This commit is contained in:
rak
2025-05-25 18:58:10 +02:00
commit ac35d7f8d9
27 changed files with 1048 additions and 0 deletions

View File

@@ -0,0 +1,50 @@
package com.rak.config
import io.smallrye.config.ConfigMapping
import java.util.*
@ConfigMapping(prefix = "scraper")
interface SourcesConfiguration {
fun sources(): MutableList<SourceConfig>
interface SourceConfig {
fun id(): String
fun name(): String
fun domain(): String
fun urlPatterns(): Optional<MutableList<String>>
fun selectors(): Selectors
interface Selectors {
fun card(): Optional<CardDefinition>
fun regionalSet(): Optional<RegionalSetDefinition>
interface AbstractModelDefinition {
fun root(): Optional<String>
}
interface RegionalSetDefinition : AbstractModelDefinition {
fun id(): SelectorDefinition
fun language(): SelectorDefinition
fun regionKey(): SelectorDefinition
}
interface CardDefinition {
fun name(): SelectorDefinition
fun attack(): SelectorDefinition
fun effect(): SelectorDefinition
}
interface SelectorDefinition {
fun steps(): Set<StepDefinition>
}
interface StepDefinition {
fun type(): String // e.g. css or xpath
fun value(): String
}
}
}
}

View File

@@ -0,0 +1,56 @@
package com.rak.controller
import com.rak.config.SourcesConfiguration
import com.rak.service.ScrapeService
import io.quarkus.logging.Log
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.Produces
import jakarta.ws.rs.core.MediaType
import org.jboss.resteasy.reactive.RestQuery
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
@Path("/hello")
class ExampleResource(
private val sourcesConfiguration: SourcesConfiguration,
private val scrapeService: ScrapeService
) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
}
@GET
@Produces(MediaType.TEXT_PLAIN)
fun hello(
@RestQuery
provider: String,
@RestQuery
path: String
): String {
val sources = sourcesConfiguration
.sources()
.filter {
it.id().equals(provider, ignoreCase = true)
}
val source = sources.firstOrNull() ?: throw IllegalArgumentException("Provider $provider not found")
val newPath: String = path
.trim()
.replace(" ", "_")
Log.info(newPath)
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
val regionalSetSelector = source.selectors().regionalSet().get()
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp"
}
}

View File

@@ -0,0 +1,4 @@
package com.rak.model.scrape
abstract class AbstractScraper{
}

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape
class JsoupScraper : AbstractScraper() {
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.scrape
data class ScrapeJob(
val url: String,
)

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape.selector
enum class Selector {
CSS,
XPATH
}

View File

@@ -0,0 +1,29 @@
package com.rak.service
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
/**
* To use it via injection.
*
* ```kotlin
* @Inject
* @RestClient
* lateinit var myRemoteService: MyRemoteService
*
* fun doSomething() {
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
* }
* ```
*/
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
interface MyRemoteService {
@GET
@Path("/extensions")
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
}

View File

@@ -0,0 +1,68 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import java.util.concurrent.LinkedBlockingQueue
@ApplicationScoped
class ScrapeService {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private fun evaluateXpath(element: Element, xpath: String): Element? {
return element.selectXpath(xpath).first()
}
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
return null
}
// XPath
// - text()
// - last step (default to text())
// CSS
// - last step???
private fun untilText(): String? {
return null
}
}
fun extractTextFromRootBySteps(
root: Element,
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
): String? {
var currentElement: Element? = root.clone()
val stepsAsQueue = LinkedBlockingQueue(
steps
)
while (stepsAsQueue.isNotEmpty()) {
val step = stepsAsQueue.take()
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
if (currentElement == null) {
return null
}
currentElement = if (step.type() == "xpath") {
if (stepTargetsTextNode) {
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
}
else {
currentElement.selectXpath(step.value()).first()
}
}
else {
currentElement.selectFirst(step.value())
}
}
return null
}
}