Trying out model

This commit is contained in:
2025-05-28 17:43:05 +02:00
parent ac35d7f8d9
commit 4b0be3bd4e
17 changed files with 220 additions and 47 deletions

View File

@@ -33,7 +33,7 @@ scraper:
language: language:
steps: steps:
- type: "xpath" - type: "xpath"
value: "//li/abbr/@title" value: "//li/abbr"
- type: "xpath" - type: "xpath"
value: "//abbr/@title" value: "//abbr/@title"
region-key: region-key:

View File

@@ -0,0 +1,14 @@
package com.rak.model.config;
import org.eclipse.microprofile.config.spi.Converter;
public class TestConverter {
public static class ActualConverter implements Converter<Attribute> {
@Override
public Attribute convert(final String value) throws IllegalArgumentException, NullPointerException {
return null;
}
}
}

View File

@@ -13,7 +13,7 @@ interface SourcesConfiguration {
fun id(): String fun id(): String
fun name(): String fun name(): String
fun domain(): String fun domain(): String
fun urlPatterns(): Optional<MutableList<String>> fun urlPatterns(): Optional<MutableSet<String>>
fun selectors(): Selectors fun selectors(): Selectors
interface Selectors { interface Selectors {

View File

@@ -31,7 +31,7 @@ class ExampleResource(
provider: String, provider: String,
@RestQuery @RestQuery
path: String path: String
): String { ): Map<String, String> {
val sources = sourcesConfiguration val sources = sourcesConfiguration
.sources() .sources()
.filter { .filter {
@@ -44,13 +44,19 @@ class ExampleResource(
.trim() .trim()
.replace(" ", "_") .replace(" ", "_")
Log.info(newPath)
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get() val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
val regionalSetSelector = source.selectors().regionalSet().get() val regionalSetSelector = source.selectors().regionalSet().get()
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!! val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp" val setId: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps())
val setLanguage: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().language().steps())
val setKey: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().regionKey().steps())
return mapOf(
Pair("id", setId ?: "N/A"),
Pair("language", setLanguage ?: "N/A"),
Pair("key", setKey ?: "N/A"),
)
} }
} }

View File

@@ -0,0 +1,7 @@
package com.rak.model
enum class XPathTarget {
TEXT,
ATTRIBUTE,
ELEMENT
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.config
data class Attribute(
val steps: Set<Step>
)

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class CardConfigModel(
override val root: String,
val name: Attribute,
val attack: Attribute,
val effect: Attribute,
) : ConfigModel

View File

@@ -0,0 +1,5 @@
package com.rak.model.config
interface ConfigModel {
val root: String
}

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class RegionalSetConfigModel(
override val root: String,
val id: Attribute,
val language: Attribute,
val regionKey: Attribute
) : ConfigModel

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class SourceConfiguration(
val id: String,
val name: String,
val urlPatterns: Set<String>,
val selectors: Set<ConfigModel>
)

View File

@@ -0,0 +1,11 @@
package com.rak.model.config
data class Step(
val type: Type,
val value: String
) {
enum class Type {
XPATH,
CSS
}
}

View File

@@ -1,68 +1,40 @@
package com.rak.service package com.rak.service
import com.rak.config.SourcesConfiguration import com.rak.config.SourcesConfiguration
import com.rak.model.XPathTarget
import com.rak.util.XPathUtil
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Evaluator
import java.util.concurrent.LinkedBlockingQueue import java.util.concurrent.LinkedBlockingQueue
@ApplicationScoped @ApplicationScoped
class ScrapeService { class ScrapeService {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private fun evaluateXpath(element: Element, xpath: String): Element? {
return element.selectXpath(xpath).first()
}
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
return null
}
// XPath
// - text()
// - last step (default to text())
// CSS
// - last step???
private fun untilText(): String? {
return null
}
}
fun extractTextFromRootBySteps( fun extractTextFromRootBySteps(
root: Element, root: Element,
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition> steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
): String? { ): String? {
var currentElement: Element? = root.clone() var currentElement: Element? = root.clone()
val stepsAsQueue = LinkedBlockingQueue( var result: String? = null
steps
)
while (stepsAsQueue.isNotEmpty()) {
val step = stepsAsQueue.take()
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
for (index in 0 until steps.size) {
val currentStep = steps.elementAtOrNull(index) ?: return null
if (currentElement == null) { if (currentElement == null) {
return null throw IllegalStateException()
} }
currentElement = if (step.type() == "xpath") { if (index == steps.size - 1) {
if (stepTargetsTextNode) { result = XPathUtil.extractResult(currentElement, currentStep.value())
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
} }
else { else {
currentElement.selectXpath(step.value()).first() currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
}
}
else {
currentElement.selectFirst(step.value())
} }
} }
return null
return result
} }
} }

View File

@@ -0,0 +1,31 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import com.rak.model.config.SourceConfiguration
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
@ApplicationScoped
class SourceService (
private val sourcesConfiguration: SourcesConfiguration
) {
private val sources: MutableSet<SourceConfiguration> = mutableSetOf()
@PostConstruct
fun init() {
sourcesConfiguration
.sources()
.forEach { source ->
val config = SourceConfiguration(
source.id(),
source.name(),
source.urlPatterns().orElse(mutableSetOf()),
setOf()
)
sources.add(config)
}
}
}

View File

@@ -0,0 +1,54 @@
package com.rak.util
import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
class XPathUtil private constructor() {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
val attributeName = groupMatcher?.groupValues[1] ?: return null
val attributeValue = root.attr(attributeName)
return attributeValue.ifBlank {
null
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
.firstOrNull()?.text()
}
fun getNextElement(root: Element, path: String): Element? {
return root.selectXpath(path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {
return when (getXPathTargetFromPath(path)) {
XPathTarget.TEXT -> extractTextFromNode(root, path)
XPathTarget.ATTRIBUTE -> extractTextFromAttribute(root, path)
else -> null
}
}
fun getXPathTargetFromPath(path: String): XPathTarget {
return if (TEXT_NODE_MATCHER.matches(path)) {
XPathTarget.TEXT
} else if (ATTRIBUTE_MATCHER.matches(path)) {
XPathTarget.ATTRIBUTE
} else {
XPathTarget.ELEMENT
}
}
}
}

View File

@@ -0,0 +1 @@
com.rak.model.config.TestConverter$ActualConverter

View File

@@ -1 +0,0 @@
quarkus.config.locations=sources.yml

View File

@@ -0,0 +1,44 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
language:
steps:
- type: "xpath"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"
testing:
waaa: test