Trying out model

This commit is contained in:
2025-05-28 17:43:05 +02:00
parent ac35d7f8d9
commit 4b0be3bd4e
17 changed files with 220 additions and 47 deletions

View File

@@ -0,0 +1,14 @@
package com.rak.model.config;
import org.eclipse.microprofile.config.spi.Converter;
public class TestConverter {
public static class ActualConverter implements Converter<Attribute> {
@Override
public Attribute convert(final String value) throws IllegalArgumentException, NullPointerException {
return null;
}
}
}

View File

@@ -13,7 +13,7 @@ interface SourcesConfiguration {
fun id(): String
fun name(): String
fun domain(): String
fun urlPatterns(): Optional<MutableList<String>>
fun urlPatterns(): Optional<MutableSet<String>>
fun selectors(): Selectors
interface Selectors {

View File

@@ -31,7 +31,7 @@ class ExampleResource(
provider: String,
@RestQuery
path: String
): String {
): Map<String, String> {
val sources = sourcesConfiguration
.sources()
.filter {
@@ -44,13 +44,19 @@ class ExampleResource(
.trim()
.replace(" ", "_")
Log.info(newPath)
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
val regionalSetSelector = source.selectors().regionalSet().get()
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp"
val setId: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps())
val setLanguage: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().language().steps())
val setKey: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().regionKey().steps())
return mapOf(
Pair("id", setId ?: "N/A"),
Pair("language", setLanguage ?: "N/A"),
Pair("key", setKey ?: "N/A"),
)
}
}

View File

@@ -0,0 +1,7 @@
package com.rak.model
enum class XPathTarget {
TEXT,
ATTRIBUTE,
ELEMENT
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.config
data class Attribute(
val steps: Set<Step>
)

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class CardConfigModel(
override val root: String,
val name: Attribute,
val attack: Attribute,
val effect: Attribute,
) : ConfigModel

View File

@@ -0,0 +1,5 @@
package com.rak.model.config
interface ConfigModel {
val root: String
}

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class RegionalSetConfigModel(
override val root: String,
val id: Attribute,
val language: Attribute,
val regionKey: Attribute
) : ConfigModel

View File

@@ -0,0 +1,8 @@
package com.rak.model.config
data class SourceConfiguration(
val id: String,
val name: String,
val urlPatterns: Set<String>,
val selectors: Set<ConfigModel>
)

View File

@@ -0,0 +1,11 @@
package com.rak.model.config
data class Step(
val type: Type,
val value: String
) {
enum class Type {
XPATH,
CSS
}
}

View File

@@ -1,68 +1,40 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import com.rak.model.XPathTarget
import com.rak.util.XPathUtil
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.select.Evaluator
import java.util.concurrent.LinkedBlockingQueue
@ApplicationScoped
class ScrapeService {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private fun evaluateXpath(element: Element, xpath: String): Element? {
return element.selectXpath(xpath).first()
}
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
return null
}
// XPath
// - text()
// - last step (default to text())
// CSS
// - last step???
private fun untilText(): String? {
return null
}
}
fun extractTextFromRootBySteps(
root: Element,
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
): String? {
var currentElement: Element? = root.clone()
val stepsAsQueue = LinkedBlockingQueue(
steps
)
while (stepsAsQueue.isNotEmpty()) {
val step = stepsAsQueue.take()
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
var result: String? = null
for (index in 0 until steps.size) {
val currentStep = steps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
return null
throw IllegalStateException()
}
currentElement = if (step.type() == "xpath") {
if (stepTargetsTextNode) {
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
}
else {
currentElement.selectXpath(step.value()).first()
}
if (index == steps.size - 1) {
result = XPathUtil.extractResult(currentElement, currentStep.value())
}
else {
currentElement.selectFirst(step.value())
currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
}
}
return null
return result
}
}

View File

@@ -0,0 +1,31 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import com.rak.model.config.SourceConfiguration
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
@ApplicationScoped
class SourceService (
private val sourcesConfiguration: SourcesConfiguration
) {
private val sources: MutableSet<SourceConfiguration> = mutableSetOf()
@PostConstruct
fun init() {
sourcesConfiguration
.sources()
.forEach { source ->
val config = SourceConfiguration(
source.id(),
source.name(),
source.urlPatterns().orElse(mutableSetOf()),
setOf()
)
sources.add(config)
}
}
}

View File

@@ -0,0 +1,54 @@
package com.rak.util
import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
class XPathUtil private constructor() {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
val attributeName = groupMatcher?.groupValues[1] ?: return null
val attributeValue = root.attr(attributeName)
return attributeValue.ifBlank {
null
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
.firstOrNull()?.text()
}
fun getNextElement(root: Element, path: String): Element? {
return root.selectXpath(path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {
return when (getXPathTargetFromPath(path)) {
XPathTarget.TEXT -> extractTextFromNode(root, path)
XPathTarget.ATTRIBUTE -> extractTextFromAttribute(root, path)
else -> null
}
}
fun getXPathTargetFromPath(path: String): XPathTarget {
return if (TEXT_NODE_MATCHER.matches(path)) {
XPathTarget.TEXT
} else if (ATTRIBUTE_MATCHER.matches(path)) {
XPathTarget.ATTRIBUTE
} else {
XPathTarget.ELEMENT
}
}
}
}

View File

@@ -0,0 +1 @@
com.rak.model.config.TestConverter$ActualConverter

View File

@@ -1 +0,0 @@
quarkus.config.locations=sources.yml

View File

@@ -33,10 +33,12 @@ scraper:
language:
steps:
- type: "xpath"
value: "//li/abbr/@title"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"
value: "//li/abbr/text()"
testing:
waaa: test