Implement XPath index access

This commit is contained in:
rak
2025-06-25 23:11:05 +02:00
parent 39c0ebfc7c
commit e97f9bdd61
6 changed files with 48 additions and 72 deletions

View File

@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
@WithName("direction")
@WithConverter(DiscriminatorDirectionConverter::class)
fun getDiscriminatorDirection(): DiscriminatorDirection
}

View File

@@ -84,7 +84,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
}
}
protected fun extractAsMap(
protected fun extractSingle(
document: Element,
extractionConfig: T
): Map<String, String> {
@@ -108,7 +108,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return result
}
fun extractAsListOfMaps(
fun extractMulti(
element: Element,
extractionConfig: T
): List<Map<String, String>> {
@@ -143,7 +143,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList
}
fun extractAsListOfMaps(
fun extractMulti(
elements: Elements,
extractionConfig: T
): List<Map<String, String>> {
@@ -174,64 +174,25 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList
}
fun extractWithDiscriminator(
fun extractMultiWithDiscriminator(
element: Element,
extractionConfig: T
): List<List<Map<String, String>>>{
val rootElement = getRootElement(
val rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
extractionConfig.getDiscriminator().get().getRootConfig(),
Optional.empty<ExtractConfig>()
)
var rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
Optional.empty<ExtractConfig>()
)
val discriminatedElements = getElementsFromElementByExtractConfig(
rootElement,
extractionConfig.getDiscriminator().get().getRootConfig().get(),
)
val discriminations = mutableListOf<String>()
val result = mutableListOf<List<Map<String, String>>>()
for (element in discriminatedElements) {
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig(
for(element in rootElements) {
result.add(extractMulti(
element,
extractionConfig.getDiscriminator().get()
) ?: throw ElementNotFoundException("")
discriminations.add(discriminatorValue)
extractionConfig
))
}
val definitiveElements = if (discriminations.size < rootElements.size) {
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
while (discriminations.size < rootElements.size) {
rootElements.removeFirst()
}
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
rootElements
} else {
rootElements
}
result.add(extractAsListOfMaps(
definitiveElements,
extractionConfig
))
return result
}

View File

@@ -3,13 +3,9 @@ package com.rak.service
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
@ApplicationScoped
@@ -37,7 +33,7 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): Collection<CardPrint> {
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig)
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
return objectAsListOfMaps.map {
CardPrint.fromMap(it[0])

View File

@@ -36,8 +36,8 @@ class RegionalSetExtractionService(
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): Collection<RegionalSet> {
val regionalSetList = extractAsListOfMaps(element, extractionConfig)
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig)
val regionalSetList = extractMulti(element, extractionConfig)
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
val cardPrints = cardPrintExtractionService.extractMultiple(
element,

View File

@@ -3,12 +3,15 @@ package com.rak.util
import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
}
}
private fun selectXpath(element: Element, xpath: String): Elements {
return if (xpath.contains(INDEX_MATCHER)) {
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
val xpathHalves = xpath.split("[$index]")
try {
Elements(element.selectXpath(xpathHalves[0])[index])
} catch (_: IndexOutOfBoundsException) {
Elements()
}
} else {
element.selectXpath(xpath)
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
}
fun getNextElement(element: Element, path: String): Element? {
return element.selectXpath(path).firstOrNull()
return selectXpath(element, path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {

View File

@@ -56,33 +56,37 @@ scraper:
value: "//li/abbr/text()"
card-print:
multi: true
root:
type: css
value: ".tabber.wds-tabber > div"
discriminator:
direction: asc
root:
type: css
value: ".wds-tabs__tab"
steps:
- type: xpath
value: "//li/div/a/text()"
value: ".wds-tab__content"
root:
type: css
value: "table > tbody > tr:has(> td)"
id:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()"
value: "./td/a[0]"
- type: xpath
value: "./text()"
name:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()"
value: "./td/a[1]"
- type: xpath
value: "./text()"
regional-name:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[2]/a/text()"
value: "./td/a[2]"
- type: xpath
value: "./text()"
rarity:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[3]/a/text()"
value: "./td/a[3]"
- type: xpath
value: "./text()"
card:
name:
root: