Implement XPath index access

This commit is contained in:
rak
2025-06-25 23:11:05 +02:00
parent 39c0ebfc7c
commit e97f9bdd61
6 changed files with 48 additions and 72 deletions

View File

@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
import io.smallrye.config.WithName import io.smallrye.config.WithName
interface DiscriminatorConfig : ScrapeTargetFieldConfig { interface DiscriminatorConfig : ScrapeTargetFieldConfig {
@WithName("direction")
@WithConverter(DiscriminatorDirectionConverter::class)
fun getDiscriminatorDirection(): DiscriminatorDirection
} }

View File

@@ -84,7 +84,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
} }
} }
protected fun extractAsMap( protected fun extractSingle(
document: Element, document: Element,
extractionConfig: T extractionConfig: T
): Map<String, String> { ): Map<String, String> {
@@ -108,7 +108,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return result return result
} }
fun extractAsListOfMaps( fun extractMulti(
element: Element, element: Element,
extractionConfig: T extractionConfig: T
): List<Map<String, String>> { ): List<Map<String, String>> {
@@ -143,7 +143,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList return resultList
} }
fun extractAsListOfMaps( fun extractMulti(
elements: Elements, elements: Elements,
extractionConfig: T extractionConfig: T
): List<Map<String, String>> { ): List<Map<String, String>> {
@@ -174,63 +174,24 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList return resultList
} }
fun extractWithDiscriminator( fun extractMultiWithDiscriminator(
element: Element, element: Element,
extractionConfig: T extractionConfig: T
): List<List<Map<String, String>>>{ ): List<List<Map<String, String>>>{
val rootElement = getRootElement( val rootElements = getRootElements(
element, element,
extractionConfig.getRootConfig(), extractionConfig.getDiscriminator().get().getRootConfig(),
Optional.empty<ExtractConfig>() Optional.empty<ExtractConfig>()
) )
var rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
Optional.empty<ExtractConfig>()
)
val discriminatedElements = getElementsFromElementByExtractConfig(
rootElement,
extractionConfig.getDiscriminator().get().getRootConfig().get(),
)
val discriminations = mutableListOf<String>()
val result = mutableListOf<List<Map<String, String>>>() val result = mutableListOf<List<Map<String, String>>>()
for (element in discriminatedElements) { for(element in rootElements) {
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig( result.add(extractMulti(
element, element,
extractionConfig.getDiscriminator().get()
) ?: throw ElementNotFoundException("")
discriminations.add(discriminatorValue)
}
val definitiveElements = if (discriminations.size < rootElements.size) {
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
while (discriminations.size < rootElements.size) {
rootElements.removeFirst()
}
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
rootElements
} else {
rootElements
}
result.add(extractAsListOfMaps(
definitiveElements,
extractionConfig extractionConfig
)) ))
}
return result return result
} }

View File

@@ -3,13 +3,9 @@ package com.rak.service
import com.rak.config.model.CardPrintScrapeTargetConfig import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.model.card.CardPrint import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException import com.rak.model.exception.NotImplementedException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
@ApplicationScoped @ApplicationScoped
@@ -37,7 +33,7 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig extractionConfig: CardPrintScrapeTargetConfig
): Collection<CardPrint> { ): Collection<CardPrint> {
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig) val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
return objectAsListOfMaps.map { return objectAsListOfMaps.map {
CardPrint.fromMap(it[0]) CardPrint.fromMap(it[0])

View File

@@ -36,8 +36,8 @@ class RegionalSetExtractionService(
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: SetScrapeTargetConfig
): Collection<RegionalSet> { ): Collection<RegionalSet> {
val regionalSetList = extractAsListOfMaps(element, extractionConfig) val regionalSetList = extractMulti(element, extractionConfig)
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig) val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
val cardPrints = cardPrintExtractionService.extractMultiple( val cardPrints = cardPrintExtractionService.extractMultiple(
element, element,

View File

@@ -3,12 +3,15 @@ package com.rak.util
import com.rak.model.XPathTarget import com.rak.model.XPathTarget
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() { class XPathUtil private constructor() {
companion object { companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$") private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
private fun extractTextFromAttribute(root: Element, xpath: String): String? { private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath) val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
} }
} }
private fun selectXpath(element: Element, xpath: String): Elements {
return if (xpath.contains(INDEX_MATCHER)) {
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
val xpathHalves = xpath.split("[$index]")
try {
Elements(element.selectXpath(xpathHalves[0])[index])
} catch (_: IndexOutOfBoundsException) {
Elements()
}
} else {
element.selectXpath(xpath)
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? { private fun extractTextFromNode(root: Element, xpath: String): String? {
return root return root
.selectXpath(xpath, TextNode::class.java) .selectXpath(xpath, TextNode::class.java)
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
} }
fun getNextElement(element: Element, path: String): Element? { fun getNextElement(element: Element, path: String): Element? {
return element.selectXpath(path).firstOrNull() return selectXpath(element, path).firstOrNull()
} }
fun extractResult(root: Element, path: String): String? { fun extractResult(root: Element, path: String): String? {

View File

@@ -56,33 +56,37 @@ scraper:
value: "//li/abbr/text()" value: "//li/abbr/text()"
card-print: card-print:
multi: true multi: true
root:
type: css
value: ".tabber.wds-tabber > div"
discriminator: discriminator:
direction: asc
root: root:
type: css type: css
value: ".wds-tabs__tab" value: ".wds-tab__content"
steps: root:
- type: xpath type: css
value: "//li/div/a/text()" value: "table > tbody > tr:has(> td)"
id: id:
steps: steps:
- type: xpath - type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()" value: "./td/a[0]"
- type: xpath
value: "./text()"
name: name:
steps: steps:
- type: xpath - type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()" value: "./td/a[1]"
- type: xpath
value: "./text()"
regional-name: regional-name:
steps: steps:
- type: xpath - type: xpath
value: ".//table/tbody/tr[2]/td[2]/a/text()" value: "./td/a[2]"
- type: xpath
value: "./text()"
rarity: rarity:
steps: steps:
- type: xpath - type: xpath
value: ".//table/tbody/tr[2]/td[3]/a/text()" value: "./td/a[3]"
- type: xpath
value: "./text()"
card: card:
name: name:
root: root: