From e97f9bdd61f37245e52d67aca786c7508b64098c Mon Sep 17 00:00:00 2001 From: rak Date: Wed, 25 Jun 2025 23:11:05 +0200 Subject: [PATCH] Implement XPath index access --- .../rak/config/model/DiscriminatorConfig.kt | 3 - .../rak/service/AbstractExtractionService.kt | 59 ++++--------------- .../rak/service/CardPrintExtractionService.kt | 6 +- .../service/RegionalSetExtractionService.kt | 4 +- src/main/kotlin/com/rak/util/XPathUtil.kt | 20 ++++++- src/main/resources/application.yml | 28 +++++---- 6 files changed, 48 insertions(+), 72 deletions(-) diff --git a/src/main/kotlin/com/rak/config/model/DiscriminatorConfig.kt b/src/main/kotlin/com/rak/config/model/DiscriminatorConfig.kt index dc384b0..919f810 100644 --- a/src/main/kotlin/com/rak/config/model/DiscriminatorConfig.kt +++ b/src/main/kotlin/com/rak/config/model/DiscriminatorConfig.kt @@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter import io.smallrye.config.WithName interface DiscriminatorConfig : ScrapeTargetFieldConfig { - @WithName("direction") - @WithConverter(DiscriminatorDirectionConverter::class) - fun getDiscriminatorDirection(): DiscriminatorDirection } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/AbstractExtractionService.kt b/src/main/kotlin/com/rak/service/AbstractExtractionService.kt index 1a7b1b2..1a1c276 100644 --- a/src/main/kotlin/com/rak/service/AbstractExtractionService.kt +++ b/src/main/kotlin/com/rak/service/AbstractExtractionService.kt @@ -84,7 +84,7 @@ abstract class AbstractExtractionService { } } - protected fun extractAsMap( + protected fun extractSingle( document: Element, extractionConfig: T ): Map { @@ -108,7 +108,7 @@ abstract class AbstractExtractionService { return result } - fun extractAsListOfMaps( + fun extractMulti( element: Element, extractionConfig: T ): List> { @@ -143,7 +143,7 @@ abstract class AbstractExtractionService { return resultList } - fun extractAsListOfMaps( + fun extractMulti( elements: Elements, extractionConfig: T ): List> { @@ -174,64 +174,25 @@ abstract class AbstractExtractionService { return resultList } - fun extractWithDiscriminator( + fun extractMultiWithDiscriminator( element: Element, extractionConfig: T ): List>>{ - val rootElement = getRootElement( + val rootElements = getRootElements( element, - extractionConfig.getRootConfig(), + extractionConfig.getDiscriminator().get().getRootConfig(), Optional.empty() ) - var rootElements = getRootElements( - element, - extractionConfig.getRootConfig(), - Optional.empty() - ) - - val discriminatedElements = getElementsFromElementByExtractConfig( - rootElement, - extractionConfig.getDiscriminator().get().getRootConfig().get(), - ) - - val discriminations = mutableListOf() val result = mutableListOf>>() - for (element in discriminatedElements) { - val discriminatorValue: String = extractTextFromElementByTargetFieldConfig( + for(element in rootElements) { + result.add(extractMulti( element, - extractionConfig.getDiscriminator().get() - ) ?: throw ElementNotFoundException("") - - discriminations.add(discriminatorValue) + extractionConfig + )) } - val definitiveElements = if (discriminations.size < rootElements.size) { - if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) { - rootElements = Elements(rootElements.reversed()) - } - - while (discriminations.size < rootElements.size) { - rootElements.removeFirst() - } - - if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) { - rootElements = Elements(rootElements.reversed()) - } - - rootElements - } else { - rootElements - } - - result.add(extractAsListOfMaps( - definitiveElements, - extractionConfig - )) - - - return result } diff --git a/src/main/kotlin/com/rak/service/CardPrintExtractionService.kt b/src/main/kotlin/com/rak/service/CardPrintExtractionService.kt index a3e13f6..778799e 100644 --- a/src/main/kotlin/com/rak/service/CardPrintExtractionService.kt +++ b/src/main/kotlin/com/rak/service/CardPrintExtractionService.kt @@ -3,13 +3,9 @@ package com.rak.service import com.rak.config.model.CardPrintScrapeTargetConfig import com.rak.config.model.ProviderConfig import com.rak.config.model.ScrapeTargetFieldConfig -import com.rak.config.model.SetScrapeTargetConfig import com.rak.model.card.CardPrint import com.rak.model.exception.NotImplementedException -import com.rak.model.set.CardSet -import com.rak.model.set.RegionalSet import jakarta.enterprise.context.ApplicationScoped -import org.jsoup.nodes.Document import org.jsoup.nodes.Element @ApplicationScoped @@ -37,7 +33,7 @@ class CardPrintExtractionService : AbstractExtractionService { - val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig) + val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig) return objectAsListOfMaps.map { CardPrint.fromMap(it[0]) diff --git a/src/main/kotlin/com/rak/service/RegionalSetExtractionService.kt b/src/main/kotlin/com/rak/service/RegionalSetExtractionService.kt index b475400..30c4b67 100644 --- a/src/main/kotlin/com/rak/service/RegionalSetExtractionService.kt +++ b/src/main/kotlin/com/rak/service/RegionalSetExtractionService.kt @@ -36,8 +36,8 @@ class RegionalSetExtractionService( providerConfig: ProviderConfig, extractionConfig: SetScrapeTargetConfig ): Collection { - val regionalSetList = extractAsListOfMaps(element, extractionConfig) - val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig) + val regionalSetList = extractMulti(element, extractionConfig) + val cardPrintsInRegionalSet = extractMulti(element, extractionConfig) val cardPrints = cardPrintExtractionService.extractMultiple( element, diff --git a/src/main/kotlin/com/rak/util/XPathUtil.kt b/src/main/kotlin/com/rak/util/XPathUtil.kt index ae4c1a8..d848351 100644 --- a/src/main/kotlin/com/rak/util/XPathUtil.kt +++ b/src/main/kotlin/com/rak/util/XPathUtil.kt @@ -3,12 +3,15 @@ package com.rak.util import com.rak.model.XPathTarget import org.jsoup.nodes.Element import org.jsoup.nodes.TextNode +import org.jsoup.select.Elements +import kotlin.coroutines.CoroutineContext class XPathUtil private constructor() { companion object { private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$") + private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]") private fun extractTextFromAttribute(root: Element, xpath: String): String? { val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath) @@ -20,6 +23,21 @@ class XPathUtil private constructor() { } } + private fun selectXpath(element: Element, xpath: String): Elements { + return if (xpath.contains(INDEX_MATCHER)) { + val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt() + val xpathHalves = xpath.split("[$index]") + + try { + Elements(element.selectXpath(xpathHalves[0])[index]) + } catch (_: IndexOutOfBoundsException) { + Elements() + } + } else { + element.selectXpath(xpath) + } + } + private fun extractTextFromNode(root: Element, xpath: String): String? { return root .selectXpath(xpath, TextNode::class.java) @@ -27,7 +45,7 @@ class XPathUtil private constructor() { } fun getNextElement(element: Element, path: String): Element? { - return element.selectXpath(path).firstOrNull() + return selectXpath(element, path).firstOrNull() } fun extractResult(root: Element, path: String): String? { diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index e1f4b39..7170a34 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -56,33 +56,37 @@ scraper: value: "//li/abbr/text()" card-print: multi: true - root: - type: css - value: ".tabber.wds-tabber > div" discriminator: - direction: asc root: type: css - value: ".wds-tabs__tab" - steps: - - type: xpath - value: "//li/div/a/text()" + value: ".wds-tab__content" + root: + type: css + value: "table > tbody > tr:has(> td)" id: steps: - type: xpath - value: ".//table/tbody/tr[2]/td[1]/a/text()" + value: "./td/a[0]" + - type: xpath + value: "./text()" name: steps: - type: xpath - value: ".//table/tbody/tr[2]/td[1]/a/text()" + value: "./td/a[1]" + - type: xpath + value: "./text()" regional-name: steps: - type: xpath - value: ".//table/tbody/tr[2]/td[2]/a/text()" + value: "./td/a[2]" + - type: xpath + value: "./text()" rarity: steps: - type: xpath - value: ".//table/tbody/tr[2]/td[3]/a/text()" + value: "./td/a[3]" + - type: xpath + value: "./text()" card: name: root: