Implement XPath index access
This commit is contained in:
@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
|
||||
import io.smallrye.config.WithName
|
||||
|
||||
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
|
||||
@WithName("direction")
|
||||
@WithConverter(DiscriminatorDirectionConverter::class)
|
||||
fun getDiscriminatorDirection(): DiscriminatorDirection
|
||||
}
|
||||
@@ -84,7 +84,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
}
|
||||
}
|
||||
|
||||
protected fun extractAsMap(
|
||||
protected fun extractSingle(
|
||||
document: Element,
|
||||
extractionConfig: T
|
||||
): Map<String, String> {
|
||||
@@ -108,7 +108,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return result
|
||||
}
|
||||
|
||||
fun extractAsListOfMaps(
|
||||
fun extractMulti(
|
||||
element: Element,
|
||||
extractionConfig: T
|
||||
): List<Map<String, String>> {
|
||||
@@ -143,7 +143,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractAsListOfMaps(
|
||||
fun extractMulti(
|
||||
elements: Elements,
|
||||
extractionConfig: T
|
||||
): List<Map<String, String>> {
|
||||
@@ -174,63 +174,24 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractWithDiscriminator(
|
||||
fun extractMultiWithDiscriminator(
|
||||
element: Element,
|
||||
extractionConfig: T
|
||||
): List<List<Map<String, String>>>{
|
||||
val rootElement = getRootElement(
|
||||
val rootElements = getRootElements(
|
||||
element,
|
||||
extractionConfig.getRootConfig(),
|
||||
extractionConfig.getDiscriminator().get().getRootConfig(),
|
||||
Optional.empty<ExtractConfig>()
|
||||
)
|
||||
|
||||
var rootElements = getRootElements(
|
||||
element,
|
||||
extractionConfig.getRootConfig(),
|
||||
Optional.empty<ExtractConfig>()
|
||||
)
|
||||
|
||||
val discriminatedElements = getElementsFromElementByExtractConfig(
|
||||
rootElement,
|
||||
extractionConfig.getDiscriminator().get().getRootConfig().get(),
|
||||
)
|
||||
|
||||
val discriminations = mutableListOf<String>()
|
||||
val result = mutableListOf<List<Map<String, String>>>()
|
||||
|
||||
for (element in discriminatedElements) {
|
||||
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig(
|
||||
for(element in rootElements) {
|
||||
result.add(extractMulti(
|
||||
element,
|
||||
extractionConfig.getDiscriminator().get()
|
||||
) ?: throw ElementNotFoundException("")
|
||||
|
||||
discriminations.add(discriminatorValue)
|
||||
}
|
||||
|
||||
val definitiveElements = if (discriminations.size < rootElements.size) {
|
||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
||||
rootElements = Elements(rootElements.reversed())
|
||||
}
|
||||
|
||||
while (discriminations.size < rootElements.size) {
|
||||
rootElements.removeFirst()
|
||||
}
|
||||
|
||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
||||
rootElements = Elements(rootElements.reversed())
|
||||
}
|
||||
|
||||
rootElements
|
||||
} else {
|
||||
rootElements
|
||||
}
|
||||
|
||||
result.add(extractAsListOfMaps(
|
||||
definitiveElements,
|
||||
extractionConfig
|
||||
))
|
||||
|
||||
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -3,13 +3,9 @@ package com.rak.service
|
||||
import com.rak.config.model.CardPrintScrapeTargetConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.nodes.Document
|
||||
import org.jsoup.nodes.Element
|
||||
|
||||
@ApplicationScoped
|
||||
@@ -37,7 +33,7 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: CardPrintScrapeTargetConfig
|
||||
): Collection<CardPrint> {
|
||||
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig)
|
||||
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
|
||||
|
||||
return objectAsListOfMaps.map {
|
||||
CardPrint.fromMap(it[0])
|
||||
|
||||
@@ -36,8 +36,8 @@ class RegionalSetExtractionService(
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): Collection<RegionalSet> {
|
||||
val regionalSetList = extractAsListOfMaps(element, extractionConfig)
|
||||
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig)
|
||||
val regionalSetList = extractMulti(element, extractionConfig)
|
||||
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
|
||||
|
||||
val cardPrints = cardPrintExtractionService.extractMultiple(
|
||||
element,
|
||||
|
||||
@@ -3,12 +3,15 @@ package com.rak.util
|
||||
import com.rak.model.XPathTarget
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.nodes.TextNode
|
||||
import org.jsoup.select.Elements
|
||||
import kotlin.coroutines.CoroutineContext
|
||||
|
||||
class XPathUtil private constructor() {
|
||||
|
||||
companion object {
|
||||
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
||||
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
||||
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
|
||||
|
||||
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
||||
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
||||
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
|
||||
}
|
||||
}
|
||||
|
||||
private fun selectXpath(element: Element, xpath: String): Elements {
|
||||
return if (xpath.contains(INDEX_MATCHER)) {
|
||||
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
|
||||
val xpathHalves = xpath.split("[$index]")
|
||||
|
||||
try {
|
||||
Elements(element.selectXpath(xpathHalves[0])[index])
|
||||
} catch (_: IndexOutOfBoundsException) {
|
||||
Elements()
|
||||
}
|
||||
} else {
|
||||
element.selectXpath(xpath)
|
||||
}
|
||||
}
|
||||
|
||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||
return root
|
||||
.selectXpath(xpath, TextNode::class.java)
|
||||
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
|
||||
}
|
||||
|
||||
fun getNextElement(element: Element, path: String): Element? {
|
||||
return element.selectXpath(path).firstOrNull()
|
||||
return selectXpath(element, path).firstOrNull()
|
||||
}
|
||||
|
||||
fun extractResult(root: Element, path: String): String? {
|
||||
|
||||
@@ -56,33 +56,37 @@ scraper:
|
||||
value: "//li/abbr/text()"
|
||||
card-print:
|
||||
multi: true
|
||||
root:
|
||||
type: css
|
||||
value: ".tabber.wds-tabber > div"
|
||||
discriminator:
|
||||
direction: asc
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tabs__tab"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "//li/div/a/text()"
|
||||
value: ".wds-tab__content"
|
||||
root:
|
||||
type: css
|
||||
value: "table > tbody > tr:has(> td)"
|
||||
id:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
name:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
regional-name:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[2]/a/text()"
|
||||
value: "./td/a[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
rarity:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[3]/a/text()"
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
card:
|
||||
name:
|
||||
root:
|
||||
|
||||
Reference in New Issue
Block a user