Implement XPath index access
This commit is contained in:
@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
|
|||||||
import io.smallrye.config.WithName
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
|
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
|
||||||
@WithName("direction")
|
|
||||||
@WithConverter(DiscriminatorDirectionConverter::class)
|
|
||||||
fun getDiscriminatorDirection(): DiscriminatorDirection
|
|
||||||
}
|
}
|
||||||
@@ -84,7 +84,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected fun extractAsMap(
|
protected fun extractSingle(
|
||||||
document: Element,
|
document: Element,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
): Map<String, String> {
|
): Map<String, String> {
|
||||||
@@ -108,7 +108,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractAsListOfMaps(
|
fun extractMulti(
|
||||||
element: Element,
|
element: Element,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
): List<Map<String, String>> {
|
): List<Map<String, String>> {
|
||||||
@@ -143,7 +143,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
return resultList
|
return resultList
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractAsListOfMaps(
|
fun extractMulti(
|
||||||
elements: Elements,
|
elements: Elements,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
): List<Map<String, String>> {
|
): List<Map<String, String>> {
|
||||||
@@ -174,64 +174,25 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
return resultList
|
return resultList
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractWithDiscriminator(
|
fun extractMultiWithDiscriminator(
|
||||||
element: Element,
|
element: Element,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
): List<List<Map<String, String>>>{
|
): List<List<Map<String, String>>>{
|
||||||
val rootElement = getRootElement(
|
val rootElements = getRootElements(
|
||||||
element,
|
element,
|
||||||
extractionConfig.getRootConfig(),
|
extractionConfig.getDiscriminator().get().getRootConfig(),
|
||||||
Optional.empty<ExtractConfig>()
|
Optional.empty<ExtractConfig>()
|
||||||
)
|
)
|
||||||
|
|
||||||
var rootElements = getRootElements(
|
|
||||||
element,
|
|
||||||
extractionConfig.getRootConfig(),
|
|
||||||
Optional.empty<ExtractConfig>()
|
|
||||||
)
|
|
||||||
|
|
||||||
val discriminatedElements = getElementsFromElementByExtractConfig(
|
|
||||||
rootElement,
|
|
||||||
extractionConfig.getDiscriminator().get().getRootConfig().get(),
|
|
||||||
)
|
|
||||||
|
|
||||||
val discriminations = mutableListOf<String>()
|
|
||||||
val result = mutableListOf<List<Map<String, String>>>()
|
val result = mutableListOf<List<Map<String, String>>>()
|
||||||
|
|
||||||
for (element in discriminatedElements) {
|
for(element in rootElements) {
|
||||||
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig(
|
result.add(extractMulti(
|
||||||
element,
|
element,
|
||||||
extractionConfig.getDiscriminator().get()
|
extractionConfig
|
||||||
) ?: throw ElementNotFoundException("")
|
))
|
||||||
|
|
||||||
discriminations.add(discriminatorValue)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
val definitiveElements = if (discriminations.size < rootElements.size) {
|
|
||||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
|
||||||
rootElements = Elements(rootElements.reversed())
|
|
||||||
}
|
|
||||||
|
|
||||||
while (discriminations.size < rootElements.size) {
|
|
||||||
rootElements.removeFirst()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
|
||||||
rootElements = Elements(rootElements.reversed())
|
|
||||||
}
|
|
||||||
|
|
||||||
rootElements
|
|
||||||
} else {
|
|
||||||
rootElements
|
|
||||||
}
|
|
||||||
|
|
||||||
result.add(extractAsListOfMaps(
|
|
||||||
definitiveElements,
|
|
||||||
extractionConfig
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,13 +3,9 @@ package com.rak.service
|
|||||||
import com.rak.config.model.CardPrintScrapeTargetConfig
|
import com.rak.config.model.CardPrintScrapeTargetConfig
|
||||||
import com.rak.config.model.ProviderConfig
|
import com.rak.config.model.ProviderConfig
|
||||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
import com.rak.config.model.SetScrapeTargetConfig
|
|
||||||
import com.rak.model.card.CardPrint
|
import com.rak.model.card.CardPrint
|
||||||
import com.rak.model.exception.NotImplementedException
|
import com.rak.model.exception.NotImplementedException
|
||||||
import com.rak.model.set.CardSet
|
|
||||||
import com.rak.model.set.RegionalSet
|
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
import org.jsoup.nodes.Document
|
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
|
|
||||||
@ApplicationScoped
|
@ApplicationScoped
|
||||||
@@ -37,7 +33,7 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
|
|||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: CardPrintScrapeTargetConfig
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
): Collection<CardPrint> {
|
): Collection<CardPrint> {
|
||||||
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig)
|
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
|
||||||
|
|
||||||
return objectAsListOfMaps.map {
|
return objectAsListOfMaps.map {
|
||||||
CardPrint.fromMap(it[0])
|
CardPrint.fromMap(it[0])
|
||||||
|
|||||||
@@ -36,8 +36,8 @@ class RegionalSetExtractionService(
|
|||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: SetScrapeTargetConfig
|
extractionConfig: SetScrapeTargetConfig
|
||||||
): Collection<RegionalSet> {
|
): Collection<RegionalSet> {
|
||||||
val regionalSetList = extractAsListOfMaps(element, extractionConfig)
|
val regionalSetList = extractMulti(element, extractionConfig)
|
||||||
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig)
|
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
|
||||||
|
|
||||||
val cardPrints = cardPrintExtractionService.extractMultiple(
|
val cardPrints = cardPrintExtractionService.extractMultiple(
|
||||||
element,
|
element,
|
||||||
|
|||||||
@@ -3,12 +3,15 @@ package com.rak.util
|
|||||||
import com.rak.model.XPathTarget
|
import com.rak.model.XPathTarget
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.nodes.TextNode
|
import org.jsoup.nodes.TextNode
|
||||||
|
import org.jsoup.select.Elements
|
||||||
|
import kotlin.coroutines.CoroutineContext
|
||||||
|
|
||||||
class XPathUtil private constructor() {
|
class XPathUtil private constructor() {
|
||||||
|
|
||||||
companion object {
|
companion object {
|
||||||
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
||||||
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
||||||
|
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
|
||||||
|
|
||||||
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
||||||
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
||||||
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun selectXpath(element: Element, xpath: String): Elements {
|
||||||
|
return if (xpath.contains(INDEX_MATCHER)) {
|
||||||
|
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
|
||||||
|
val xpathHalves = xpath.split("[$index]")
|
||||||
|
|
||||||
|
try {
|
||||||
|
Elements(element.selectXpath(xpathHalves[0])[index])
|
||||||
|
} catch (_: IndexOutOfBoundsException) {
|
||||||
|
Elements()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
element.selectXpath(xpath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||||
return root
|
return root
|
||||||
.selectXpath(xpath, TextNode::class.java)
|
.selectXpath(xpath, TextNode::class.java)
|
||||||
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fun getNextElement(element: Element, path: String): Element? {
|
fun getNextElement(element: Element, path: String): Element? {
|
||||||
return element.selectXpath(path).firstOrNull()
|
return selectXpath(element, path).firstOrNull()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractResult(root: Element, path: String): String? {
|
fun extractResult(root: Element, path: String): String? {
|
||||||
|
|||||||
@@ -56,33 +56,37 @@ scraper:
|
|||||||
value: "//li/abbr/text()"
|
value: "//li/abbr/text()"
|
||||||
card-print:
|
card-print:
|
||||||
multi: true
|
multi: true
|
||||||
root:
|
|
||||||
type: css
|
|
||||||
value: ".tabber.wds-tabber > div"
|
|
||||||
discriminator:
|
discriminator:
|
||||||
direction: asc
|
|
||||||
root:
|
root:
|
||||||
type: css
|
type: css
|
||||||
value: ".wds-tabs__tab"
|
value: ".wds-tab__content"
|
||||||
steps:
|
root:
|
||||||
- type: xpath
|
type: css
|
||||||
value: "//li/div/a/text()"
|
value: "table > tbody > tr:has(> td)"
|
||||||
id:
|
id:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
value: "./td/a[0]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
name:
|
name:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
value: "./td/a[1]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
regional-name:
|
regional-name:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: ".//table/tbody/tr[2]/td[2]/a/text()"
|
value: "./td/a[2]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
rarity:
|
rarity:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: ".//table/tbody/tr[2]/td[3]/a/text()"
|
value: "./td/a[3]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
card:
|
card:
|
||||||
name:
|
name:
|
||||||
root:
|
root:
|
||||||
|
|||||||
Reference in New Issue
Block a user