Basic multi-method extraction

This commit is contained in:
2025-06-29 13:21:18 +02:00
parent 108b4c4c19
commit ee4ce4fd65
5 changed files with 180 additions and 141 deletions

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -4,12 +4,12 @@ import io.smallrye.config.WithName
import java.util.* import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("root") @WithName("root")
fun getRootConfig(): Optional<ExtractConfig> fun getRootConfig(): Optional<ExtractConfig>
@WithName("steps") @WithName("extractors")
fun getExtractionSteps(): List<ExtractConfig> fun getExtractionMethods(): List<ExtractorConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback") @WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback> fun getFallbackConfiguration(): Optional<FieldConfigFallback>
} }

View File

@@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.transform.TransformationRegistry import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil import com.rak.util.CssUtil
import com.rak.util.XPathUtil import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.Optional import java.util.Optional
@@ -174,54 +175,67 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
root: Element, root: Element,
extractionConfig: ScrapeTargetFieldConfig extractionConfig: ScrapeTargetFieldConfig
): String? { ): String? {
val extractionSteps = extractionConfig.getExtractionSteps() val extractionMethods = extractionConfig.getExtractionMethods()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null var result: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) { for(extractionMethod in extractionMethods) {
result = when (currentStep.selectorType()) { val extractionSteps = extractionMethod.getExtractionSteps()
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) val transformationSteps = extractionMethod.getOptionalTransformationSteps()
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
var currentElement: Element? = root.clone()
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
} }
} }
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
if (result == null) { if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted") throw ElementNotFoundException("Result could not be extracted")
} } else {
if (transformationSteps.isPresent) {
if (transformationSteps.isPresent) { intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex
} }
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
// if (extractionConfig.getFallbackConfiguration().isPresent) {
// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
// } else {
// throw ex
// }
Log.warn("An extraction method failed")
}
else -> throw ex
} }
else -> throw ex
} }
} }
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result return result
} }

View File

@@ -1,6 +1,5 @@
package com.rak.util package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse import com.rak.model.cc.CCIndexSuccessResponse

View File

@@ -4,25 +4,25 @@ quarkus:
scraper: scraper:
sources: sources:
- id: konami-official # - id: konami-official
name: "Konami Official Database" # name: "Konami Official Database"
domain: "yugioh-card.com" # domain: "yugioh-card.com"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" # url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets: # targets:
card: # card:
root: # root:
type: css # type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" # value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
name: # name:
steps: # steps:
- type: "css" # - type: "css"
value: "h1.product-title" # value: "h1.product-title"
- type: "xpath" # - type: "xpath"
value: "//h1[@itemprop='name']" # value: "//h1[@itemprop='name']"
attack: # attack:
steps: # steps:
- type: "css" # - type: "css"
value: ".atk-value" # value: ".atk-value"
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
@@ -34,98 +34,113 @@ scraper:
type: css type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id: id:
steps: type: int
- type: xpath extractors:
value: "//li/text()" - steps:
transform: - type: xpath
- name: "replace" value: "//li/text()"
parameters: [ transform:
" (", - name: "replace"
"" parameters: [
] " (",
""
]
language: language:
steps: type: int
- type: xpath extractors:
value: "//li/abbr" - steps:
- type: xpath - type: xpath
value: "//abbr/@title" value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
region-key: region-key:
steps: type: int
- type: xpath extractors:
value: "//li/abbr/text()" - steps:
- type: xpath
value: "//li/abbr/text()"
card-print: card-print:
multi: true multi: true
discriminator:
root:
type: css
value: ".wds-tab__content"
root: root:
type: css type: css
value: "table > tbody > tr:has(> td)" value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id: id:
steps: type: int
- type: xpath extractors:
value: "./td/a[0]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[0]"
- type: xpath
value: "./text()"
name: name:
steps: type: int
- type: xpath extractors:
value: "./td/a[1]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[1]"
- type: xpath
value: "./text()"
regional-name: regional-name:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
- type: xpath extractors:
value: "./td[2]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td[2]"
transform: - type: xpath
- name: "removeInnerQuotes" value: "./text()"
parameters: [] transform:
- name: "removeInnerQuotes"
parameters: []
rarity: rarity:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
- type: xpath extractors:
value: "./td/a[3]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[3]"
card: - type: xpath
name: value: "./text()"
root: # card:
type: css # name:
value: ".cardTable" # root:
steps: # type: css
- type: "xpath" # value: ".cardTable"
value: "./tbody/tr[3]/th/text()" # steps:
description: # - type: "xpath"
root: # value: "./tbody/tr[3]/th/text()"
type: css # description:
value: ".cardTable" # root:
steps: # type: css
- type: "xpath" # value: ".cardTable"
value: "b:contains(Card descriptions)" # steps:
type: # - type: "xpath"
root: # value: "b:contains(Card descriptions)"
type: css # type:
value: ".cardTable" # root:
steps: # type: css
- type: "xpath" # value: ".cardTable"
value: "b:contains(Card descriptions)" # steps:
attack: # - type: "xpath"
root: # value: "b:contains(Card descriptions)"
type: css # attack:
value: ".cardTable" # root:
steps: # type: css
- type: "xpath" # value: ".cardTable"
value: "b:contains(Card descriptions)" # steps:
defense: # - type: "xpath"
root: # value: "b:contains(Card descriptions)"
type: css # defense:
value: ".cardTable" # root:
steps: # type: css
- type: "xpath" # value: ".cardTable"
value: "b:contains(Card descriptions)" # steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"