Basic multi-method extraction

This commit is contained in:
2025-06-29 13:21:18 +02:00
parent 108b4c4c19
commit ee4ce4fd65
5 changed files with 180 additions and 141 deletions

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -4,12 +4,12 @@ import io.smallrye.config.WithName
import java.util.* import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("root") @WithName("root")
fun getRootConfig(): Optional<ExtractConfig> fun getRootConfig(): Optional<ExtractConfig>
@WithName("steps") @WithName("extractors")
fun getExtractionSteps(): List<ExtractConfig> fun getExtractionMethods(): List<ExtractorConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback") @WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback> fun getFallbackConfiguration(): Optional<FieldConfigFallback>
} }

View File

@@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.transform.TransformationRegistry import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil import com.rak.util.CssUtil
import com.rak.util.XPathUtil import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.Optional import java.util.Optional
@@ -174,11 +175,16 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
root: Element, root: Element,
extractionConfig: ScrapeTargetFieldConfig extractionConfig: ScrapeTargetFieldConfig
): String? { ): String? {
val extractionSteps = extractionConfig.getExtractionSteps() val extractionMethods = extractionConfig.getExtractionMethods()
val transformationSteps = extractionConfig.getOptionalTransformationSteps() var result: String? = null
for(extractionMethod in extractionMethods) {
val extractionSteps = extractionMethod.getExtractionSteps()
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
var currentElement: Element? = root.clone() var currentElement: Element? = root.clone()
var result: String? = null var intermediateResult: String? = null
try { try {
for (index in 0 until extractionSteps.size) { for (index in 0 until extractionSteps.size) {
@@ -188,7 +194,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
} }
if (index == extractionSteps.size - 1) { if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) { intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString()) Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
} }
@@ -201,27 +207,35 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
} }
} }
if (result == null) { if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted") throw ElementNotFoundException("Result could not be extracted")
} else {
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
} }
if (transformationSteps.isPresent) { result = intermediateResult
result = transformationRegistry.applyTransformations(result, transformationSteps.get()) break
} }
} catch (ex: RuntimeException) { } catch (ex: RuntimeException) {
when (ex) { when (ex) {
is ElementNotFoundException, is ElementNotFoundException,
is IllegalStateException -> { is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) { // if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue() // intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else { // } else {
throw ex // throw ex
} // }
Log.warn("An extraction method failed")
} }
else -> throw ex else -> throw ex
} }
} }
}
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result return result
} }

View File

@@ -1,6 +1,5 @@
package com.rak.util package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse import com.rak.model.cc.CCIndexSuccessResponse

View File

@@ -4,25 +4,25 @@ quarkus:
scraper: scraper:
sources: sources:
- id: konami-official # - id: konami-official
name: "Konami Official Database" # name: "Konami Official Database"
domain: "yugioh-card.com" # domain: "yugioh-card.com"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" # url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets: # targets:
card: # card:
root: # root:
type: css # type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" # value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
name: # name:
steps: # steps:
- type: "css" # - type: "css"
value: "h1.product-title" # value: "h1.product-title"
- type: "xpath" # - type: "xpath"
value: "//h1[@itemprop='name']" # value: "//h1[@itemprop='name']"
attack: # attack:
steps: # steps:
- type: "css" # - type: "css"
value: ".atk-value" # value: ".atk-value"
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
@@ -34,7 +34,9 @@ scraper:
type: css type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id: id:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/text()" value: "//li/text()"
transform: transform:
@@ -44,32 +46,41 @@ scraper:
"" ""
] ]
language: language:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/abbr" value: "//li/abbr"
- type: xpath - type: xpath
value: "//abbr/@title" value: "//abbr/@title"
region-key: region-key:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/abbr/text()" value: "//li/abbr/text()"
card-print: card-print:
multi: true multi: true
discriminator:
root:
type: css
value: ".wds-tab__content"
root: root:
type: css type: css
value: "table > tbody > tr:has(> td)" value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id: id:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "./td/a[0]" value: "./td/a[0]"
- type: xpath - type: xpath
value: "./text()" value: "./text()"
name: name:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "./td/a[1]" value: "./td/a[1]"
- type: xpath - type: xpath
@@ -77,7 +88,9 @@ scraper:
regional-name: regional-name:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "./td[2]" value: "./td[2]"
- type: xpath - type: xpath
@@ -88,44 +101,46 @@ scraper:
rarity: rarity:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "./td/a[3]" value: "./td/a[3]"
- type: xpath - type: xpath
value: "./text()" value: "./text()"
card: # card:
name: # name:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "./tbody/tr[3]/th/text()" # value: "./tbody/tr[3]/th/text()"
description: # description:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
type: # type:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
attack: # attack:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
defense: # defense:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"