Basic multi-method extraction

This commit is contained in:
2025-06-29 13:21:18 +02:00
parent 108b4c4c19
commit ee4ce4fd65
5 changed files with 180 additions and 141 deletions

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -4,12 +4,12 @@ import io.smallrye.config.WithName
import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("root")
fun getRootConfig(): Optional<ExtractConfig>
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("extractors")
fun getExtractionMethods(): List<ExtractorConfig>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
}

View File

@@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil
import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.Optional
@@ -174,11 +175,16 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
root: Element,
extractionConfig: ScrapeTargetFieldConfig
): String? {
val extractionSteps = extractionConfig.getExtractionSteps()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
val extractionMethods = extractionConfig.getExtractionMethods()
var result: String? = null
for(extractionMethod in extractionMethods) {
val extractionSteps = extractionMethod.getExtractionSteps()
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
@@ -188,7 +194,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
}
if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
@@ -201,27 +207,35 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
}
}
if (result == null) {
if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted")
} else {
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex
}
// if (extractionConfig.getFallbackConfiguration().isPresent) {
// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
// } else {
// throw ex
// }
Log.warn("An extraction method failed")
}
else -> throw ex
}
}
}
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result
}

View File

@@ -1,6 +1,5 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse

View File

@@ -4,25 +4,25 @@ quarkus:
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets:
card:
root:
type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
# - id: konami-official
# name: "Konami Official Database"
# domain: "yugioh-card.com"
# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
# targets:
# card:
# root:
# type: css
# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
# name:
# steps:
# - type: "css"
# value: "h1.product-title"
# - type: "xpath"
# value: "//h1[@itemprop='name']"
# attack:
# steps:
# - type: "css"
# value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
@@ -34,7 +34,9 @@ scraper:
type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
type: int
extractors:
- steps:
- type: xpath
value: "//li/text()"
transform:
@@ -44,32 +46,41 @@ scraper:
""
]
language:
steps:
type: int
extractors:
- steps:
- type: xpath
value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
region-key:
steps:
type: int
extractors:
- steps:
- type: xpath
value: "//li/abbr/text()"
card-print:
multi: true
discriminator:
root:
type: css
value: ".wds-tab__content"
root:
type: css
value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id:
steps:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[0]"
- type: xpath
value: "./text()"
name:
steps:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[1]"
- type: xpath
@@ -77,7 +88,9 @@ scraper:
regional-name:
fallback:
default: "N/A"
steps:
type: int
extractors:
- steps:
- type: xpath
value: "./td[2]"
- type: xpath
@@ -88,44 +101,46 @@ scraper:
rarity:
fallback:
default: "N/A"
steps:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[3]"
- type: xpath
value: "./text()"
card:
name:
root:
type: css
value: ".cardTable"
steps:
- type: "xpath"
value: "./tbody/tr[3]/th/text()"
description:
root:
type: css
value: ".cardTable"
steps:
- type: "xpath"
value: "b:contains(Card descriptions)"
type:
root:
type: css
value: ".cardTable"
steps:
- type: "xpath"
value: "b:contains(Card descriptions)"
attack:
root:
type: css
value: ".cardTable"
steps:
- type: "xpath"
value: "b:contains(Card descriptions)"
defense:
root:
type: css
value: ".cardTable"
steps:
- type: "xpath"
value: "b:contains(Card descriptions)"
# card:
# name:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "./tbody/tr[3]/th/text()"
# description:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# type:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# attack:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# defense:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"