From ee4ce4fd65b0a0ba07b15f62a5633c8dae5e6d22 Mon Sep 17 00:00:00 2001 From: Katarina Date: Sun, 29 Jun 2025 13:21:18 +0200 Subject: [PATCH] Basic multi-method extraction --- .../com/rak/config/model/ExtractorConfig.kt | 11 + .../config/model/ScrapeTargetFieldConfig.kt | 8 +- .../extract/AbstractExtractionService.kt | 88 +++++--- src/main/kotlin/com/rak/util/NDJsonReader.kt | 1 - src/main/resources/application.yml | 213 ++++++++++-------- 5 files changed, 180 insertions(+), 141 deletions(-) create mode 100644 src/main/kotlin/com/rak/config/model/ExtractorConfig.kt diff --git a/src/main/kotlin/com/rak/config/model/ExtractorConfig.kt b/src/main/kotlin/com/rak/config/model/ExtractorConfig.kt new file mode 100644 index 0000000..09e699b --- /dev/null +++ b/src/main/kotlin/com/rak/config/model/ExtractorConfig.kt @@ -0,0 +1,11 @@ +package com.rak.config.model + +import io.smallrye.config.WithName +import java.util.Optional + +interface ExtractorConfig { + @WithName("steps") + fun getExtractionSteps(): List + @WithName("transform") + fun getOptionalTransformationSteps(): Optional> +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt b/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt index 7adb033..443ee6a 100644 --- a/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt +++ b/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt @@ -4,12 +4,12 @@ import io.smallrye.config.WithName import java.util.* interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { + @WithName("type") + fun getType(): String @WithName("root") fun getRootConfig(): Optional - @WithName("steps") - fun getExtractionSteps(): List - @WithName("transform") - fun getOptionalTransformationSteps(): Optional> + @WithName("extractors") + fun getExtractionMethods(): List @WithName("fallback") fun getFallbackConfiguration(): Optional } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt b/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt index 5c89763..2c20b32 100644 --- a/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt +++ b/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt @@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException import com.rak.model.transform.TransformationRegistry import com.rak.util.CssUtil import com.rak.util.XPathUtil +import io.quarkus.logging.Log import org.jsoup.nodes.Element import org.jsoup.select.Elements import java.util.Optional @@ -174,54 +175,67 @@ abstract class AbstractExtractionService { root: Element, extractionConfig: ScrapeTargetFieldConfig ): String? { - val extractionSteps = extractionConfig.getExtractionSteps() - val transformationSteps = extractionConfig.getOptionalTransformationSteps() - - var currentElement: Element? = root.clone() + val extractionMethods = extractionConfig.getExtractionMethods() var result: String? = null - try { - for (index in 0 until extractionSteps.size) { - val currentStep = extractionSteps.elementAtOrNull(index) ?: return null - if (currentElement == null) { - throw IllegalStateException() - } - if (index == extractionSteps.size - 1) { - result = when (currentStep.selectorType()) { - Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) - Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString()) + for(extractionMethod in extractionMethods) { + val extractionSteps = extractionMethod.getExtractionSteps() + val transformationSteps = extractionMethod.getOptionalTransformationSteps() + + var currentElement: Element? = root.clone() + var intermediateResult: String? = null + + try { + for (index in 0 until extractionSteps.size) { + val currentStep = extractionSteps.elementAtOrNull(index) ?: return null + if (currentElement == null) { + throw IllegalStateException() + } + + if (index == extractionSteps.size - 1) { + intermediateResult = when (currentStep.selectorType()) { + Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) + Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString()) + } + } + else { + currentElement = when (currentStep.selectorType()) { + Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString()) + Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString()) + } } } - else { - currentElement = when (currentStep.selectorType()) { - Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString()) - Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString()) - } - } - } - if (result == null) { - throw ElementNotFoundException("Result could not be extracted") - } - - if (transformationSteps.isPresent) { - result = transformationRegistry.applyTransformations(result, transformationSteps.get()) - } - } catch (ex: RuntimeException) { - when (ex) { - is ElementNotFoundException, - is IllegalStateException -> { - if (extractionConfig.getFallbackConfiguration().isPresent) { - result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue() - } else { - throw ex + if (intermediateResult == null) { + throw ElementNotFoundException("Result could not be extracted") + } else { + if (transformationSteps.isPresent) { + intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get()) } + + result = intermediateResult + break + } + } catch (ex: RuntimeException) { + when (ex) { + is ElementNotFoundException, + is IllegalStateException -> { +// if (extractionConfig.getFallbackConfiguration().isPresent) { +// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue() +// } else { +// throw ex +// } + Log.warn("An extraction method failed") + } + else -> throw ex } - else -> throw ex } } + if (result == null && extractionConfig.getFallbackConfiguration().isPresent) { + result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue() + } return result } diff --git a/src/main/kotlin/com/rak/util/NDJsonReader.kt b/src/main/kotlin/com/rak/util/NDJsonReader.kt index f4c01d6..bf29f21 100644 --- a/src/main/kotlin/com/rak/util/NDJsonReader.kt +++ b/src/main/kotlin/com/rak/util/NDJsonReader.kt @@ -1,6 +1,5 @@ package com.rak.util -import com.fasterxml.jackson.datatype.jsr310.JSR310Module import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper import com.rak.model.cc.CCIndexSuccessResponse diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 7e72b04..7b5c241 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -4,25 +4,25 @@ quarkus: scraper: sources: - - id: konami-official - name: "Konami Official Database" - domain: "yugioh-card.com" - url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" - targets: - card: - root: - type: css - value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" - name: - steps: - - type: "css" - value: "h1.product-title" - - type: "xpath" - value: "//h1[@itemprop='name']" - attack: - steps: - - type: "css" - value: ".atk-value" +# - id: konami-official +# name: "Konami Official Database" +# domain: "yugioh-card.com" +# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" +# targets: +# card: +# root: +# type: css +# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" +# name: +# steps: +# - type: "css" +# value: "h1.product-title" +# - type: "xpath" +# value: "//h1[@itemprop='name']" +# attack: +# steps: +# - type: "css" +# value: ".atk-value" - id: ygo-fandom name: "Yu-Gi-Oh Fandom Wiki" @@ -34,98 +34,113 @@ scraper: type: css value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" id: - steps: - - type: xpath - value: "//li/text()" - transform: - - name: "replace" - parameters: [ - " (", - "" - ] + type: int + extractors: + - steps: + - type: xpath + value: "//li/text()" + transform: + - name: "replace" + parameters: [ + " (", + "" + ] language: - steps: - - type: xpath - value: "//li/abbr" - - type: xpath - value: "//abbr/@title" + type: int + extractors: + - steps: + - type: xpath + value: "//li/abbr" + - type: xpath + value: "//abbr/@title" region-key: - steps: - - type: xpath - value: "//li/abbr/text()" + type: int + extractors: + - steps: + - type: xpath + value: "//li/abbr/text()" card-print: multi: true - discriminator: - root: - type: css - value: ".wds-tab__content" root: type: css value: "table > tbody > tr:has(> td)" + discriminator: + type: string + root: + type: css + value: ".wds-tab__content" id: - steps: - - type: xpath - value: "./td/a[0]" - - type: xpath - value: "./text()" + type: int + extractors: + - steps: + - type: xpath + value: "./td/a[0]" + - type: xpath + value: "./text()" name: - steps: - - type: xpath - value: "./td/a[1]" - - type: xpath - value: "./text()" + type: int + extractors: + - steps: + - type: xpath + value: "./td/a[1]" + - type: xpath + value: "./text()" regional-name: fallback: default: "N/A" - steps: - - type: xpath - value: "./td[2]" - - type: xpath - value: "./text()" - transform: - - name: "removeInnerQuotes" - parameters: [] + type: int + extractors: + - steps: + - type: xpath + value: "./td[2]" + - type: xpath + value: "./text()" + transform: + - name: "removeInnerQuotes" + parameters: [] rarity: fallback: default: "N/A" - steps: - - type: xpath - value: "./td/a[3]" - - type: xpath - value: "./text()" - card: - name: - root: - type: css - value: ".cardTable" - steps: - - type: "xpath" - value: "./tbody/tr[3]/th/text()" - description: - root: - type: css - value: ".cardTable" - steps: - - type: "xpath" - value: "b:contains(Card descriptions)" - type: - root: - type: css - value: ".cardTable" - steps: - - type: "xpath" - value: "b:contains(Card descriptions)" - attack: - root: - type: css - value: ".cardTable" - steps: - - type: "xpath" - value: "b:contains(Card descriptions)" - defense: - root: - type: css - value: ".cardTable" - steps: - - type: "xpath" - value: "b:contains(Card descriptions)" \ No newline at end of file + type: int + extractors: + - steps: + - type: xpath + value: "./td/a[3]" + - type: xpath + value: "./text()" +# card: +# name: +# root: +# type: css +# value: ".cardTable" +# steps: +# - type: "xpath" +# value: "./tbody/tr[3]/th/text()" +# description: +# root: +# type: css +# value: ".cardTable" +# steps: +# - type: "xpath" +# value: "b:contains(Card descriptions)" +# type: +# root: +# type: css +# value: ".cardTable" +# steps: +# - type: "xpath" +# value: "b:contains(Card descriptions)" +# attack: +# root: +# type: css +# value: ".cardTable" +# steps: +# - type: "xpath" +# value: "b:contains(Card descriptions)" +# defense: +# root: +# type: css +# value: ".cardTable" +# steps: +# - type: "xpath" +# value: "b:contains(Card descriptions)" \ No newline at end of file