From 2a79218a54a01f1abb26c5ad7854858ff08f3535 Mon Sep 17 00:00:00 2001 From: Katarina Date: Sun, 29 Jun 2025 14:52:09 +0200 Subject: [PATCH] Add RegEx validation Amend RegExReplace transformer Amend transformations --- .../rak/config/converter/PatternConverter.kt | 19 +++++++++++ .../config/model/ScrapeTargetFieldConfig.kt | 2 ++ .../com/rak/config/model/ValidationConfig.kt | 12 +++++++ .../exception/ValueValidationException.kt | 3 ++ .../model/transform/TransformationRegistry.kt | 7 ++-- .../extract/AbstractExtractionService.kt | 34 +++++++++++++++---- src/main/kotlin/com/rak/util/XPathUtil.kt | 5 +-- src/main/resources/application.yml | 31 ++++++++++++++++- 8 files changed, 102 insertions(+), 11 deletions(-) create mode 100644 src/main/kotlin/com/rak/config/converter/PatternConverter.kt create mode 100644 src/main/kotlin/com/rak/config/model/ValidationConfig.kt create mode 100644 src/main/kotlin/com/rak/model/exception/ValueValidationException.kt diff --git a/src/main/kotlin/com/rak/config/converter/PatternConverter.kt b/src/main/kotlin/com/rak/config/converter/PatternConverter.kt new file mode 100644 index 0000000..fcaede9 --- /dev/null +++ b/src/main/kotlin/com/rak/config/converter/PatternConverter.kt @@ -0,0 +1,19 @@ +package com.rak.config.converter + +import org.eclipse.microprofile.config.spi.Converter +import java.util.regex.Pattern +import java.util.regex.PatternSyntaxException + +class PatternConverter : Converter { + override fun convert(value: String): Pattern { + if (value.isBlank()) { + throw IllegalArgumentException("Pattern may not be empty") + } + + try { + return Pattern.compile(value) + } catch (_: PatternSyntaxException) { + throw IllegalStateException("'$value' is not a valid RegEx pattern") + } + } +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt b/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt index 443ee6a..78c4cdf 100644 --- a/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt +++ b/src/main/kotlin/com/rak/config/model/ScrapeTargetFieldConfig.kt @@ -12,4 +12,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { fun getExtractionMethods(): List @WithName("fallback") fun getFallbackConfiguration(): Optional + @WithName("validation") + fun getOptionalValidation(): Optional } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/config/model/ValidationConfig.kt b/src/main/kotlin/com/rak/config/model/ValidationConfig.kt new file mode 100644 index 0000000..ab34b81 --- /dev/null +++ b/src/main/kotlin/com/rak/config/model/ValidationConfig.kt @@ -0,0 +1,12 @@ +package com.rak.config.model + +import com.rak.config.converter.PatternConverter +import io.smallrye.config.WithConverter +import io.smallrye.config.WithName +import java.util.regex.Pattern + +interface ValidationConfig { + @WithName("pattern") + @WithConverter(PatternConverter::class) + fun getRegexPatterns(): MutableList +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/exception/ValueValidationException.kt b/src/main/kotlin/com/rak/model/exception/ValueValidationException.kt new file mode 100644 index 0000000..58cc8cd --- /dev/null +++ b/src/main/kotlin/com/rak/model/exception/ValueValidationException.kt @@ -0,0 +1,3 @@ +package com.rak.model.exception + +class ValueValidationException(message: String) : RuntimeException(message) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/transform/TransformationRegistry.kt b/src/main/kotlin/com/rak/model/transform/TransformationRegistry.kt index 28ce33d..5c5adee 100644 --- a/src/main/kotlin/com/rak/model/transform/TransformationRegistry.kt +++ b/src/main/kotlin/com/rak/model/transform/TransformationRegistry.kt @@ -22,8 +22,11 @@ class TransformationRegistry { input.replace(parameters[0], parameters[1]) } register("regexReplace") { input, params -> - require(params.size == 2) { - "'regexReplace' requires exactly 2 parameters" + require(params.size == 1 || params.size == 2) { + "'regexReplace' requires either 1 or 2 parameters" + } + if (params.size == 1) { + params.add("") } input.replace(params[0].toRegex(), params[1]) } diff --git a/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt b/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt index 2c20b32..8e3bbbf 100644 --- a/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt +++ b/src/main/kotlin/com/rak/service/extract/AbstractExtractionService.kt @@ -1,19 +1,17 @@ package com.rak.service.extract -import com.rak.config.model.AbstractScrapeTargetConfig -import com.rak.config.model.ExtractConfig -import com.rak.config.model.ProviderConfig -import com.rak.config.model.ScrapeTargetFieldConfig +import com.rak.config.model.* import com.rak.model.Selector import com.rak.model.exception.ElementNotFoundException import com.rak.model.exception.InvalidConfigurationException +import com.rak.model.exception.ValueValidationException import com.rak.model.transform.TransformationRegistry import com.rak.util.CssUtil import com.rak.util.XPathUtil import io.quarkus.logging.Log import org.jsoup.nodes.Element import org.jsoup.select.Elements -import java.util.Optional +import java.util.* import kotlin.jvm.optionals.getOrElse // find root element from global or node config @@ -210,6 +208,12 @@ abstract class AbstractExtractionService { if (intermediateResult == null) { throw ElementNotFoundException("Result could not be extracted") } else { + try { + validateValue(intermediateResult, extractionConfig.getOptionalValidation()) + } catch (ex: ValueValidationException) { + throw ex + } + if (transformationSteps.isPresent) { intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get()) } @@ -226,8 +230,8 @@ abstract class AbstractExtractionService { // } else { // throw ex // } - Log.warn("An extraction method failed") } + is ValueValidationException -> Log.warn(ex.message) else -> throw ex } } @@ -240,4 +244,22 @@ abstract class AbstractExtractionService { return result } + private fun validateValue(value: String, validationConfig: Optional) { + if (!validationConfig.isPresent) { + return + } + + var validated = true + + for(regex in validationConfig.get().getRegexPatterns()) { + if (!value.matches(regex.toRegex())) { + validated = false + } + } + + if (!validated) { + throw ValueValidationException("'$value' does not validate against RegEx(s)") + } + } + } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/util/XPathUtil.kt b/src/main/kotlin/com/rak/util/XPathUtil.kt index d848351..14f51ae 100644 --- a/src/main/kotlin/com/rak/util/XPathUtil.kt +++ b/src/main/kotlin/com/rak/util/XPathUtil.kt @@ -4,6 +4,7 @@ import com.rak.model.XPathTarget import org.jsoup.nodes.Element import org.jsoup.nodes.TextNode import org.jsoup.select.Elements +import java.util.regex.Pattern import kotlin.coroutines.CoroutineContext class XPathUtil private constructor() { @@ -40,8 +41,8 @@ class XPathUtil private constructor() { private fun extractTextFromNode(root: Element, xpath: String): String? { return root - .selectXpath(xpath, TextNode::class.java) - .firstOrNull()?.text() + .selectXpath(xpath.replace("/text()", "")) + .text() } fun getNextElement(element: Element, path: String): Element? { diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 7b5c241..0c2472a 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -77,14 +77,29 @@ scraper: value: "./td/a[0]" - type: xpath value: "./text()" + - steps: + - type: xpath + value: "./td/span/text()" + validation: + pattern: "^.+-.+\\\\d.+$" name: type: int extractors: - steps: - type: xpath - value: "./td/a[1]" + value: "./td[1]" - type: xpath value: "./text()" + transform: + - name: "regexReplace" + parameters: [ + "\\(.+\\)", + "" + ] + - name: "removeInnerQuotes" + parameters: [] + validation: + pattern: "^\".+\".*" regional-name: fallback: default: "N/A" @@ -98,6 +113,8 @@ scraper: transform: - name: "removeInnerQuotes" parameters: [] + validation: + pattern: "^\".+\"$" rarity: fallback: default: "N/A" @@ -108,6 +125,18 @@ scraper: value: "./td/a[3]" - type: xpath value: "./text()" + - steps: + - type: xpath + value: "./td/a[2]" + - type: xpath + value: "./text()" + - steps: + - type: xpath + value: "./td/a[1]" + - type: xpath + value: "./text()" + validation: + pattern: "^.*(Common|Rare|Print).*$" # card: # name: # root: