Add RegEx validation

Amend RegExReplace transformer
Amend transformations
This commit is contained in:
2025-06-29 14:52:09 +02:00
parent ee4ce4fd65
commit 2a79218a54
8 changed files with 102 additions and 11 deletions

View File

@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}

View File

@@ -12,4 +12,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
fun getExtractionMethods(): List<ExtractorConfig> fun getExtractionMethods(): List<ExtractorConfig>
@WithName("fallback") @WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback> fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
} }

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)

View File

@@ -22,8 +22,11 @@ class TransformationRegistry {
input.replace(parameters[0], parameters[1]) input.replace(parameters[0], parameters[1])
} }
register("regexReplace") { input, params -> register("regexReplace") { input, params ->
require(params.size == 2) { require(params.size == 1 || params.size == 2) {
"'regexReplace' requires exactly 2 parameters" "'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
} }
input.replace(params[0].toRegex(), params[1]) input.replace(params[0].toRegex(), params[1])
} }

View File

@@ -1,19 +1,17 @@
package com.rak.service.extract package com.rak.service.extract
import com.rak.config.model.AbstractScrapeTargetConfig import com.rak.config.model.*
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.Selector import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil import com.rak.util.CssUtil
import com.rak.util.XPathUtil import com.rak.util.XPathUtil
import io.quarkus.logging.Log import io.quarkus.logging.Log
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.Optional import java.util.*
import kotlin.jvm.optionals.getOrElse import kotlin.jvm.optionals.getOrElse
// find root element from global or node config // find root element from global or node config
@@ -210,6 +208,12 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
if (intermediateResult == null) { if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted") throw ElementNotFoundException("Result could not be extracted")
} else { } else {
try {
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
} catch (ex: ValueValidationException) {
throw ex
}
if (transformationSteps.isPresent) { if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get()) intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
} }
@@ -226,8 +230,8 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
// } else { // } else {
// throw ex // throw ex
// } // }
Log.warn("An extraction method failed")
} }
is ValueValidationException -> Log.warn(ex.message)
else -> throw ex else -> throw ex
} }
} }
@@ -240,4 +244,22 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return result return result
} }
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
} }

View File

@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() { class XPathUtil private constructor() {
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
private fun extractTextFromNode(root: Element, xpath: String): String? { private fun extractTextFromNode(root: Element, xpath: String): String? {
return root return root
.selectXpath(xpath, TextNode::class.java) .selectXpath(xpath.replace("/text()", ""))
.firstOrNull()?.text() .text()
} }
fun getNextElement(element: Element, path: String): Element? { fun getNextElement(element: Element, path: String): Element? {

View File

@@ -77,14 +77,29 @@ scraper:
value: "./td/a[0]" value: "./td/a[0]"
- type: xpath - type: xpath
value: "./text()" value: "./text()"
- steps:
- type: xpath
value: "./td/span/text()"
validation:
pattern: "^.+-.+\\\\d.+$"
name: name:
type: int type: int
extractors: extractors:
- steps: - steps:
- type: xpath - type: xpath
value: "./td/a[1]" value: "./td[1]"
- type: xpath - type: xpath
value: "./text()" value: "./text()"
transform:
- name: "regexReplace"
parameters: [
"\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name: regional-name:
fallback: fallback:
default: "N/A" default: "N/A"
@@ -98,6 +113,8 @@ scraper:
transform: transform:
- name: "removeInnerQuotes" - name: "removeInnerQuotes"
parameters: [] parameters: []
validation:
pattern: "^\".+\"$"
rarity: rarity:
fallback: fallback:
default: "N/A" default: "N/A"
@@ -108,6 +125,18 @@ scraper:
value: "./td/a[3]" value: "./td/a[3]"
- type: xpath - type: xpath
value: "./text()" value: "./text()"
- steps:
- type: xpath
value: "./td/a[2]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[1]"
- type: xpath
value: "./text()"
validation:
pattern: "^.*(Common|Rare|Print).*$"
# card: # card:
# name: # name:
# root: # root: