Add RegEx validation

Amend RegExReplace transformer
Amend transformations
This commit is contained in:
2025-06-29 14:52:09 +02:00
parent ee4ce4fd65
commit 2a79218a54
8 changed files with 102 additions and 11 deletions

View File

@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}

View File

@@ -12,4 +12,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
fun getExtractionMethods(): List<ExtractorConfig>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
}

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)

View File

@@ -22,8 +22,11 @@ class TransformationRegistry {
input.replace(parameters[0], parameters[1])
}
register("regexReplace") { input, params ->
require(params.size == 2) {
"'regexReplace' requires exactly 2 parameters"
require(params.size == 1 || params.size == 2) {
"'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
}
input.replace(params[0].toRegex(), params[1])
}

View File

@@ -1,19 +1,17 @@
package com.rak.service.extract
import com.rak.config.model.AbstractScrapeTargetConfig
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.*
import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil
import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.Optional
import java.util.*
import kotlin.jvm.optionals.getOrElse
// find root element from global or node config
@@ -210,6 +208,12 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted")
} else {
try {
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
} catch (ex: ValueValidationException) {
throw ex
}
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
@@ -226,8 +230,8 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
// } else {
// throw ex
// }
Log.warn("An extraction method failed")
}
is ValueValidationException -> Log.warn(ex.message)
else -> throw ex
}
}
@@ -240,4 +244,22 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return result
}
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
}

View File

@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() {
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
.firstOrNull()?.text()
.selectXpath(xpath.replace("/text()", ""))
.text()
}
fun getNextElement(element: Element, path: String): Element? {

View File

@@ -77,14 +77,29 @@ scraper:
value: "./td/a[0]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/span/text()"
validation:
pattern: "^.+-.+\\\\d.+$"
name:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[1]"
value: "./td[1]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
"\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name:
fallback:
default: "N/A"
@@ -98,6 +113,8 @@ scraper:
transform:
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\"$"
rarity:
fallback:
default: "N/A"
@@ -108,6 +125,18 @@ scraper:
value: "./td/a[3]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[2]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[1]"
- type: xpath
value: "./text()"
validation:
pattern: "^.*(Common|Rare|Print).*$"
# card:
# name:
# root: