Add RegEx validation
Amend RegExReplace transformer Amend transformations
This commit is contained in:
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
@@ -0,0 +1,19 @@
|
||||
package com.rak.config.converter
|
||||
|
||||
import org.eclipse.microprofile.config.spi.Converter
|
||||
import java.util.regex.Pattern
|
||||
import java.util.regex.PatternSyntaxException
|
||||
|
||||
class PatternConverter : Converter<Pattern> {
|
||||
override fun convert(value: String): Pattern {
|
||||
if (value.isBlank()) {
|
||||
throw IllegalArgumentException("Pattern may not be empty")
|
||||
}
|
||||
|
||||
try {
|
||||
return Pattern.compile(value)
|
||||
} catch (_: PatternSyntaxException) {
|
||||
throw IllegalStateException("'$value' is not a valid RegEx pattern")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -12,4 +12,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||
fun getExtractionMethods(): List<ExtractorConfig>
|
||||
@WithName("fallback")
|
||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||
@WithName("validation")
|
||||
fun getOptionalValidation(): Optional<ValidationConfig>
|
||||
}
|
||||
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
@@ -0,0 +1,12 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import com.rak.config.converter.PatternConverter
|
||||
import io.smallrye.config.WithConverter
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.regex.Pattern
|
||||
|
||||
interface ValidationConfig {
|
||||
@WithName("pattern")
|
||||
@WithConverter(PatternConverter::class)
|
||||
fun getRegexPatterns(): MutableList<Pattern>
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
package com.rak.model.exception
|
||||
|
||||
class ValueValidationException(message: String) : RuntimeException(message)
|
||||
@@ -22,8 +22,11 @@ class TransformationRegistry {
|
||||
input.replace(parameters[0], parameters[1])
|
||||
}
|
||||
register("regexReplace") { input, params ->
|
||||
require(params.size == 2) {
|
||||
"'regexReplace' requires exactly 2 parameters"
|
||||
require(params.size == 1 || params.size == 2) {
|
||||
"'regexReplace' requires either 1 or 2 parameters"
|
||||
}
|
||||
if (params.size == 1) {
|
||||
params.add("")
|
||||
}
|
||||
input.replace(params[0].toRegex(), params[1])
|
||||
}
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.AbstractScrapeTargetConfig
|
||||
import com.rak.config.model.ExtractConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.*
|
||||
import com.rak.model.Selector
|
||||
import com.rak.model.exception.ElementNotFoundException
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
import com.rak.model.exception.ValueValidationException
|
||||
import com.rak.model.transform.TransformationRegistry
|
||||
import com.rak.util.CssUtil
|
||||
import com.rak.util.XPathUtil
|
||||
import io.quarkus.logging.Log
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.Optional
|
||||
import java.util.*
|
||||
import kotlin.jvm.optionals.getOrElse
|
||||
|
||||
// find root element from global or node config
|
||||
@@ -210,6 +208,12 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
if (intermediateResult == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
} else {
|
||||
try {
|
||||
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
|
||||
} catch (ex: ValueValidationException) {
|
||||
throw ex
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
||||
}
|
||||
@@ -226,8 +230,8 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
// } else {
|
||||
// throw ex
|
||||
// }
|
||||
Log.warn("An extraction method failed")
|
||||
}
|
||||
is ValueValidationException -> Log.warn(ex.message)
|
||||
else -> throw ex
|
||||
}
|
||||
}
|
||||
@@ -240,4 +244,22 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return result
|
||||
}
|
||||
|
||||
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
|
||||
if (!validationConfig.isPresent) {
|
||||
return
|
||||
}
|
||||
|
||||
var validated = true
|
||||
|
||||
for(regex in validationConfig.get().getRegexPatterns()) {
|
||||
if (!value.matches(regex.toRegex())) {
|
||||
validated = false
|
||||
}
|
||||
}
|
||||
|
||||
if (!validated) {
|
||||
throw ValueValidationException("'$value' does not validate against RegEx(s)")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.nodes.TextNode
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.regex.Pattern
|
||||
import kotlin.coroutines.CoroutineContext
|
||||
|
||||
class XPathUtil private constructor() {
|
||||
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
|
||||
|
||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||
return root
|
||||
.selectXpath(xpath, TextNode::class.java)
|
||||
.firstOrNull()?.text()
|
||||
.selectXpath(xpath.replace("/text()", ""))
|
||||
.text()
|
||||
}
|
||||
|
||||
fun getNextElement(element: Element, path: String): Element? {
|
||||
|
||||
@@ -77,14 +77,29 @@ scraper:
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/span/text()"
|
||||
validation:
|
||||
pattern: "^.+-.+\\\\d.+$"
|
||||
name:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
value: "./td[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "regexReplace"
|
||||
parameters: [
|
||||
"\\(.+\\)",
|
||||
""
|
||||
]
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
validation:
|
||||
pattern: "^\".+\".*"
|
||||
regional-name:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
@@ -98,6 +113,8 @@ scraper:
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
validation:
|
||||
pattern: "^\".+\"$"
|
||||
rarity:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
@@ -108,6 +125,18 @@ scraper:
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
validation:
|
||||
pattern: "^.*(Common|Rare|Print).*$"
|
||||
# card:
|
||||
# name:
|
||||
# root:
|
||||
|
||||
Reference in New Issue
Block a user