Add RegEx validation
Amend RegExReplace transformer Amend transformations
This commit is contained in:
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
package com.rak.config.converter
|
||||||
|
|
||||||
|
import org.eclipse.microprofile.config.spi.Converter
|
||||||
|
import java.util.regex.Pattern
|
||||||
|
import java.util.regex.PatternSyntaxException
|
||||||
|
|
||||||
|
class PatternConverter : Converter<Pattern> {
|
||||||
|
override fun convert(value: String): Pattern {
|
||||||
|
if (value.isBlank()) {
|
||||||
|
throw IllegalArgumentException("Pattern may not be empty")
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Pattern.compile(value)
|
||||||
|
} catch (_: PatternSyntaxException) {
|
||||||
|
throw IllegalStateException("'$value' is not a valid RegEx pattern")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -12,4 +12,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
|||||||
fun getExtractionMethods(): List<ExtractorConfig>
|
fun getExtractionMethods(): List<ExtractorConfig>
|
||||||
@WithName("fallback")
|
@WithName("fallback")
|
||||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||||
|
@WithName("validation")
|
||||||
|
fun getOptionalValidation(): Optional<ValidationConfig>
|
||||||
}
|
}
|
||||||
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import com.rak.config.converter.PatternConverter
|
||||||
|
import io.smallrye.config.WithConverter
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.regex.Pattern
|
||||||
|
|
||||||
|
interface ValidationConfig {
|
||||||
|
@WithName("pattern")
|
||||||
|
@WithConverter(PatternConverter::class)
|
||||||
|
fun getRegexPatterns(): MutableList<Pattern>
|
||||||
|
}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class ValueValidationException(message: String) : RuntimeException(message)
|
||||||
@@ -22,8 +22,11 @@ class TransformationRegistry {
|
|||||||
input.replace(parameters[0], parameters[1])
|
input.replace(parameters[0], parameters[1])
|
||||||
}
|
}
|
||||||
register("regexReplace") { input, params ->
|
register("regexReplace") { input, params ->
|
||||||
require(params.size == 2) {
|
require(params.size == 1 || params.size == 2) {
|
||||||
"'regexReplace' requires exactly 2 parameters"
|
"'regexReplace' requires either 1 or 2 parameters"
|
||||||
|
}
|
||||||
|
if (params.size == 1) {
|
||||||
|
params.add("")
|
||||||
}
|
}
|
||||||
input.replace(params[0].toRegex(), params[1])
|
input.replace(params[0].toRegex(), params[1])
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,19 +1,17 @@
|
|||||||
package com.rak.service.extract
|
package com.rak.service.extract
|
||||||
|
|
||||||
import com.rak.config.model.AbstractScrapeTargetConfig
|
import com.rak.config.model.*
|
||||||
import com.rak.config.model.ExtractConfig
|
|
||||||
import com.rak.config.model.ProviderConfig
|
|
||||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
|
||||||
import com.rak.model.Selector
|
import com.rak.model.Selector
|
||||||
import com.rak.model.exception.ElementNotFoundException
|
import com.rak.model.exception.ElementNotFoundException
|
||||||
import com.rak.model.exception.InvalidConfigurationException
|
import com.rak.model.exception.InvalidConfigurationException
|
||||||
|
import com.rak.model.exception.ValueValidationException
|
||||||
import com.rak.model.transform.TransformationRegistry
|
import com.rak.model.transform.TransformationRegistry
|
||||||
import com.rak.util.CssUtil
|
import com.rak.util.CssUtil
|
||||||
import com.rak.util.XPathUtil
|
import com.rak.util.XPathUtil
|
||||||
import io.quarkus.logging.Log
|
import io.quarkus.logging.Log
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
import java.util.Optional
|
import java.util.*
|
||||||
import kotlin.jvm.optionals.getOrElse
|
import kotlin.jvm.optionals.getOrElse
|
||||||
|
|
||||||
// find root element from global or node config
|
// find root element from global or node config
|
||||||
@@ -210,6 +208,12 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
if (intermediateResult == null) {
|
if (intermediateResult == null) {
|
||||||
throw ElementNotFoundException("Result could not be extracted")
|
throw ElementNotFoundException("Result could not be extracted")
|
||||||
} else {
|
} else {
|
||||||
|
try {
|
||||||
|
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
|
||||||
|
} catch (ex: ValueValidationException) {
|
||||||
|
throw ex
|
||||||
|
}
|
||||||
|
|
||||||
if (transformationSteps.isPresent) {
|
if (transformationSteps.isPresent) {
|
||||||
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
||||||
}
|
}
|
||||||
@@ -226,8 +230,8 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
// } else {
|
// } else {
|
||||||
// throw ex
|
// throw ex
|
||||||
// }
|
// }
|
||||||
Log.warn("An extraction method failed")
|
|
||||||
}
|
}
|
||||||
|
is ValueValidationException -> Log.warn(ex.message)
|
||||||
else -> throw ex
|
else -> throw ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -240,4 +244,22 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
|
||||||
|
if (!validationConfig.isPresent) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var validated = true
|
||||||
|
|
||||||
|
for(regex in validationConfig.get().getRegexPatterns()) {
|
||||||
|
if (!value.matches(regex.toRegex())) {
|
||||||
|
validated = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!validated) {
|
||||||
|
throw ValueValidationException("'$value' does not validate against RegEx(s)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
|
|||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.nodes.TextNode
|
import org.jsoup.nodes.TextNode
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
|
import java.util.regex.Pattern
|
||||||
import kotlin.coroutines.CoroutineContext
|
import kotlin.coroutines.CoroutineContext
|
||||||
|
|
||||||
class XPathUtil private constructor() {
|
class XPathUtil private constructor() {
|
||||||
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
|
|||||||
|
|
||||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||||
return root
|
return root
|
||||||
.selectXpath(xpath, TextNode::class.java)
|
.selectXpath(xpath.replace("/text()", ""))
|
||||||
.firstOrNull()?.text()
|
.text()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun getNextElement(element: Element, path: String): Element? {
|
fun getNextElement(element: Element, path: String): Element? {
|
||||||
|
|||||||
@@ -77,14 +77,29 @@ scraper:
|
|||||||
value: "./td/a[0]"
|
value: "./td/a[0]"
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./text()"
|
||||||
|
- steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/span/text()"
|
||||||
|
validation:
|
||||||
|
pattern: "^.+-.+\\\\d.+$"
|
||||||
name:
|
name:
|
||||||
type: int
|
type: int
|
||||||
extractors:
|
extractors:
|
||||||
- steps:
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./td/a[1]"
|
value: "./td[1]"
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./text()"
|
||||||
|
transform:
|
||||||
|
- name: "regexReplace"
|
||||||
|
parameters: [
|
||||||
|
"\\(.+\\)",
|
||||||
|
""
|
||||||
|
]
|
||||||
|
- name: "removeInnerQuotes"
|
||||||
|
parameters: []
|
||||||
|
validation:
|
||||||
|
pattern: "^\".+\".*"
|
||||||
regional-name:
|
regional-name:
|
||||||
fallback:
|
fallback:
|
||||||
default: "N/A"
|
default: "N/A"
|
||||||
@@ -98,6 +113,8 @@ scraper:
|
|||||||
transform:
|
transform:
|
||||||
- name: "removeInnerQuotes"
|
- name: "removeInnerQuotes"
|
||||||
parameters: []
|
parameters: []
|
||||||
|
validation:
|
||||||
|
pattern: "^\".+\"$"
|
||||||
rarity:
|
rarity:
|
||||||
fallback:
|
fallback:
|
||||||
default: "N/A"
|
default: "N/A"
|
||||||
@@ -108,6 +125,18 @@ scraper:
|
|||||||
value: "./td/a[3]"
|
value: "./td/a[3]"
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./text()"
|
||||||
|
- steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/a[2]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
- steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/a[1]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
validation:
|
||||||
|
pattern: "^.*(Common|Rare|Print).*$"
|
||||||
# card:
|
# card:
|
||||||
# name:
|
# name:
|
||||||
# root:
|
# root:
|
||||||
|
|||||||
Reference in New Issue
Block a user