Amend transformation engine

This commit is contained in:
2025-06-26 12:40:51 +02:00
parent e97f9bdd61
commit 2289489fe1
12 changed files with 124 additions and 89 deletions

View File

@@ -1,11 +0,0 @@
package com.rak.config.converter
import jakarta.annotation.Priority
import org.eclipse.microprofile.config.spi.Converter
@Priority(1)
class EmptyStringConverter : Converter<String> {
override fun convert(value: String): String {
return value
}
}

View File

@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.Optional
interface FieldConfigFallback {
@WithName("steps")
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("default")
@WithDefault("N/A")
fun getOptionalDefaultValue(): String
}

View File

@@ -10,4 +10,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
}

View File

@@ -1,10 +1,8 @@
package com.rak.config.model
import com.rak.config.converter.EmptyStringConverter
import io.smallrye.config.WithConverter
import java.util.Optional
interface TransformationStepConfig {
fun name(): String
@WithConverter(EmptyStringConverter::class)
fun parameters(): MutableList<String>
fun parameters(): Optional<MutableList<String>>
}

View File

@@ -2,5 +2,5 @@ package com.rak.model.transform
@FunctionalInterface
fun interface ParameterizedTransformation : AbstractTransformation {
fun apply(input: String, parameters: List<String>): String
fun apply(input: String, parameters: MutableList<String>): String
}

View File

@@ -11,9 +11,13 @@ class TransformationRegistry {
init {
register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace("\"", "") }
register("replace") { input, parameters ->
require(parameters.size == 2) {
"'replace' requires exactly 2 parameters"
require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters"
}
if (parameters.size == 1) {
parameters.add("")
}
input.replace(parameters[0], parameters[1])
}
@@ -39,14 +43,14 @@ class TransformationRegistry {
val parameters = transformationStep.parameters()
return when {
transformations.containsKey(name) -> {
if (parameters.isNotEmpty()) {
if (parameters.isPresent && parameters.get().isNotEmpty()) {
throw IllegalArgumentException("'$name' doesn't accept parameters")
} else {
transformations[name]!!
}
}
parameterizedTransformation.containsKey(name) -> {
if (parameters.isEmpty()) {
if (parameters.isPresent && parameters.get().isEmpty()) {
throw IllegalArgumentException("'$name' requires parameters")
} else {
parameterizedTransformation[name]!!
@@ -65,7 +69,7 @@ class TransformationRegistry {
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
is ParameterizedTransformation ->
parameterizedTransformation[step.name()]?.apply(current, step.parameters())
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
else -> throw IllegalStateException("Invalid transformation type")

View File

@@ -4,7 +4,7 @@ import com.rak.config.model.AbstractScrapeTargetConfig
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.DiscriminatorDirection
import com.rak.config.model.TransformationStepConfig
import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
@@ -35,7 +35,13 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): Collection<E>
): List<E>
abstract fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<List<E>>
fun getRootElement(
element: Element,
@@ -143,37 +149,6 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList
}
fun extractMulti(
elements: Elements,
extractionConfig: T
): List<Map<String, String>> {
val resultList = mutableListOf<MutableMap<String, String>>()
// refactor this
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
for(index in 0..elements.size - 1) {
val rootElement = elements[index]
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
val mapToModify: MutableMap<String, String> = try {
resultList[index]
} catch (_: IndexOutOfBoundsException) {
val newMap = mutableMapOf<String, String>()
resultList.add(newMap)
newMap
}
mapToModify.put(identifier, extractedText)
}
}
return resultList
}
fun extractMultiWithDiscriminator(
element: Element,
extractionConfig: T
@@ -206,33 +181,48 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
var currentElement: Element? = root.clone()
var result: String? = null
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex
}
}
else -> throw ex
}
}
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
return result
}

View File

@@ -32,11 +32,22 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): Collection<CardPrint> {
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
): List<CardPrint> {
throw NotImplementedException("Not implemented")
}
return objectAsListOfMaps.map {
CardPrint.fromMap(it[0])
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<List<CardPrint>> {
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
element,
extractionConfig
)
return objectAsListOfMaps.map { innerList ->
innerList.map { map -> CardPrint.fromMap(map) }
}
}
}

View File

@@ -4,6 +4,7 @@ import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
@@ -35,18 +36,27 @@ class RegionalSetExtractionService(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): Collection<RegionalSet> {
): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig)
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
val cardPrints = cardPrintExtractionService.extractMultiple(
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
element,
providerConfig,
providerConfig.getTargets().getCardPrintConfiguration().get()
)
return regionalSetList.map {
RegionalSet.fromMap(it, cardPrints)
// Pair each RegionalSet with its CardPrint group by index
return regionalSetList.mapIndexed { index, regionalSetMap ->
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
}
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -36,7 +36,15 @@ class SetExtractionService(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): Collection<CardSet> {
): List<CardSet> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<CardSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -41,9 +41,10 @@ scraper:
value: "//li/text()"
transform:
- name: "replace"
parameters:
- " ("
- ""
parameters: [
" (",
""
]
language:
steps:
- type: xpath
@@ -76,12 +77,19 @@ scraper:
- type: xpath
value: "./text()"
regional-name:
fallback:
default: "N/A"
steps:
- type: xpath
value: "./td/a[2]"
value: "./td[2]"
- type: xpath
value: "./text()"
transform:
- name: "removeInnerQuotes"
parameters: []
rarity:
fallback:
default: "N/A"
steps:
- type: xpath
value: "./td/a[3]"