Amend transformation engine
This commit is contained in:
@@ -1,11 +0,0 @@
|
||||
package com.rak.config.converter
|
||||
|
||||
import jakarta.annotation.Priority
|
||||
import org.eclipse.microprofile.config.spi.Converter
|
||||
|
||||
@Priority(1)
|
||||
class EmptyStringConverter : Converter<String> {
|
||||
override fun convert(value: String): String {
|
||||
return value
|
||||
}
|
||||
}
|
||||
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
@@ -0,0 +1,15 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import io.smallrye.config.WithDefault
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.Optional
|
||||
|
||||
interface FieldConfigFallback {
|
||||
@WithName("steps")
|
||||
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("default")
|
||||
@WithDefault("N/A")
|
||||
fun getOptionalDefaultValue(): String
|
||||
}
|
||||
@@ -10,4 +10,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("fallback")
|
||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||
}
|
||||
@@ -1,10 +1,8 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import com.rak.config.converter.EmptyStringConverter
|
||||
import io.smallrye.config.WithConverter
|
||||
import java.util.Optional
|
||||
|
||||
interface TransformationStepConfig {
|
||||
fun name(): String
|
||||
@WithConverter(EmptyStringConverter::class)
|
||||
fun parameters(): MutableList<String>
|
||||
fun parameters(): Optional<MutableList<String>>
|
||||
}
|
||||
@@ -2,5 +2,5 @@ package com.rak.model.transform
|
||||
|
||||
@FunctionalInterface
|
||||
fun interface ParameterizedTransformation : AbstractTransformation {
|
||||
fun apply(input: String, parameters: List<String>): String
|
||||
fun apply(input: String, parameters: MutableList<String>): String
|
||||
}
|
||||
@@ -11,9 +11,13 @@ class TransformationRegistry {
|
||||
|
||||
init {
|
||||
register("trim") { it.trim() }
|
||||
register("removeInnerQuotes") { it.replace("\"", "") }
|
||||
register("replace") { input, parameters ->
|
||||
require(parameters.size == 2) {
|
||||
"'replace' requires exactly 2 parameters"
|
||||
require(parameters.size == 1 || parameters.size == 2) {
|
||||
"'replace' requires either 1 or 2 parameters"
|
||||
}
|
||||
if (parameters.size == 1) {
|
||||
parameters.add("")
|
||||
}
|
||||
input.replace(parameters[0], parameters[1])
|
||||
}
|
||||
@@ -39,14 +43,14 @@ class TransformationRegistry {
|
||||
val parameters = transformationStep.parameters()
|
||||
return when {
|
||||
transformations.containsKey(name) -> {
|
||||
if (parameters.isNotEmpty()) {
|
||||
if (parameters.isPresent && parameters.get().isNotEmpty()) {
|
||||
throw IllegalArgumentException("'$name' doesn't accept parameters")
|
||||
} else {
|
||||
transformations[name]!!
|
||||
}
|
||||
}
|
||||
parameterizedTransformation.containsKey(name) -> {
|
||||
if (parameters.isEmpty()) {
|
||||
if (parameters.isPresent && parameters.get().isEmpty()) {
|
||||
throw IllegalArgumentException("'$name' requires parameters")
|
||||
} else {
|
||||
parameterizedTransformation[name]!!
|
||||
@@ -65,7 +69,7 @@ class TransformationRegistry {
|
||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||
|
||||
is ParameterizedTransformation ->
|
||||
parameterizedTransformation[step.name()]?.apply(current, step.parameters())
|
||||
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
|
||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||
|
||||
else -> throw IllegalStateException("Invalid transformation type")
|
||||
|
||||
@@ -4,7 +4,7 @@ import com.rak.config.model.AbstractScrapeTargetConfig
|
||||
import com.rak.config.model.ExtractConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.model.DiscriminatorDirection
|
||||
import com.rak.config.model.TransformationStepConfig
|
||||
import com.rak.model.Selector
|
||||
import com.rak.model.exception.ElementNotFoundException
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
@@ -35,7 +35,13 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: T
|
||||
): Collection<E>
|
||||
): List<E>
|
||||
|
||||
abstract fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: T
|
||||
): List<List<E>>
|
||||
|
||||
fun getRootElement(
|
||||
element: Element,
|
||||
@@ -143,37 +149,6 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractMulti(
|
||||
elements: Elements,
|
||||
extractionConfig: T
|
||||
): List<Map<String, String>> {
|
||||
val resultList = mutableListOf<MutableMap<String, String>>()
|
||||
|
||||
|
||||
// refactor this
|
||||
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
|
||||
for(index in 0..elements.size - 1) {
|
||||
val rootElement = elements[index]
|
||||
val extractedText = extractTextFromElementByTargetFieldConfig(
|
||||
rootElement,
|
||||
fieldConfig
|
||||
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
|
||||
|
||||
val mapToModify: MutableMap<String, String> = try {
|
||||
resultList[index]
|
||||
} catch (_: IndexOutOfBoundsException) {
|
||||
val newMap = mutableMapOf<String, String>()
|
||||
resultList.add(newMap)
|
||||
newMap
|
||||
}
|
||||
|
||||
mapToModify.put(identifier, extractedText)
|
||||
}
|
||||
}
|
||||
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractMultiWithDiscriminator(
|
||||
element: Element,
|
||||
extractionConfig: T
|
||||
@@ -206,33 +181,48 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
var currentElement: Element? = root.clone()
|
||||
var result: String? = null
|
||||
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
throw IllegalStateException()
|
||||
}
|
||||
try {
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
throw IllegalStateException()
|
||||
}
|
||||
|
||||
if (index == extractionSteps.size - 1) {
|
||||
result = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
if (index == extractionSteps.size - 1) {
|
||||
result = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentElement = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentElement = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
|
||||
if (result == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
}
|
||||
} catch (ex: RuntimeException) {
|
||||
when (ex) {
|
||||
is ElementNotFoundException,
|
||||
is IllegalStateException -> {
|
||||
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
} else {
|
||||
throw ex
|
||||
}
|
||||
}
|
||||
else -> throw ex
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -32,11 +32,22 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: CardPrintScrapeTargetConfig
|
||||
): Collection<CardPrint> {
|
||||
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
|
||||
): List<CardPrint> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
|
||||
return objectAsListOfMaps.map {
|
||||
CardPrint.fromMap(it[0])
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: CardPrintScrapeTargetConfig
|
||||
): List<List<CardPrint>> {
|
||||
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
|
||||
element,
|
||||
extractionConfig
|
||||
)
|
||||
|
||||
return objectAsListOfMaps.map { innerList ->
|
||||
innerList.map { map -> CardPrint.fromMap(map) }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.config.model.SourcesConfig
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.set.RegionalSet
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
@@ -35,18 +36,27 @@ class RegionalSetExtractionService(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): Collection<RegionalSet> {
|
||||
): List<RegionalSet> {
|
||||
val regionalSetList = extractMulti(element, extractionConfig)
|
||||
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
|
||||
|
||||
val cardPrints = cardPrintExtractionService.extractMultiple(
|
||||
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
|
||||
element,
|
||||
providerConfig,
|
||||
providerConfig.getTargets().getCardPrintConfiguration().get()
|
||||
)
|
||||
|
||||
return regionalSetList.map {
|
||||
RegionalSet.fromMap(it, cardPrints)
|
||||
// Pair each RegionalSet with its CardPrint group by index
|
||||
return regionalSetList.mapIndexed { index, regionalSetMap ->
|
||||
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
|
||||
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
|
||||
}
|
||||
}
|
||||
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): List<List<RegionalSet>> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
}
|
||||
@@ -36,7 +36,15 @@ class SetExtractionService(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): Collection<CardSet> {
|
||||
): List<CardSet> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): List<List<CardSet>> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
}
|
||||
@@ -1,2 +1,2 @@
|
||||
com.rak.config.converter.TypeSelectorConverter
|
||||
com.rak.config.converter.DiscriminatorDirectionConverter
|
||||
com.rak.config.converter.DiscriminatorDirectionConverter
|
||||
@@ -41,9 +41,10 @@ scraper:
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
parameters:
|
||||
- " ("
|
||||
- ""
|
||||
parameters: [
|
||||
" (",
|
||||
""
|
||||
]
|
||||
language:
|
||||
steps:
|
||||
- type: xpath
|
||||
@@ -76,12 +77,19 @@ scraper:
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
regional-name:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td/a[2]"
|
||||
value: "./td[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
rarity:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td/a[3]"
|
||||
|
||||
Reference in New Issue
Block a user