Amend transformation engine
This commit is contained in:
@@ -1,11 +0,0 @@
|
|||||||
package com.rak.config.converter
|
|
||||||
|
|
||||||
import jakarta.annotation.Priority
|
|
||||||
import org.eclipse.microprofile.config.spi.Converter
|
|
||||||
|
|
||||||
@Priority(1)
|
|
||||||
class EmptyStringConverter : Converter<String> {
|
|
||||||
override fun convert(value: String): String {
|
|
||||||
return value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithDefault
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.Optional
|
||||||
|
|
||||||
|
interface FieldConfigFallback {
|
||||||
|
@WithName("steps")
|
||||||
|
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
|
||||||
|
@WithName("transform")
|
||||||
|
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||||
|
@WithName("default")
|
||||||
|
@WithDefault("N/A")
|
||||||
|
fun getOptionalDefaultValue(): String
|
||||||
|
}
|
||||||
@@ -10,4 +10,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
|||||||
fun getExtractionSteps(): List<ExtractConfig>
|
fun getExtractionSteps(): List<ExtractConfig>
|
||||||
@WithName("transform")
|
@WithName("transform")
|
||||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||||
|
@WithName("fallback")
|
||||||
|
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||||
}
|
}
|
||||||
@@ -1,10 +1,8 @@
|
|||||||
package com.rak.config.model
|
package com.rak.config.model
|
||||||
|
|
||||||
import com.rak.config.converter.EmptyStringConverter
|
import java.util.Optional
|
||||||
import io.smallrye.config.WithConverter
|
|
||||||
|
|
||||||
interface TransformationStepConfig {
|
interface TransformationStepConfig {
|
||||||
fun name(): String
|
fun name(): String
|
||||||
@WithConverter(EmptyStringConverter::class)
|
fun parameters(): Optional<MutableList<String>>
|
||||||
fun parameters(): MutableList<String>
|
|
||||||
}
|
}
|
||||||
@@ -2,5 +2,5 @@ package com.rak.model.transform
|
|||||||
|
|
||||||
@FunctionalInterface
|
@FunctionalInterface
|
||||||
fun interface ParameterizedTransformation : AbstractTransformation {
|
fun interface ParameterizedTransformation : AbstractTransformation {
|
||||||
fun apply(input: String, parameters: List<String>): String
|
fun apply(input: String, parameters: MutableList<String>): String
|
||||||
}
|
}
|
||||||
@@ -11,9 +11,13 @@ class TransformationRegistry {
|
|||||||
|
|
||||||
init {
|
init {
|
||||||
register("trim") { it.trim() }
|
register("trim") { it.trim() }
|
||||||
|
register("removeInnerQuotes") { it.replace("\"", "") }
|
||||||
register("replace") { input, parameters ->
|
register("replace") { input, parameters ->
|
||||||
require(parameters.size == 2) {
|
require(parameters.size == 1 || parameters.size == 2) {
|
||||||
"'replace' requires exactly 2 parameters"
|
"'replace' requires either 1 or 2 parameters"
|
||||||
|
}
|
||||||
|
if (parameters.size == 1) {
|
||||||
|
parameters.add("")
|
||||||
}
|
}
|
||||||
input.replace(parameters[0], parameters[1])
|
input.replace(parameters[0], parameters[1])
|
||||||
}
|
}
|
||||||
@@ -39,14 +43,14 @@ class TransformationRegistry {
|
|||||||
val parameters = transformationStep.parameters()
|
val parameters = transformationStep.parameters()
|
||||||
return when {
|
return when {
|
||||||
transformations.containsKey(name) -> {
|
transformations.containsKey(name) -> {
|
||||||
if (parameters.isNotEmpty()) {
|
if (parameters.isPresent && parameters.get().isNotEmpty()) {
|
||||||
throw IllegalArgumentException("'$name' doesn't accept parameters")
|
throw IllegalArgumentException("'$name' doesn't accept parameters")
|
||||||
} else {
|
} else {
|
||||||
transformations[name]!!
|
transformations[name]!!
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
parameterizedTransformation.containsKey(name) -> {
|
parameterizedTransformation.containsKey(name) -> {
|
||||||
if (parameters.isEmpty()) {
|
if (parameters.isPresent && parameters.get().isEmpty()) {
|
||||||
throw IllegalArgumentException("'$name' requires parameters")
|
throw IllegalArgumentException("'$name' requires parameters")
|
||||||
} else {
|
} else {
|
||||||
parameterizedTransformation[name]!!
|
parameterizedTransformation[name]!!
|
||||||
@@ -65,7 +69,7 @@ class TransformationRegistry {
|
|||||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||||
|
|
||||||
is ParameterizedTransformation ->
|
is ParameterizedTransformation ->
|
||||||
parameterizedTransformation[step.name()]?.apply(current, step.parameters())
|
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
|
||||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||||
|
|
||||||
else -> throw IllegalStateException("Invalid transformation type")
|
else -> throw IllegalStateException("Invalid transformation type")
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import com.rak.config.model.AbstractScrapeTargetConfig
|
|||||||
import com.rak.config.model.ExtractConfig
|
import com.rak.config.model.ExtractConfig
|
||||||
import com.rak.config.model.ProviderConfig
|
import com.rak.config.model.ProviderConfig
|
||||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
import com.rak.model.DiscriminatorDirection
|
import com.rak.config.model.TransformationStepConfig
|
||||||
import com.rak.model.Selector
|
import com.rak.model.Selector
|
||||||
import com.rak.model.exception.ElementNotFoundException
|
import com.rak.model.exception.ElementNotFoundException
|
||||||
import com.rak.model.exception.InvalidConfigurationException
|
import com.rak.model.exception.InvalidConfigurationException
|
||||||
@@ -35,7 +35,13 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
element: Element,
|
element: Element,
|
||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
): Collection<E>
|
): List<E>
|
||||||
|
|
||||||
|
abstract fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: T
|
||||||
|
): List<List<E>>
|
||||||
|
|
||||||
fun getRootElement(
|
fun getRootElement(
|
||||||
element: Element,
|
element: Element,
|
||||||
@@ -143,37 +149,6 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
return resultList
|
return resultList
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractMulti(
|
|
||||||
elements: Elements,
|
|
||||||
extractionConfig: T
|
|
||||||
): List<Map<String, String>> {
|
|
||||||
val resultList = mutableListOf<MutableMap<String, String>>()
|
|
||||||
|
|
||||||
|
|
||||||
// refactor this
|
|
||||||
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
|
|
||||||
for(index in 0..elements.size - 1) {
|
|
||||||
val rootElement = elements[index]
|
|
||||||
val extractedText = extractTextFromElementByTargetFieldConfig(
|
|
||||||
rootElement,
|
|
||||||
fieldConfig
|
|
||||||
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
|
|
||||||
|
|
||||||
val mapToModify: MutableMap<String, String> = try {
|
|
||||||
resultList[index]
|
|
||||||
} catch (_: IndexOutOfBoundsException) {
|
|
||||||
val newMap = mutableMapOf<String, String>()
|
|
||||||
resultList.add(newMap)
|
|
||||||
newMap
|
|
||||||
}
|
|
||||||
|
|
||||||
mapToModify.put(identifier, extractedText)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultList
|
|
||||||
}
|
|
||||||
|
|
||||||
fun extractMultiWithDiscriminator(
|
fun extractMultiWithDiscriminator(
|
||||||
element: Element,
|
element: Element,
|
||||||
extractionConfig: T
|
extractionConfig: T
|
||||||
@@ -206,33 +181,48 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
var currentElement: Element? = root.clone()
|
var currentElement: Element? = root.clone()
|
||||||
var result: String? = null
|
var result: String? = null
|
||||||
|
|
||||||
for (index in 0 until extractionSteps.size) {
|
try {
|
||||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
for (index in 0 until extractionSteps.size) {
|
||||||
if (currentElement == null) {
|
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||||
throw IllegalStateException()
|
if (currentElement == null) {
|
||||||
}
|
throw IllegalStateException()
|
||||||
|
}
|
||||||
|
|
||||||
if (index == extractionSteps.size - 1) {
|
if (index == extractionSteps.size - 1) {
|
||||||
result = when (currentStep.selectorType()) {
|
result = when (currentStep.selectorType()) {
|
||||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
currentElement = when (currentStep.selectorType()) {
|
||||||
|
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
currentElement = when (currentStep.selectorType()) {
|
if (result == null) {
|
||||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
throw ElementNotFoundException("Result could not be extracted")
|
||||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
}
|
||||||
|
|
||||||
|
if (transformationSteps.isPresent) {
|
||||||
|
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||||
|
}
|
||||||
|
} catch (ex: RuntimeException) {
|
||||||
|
when (ex) {
|
||||||
|
is ElementNotFoundException,
|
||||||
|
is IllegalStateException -> {
|
||||||
|
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||||
|
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||||
|
} else {
|
||||||
|
throw ex
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
else -> throw ex
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (result == null) {
|
|
||||||
throw ElementNotFoundException("Result could not be extracted")
|
|
||||||
}
|
|
||||||
|
|
||||||
if (transformationSteps.isPresent) {
|
|
||||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
|
||||||
}
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,11 +32,22 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
|
|||||||
element: Element,
|
element: Element,
|
||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: CardPrintScrapeTargetConfig
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
): Collection<CardPrint> {
|
): List<CardPrint> {
|
||||||
val objectAsListOfMaps = extractMultiWithDiscriminator(element, extractionConfig)
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
return objectAsListOfMaps.map {
|
override fun extractNestedMultiples(
|
||||||
CardPrint.fromMap(it[0])
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
|
): List<List<CardPrint>> {
|
||||||
|
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
|
||||||
|
element,
|
||||||
|
extractionConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
return objectAsListOfMaps.map { innerList ->
|
||||||
|
innerList.map { map -> CardPrint.fromMap(map) }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4,6 +4,7 @@ import com.rak.config.model.ProviderConfig
|
|||||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
import com.rak.config.model.SetScrapeTargetConfig
|
import com.rak.config.model.SetScrapeTargetConfig
|
||||||
import com.rak.config.model.SourcesConfig
|
import com.rak.config.model.SourcesConfig
|
||||||
|
import com.rak.model.card.CardPrint
|
||||||
import com.rak.model.exception.NotImplementedException
|
import com.rak.model.exception.NotImplementedException
|
||||||
import com.rak.model.set.RegionalSet
|
import com.rak.model.set.RegionalSet
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
@@ -35,18 +36,27 @@ class RegionalSetExtractionService(
|
|||||||
element: Element,
|
element: Element,
|
||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: SetScrapeTargetConfig
|
extractionConfig: SetScrapeTargetConfig
|
||||||
): Collection<RegionalSet> {
|
): List<RegionalSet> {
|
||||||
val regionalSetList = extractMulti(element, extractionConfig)
|
val regionalSetList = extractMulti(element, extractionConfig)
|
||||||
val cardPrintsInRegionalSet = extractMulti(element, extractionConfig)
|
|
||||||
|
|
||||||
val cardPrints = cardPrintExtractionService.extractMultiple(
|
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
|
||||||
element,
|
element,
|
||||||
providerConfig,
|
providerConfig,
|
||||||
providerConfig.getTargets().getCardPrintConfiguration().get()
|
providerConfig.getTargets().getCardPrintConfiguration().get()
|
||||||
)
|
)
|
||||||
|
|
||||||
return regionalSetList.map {
|
// Pair each RegionalSet with its CardPrint group by index
|
||||||
RegionalSet.fromMap(it, cardPrints)
|
return regionalSetList.mapIndexed { index, regionalSetMap ->
|
||||||
|
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
|
||||||
|
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
override fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<List<RegionalSet>> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -36,7 +36,15 @@ class SetExtractionService(
|
|||||||
element: Element,
|
element: Element,
|
||||||
providerConfig: ProviderConfig,
|
providerConfig: ProviderConfig,
|
||||||
extractionConfig: SetScrapeTargetConfig
|
extractionConfig: SetScrapeTargetConfig
|
||||||
): Collection<CardSet> {
|
): List<CardSet> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<List<CardSet>> {
|
||||||
throw NotImplementedException("Not implemented")
|
throw NotImplementedException("Not implemented")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -41,9 +41,10 @@ scraper:
|
|||||||
value: "//li/text()"
|
value: "//li/text()"
|
||||||
transform:
|
transform:
|
||||||
- name: "replace"
|
- name: "replace"
|
||||||
parameters:
|
parameters: [
|
||||||
- " ("
|
" (",
|
||||||
- ""
|
""
|
||||||
|
]
|
||||||
language:
|
language:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
@@ -76,12 +77,19 @@ scraper:
|
|||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./text()"
|
||||||
regional-name:
|
regional-name:
|
||||||
|
fallback:
|
||||||
|
default: "N/A"
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./td/a[2]"
|
value: "./td[2]"
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./text()"
|
||||||
|
transform:
|
||||||
|
- name: "removeInnerQuotes"
|
||||||
|
parameters: []
|
||||||
rarity:
|
rarity:
|
||||||
|
fallback:
|
||||||
|
default: "N/A"
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./td/a[3]"
|
value: "./td/a[3]"
|
||||||
|
|||||||
Reference in New Issue
Block a user