Basic multi-method extraction
This commit is contained in:
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
@@ -0,0 +1,11 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.Optional
|
||||
|
||||
interface ExtractorConfig {
|
||||
@WithName("steps")
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
}
|
||||
@@ -4,12 +4,12 @@ import io.smallrye.config.WithName
|
||||
import java.util.*
|
||||
|
||||
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||
@WithName("type")
|
||||
fun getType(): String
|
||||
@WithName("root")
|
||||
fun getRootConfig(): Optional<ExtractConfig>
|
||||
@WithName("steps")
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("extractors")
|
||||
fun getExtractionMethods(): List<ExtractorConfig>
|
||||
@WithName("fallback")
|
||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||
}
|
||||
@@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException
|
||||
import com.rak.model.transform.TransformationRegistry
|
||||
import com.rak.util.CssUtil
|
||||
import com.rak.util.XPathUtil
|
||||
import io.quarkus.logging.Log
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.Optional
|
||||
@@ -174,54 +175,67 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
root: Element,
|
||||
extractionConfig: ScrapeTargetFieldConfig
|
||||
): String? {
|
||||
val extractionSteps = extractionConfig.getExtractionSteps()
|
||||
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
|
||||
|
||||
var currentElement: Element? = root.clone()
|
||||
val extractionMethods = extractionConfig.getExtractionMethods()
|
||||
var result: String? = null
|
||||
|
||||
try {
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
throw IllegalStateException()
|
||||
}
|
||||
|
||||
if (index == extractionSteps.size - 1) {
|
||||
result = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
for(extractionMethod in extractionMethods) {
|
||||
val extractionSteps = extractionMethod.getExtractionSteps()
|
||||
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
|
||||
|
||||
var currentElement: Element? = root.clone()
|
||||
var intermediateResult: String? = null
|
||||
|
||||
try {
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
throw IllegalStateException()
|
||||
}
|
||||
|
||||
if (index == extractionSteps.size - 1) {
|
||||
intermediateResult = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentElement = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentElement = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
}
|
||||
} catch (ex: RuntimeException) {
|
||||
when (ex) {
|
||||
is ElementNotFoundException,
|
||||
is IllegalStateException -> {
|
||||
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
} else {
|
||||
throw ex
|
||||
if (intermediateResult == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
} else {
|
||||
if (transformationSteps.isPresent) {
|
||||
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
||||
}
|
||||
|
||||
result = intermediateResult
|
||||
break
|
||||
}
|
||||
} catch (ex: RuntimeException) {
|
||||
when (ex) {
|
||||
is ElementNotFoundException,
|
||||
is IllegalStateException -> {
|
||||
// if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
// } else {
|
||||
// throw ex
|
||||
// }
|
||||
Log.warn("An extraction method failed")
|
||||
}
|
||||
else -> throw ex
|
||||
}
|
||||
else -> throw ex
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
package com.rak.util
|
||||
|
||||
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
||||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
|
||||
@@ -4,25 +4,25 @@ quarkus:
|
||||
|
||||
scraper:
|
||||
sources:
|
||||
- id: konami-official
|
||||
name: "Konami Official Database"
|
||||
domain: "yugioh-card.com"
|
||||
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
targets:
|
||||
card:
|
||||
root:
|
||||
type: css
|
||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
name:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: "h1.product-title"
|
||||
- type: "xpath"
|
||||
value: "//h1[@itemprop='name']"
|
||||
attack:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: ".atk-value"
|
||||
# - id: konami-official
|
||||
# name: "Konami Official Database"
|
||||
# domain: "yugioh-card.com"
|
||||
# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
# targets:
|
||||
# card:
|
||||
# root:
|
||||
# type: css
|
||||
# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
# name:
|
||||
# steps:
|
||||
# - type: "css"
|
||||
# value: "h1.product-title"
|
||||
# - type: "xpath"
|
||||
# value: "//h1[@itemprop='name']"
|
||||
# attack:
|
||||
# steps:
|
||||
# - type: "css"
|
||||
# value: ".atk-value"
|
||||
|
||||
- id: ygo-fandom
|
||||
name: "Yu-Gi-Oh Fandom Wiki"
|
||||
@@ -34,98 +34,113 @@ scraper:
|
||||
type: css
|
||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
id:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
parameters: [
|
||||
" (",
|
||||
""
|
||||
]
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
parameters: [
|
||||
" (",
|
||||
""
|
||||
]
|
||||
language:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr"
|
||||
- type: xpath
|
||||
value: "//abbr/@title"
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr"
|
||||
- type: xpath
|
||||
value: "//abbr/@title"
|
||||
region-key:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr/text()"
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr/text()"
|
||||
card-print:
|
||||
multi: true
|
||||
discriminator:
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tab__content"
|
||||
root:
|
||||
type: css
|
||||
value: "table > tbody > tr:has(> td)"
|
||||
discriminator:
|
||||
type: string
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tab__content"
|
||||
id:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
name:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
regional-name:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
rarity:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
card:
|
||||
name:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "./tbody/tr[3]/th/text()"
|
||||
description:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
type:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
attack:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
defense:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
# card:
|
||||
# name:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "./tbody/tr[3]/th/text()"
|
||||
# description:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# type:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# attack:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# defense:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
Reference in New Issue
Block a user