Basic multi-method extraction
This commit is contained in:
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.Optional
|
||||||
|
|
||||||
|
interface ExtractorConfig {
|
||||||
|
@WithName("steps")
|
||||||
|
fun getExtractionSteps(): List<ExtractConfig>
|
||||||
|
@WithName("transform")
|
||||||
|
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||||
|
}
|
||||||
@@ -4,12 +4,12 @@ import io.smallrye.config.WithName
|
|||||||
import java.util.*
|
import java.util.*
|
||||||
|
|
||||||
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||||
|
@WithName("type")
|
||||||
|
fun getType(): String
|
||||||
@WithName("root")
|
@WithName("root")
|
||||||
fun getRootConfig(): Optional<ExtractConfig>
|
fun getRootConfig(): Optional<ExtractConfig>
|
||||||
@WithName("steps")
|
@WithName("extractors")
|
||||||
fun getExtractionSteps(): List<ExtractConfig>
|
fun getExtractionMethods(): List<ExtractorConfig>
|
||||||
@WithName("transform")
|
|
||||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
|
||||||
@WithName("fallback")
|
@WithName("fallback")
|
||||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||||
}
|
}
|
||||||
@@ -10,6 +10,7 @@ import com.rak.model.exception.InvalidConfigurationException
|
|||||||
import com.rak.model.transform.TransformationRegistry
|
import com.rak.model.transform.TransformationRegistry
|
||||||
import com.rak.util.CssUtil
|
import com.rak.util.CssUtil
|
||||||
import com.rak.util.XPathUtil
|
import com.rak.util.XPathUtil
|
||||||
|
import io.quarkus.logging.Log
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.select.Elements
|
import org.jsoup.select.Elements
|
||||||
import java.util.Optional
|
import java.util.Optional
|
||||||
@@ -174,54 +175,67 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
|||||||
root: Element,
|
root: Element,
|
||||||
extractionConfig: ScrapeTargetFieldConfig
|
extractionConfig: ScrapeTargetFieldConfig
|
||||||
): String? {
|
): String? {
|
||||||
val extractionSteps = extractionConfig.getExtractionSteps()
|
val extractionMethods = extractionConfig.getExtractionMethods()
|
||||||
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
|
|
||||||
|
|
||||||
var currentElement: Element? = root.clone()
|
|
||||||
var result: String? = null
|
var result: String? = null
|
||||||
|
|
||||||
try {
|
|
||||||
for (index in 0 until extractionSteps.size) {
|
|
||||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
|
||||||
if (currentElement == null) {
|
|
||||||
throw IllegalStateException()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (index == extractionSteps.size - 1) {
|
for(extractionMethod in extractionMethods) {
|
||||||
result = when (currentStep.selectorType()) {
|
val extractionSteps = extractionMethod.getExtractionSteps()
|
||||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
|
||||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
|
||||||
|
var currentElement: Element? = root.clone()
|
||||||
|
var intermediateResult: String? = null
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (index in 0 until extractionSteps.size) {
|
||||||
|
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||||
|
if (currentElement == null) {
|
||||||
|
throw IllegalStateException()
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index == extractionSteps.size - 1) {
|
||||||
|
intermediateResult = when (currentStep.selectorType()) {
|
||||||
|
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
|
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
currentElement = when (currentStep.selectorType()) {
|
||||||
|
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
currentElement = when (currentStep.selectorType()) {
|
|
||||||
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
|
||||||
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result == null) {
|
if (intermediateResult == null) {
|
||||||
throw ElementNotFoundException("Result could not be extracted")
|
throw ElementNotFoundException("Result could not be extracted")
|
||||||
}
|
} else {
|
||||||
|
if (transformationSteps.isPresent) {
|
||||||
if (transformationSteps.isPresent) {
|
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
||||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
|
||||||
}
|
|
||||||
} catch (ex: RuntimeException) {
|
|
||||||
when (ex) {
|
|
||||||
is ElementNotFoundException,
|
|
||||||
is IllegalStateException -> {
|
|
||||||
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
|
||||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
|
||||||
} else {
|
|
||||||
throw ex
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result = intermediateResult
|
||||||
|
break
|
||||||
|
}
|
||||||
|
} catch (ex: RuntimeException) {
|
||||||
|
when (ex) {
|
||||||
|
is ElementNotFoundException,
|
||||||
|
is IllegalStateException -> {
|
||||||
|
// if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||||
|
// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||||
|
// } else {
|
||||||
|
// throw ex
|
||||||
|
// }
|
||||||
|
Log.warn("An extraction method failed")
|
||||||
|
}
|
||||||
|
else -> throw ex
|
||||||
}
|
}
|
||||||
else -> throw ex
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
|
||||||
|
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||||
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
package com.rak.util
|
package com.rak.util
|
||||||
|
|
||||||
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
|
||||||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||||
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||||
import com.rak.model.cc.CCIndexSuccessResponse
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
|||||||
@@ -4,25 +4,25 @@ quarkus:
|
|||||||
|
|
||||||
scraper:
|
scraper:
|
||||||
sources:
|
sources:
|
||||||
- id: konami-official
|
# - id: konami-official
|
||||||
name: "Konami Official Database"
|
# name: "Konami Official Database"
|
||||||
domain: "yugioh-card.com"
|
# domain: "yugioh-card.com"
|
||||||
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||||
targets:
|
# targets:
|
||||||
card:
|
# card:
|
||||||
root:
|
# root:
|
||||||
type: css
|
# type: css
|
||||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||||
name:
|
# name:
|
||||||
steps:
|
# steps:
|
||||||
- type: "css"
|
# - type: "css"
|
||||||
value: "h1.product-title"
|
# value: "h1.product-title"
|
||||||
- type: "xpath"
|
# - type: "xpath"
|
||||||
value: "//h1[@itemprop='name']"
|
# value: "//h1[@itemprop='name']"
|
||||||
attack:
|
# attack:
|
||||||
steps:
|
# steps:
|
||||||
- type: "css"
|
# - type: "css"
|
||||||
value: ".atk-value"
|
# value: ".atk-value"
|
||||||
|
|
||||||
- id: ygo-fandom
|
- id: ygo-fandom
|
||||||
name: "Yu-Gi-Oh Fandom Wiki"
|
name: "Yu-Gi-Oh Fandom Wiki"
|
||||||
@@ -34,98 +34,113 @@ scraper:
|
|||||||
type: css
|
type: css
|
||||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||||
id:
|
id:
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "//li/text()"
|
- steps:
|
||||||
transform:
|
- type: xpath
|
||||||
- name: "replace"
|
value: "//li/text()"
|
||||||
parameters: [
|
transform:
|
||||||
" (",
|
- name: "replace"
|
||||||
""
|
parameters: [
|
||||||
]
|
" (",
|
||||||
|
""
|
||||||
|
]
|
||||||
language:
|
language:
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "//li/abbr"
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "//abbr/@title"
|
value: "//li/abbr"
|
||||||
|
- type: xpath
|
||||||
|
value: "//abbr/@title"
|
||||||
region-key:
|
region-key:
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "//li/abbr/text()"
|
- steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "//li/abbr/text()"
|
||||||
card-print:
|
card-print:
|
||||||
multi: true
|
multi: true
|
||||||
discriminator:
|
|
||||||
root:
|
|
||||||
type: css
|
|
||||||
value: ".wds-tab__content"
|
|
||||||
root:
|
root:
|
||||||
type: css
|
type: css
|
||||||
value: "table > tbody > tr:has(> td)"
|
value: "table > tbody > tr:has(> td)"
|
||||||
|
discriminator:
|
||||||
|
type: string
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".wds-tab__content"
|
||||||
id:
|
id:
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "./td/a[0]"
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./td/a[0]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
name:
|
name:
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "./td/a[1]"
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./td/a[1]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
regional-name:
|
regional-name:
|
||||||
fallback:
|
fallback:
|
||||||
default: "N/A"
|
default: "N/A"
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "./td[2]"
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./td[2]"
|
||||||
transform:
|
- type: xpath
|
||||||
- name: "removeInnerQuotes"
|
value: "./text()"
|
||||||
parameters: []
|
transform:
|
||||||
|
- name: "removeInnerQuotes"
|
||||||
|
parameters: []
|
||||||
rarity:
|
rarity:
|
||||||
fallback:
|
fallback:
|
||||||
default: "N/A"
|
default: "N/A"
|
||||||
steps:
|
type: int
|
||||||
- type: xpath
|
extractors:
|
||||||
value: "./td/a[3]"
|
- steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "./text()"
|
value: "./td/a[3]"
|
||||||
card:
|
- type: xpath
|
||||||
name:
|
value: "./text()"
|
||||||
root:
|
# card:
|
||||||
type: css
|
# name:
|
||||||
value: ".cardTable"
|
# root:
|
||||||
steps:
|
# type: css
|
||||||
- type: "xpath"
|
# value: ".cardTable"
|
||||||
value: "./tbody/tr[3]/th/text()"
|
# steps:
|
||||||
description:
|
# - type: "xpath"
|
||||||
root:
|
# value: "./tbody/tr[3]/th/text()"
|
||||||
type: css
|
# description:
|
||||||
value: ".cardTable"
|
# root:
|
||||||
steps:
|
# type: css
|
||||||
- type: "xpath"
|
# value: ".cardTable"
|
||||||
value: "b:contains(Card descriptions)"
|
# steps:
|
||||||
type:
|
# - type: "xpath"
|
||||||
root:
|
# value: "b:contains(Card descriptions)"
|
||||||
type: css
|
# type:
|
||||||
value: ".cardTable"
|
# root:
|
||||||
steps:
|
# type: css
|
||||||
- type: "xpath"
|
# value: ".cardTable"
|
||||||
value: "b:contains(Card descriptions)"
|
# steps:
|
||||||
attack:
|
# - type: "xpath"
|
||||||
root:
|
# value: "b:contains(Card descriptions)"
|
||||||
type: css
|
# attack:
|
||||||
value: ".cardTable"
|
# root:
|
||||||
steps:
|
# type: css
|
||||||
- type: "xpath"
|
# value: ".cardTable"
|
||||||
value: "b:contains(Card descriptions)"
|
# steps:
|
||||||
defense:
|
# - type: "xpath"
|
||||||
root:
|
# value: "b:contains(Card descriptions)"
|
||||||
type: css
|
# defense:
|
||||||
value: ".cardTable"
|
# root:
|
||||||
steps:
|
# type: css
|
||||||
- type: "xpath"
|
# value: ".cardTable"
|
||||||
value: "b:contains(Card descriptions)"
|
# steps:
|
||||||
|
# - type: "xpath"
|
||||||
|
# value: "b:contains(Card descriptions)"
|
||||||
Reference in New Issue
Block a user