Compare commits
4 Commits
108b4c4c19
...
5930da7a4c
| Author | SHA1 | Date | |
|---|---|---|---|
| 5930da7a4c | |||
| 8a0777e557 | |||
| 2a79218a54 | |||
| ee4ce4fd65 |
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
19
src/main/kotlin/com/rak/config/converter/PatternConverter.kt
Normal file
@@ -0,0 +1,19 @@
|
||||
package com.rak.config.converter
|
||||
|
||||
import org.eclipse.microprofile.config.spi.Converter
|
||||
import java.util.regex.Pattern
|
||||
import java.util.regex.PatternSyntaxException
|
||||
|
||||
class PatternConverter : Converter<Pattern> {
|
||||
override fun convert(value: String): Pattern {
|
||||
if (value.isBlank()) {
|
||||
throw IllegalArgumentException("Pattern may not be empty")
|
||||
}
|
||||
|
||||
try {
|
||||
return Pattern.compile(value)
|
||||
} catch (_: PatternSyntaxException) {
|
||||
throw IllegalStateException("'$value' is not a valid RegEx pattern")
|
||||
}
|
||||
}
|
||||
}
|
||||
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
11
src/main/kotlin/com/rak/config/model/ExtractorConfig.kt
Normal file
@@ -0,0 +1,11 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.Optional
|
||||
|
||||
interface ExtractorConfig {
|
||||
@WithName("steps")
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import io.smallrye.config.WithName
|
||||
|
||||
interface RegionalSetScrapeTargetConfig : AbstractScrapeTargetConfig {
|
||||
@WithName("id")
|
||||
fun getIdConfig(): ScrapeTargetFieldConfig
|
||||
@WithName("language")
|
||||
fun getLanguageConfig(): ScrapeTargetFieldConfig
|
||||
@WithName("region-key")
|
||||
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
|
||||
}
|
||||
@@ -4,12 +4,14 @@ import io.smallrye.config.WithName
|
||||
import java.util.*
|
||||
|
||||
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||
@WithName("type")
|
||||
fun getType(): String
|
||||
@WithName("root")
|
||||
fun getRootConfig(): Optional<ExtractConfig>
|
||||
@WithName("steps")
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("extractors")
|
||||
fun getExtractionMethods(): List<ExtractorConfig>
|
||||
@WithName("fallback")
|
||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||
@WithName("validation")
|
||||
fun getOptionalValidation(): Optional<ValidationConfig>
|
||||
}
|
||||
@@ -3,10 +3,6 @@ package com.rak.config.model
|
||||
import io.smallrye.config.WithName
|
||||
|
||||
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
|
||||
@WithName("id")
|
||||
fun getIdConfig(): ScrapeTargetFieldConfig
|
||||
@WithName("language")
|
||||
fun getLanguageConfig(): ScrapeTargetFieldConfig
|
||||
@WithName("region-key")
|
||||
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
|
||||
@WithName("name")
|
||||
fun getNameConfig(): ScrapeTargetFieldConfig
|
||||
}
|
||||
@@ -8,6 +8,8 @@ interface TargetsConfig {
|
||||
fun getCardConfig(): Optional<CardScrapeTargetConfig>
|
||||
@WithName("set")
|
||||
fun getSetConfig(): Optional<SetScrapeTargetConfig>
|
||||
@WithName("regional-set")
|
||||
fun getRegionalSetConfig(): Optional<RegionalSetScrapeTargetConfig>
|
||||
@WithName("card-print")
|
||||
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
|
||||
}
|
||||
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
12
src/main/kotlin/com/rak/config/model/ValidationConfig.kt
Normal file
@@ -0,0 +1,12 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import com.rak.config.converter.PatternConverter
|
||||
import io.smallrye.config.WithConverter
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.regex.Pattern
|
||||
|
||||
interface ValidationConfig {
|
||||
@WithName("pattern")
|
||||
@WithConverter(PatternConverter::class)
|
||||
fun getRegexPatterns(): MutableList<Pattern>
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
package com.rak.model.exception
|
||||
|
||||
class ValueValidationException(message: String) : RuntimeException(message)
|
||||
@@ -1,12 +1,15 @@
|
||||
package com.rak.model.set
|
||||
|
||||
import kotlin.collections.Set
|
||||
|
||||
data class CardSet(
|
||||
var name: String,
|
||||
val regionalSets: Set<RegionalSet>
|
||||
) {
|
||||
companion object {
|
||||
|
||||
fun fromMap(map: Map<String, String>, regionalSet: Set<RegionalSet>): CardSet {
|
||||
return CardSet(
|
||||
map["name"] ?: throw IllegalStateException("Parameter 'name' not found"),
|
||||
regionalSet
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -22,8 +22,11 @@ class TransformationRegistry {
|
||||
input.replace(parameters[0], parameters[1])
|
||||
}
|
||||
register("regexReplace") { input, params ->
|
||||
require(params.size == 2) {
|
||||
"'regexReplace' requires exactly 2 parameters"
|
||||
require(params.size == 1 || params.size == 2) {
|
||||
"'regexReplace' requires either 1 or 2 parameters"
|
||||
}
|
||||
if (params.size == 1) {
|
||||
params.add("")
|
||||
}
|
||||
input.replace(params[0].toRegex(), params[1])
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ import com.rak.model.exception.TargetNotFoundException
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import com.rak.service.extract.RegionalSetExtractionService
|
||||
import com.rak.service.extract.SetExtractionService
|
||||
import com.rak.service.extract.CardSetExtractionService
|
||||
import io.quarkus.logging.Log
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.Jsoup
|
||||
@@ -17,7 +17,7 @@ import java.lang.Exception
|
||||
@ApplicationScoped
|
||||
class ScrapeService(
|
||||
private val sourceService: SourceService,
|
||||
private val setExtractionService: SetExtractionService,
|
||||
private val cardSetExtractionService: CardSetExtractionService,
|
||||
private val regionalSetExtractionService: RegionalSetExtractionService,
|
||||
private val commonCrawlService: CommonCrawlService
|
||||
) {
|
||||
@@ -59,13 +59,11 @@ class ScrapeService(
|
||||
}
|
||||
}
|
||||
|
||||
return setExtractionService.extract(
|
||||
return cardSetExtractionService.extract(
|
||||
document,
|
||||
source,
|
||||
source.getTargets().getSetConfig().get()
|
||||
).apply {
|
||||
name = setName
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
fun scrapeRegionalSet(
|
||||
@@ -77,7 +75,7 @@ class ScrapeService(
|
||||
val path: String = normalizePath(setName)
|
||||
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
||||
|
||||
return regionalSetExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
|
||||
return regionalSetExtractionService.extract(document, source, source.getTargets().getRegionalSetConfig().get())
|
||||
}
|
||||
|
||||
fun scrapeCard(
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
package com.rak.service
|
||||
|
||||
import com.rak.config.model.CardScrapeTargetConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.config.model.RegionalSetScrapeTargetConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.SourcesConfig
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
@@ -21,7 +21,7 @@ class SourceService(
|
||||
}
|
||||
|
||||
private fun validateSource(providerConfig: ProviderConfig) {
|
||||
val optionalRegionalSetConfig = providerConfig.getTargets().getSetConfig()
|
||||
val optionalRegionalSetConfig = providerConfig.getTargets().getRegionalSetConfig()
|
||||
val optionalCardConfig = providerConfig.getTargets().getCardConfig()
|
||||
|
||||
if (optionalRegionalSetConfig.isPresent) {
|
||||
@@ -33,7 +33,7 @@ class SourceService(
|
||||
}
|
||||
}
|
||||
|
||||
private fun validateSetExtractConfig(setExtractConfig: SetScrapeTargetConfig) {
|
||||
private fun validateSetExtractConfig(setExtractConfig: RegionalSetScrapeTargetConfig) {
|
||||
val selectors = listOf(
|
||||
setExtractConfig.getLanguageConfig(),
|
||||
setExtractConfig.getIdConfig(),
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.AbstractScrapeTargetConfig
|
||||
import com.rak.config.model.ExtractConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.*
|
||||
import com.rak.model.Selector
|
||||
import com.rak.model.exception.ElementNotFoundException
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
import com.rak.model.exception.ValueValidationException
|
||||
import com.rak.model.transform.TransformationRegistry
|
||||
import com.rak.util.CssUtil
|
||||
import com.rak.util.XPathUtil
|
||||
import io.quarkus.logging.Log
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.Optional
|
||||
import java.util.*
|
||||
import kotlin.jvm.optionals.getOrElse
|
||||
|
||||
// find root element from global or node config
|
||||
@@ -174,11 +173,16 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
root: Element,
|
||||
extractionConfig: ScrapeTargetFieldConfig
|
||||
): String? {
|
||||
val extractionSteps = extractionConfig.getExtractionSteps()
|
||||
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
|
||||
val extractionMethods = extractionConfig.getExtractionMethods()
|
||||
var result: String? = null
|
||||
|
||||
|
||||
for(extractionMethod in extractionMethods) {
|
||||
val extractionSteps = extractionMethod.getExtractionSteps()
|
||||
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
|
||||
|
||||
var currentElement: Element? = root.clone()
|
||||
var result: String? = null
|
||||
var intermediateResult: String? = null
|
||||
|
||||
try {
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
@@ -188,7 +192,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
}
|
||||
|
||||
if (index == extractionSteps.size - 1) {
|
||||
result = when (currentStep.selectorType()) {
|
||||
intermediateResult = when (currentStep.selectorType()) {
|
||||
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
@@ -201,29 +205,61 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null) {
|
||||
if (intermediateResult == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
} else {
|
||||
try {
|
||||
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
|
||||
} catch (ex: ValueValidationException) {
|
||||
throw ex
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
|
||||
}
|
||||
|
||||
result = intermediateResult
|
||||
break
|
||||
}
|
||||
} catch (ex: RuntimeException) {
|
||||
when (ex) {
|
||||
is ElementNotFoundException,
|
||||
is IllegalStateException -> {
|
||||
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
} else {
|
||||
throw ex
|
||||
}
|
||||
// if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
// intermediateResult = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
// } else {
|
||||
// throw ex
|
||||
// }
|
||||
}
|
||||
is ValueValidationException -> Log.warn(ex.message)
|
||||
else -> throw ex
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
|
||||
if (!validationConfig.isPresent) {
|
||||
return
|
||||
}
|
||||
|
||||
var validated = true
|
||||
|
||||
for(regex in validationConfig.get().getRegexPatterns()) {
|
||||
if (!value.matches(regex.toRegex())) {
|
||||
validated = false
|
||||
}
|
||||
}
|
||||
|
||||
if (!validated) {
|
||||
throw ValueValidationException("'$value' does not validate against RegEx(s)")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -9,15 +9,13 @@ import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.nodes.Element
|
||||
|
||||
@ApplicationScoped
|
||||
class SetExtractionService(
|
||||
class CardSetExtractionService(
|
||||
private val regionalSetExtractionService: RegionalSetExtractionService
|
||||
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
|
||||
|
||||
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||
return mapOf(
|
||||
Pair("prefix", this.getIdConfig()),
|
||||
Pair("regionCode", this.getRegionKeyConfig()),
|
||||
Pair("region", this.getLanguageConfig()),
|
||||
Pair("name", this.getNameConfig()),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -26,9 +24,15 @@ class SetExtractionService(
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): CardSet {
|
||||
return CardSet(
|
||||
"test",
|
||||
regionalSetExtractionService.extractMultiple(element, providerConfig, extractionConfig).toSet()
|
||||
val set = extractSingle(element, extractionConfig)
|
||||
|
||||
return CardSet.fromMap(
|
||||
set,
|
||||
regionalSetExtractionService.extractMultiple(
|
||||
element,
|
||||
providerConfig,
|
||||
providerConfig.getTargets().getRegionalSetConfig().get()
|
||||
).toSet()
|
||||
)
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.config.model.RegionalSetScrapeTargetConfig
|
||||
import com.rak.config.model.SourcesConfig
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
@@ -14,9 +14,9 @@ import org.jsoup.nodes.Element
|
||||
class RegionalSetExtractionService(
|
||||
private val cardPrintExtractionService: CardPrintExtractionService,
|
||||
private val sourcesConfig: SourcesConfig
|
||||
) : AbstractExtractionService<RegionalSet, SetScrapeTargetConfig>() {
|
||||
) : AbstractExtractionService<RegionalSet, RegionalSetScrapeTargetConfig>() {
|
||||
|
||||
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||
override fun RegionalSetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||
return mapOf(
|
||||
Pair("prefix", this.getIdConfig()),
|
||||
Pair("regionCode", this.getRegionKeyConfig()),
|
||||
@@ -27,7 +27,7 @@ class RegionalSetExtractionService(
|
||||
override fun extract(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
extractionConfig: RegionalSetScrapeTargetConfig
|
||||
): RegionalSet {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
@@ -35,7 +35,7 @@ class RegionalSetExtractionService(
|
||||
override fun extractMultiple(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
extractionConfig: RegionalSetScrapeTargetConfig
|
||||
): List<RegionalSet> {
|
||||
val regionalSetList = extractMulti(element, extractionConfig)
|
||||
|
||||
@@ -55,7 +55,7 @@ class RegionalSetExtractionService(
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
extractionConfig: RegionalSetScrapeTargetConfig
|
||||
): List<List<RegionalSet>> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
package com.rak.util
|
||||
|
||||
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
||||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
|
||||
@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.nodes.TextNode
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.regex.Pattern
|
||||
import kotlin.coroutines.CoroutineContext
|
||||
|
||||
class XPathUtil private constructor() {
|
||||
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
|
||||
|
||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||
return root
|
||||
.selectXpath(xpath, TextNode::class.java)
|
||||
.firstOrNull()?.text()
|
||||
.selectXpath(xpath.replace("/text()", ""))
|
||||
.text()
|
||||
}
|
||||
|
||||
fun getNextElement(element: Element, path: String): Element? {
|
||||
|
||||
@@ -4,25 +4,25 @@ quarkus:
|
||||
|
||||
scraper:
|
||||
sources:
|
||||
- id: konami-official
|
||||
name: "Konami Official Database"
|
||||
domain: "yugioh-card.com"
|
||||
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
targets:
|
||||
card:
|
||||
root:
|
||||
type: css
|
||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
name:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: "h1.product-title"
|
||||
- type: "xpath"
|
||||
value: "//h1[@itemprop='name']"
|
||||
attack:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: ".atk-value"
|
||||
# - id: konami-official
|
||||
# name: "Konami Official Database"
|
||||
# domain: "yugioh-card.com"
|
||||
# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
# targets:
|
||||
# card:
|
||||
# root:
|
||||
# type: css
|
||||
# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
# name:
|
||||
# steps:
|
||||
# - type: "css"
|
||||
# value: "h1.product-title"
|
||||
# - type: "xpath"
|
||||
# value: "//h1[@itemprop='name']"
|
||||
# attack:
|
||||
# steps:
|
||||
# - type: "css"
|
||||
# value: ".atk-value"
|
||||
|
||||
- id: ygo-fandom
|
||||
name: "Yu-Gi-Oh Fandom Wiki"
|
||||
@@ -30,54 +30,98 @@ scraper:
|
||||
url-pattern: "https://yugioh.fandom.com/wiki/%s"
|
||||
targets:
|
||||
set:
|
||||
root:
|
||||
type: css
|
||||
value: "aside > .pi-title"
|
||||
name:
|
||||
type: string
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//h2/text()"
|
||||
regional-set:
|
||||
root:
|
||||
type: css
|
||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
id:
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
- name: "regexReplace"
|
||||
parameters: [
|
||||
" (",
|
||||
" *\\(.+\\)",
|
||||
""
|
||||
]
|
||||
language:
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr"
|
||||
- type: xpath
|
||||
value: "//abbr/@title"
|
||||
region-key:
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "//li/abbr/text()"
|
||||
card-print:
|
||||
multi: true
|
||||
discriminator:
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tab__content"
|
||||
root:
|
||||
type: css
|
||||
value: "table > tbody > tr:has(> td)"
|
||||
discriminator:
|
||||
type: string
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tab__content"
|
||||
id:
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
name:
|
||||
steps:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
value: "./td/span/text()"
|
||||
transform:
|
||||
- name: "regexReplace"
|
||||
parameters: [
|
||||
" .+",
|
||||
""
|
||||
]
|
||||
validation:
|
||||
pattern: "^.+-.+\\\\d.+$"
|
||||
name:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "regexReplace"
|
||||
parameters: [
|
||||
"\\(.+\\)",
|
||||
""
|
||||
]
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
validation:
|
||||
pattern: "^\".+\".*"
|
||||
regional-name:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td[2]"
|
||||
- type: xpath
|
||||
@@ -85,47 +129,63 @@ scraper:
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
validation:
|
||||
pattern: "^\".+\"$"
|
||||
rarity:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
type: int
|
||||
extractors:
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
card:
|
||||
name:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "./tbody/tr[3]/th/text()"
|
||||
description:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
type:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
attack:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
defense:
|
||||
root:
|
||||
type: css
|
||||
value: ".cardTable"
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "b:contains(Card descriptions)"
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
- steps:
|
||||
- type: xpath
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
validation:
|
||||
pattern: "^.*(Common|Rare|Print).*$"
|
||||
# card:
|
||||
# name:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "./tbody/tr[3]/th/text()"
|
||||
# description:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# type:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# attack:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
# defense:
|
||||
# root:
|
||||
# type: css
|
||||
# value: ".cardTable"
|
||||
# steps:
|
||||
# - type: "xpath"
|
||||
# value: "b:contains(Card descriptions)"
|
||||
Reference in New Issue
Block a user