Compare commits

...

8 Commits

Author SHA1 Message Date
7860819029 Add CI/CD 2025-07-15 19:14:54 +02:00
304490b52e Correct YGO Fandom name transformation regex 2025-07-06 15:05:51 +02:00
ce5b87c34e Minor moddel adjustments 2025-07-01 12:54:56 +02:00
a9f6efc818 Minor config adjustment 2025-07-01 12:54:32 +02:00
5930da7a4c Split Set/RegionalSet properly 2025-06-29 16:49:30 +02:00
8a0777e557 Minor config amend
Regards Set ID
2025-06-29 14:56:00 +02:00
2a79218a54 Add RegEx validation
Amend RegExReplace transformer
Amend transformations
2025-06-29 14:52:09 +02:00
ee4ce4fd65 Basic multi-method extraction 2025-06-29 13:21:18 +02:00
24 changed files with 414 additions and 214 deletions

View File

@@ -0,0 +1,32 @@
name: Create and Push Release
on:
workflow_dispatch:
env:
AUTHENTIK_URL: https://auth.smoothbrain.win
REGISTRY_URL: gitea.smoothbrain.win
IMAGE_OWNER: rak
IMAGE_NAME: dex-scraper-java
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup JDK
uses: https://gitea.smoothbrain.win/rak/setup-java@main
with:
distribution: 'corretto'
java-version: '21.0.6'
cache: 'gradle'
- name: Build & Push Image
env:
QUARKUS_CONTAINER_IMAGE_USERNAME: ${{ secrets.CI_SERVICE_ACCOUNT }}
QUARKUS_CONTAINER_IMAGE_PASSWORD: ${{ secrets.CI_SERVICE_ACCOUNT_PASSWORD }}
run: |
./gradlew clean build \
-Dquarkus.container-image.push=true

View File

@@ -22,6 +22,7 @@ dependencies {
implementation("io.quarkus:quarkus-rest-client-kotlin-serialization") implementation("io.quarkus:quarkus-rest-client-kotlin-serialization")
implementation("io.quarkus:quarkus-rest-jackson") implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin") implementation("io.quarkus:quarkus-kotlin")
implementation("io.quarkus:quarkus-smallrye-fault-tolerance")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("io.quarkus:quarkus-arc") implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1") implementation("org.jsoup:jsoup:1.20.1")

View File

@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface RegionalSetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
}

View File

@@ -1,15 +1,21 @@
package com.rak.config.model package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName import io.smallrye.config.WithName
import java.util.* import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("nullable")
@WithDefault("false")
fun isNullable(): Boolean
@WithName("root") @WithName("root")
fun getRootConfig(): Optional<ExtractConfig> fun getRootConfig(): Optional<ExtractConfig>
@WithName("steps") @WithName("extractors")
fun getExtractionSteps(): List<ExtractConfig> fun getExtractionMethods(): List<ExtractorConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback") @WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback> fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
} }

View File

@@ -3,10 +3,6 @@ package com.rak.config.model
import io.smallrye.config.WithName import io.smallrye.config.WithName
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig { interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id") @WithName("name")
fun getIdConfig(): ScrapeTargetFieldConfig fun getNameConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
} }

View File

@@ -8,6 +8,8 @@ interface TargetsConfig {
fun getCardConfig(): Optional<CardScrapeTargetConfig> fun getCardConfig(): Optional<CardScrapeTargetConfig>
@WithName("set") @WithName("set")
fun getSetConfig(): Optional<SetScrapeTargetConfig> fun getSetConfig(): Optional<SetScrapeTargetConfig>
@WithName("regional-set")
fun getRegionalSetConfig(): Optional<RegionalSetScrapeTargetConfig>
@WithName("card-print") @WithName("card-print")
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig> fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
} }

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}

View File

@@ -1,9 +1,7 @@
package com.rak.model.card package com.rak.model.card
import com.rak.model.set.RegionalSet
data class CardPrint( data class CardPrint(
val id: String, var id: Int,
val name: String, val name: String,
val regionalName: String? = null, val regionalName: String? = null,
val rarity: String val rarity: String
@@ -11,10 +9,17 @@ data class CardPrint(
companion object { companion object {
fun fromMap(map: Map<String, String>): CardPrint { fun fromMap(map: Map<String, String>): CardPrint {
val regionalNameValue = map["regionalName"]
val regionalName = if (regionalNameValue == "") {
null
} else {
regionalNameValue
}
return CardPrint( return CardPrint(
map["id"] ?: throw IllegalStateException("Parameter 'prefix' not found"), map["id"]?.toInt() ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["name"] ?: throw IllegalStateException("Parameter 'region' not found"), map["name"] ?: throw IllegalStateException("Parameter 'region' not found"),
map["regionalName"], regionalName,
map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"), map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
) )
} }

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)

View File

@@ -1,12 +1,15 @@
package com.rak.model.set package com.rak.model.set
import kotlin.collections.Set
data class CardSet( data class CardSet(
var name: String, var name: String,
val regionalSets: Set<RegionalSet> val regionalSets: Set<RegionalSet>
) { ) {
companion object { companion object {
fun fromMap(map: Map<String, String>, regionalSet: Set<RegionalSet>): CardSet {
return CardSet(
map["name"] ?: throw IllegalStateException("Parameter 'name' not found"),
regionalSet
)
}
} }
} }

View File

@@ -22,28 +22,6 @@ data class RegionalSet(
) )
} }
fun flattenFromMemberLists(
idList: List<String>,
languageList: List<String>,
regionKeyAliasList: List<String>,
): MutableSet<RegionalSet> {
if (idList.size != languageList.size && idList.size != regionKeyAliasList.size) {
throw IllegalArgumentException("Lists have to be the same size")
}
val regionalSetList: MutableSet<RegionalSet> = mutableSetOf()
for (index in 0..idList.size - 1) {
regionalSetList.add(RegionalSet(
prefix = idList[index],
region = languageList[index],
regionCode = regionKeyAliasList[index],
listOf(),
numberOfCards = -1
))
}
return regionalSetList
}
} }
} }

View File

@@ -11,7 +11,7 @@ class TransformationRegistry {
init { init {
register("trim") { it.trim() } register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace("\"", "") } register("removeInnerQuotes") { it.replace(Regex("^\""), "").replace(Regex("\"$"), "") }
register("replace") { input, parameters -> register("replace") { input, parameters ->
require(parameters.size == 1 || parameters.size == 2) { require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters" "'replace' requires either 1 or 2 parameters"
@@ -22,8 +22,11 @@ class TransformationRegistry {
input.replace(parameters[0], parameters[1]) input.replace(parameters[0], parameters[1])
} }
register("regexReplace") { input, params -> register("regexReplace") { input, params ->
require(params.size == 2) { require(params.size == 1 || params.size == 2) {
"'regexReplace' requires exactly 2 parameters" "'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
} }
input.replace(params[0].toRegex(), params[1]) input.replace(params[0].toRegex(), params[1])
} }

View File

@@ -46,7 +46,7 @@ class CommonCrawlService(
crawlName.indexName crawlName.indexName
)) ))
} catch (ex: RuntimeException) { } catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url") Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url", ex)
} }
} }

View File

@@ -7,7 +7,7 @@ import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.SetExtractionService import com.rak.service.extract.CardSetExtractionService
import io.quarkus.logging.Log import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup import org.jsoup.Jsoup
@@ -17,7 +17,7 @@ import java.lang.Exception
@ApplicationScoped @ApplicationScoped
class ScrapeService( class ScrapeService(
private val sourceService: SourceService, private val sourceService: SourceService,
private val setExtractionService: SetExtractionService, private val cardSetExtractionService: CardSetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService, private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService private val commonCrawlService: CommonCrawlService
) { ) {
@@ -54,18 +54,16 @@ class ScrapeService(
try { try {
document = Jsoup.connect(url).get() document = Jsoup.connect(url).get()
} catch(ex: Exception) { } catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query") Log.warn("Error occurred during Jsoup query", ex)
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'") throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
} }
} }
return setExtractionService.extract( return cardSetExtractionService.extract(
document, document,
source, source,
source.getTargets().getSetConfig().get() source.getTargets().getSetConfig().get()
).apply { )
name = setName
}
} }
fun scrapeRegionalSet( fun scrapeRegionalSet(
@@ -77,7 +75,7 @@ class ScrapeService(
val path: String = normalizePath(setName) val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
return regionalSetExtractionService.extract(document, source, source.getTargets().getSetConfig().get()) return regionalSetExtractionService.extract(document, source, source.getTargets().getRegionalSetConfig().get())
} }
fun scrapeCard( fun scrapeCard(

View File

@@ -1,7 +1,7 @@
package com.rak.service package com.rak.service
import com.rak.config.model.CardScrapeTargetConfig import com.rak.config.model.CardScrapeTargetConfig
import com.rak.config.model.SetScrapeTargetConfig import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.SourcesConfig import com.rak.config.model.SourcesConfig
import com.rak.model.exception.InvalidConfigurationException import com.rak.model.exception.InvalidConfigurationException
@@ -21,7 +21,7 @@ class SourceService(
} }
private fun validateSource(providerConfig: ProviderConfig) { private fun validateSource(providerConfig: ProviderConfig) {
val optionalRegionalSetConfig = providerConfig.getTargets().getSetConfig() val optionalRegionalSetConfig = providerConfig.getTargets().getRegionalSetConfig()
val optionalCardConfig = providerConfig.getTargets().getCardConfig() val optionalCardConfig = providerConfig.getTargets().getCardConfig()
if (optionalRegionalSetConfig.isPresent) { if (optionalRegionalSetConfig.isPresent) {
@@ -33,7 +33,7 @@ class SourceService(
} }
} }
private fun validateSetExtractConfig(setExtractConfig: SetScrapeTargetConfig) { private fun validateSetExtractConfig(setExtractConfig: RegionalSetScrapeTargetConfig) {
val selectors = listOf( val selectors = listOf(
setExtractConfig.getLanguageConfig(), setExtractConfig.getLanguageConfig(),
setExtractConfig.getIdConfig(), setExtractConfig.getIdConfig(),

View File

@@ -6,14 +6,17 @@ import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url import io.quarkus.rest.client.reactive.Url
import io.smallrye.faulttolerance.api.RateLimit
import jakarta.ws.rs.Consumes import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET import jakarta.ws.rs.GET
import jakarta.ws.rs.Path import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.faulttolerance.Bulkhead
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
import java.time.temporal.ChronoUnit
@RegisterRestClient(baseUri = "whatever") @RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class) @RegisterProvider(NDJsonReader::class)
@@ -23,6 +26,11 @@ interface CommonCrawlRestClient {
@ClientQueryParam(name = "output", value = ["json"]) @ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index") @Path("/{index}-index")
@Consumes("text/x-ndjson") @Consumes("text/x-ndjson")
@RateLimit(
value = 1,
minSpacing = 5
)
@Bulkhead
fun queryIndex( fun queryIndex(
@Url @Url
baseUrl: String, baseUrl: String,

View File

@@ -1,18 +1,17 @@
package com.rak.service.extract package com.rak.service.extract
import com.rak.config.model.AbstractScrapeTargetConfig import com.rak.config.model.*
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.Selector import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil import com.rak.util.CssUtil
import com.rak.util.XPathUtil import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.Optional import java.util.*
import kotlin.jvm.optionals.getOrElse import kotlin.jvm.optionals.getOrElse
// find root element from global or node config // find root element from global or node config
@@ -131,7 +130,11 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
val extractedText = extractTextFromElementByTargetFieldConfig( val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement, rootElement,
fieldConfig fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'") ) ?: if (fieldConfig.isNullable()) {
""
} else {
throw ElementNotFoundException("Could not find element for '$identifier'")
}
val mapToModify: MutableMap<String, String> = try { val mapToModify: MutableMap<String, String> = try {
resultList[index] resultList[index]
@@ -174,56 +177,87 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
root: Element, root: Element,
extractionConfig: ScrapeTargetFieldConfig extractionConfig: ScrapeTargetFieldConfig
): String? { ): String? {
val extractionSteps = extractionConfig.getExtractionSteps() val extractionMethods = extractionConfig.getExtractionMethods()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null var result: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) { for(extractionMethod in extractionMethods) {
result = when (currentStep.selectorType()) { val extractionSteps = extractionMethod.getExtractionSteps()
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) val transformationSteps = extractionMethod.getOptionalTransformationSteps()
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
var currentElement: Element? = root.clone()
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
} }
} }
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
if (result == null) { if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted") throw ElementNotFoundException("Result could not be extracted")
} } else {
try {
if (transformationSteps.isPresent) { validateValue(intermediateResult, extractionConfig.getOptionalValidation())
result = transformationRegistry.applyTransformations(result, transformationSteps.get()) } catch (ex: ValueValidationException) {
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex throw ex
} }
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException,
is ValueValidationException -> Log.debug(ex.message)
else -> throw ex
} }
else -> throw ex
} }
} }
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result return result
} }
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
} }

View File

@@ -9,15 +9,13 @@ import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
@ApplicationScoped @ApplicationScoped
class SetExtractionService( class CardSetExtractionService(
private val regionalSetExtractionService: RegionalSetExtractionService private val regionalSetExtractionService: RegionalSetExtractionService
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() { ) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> { override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf( return mapOf(
Pair("prefix", this.getIdConfig()), Pair("name", this.getNameConfig()),
Pair("regionCode", this.getRegionKeyConfig()),
Pair("region", this.getLanguageConfig()),
) )
} }
@@ -26,9 +24,15 @@ class SetExtractionService(
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: SetScrapeTargetConfig
): CardSet { ): CardSet {
return CardSet( val set = extractSingle(element, extractionConfig)
"test",
regionalSetExtractionService.extractMultiple(element, providerConfig, extractionConfig).toSet() return CardSet.fromMap(
set,
regionalSetExtractionService.extractMultiple(
element,
providerConfig,
providerConfig.getTargets().getRegionalSetConfig().get()
).toSet()
) )
} }

View File

@@ -2,7 +2,7 @@ package com.rak.service.extract
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.SourcesConfig import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException import com.rak.model.exception.NotImplementedException
@@ -14,9 +14,9 @@ import org.jsoup.nodes.Element
class RegionalSetExtractionService( class RegionalSetExtractionService(
private val cardPrintExtractionService: CardPrintExtractionService, private val cardPrintExtractionService: CardPrintExtractionService,
private val sourcesConfig: SourcesConfig private val sourcesConfig: SourcesConfig
) : AbstractExtractionService<RegionalSet, SetScrapeTargetConfig>() { ) : AbstractExtractionService<RegionalSet, RegionalSetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> { override fun RegionalSetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf( return mapOf(
Pair("prefix", this.getIdConfig()), Pair("prefix", this.getIdConfig()),
Pair("regionCode", this.getRegionKeyConfig()), Pair("regionCode", this.getRegionKeyConfig()),
@@ -27,7 +27,7 @@ class RegionalSetExtractionService(
override fun extract( override fun extract(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): RegionalSet { ): RegionalSet {
throw NotImplementedException("Not implemented") throw NotImplementedException("Not implemented")
} }
@@ -35,7 +35,7 @@ class RegionalSetExtractionService(
override fun extractMultiple( override fun extractMultiple(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): List<RegionalSet> { ): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig) val regionalSetList = extractMulti(element, extractionConfig)
@@ -55,7 +55,7 @@ class RegionalSetExtractionService(
override fun extractNestedMultiples( override fun extractNestedMultiples(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): List<List<RegionalSet>> { ): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented") throw NotImplementedException("Not implemented")
} }

View File

@@ -1,6 +1,5 @@
package com.rak.util package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse import com.rak.model.cc.CCIndexSuccessResponse

View File

@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() { class XPathUtil private constructor() {
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
private fun extractTextFromNode(root: Element, xpath: String): String? { private fun extractTextFromNode(root: Element, xpath: String): String? {
return root return root
.selectXpath(xpath, TextNode::class.java) .selectXpath(xpath.replace("/text()", ""))
.firstOrNull()?.text() .text()
} }
fun getNextElement(element: Element, path: String): Element? { fun getNextElement(element: Element, path: String): Element? {

View File

@@ -1,28 +1,37 @@
quarkus: quarkus:
container-image:
registry: gitea.smoothbrain.win
group: rak
build: true
additional-tags: latest
http: http:
port: 8081 port: 8081
live-reload:
instrumentation: true
scraper: scraper:
sources: sources:
- id: konami-official # - id: konami-official
name: "Konami Official Database" # name: "Konami Official Database"
domain: "yugioh-card.com" # domain: "yugioh-card.com"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" # url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets: # targets:
card: # card:
root: # root:
type: css # type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" # value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
name: # name:
steps: # steps:
- type: "css" # - type: "css"
value: "h1.product-title" # value: "h1.product-title"
- type: "xpath" # - type: "xpath"
value: "//h1[@itemprop='name']" # value: "//h1[@itemprop='name']"
attack: # attack:
steps: # steps:
- type: "css" # - type: "css"
value: ".atk-value" # value: ".atk-value"
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
@@ -30,102 +39,166 @@ scraper:
url-pattern: "https://yugioh.fandom.com/wiki/%s" url-pattern: "https://yugioh.fandom.com/wiki/%s"
targets: targets:
set: set:
root:
type: css
value: "aside > .pi-title"
name:
type: string
extractors:
- steps:
- type: xpath
value: "//h2/text()"
regional-set:
root: root:
type: css type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id: id:
steps: type: int
- type: xpath extractors:
value: "//li/text()" - steps:
transform: - type: xpath
- name: "replace" value: "//li/text()"
parameters: [ transform:
" (", - name: "regexReplace"
"" parameters: [
] " *\\(.+\\)",
""
]
language: language:
steps: type: int
- type: xpath extractors:
value: "//li/abbr" - steps:
- type: xpath - type: xpath
value: "//abbr/@title" value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
region-key: region-key:
steps: type: int
- type: xpath extractors:
value: "//li/abbr/text()" - steps:
- type: xpath
value: "//li/abbr/text()"
card-print: card-print:
multi: true multi: true
discriminator:
root:
type: css
value: ".wds-tab__content"
root: root:
type: css type: css
value: "table > tbody > tr:has(> td)" value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id: id:
steps: type: int
- type: xpath extractors:
value: "./td/a[0]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[0]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/span/text()"
transform:
- name: "regexReplace"
parameters: [
" .+",
""
]
- name: "regexReplace"
parameters: [
".+-[A-Za-z]*0?",
""
]
validation:
pattern: "^.+-.+\\\\d.+$"
name: name:
steps: type: int
- type: xpath extractors:
value: "./td/a[1]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td[1]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
" ?\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name: regional-name:
fallback: type: int
default: "N/A" nullable: true
steps: extractors:
- type: xpath - steps:
value: "./td[2]" - type: xpath
- type: xpath value: "./td[2]"
value: "./text()" - type: xpath
transform: value: "./text()"
- name: "removeInnerQuotes" transform:
parameters: [] - name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\"$"
rarity: rarity:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
- type: xpath extractors:
value: "./td/a[3]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[3]"
card: - type: xpath
name: value: "./text()"
root: - steps:
type: css - type: xpath
value: ".cardTable" value: "./td/a[2]"
steps: - type: xpath
- type: "xpath" value: "./text()"
value: "./tbody/tr[3]/th/text()" - steps:
description: - type: xpath
root: value: "./td/a[1]"
type: css - type: xpath
value: ".cardTable" value: "./text()"
steps: validation:
- type: "xpath" pattern: "^.*(Common|Rare|Print).*$"
value: "b:contains(Card descriptions)" # card:
type: # name:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "./tbody/tr[3]/th/text()"
attack: # description:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
defense: # type:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
# attack:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# defense:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"