Compare commits

24 Commits

Author SHA1 Message Date
7860819029 Add CI/CD 2025-07-15 19:14:54 +02:00
304490b52e Correct YGO Fandom name transformation regex 2025-07-06 15:05:51 +02:00
ce5b87c34e Minor moddel adjustments 2025-07-01 12:54:56 +02:00
a9f6efc818 Minor config adjustment 2025-07-01 12:54:32 +02:00
5930da7a4c Split Set/RegionalSet properly 2025-06-29 16:49:30 +02:00
8a0777e557 Minor config amend
Regards Set ID
2025-06-29 14:56:00 +02:00
2a79218a54 Add RegEx validation
Amend RegExReplace transformer
Amend transformations
2025-06-29 14:52:09 +02:00
ee4ce4fd65 Basic multi-method extraction 2025-06-29 13:21:18 +02:00
108b4c4c19 Basic exception mapping 2025-06-26 17:17:10 +02:00
8f934bc2b9 Basic CommonCrawl integration 2025-06-26 17:05:50 +02:00
a6ed98c36e Remove old config file 2025-06-26 13:04:14 +02:00
052bdd6a52 Refactor packages
Remove ExtractionService
2025-06-26 12:48:19 +02:00
edc604231f Change project name 2025-06-26 12:46:01 +02:00
2289489fe1 Amend transformation engine 2025-06-26 12:40:51 +02:00
rak
e97f9bdd61 Implement XPath index access 2025-06-25 23:11:05 +02:00
39c0ebfc7c Attempt to implement CardPrints 2025-06-25 21:06:34 +02:00
e0330e7baa Remove isMulti 2025-06-25 14:17:58 +02:00
3808fe153e Amend naming schema 2025-06-25 14:17:35 +02:00
0196308c10 Amend naming schema 2025-06-25 14:17:13 +02:00
72af626e54 Amend naming schema 2025-06-25 14:10:04 +02:00
ce64f90a66 Refactor extraction logic
Add required models
2025-06-25 13:57:44 +02:00
284723c978 Implement transformation application 2025-06-24 15:53:20 +02:00
8cc9a64111 Add Transformation model 2025-06-24 15:23:12 +02:00
9db3753105 Add Transformation model 2025-06-03 18:38:18 +02:00
80 changed files with 1710 additions and 264 deletions

View File

@@ -0,0 +1,32 @@
name: Create and Push Release
on:
workflow_dispatch:
env:
AUTHENTIK_URL: https://auth.smoothbrain.win
REGISTRY_URL: gitea.smoothbrain.win
IMAGE_OWNER: rak
IMAGE_NAME: dex-scraper-java
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup JDK
uses: https://gitea.smoothbrain.win/rak/setup-java@main
with:
distribution: 'corretto'
java-version: '21.0.6'
cache: 'gradle'
- name: Build & Push Image
env:
QUARKUS_CONTAINER_IMAGE_USERNAME: ${{ secrets.CI_SERVICE_ACCOUNT }}
QUARKUS_CONTAINER_IMAGE_PASSWORD: ${{ secrets.CI_SERVICE_ACCOUNT_PASSWORD }}
run: |
./gradlew clean build \
-Dquarkus.container-image.push=true

View File

@@ -22,9 +22,13 @@ dependencies {
implementation("io.quarkus:quarkus-rest-client-kotlin-serialization")
implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin")
implementation("io.quarkus:quarkus-smallrye-fault-tolerance")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured")
}

View File

@@ -10,4 +10,4 @@ pluginManagement {
id(quarkusPluginId) version quarkusPluginVersion
}
}
rootProject.name = "jsoup-scraper"
rootProject.name = "dex-scraper"

View File

@@ -1,42 +0,0 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
language:
steps:
- type: "xpath"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"

View File

@@ -1,12 +0,0 @@
package com.rak.config
import io.smallrye.config.WithName
interface CardDefinition {
@WithName("name")
fun nameSelector(): SelectorDefinition
@WithName("attack")
fun attackSelector(): SelectorDefinition
@WithName("effect")
fun effectSelector(): SelectorDefinition
}

View File

@@ -1,8 +0,0 @@
package com.rak.config
import java.util.*
interface Items {
fun card(): Optional<CardDefinition>
fun regionalSet(): Optional<RegionalSetDefinition>
}

View File

@@ -1,13 +0,0 @@
package com.rak.config
import com.rak.config.converter.AbstractModelDefinition
import io.smallrye.config.WithName
interface RegionalSetDefinition : AbstractModelDefinition {
@WithName("id")
fun idSelector(): SelectorDefinition
@WithName("language")
fun languageSelector(): SelectorDefinition
@WithName("region-key")
fun regionKeySelector(): SelectorDefinition
}

View File

@@ -1,5 +0,0 @@
package com.rak.config
interface SelectorDefinition {
fun steps(): Set<Step>
}

View File

@@ -1,9 +0,0 @@
package com.rak.config.converter
import com.rak.config.Step
import io.smallrye.config.WithName
interface AbstractModelDefinition {
@WithName("root")
fun rootSelector(): Step
}

View File

@@ -0,0 +1,10 @@
package com.rak.config.converter
import com.rak.model.DiscriminatorDirection
import org.eclipse.microprofile.config.spi.Converter
class DiscriminatorDirectionConverter : Converter<DiscriminatorDirection> {
override fun convert(value: String): DiscriminatorDirection? {
return DiscriminatorDirection.from(value)
}
}

View File

@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}

View File

@@ -1,6 +1,6 @@
package com.rak.config.converter
import com.rak.model.scrape.selector.Selector
import com.rak.model.Selector
import org.eclipse.microprofile.config.spi.Converter
class TypeSelectorConverter : Converter<Selector> {

View File

@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.*
interface AbstractScrapeTargetConfig {
@WithName("root")
fun getRootConfig(): Optional<ExtractConfig>
@WithName("multi")
@WithDefault("false")
fun isMulti(): Boolean
@WithName("discriminator")
fun getDiscriminator(): Optional<DiscriminatorConfig>
}

View File

@@ -0,0 +1,3 @@
package com.rak.config.model
interface AbstractScrapeTargetFieldConfig

View File

@@ -0,0 +1,14 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface CardPrintScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("name")
fun getNameConfig(): ScrapeTargetFieldConfig
@WithName("regional-name")
fun getRegionNameConfig(): ScrapeTargetFieldConfig
@WithName("rarity")
fun getRarityConfig(): ScrapeTargetFieldConfig
}

View File

@@ -0,0 +1,16 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface CardScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("name")
fun getEnglishNameConfig(): ScrapeTargetFieldConfig
@WithName("description")
fun getDescriptionConfig(): ScrapeTargetFieldConfig
@WithName("type")
fun getCardTypeConfig(): ScrapeTargetFieldConfig
@WithName("attack")
fun getAttackConfig(): ScrapeTargetFieldConfig
@WithName("defense")
fun getDefenseConfig(): ScrapeTargetFieldConfig
}

View File

@@ -0,0 +1,9 @@
package com.rak.config.model
import com.rak.config.converter.DiscriminatorDirectionConverter
import com.rak.model.DiscriminatorDirection
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
}

View File

@@ -1,13 +1,14 @@
package com.rak.config
package com.rak.config.model
import com.rak.config.converter.TypeSelectorConverter
import com.rak.model.scrape.selector.Selector
import com.rak.model.Selector
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
interface Step {
interface ExtractConfig {
@WithConverter(TypeSelectorConverter::class)
@WithName("type")
fun selectorType(): Selector // e.g. css or xpath
fun value(): String
fun selectorType(): Selector
@WithName("value")
fun getQueryString(): String
}

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.Optional
interface FieldConfigFallback {
@WithName("steps")
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("default")
@WithDefault("N/A")
fun getOptionalDefaultValue(): String
}

View File

@@ -1,9 +1,9 @@
package com.rak.config
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.*
interface SourceConfig {
interface ProviderConfig {
@WithName("id")
fun getId(): String
@@ -11,9 +11,9 @@ interface SourceConfig {
fun getName(): String
@WithName("domain")
fun getDomain(): String
@WithName("url-patterns")
fun getUrlPatterns(): Optional<MutableSet<String>>
@WithName("selectors")
fun getItems(): Items
@WithName("url-pattern")
fun getUrlPattern(): String
@WithName("targets")
fun getTargets(): TargetsConfig
}

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface RegionalSetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
}

View File

@@ -0,0 +1,21 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("nullable")
@WithDefault("false")
fun isNullable(): Boolean
@WithName("root")
fun getRootConfig(): Optional<ExtractConfig>
@WithName("extractors")
fun getExtractionMethods(): List<ExtractorConfig>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
}

View File

@@ -0,0 +1,8 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("name")
fun getNameConfig(): ScrapeTargetFieldConfig
}

View File

@@ -1,12 +1,12 @@
package com.rak.config
package com.rak.config.model
import io.smallrye.config.ConfigMapping
import io.smallrye.config.WithName
@ConfigMapping(prefix = "scraper")
interface SourcesConfiguration {
interface SourcesConfig {
@WithName("sources")
fun getSources(): MutableList<SourceConfig>
fun getSources(): MutableList<ProviderConfig>
}

View File

@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.*
interface TargetsConfig {
@WithName("card")
fun getCardConfig(): Optional<CardScrapeTargetConfig>
@WithName("set")
fun getSetConfig(): Optional<SetScrapeTargetConfig>
@WithName("regional-set")
fun getRegionalSetConfig(): Optional<RegionalSetScrapeTargetConfig>
@WithName("card-print")
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
}

View File

@@ -0,0 +1,8 @@
package com.rak.config.model
import java.util.Optional
interface TransformationStepConfig {
fun name(): String
fun parameters(): Optional<MutableList<String>>
}

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}

View File

@@ -1,8 +1,11 @@
package com.rak.controller
import com.rak.config.SourcesConfiguration
import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService
import com.rak.service.SourceService
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
@@ -13,16 +16,11 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api")
class ExampleResource(
private val sourcesConfiguration: SourcesConfiguration,
class ScrapeController(
private val scrapeService: ScrapeService,
private val sourceService: SourceService
private val commonCrawlService: CommonCrawlService
) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
}
@GET
@Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON)
@@ -32,8 +30,24 @@ class ExampleResource(
provider: String,
@RestQuery
setName: String
): Map<String, String> {
return scrapeService.extractSet(
): CardSet {
return scrapeService.scrapeSet(
provider,
setName
)
}
@GET
@Path("/{provider}/regionalSet")
@Produces(MediaType.APPLICATION_JSON)
@Consumes(MediaType.APPLICATION_JSON)
fun scrapeRegionalSet(
@RestPath
provider: String,
@RestQuery
setName: String
): RegionalSet {
return scrapeService.scrapeRegionalSet(
provider,
setName
)
@@ -48,8 +62,8 @@ class ExampleResource(
provider: String,
@RestQuery
cardName: String
): Map<String, String> {
return scrapeService.extractCard(
): Card? {
return scrapeService.scrapeCard(
provider,
cardName
)

View File

@@ -0,0 +1,17 @@
package com.rak.model
enum class DiscriminatorDirection(val value: String) {
ASC("asc"),
DESC("desc");
companion object {
fun from(value: String): DiscriminatorDirection? {
for (discriminatorDirection in DiscriminatorDirection.entries) {
if (discriminatorDirection.value == value) {
return discriminatorDirection
}
}
return null
}
}
}

View File

@@ -0,0 +1,5 @@
package com.rak.model
data class ErrorResponse(
val message: String
)

View File

@@ -1,4 +1,4 @@
package com.rak.model.scrape.selector
package com.rak.model
enum class Selector {
CSS,

View File

@@ -0,0 +1,11 @@
package com.rak.model.card
enum class Attribute {
WIND,
WATER,
FIRE,
EARTH,
LIGHT,
DARK,
DIVINE;
}

View File

@@ -0,0 +1,8 @@
package com.rak.model.card
abstract class Card {
abstract val id: Int
abstract val cardType: CardType
abstract val description: String
abstract val name: String
}

View File

@@ -0,0 +1,28 @@
package com.rak.model.card
data class CardPrint(
var id: Int,
val name: String,
val regionalName: String? = null,
val rarity: String
) {
companion object {
fun fromMap(map: Map<String, String>): CardPrint {
val regionalNameValue = map["regionalName"]
val regionalName = if (regionalNameValue == "") {
null
} else {
regionalNameValue
}
return CardPrint(
map["id"]?.toInt() ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["name"] ?: throw IllegalStateException("Parameter 'region' not found"),
regionalName,
map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
)
}
}
}

View File

@@ -0,0 +1,8 @@
package com.rak.model.card
enum class CardType {
MONSTER,
SPELL,
TRAP,
UNKNOWN
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.card
interface ICardType

View File

@@ -0,0 +1,12 @@
package com.rak.model.card
enum class LinkArrow {
TOP_LEFT,
TOP,
TOP_RIGHT,
LEFT,
RIGHT,
BOTTOM_LEFT,
BOTTOM,
BOTTOM_RIGHT;
}

View File

@@ -0,0 +1,20 @@
package com.rak.model.card
data class MonsterCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val monsterEffect: String? = null,
val attack: Int? = null,
val defense: Int? = null,
val level: Int? = null,
val isPendulum: Boolean = false,
val pendulumScale: Int? = null,
val pendulumEffect: String? = null,
val linkValue: Int? = null,
val subType: MonsterCardType,
val monsterType: MonsterType,
val attribute: Attribute,
val linkArrows: Set<LinkArrow>
) : Card()

View File

@@ -0,0 +1,11 @@
package com.rak.model.card
enum class MonsterCardType : ICardType {
NORMAL,
EFFECT,
RITUAL,
FUSION,
SYNCHRO,
XYZ,
LINK
}

View File

@@ -0,0 +1,32 @@
package com.rak.model.card
// TODO string value for proper names
// TODO consider adding unknown type
enum class MonsterType {
AQUA,
BEAST,
BEAST_WARRIOR,
CREATOR_GOD,
CYBERSE,
DINOSAUR,
DIVINE_BEAST,
DRAGON,
FAIRY,
FIEND,
FISH,
INSECT,
ILLUSION,
MACHINE,
PLANT,
PSYCHIC,
PYRO,
REPTILE,
ROCK,
SEA_SERPENT,
SPELLCASTER,
THUNDER,
WARRIOR,
WINGED_BEAST,
WYRM,
ZOMBIE
}

View File

@@ -0,0 +1,9 @@
package com.rak.model.card
data class SpellCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val subType: SpellCardType
) : Card()

View File

@@ -0,0 +1,11 @@
package com.rak.model.card
// TODO fix underscore for all types with string value
enum class SpellCardType {
NORMAL,
CONTINUOUS,
EQUIP,
QUICK_PLAY,
FIELD,
RITUAL
}

View File

@@ -0,0 +1,9 @@
package com.rak.model.card
data class TrapCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val subType: TrapCardType
) : Card()

View File

@@ -0,0 +1,7 @@
package com.rak.model.card
enum class TrapCardType {
NORMAL,
CONTINUOUS,
COUNTER
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)

View File

@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)

View File

@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ElementNotFoundException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class InvalidConfigurationException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class NotImplementedException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,7 @@
package com.rak.model.exception
import java.lang.RuntimeException
class UnsupportedQueryForProviderException(
message: String,
) : RuntimeException(message)

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,18 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
override fun toResponse(exception: NotImplementedException): Response {
return Response.status(405).entity(
ErrorResponse(
exception.message ?: "Provider does not implement this method"
)
).build()
}
}

View File

@@ -0,0 +1,19 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
override fun toResponse(exception: TargetNotFoundException): Response {
return Response.status(404).entity(
ErrorResponse(
exception.message ?: "Scrape target could not be found"
)
).build()
}
}

View File

@@ -1,4 +0,0 @@
package com.rak.model.scrape
abstract class AbstractScraper{
}

View File

@@ -1,6 +0,0 @@
package com.rak.model.scrape
class JsoupScraper : AbstractScraper() {
}

View File

@@ -1,5 +0,0 @@
package com.rak.model.scrape
data class ScrapeJob(
val url: String,
)

View File

@@ -0,0 +1,15 @@
package com.rak.model.set
data class CardSet(
var name: String,
val regionalSets: Set<RegionalSet>
) {
companion object {
fun fromMap(map: Map<String, String>, regionalSet: Set<RegionalSet>): CardSet {
return CardSet(
map["name"] ?: throw IllegalStateException("Parameter 'name' not found"),
regionalSet
)
}
}
}

View File

@@ -0,0 +1,27 @@
package com.rak.model.set
import com.rak.model.card.CardPrint
data class RegionalSet(
val prefix: String,
val region: String,
val regionCode: String,
val cardPrints: Collection<CardPrint>,
val numberOfCards: Int
) {
companion object {
fun fromMap(map: Map<String, String>, cardPrints: Collection<CardPrint>): RegionalSet {
return RegionalSet(
map["prefix"] ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["region"] ?: throw IllegalStateException("Parameter 'region' not found"),
map["regionCode"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
cardPrints,
cardPrints.size
)
}
}
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.transform
interface AbstractTransformation

View File

@@ -0,0 +1,6 @@
package com.rak.model.transform
@FunctionalInterface
fun interface ParameterizedTransformation : AbstractTransformation {
fun apply(input: String, parameters: MutableList<String>): String
}

View File

@@ -0,0 +1,6 @@
package com.rak.model.transform
@FunctionalInterface
fun interface Transformation : AbstractTransformation {
fun apply(input: String): String
}

View File

@@ -0,0 +1,83 @@
package com.rak.model.transform
import com.rak.config.model.TransformationStepConfig
import java.util.concurrent.ConcurrentHashMap
class TransformationRegistry {
private val transformations: ConcurrentHashMap<String, Transformation> = ConcurrentHashMap()
private val parameterizedTransformation: ConcurrentHashMap<String, ParameterizedTransformation> =
ConcurrentHashMap()
init {
register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace(Regex("^\""), "").replace(Regex("\"$"), "") }
register("replace") { input, parameters ->
require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters"
}
if (parameters.size == 1) {
parameters.add("")
}
input.replace(parameters[0], parameters[1])
}
register("regexReplace") { input, params ->
require(params.size == 1 || params.size == 2) {
"'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
}
input.replace(params[0].toRegex(), params[1])
}
}
fun register(name: String, transformation: Transformation) {
transformations.put(name, transformation)
}
fun register(name: String, transformation: ParameterizedTransformation) {
parameterizedTransformation.put(name, transformation)
}
fun getTransformation(transformationStep: TransformationStepConfig): AbstractTransformation {
val name = transformationStep.name()
val parameters = transformationStep.parameters()
return when {
transformations.containsKey(name) -> {
if (parameters.isPresent && parameters.get().isNotEmpty()) {
throw IllegalArgumentException("'$name' doesn't accept parameters")
} else {
transformations[name]!!
}
}
parameterizedTransformation.containsKey(name) -> {
if (parameters.isPresent && parameters.get().isEmpty()) {
throw IllegalArgumentException("'$name' requires parameters")
} else {
parameterizedTransformation[name]!!
}
}
else -> throw IllegalArgumentException("Unknown transformation: '$name'")
}
}
fun applyTransformations(input: String, steps: List<TransformationStepConfig>): String {
return steps.fold(input) { current, step ->
val actualStep = getTransformation(step)
when (actualStep) {
is Transformation ->
transformations[step.name()]?.apply(current)
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
is ParameterizedTransformation ->
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
else -> throw IllegalStateException("Invalid transformation type")
}
}
}
}

View File

@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url", ex)
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}

View File

@@ -1,29 +0,0 @@
package com.rak.service
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
/**
* To use it via injection.
*
* ```kotlin
* @Inject
* @RestClient
* lateinit var myRemoteService: MyRemoteService
*
* fun doSomething() {
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
* }
* ```
*/
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
interface MyRemoteService {
@GET
@Path("/extensions")
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
}

View File

@@ -1,89 +1,89 @@
package com.rak.service
import com.rak.config.RegionalSetDefinition
import com.rak.config.SourcesConfiguration
import com.rak.config.Step
import com.rak.util.XPathUtil
import com.rak.config.model.ProviderConfig
import com.rak.model.card.Card
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.CardSetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import java.lang.Exception
@ApplicationScoped
class ScrapeService(
private val sourceService: SourceService
private val sourceService: SourceService,
private val cardSetExtractionService: CardSetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService
) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
fun ProviderConfig.buildUrl(targetName: String): String {
return this.getUrlPattern().format(targetName)
}
private fun extractTextFromRootBySteps(
root: Element,
steps: Set<Step>
): String? {
var currentElement: Element? = root.clone()
var result: String? = null
fun scrapeSet(
provider: String,
setName: String,
): CardSet {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
for (index in 0 until steps.size) {
val currentStep = steps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
val path: String = normalizePath(setName)
val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
if (index == steps.size - 1) {
result = XPathUtil.extractResult(currentElement, currentStep.value())
}
else {
currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
var document: Document? = null
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query", ex)
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return result
return cardSetExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
)
}
fun extractSet(
fun scrapeRegionalSet(
provider: String,
setName: String,
): Map<String, String> {
val source =
sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
): RegionalSet {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
val regionalSetSelector = source.getItems().regionalSet().get()
val regionalSetRoot = document.selectFirst(regionalSetSelector.rootSelector().value())!!
val setId: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.idSelector().steps()
)
val setLanguage: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.languageSelector().steps()
)
val setKey: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.regionKeySelector().steps()
)
return mapOf(
Pair("id", setId ?: "N/A"),
Pair("language", setLanguage ?: "N/A"),
Pair("key", setKey ?: "N/A"),
)
return regionalSetExtractionService.extract(document, source, source.getTargets().getRegionalSetConfig().get())
}
fun extractCard(
fun scrapeCard(
provider: String,
cardName: String,
): Map<String, String> {
val path: String = normalizePath(cardName)
return mapOf()
): Card? {
throw NotImplementedException("Not implemented")
}
private fun normalizePath(path: String): String = path

View File

@@ -1,15 +1,86 @@
package com.rak.service
import com.rak.config.SourceConfig
import com.rak.config.SourcesConfiguration
import com.rak.config.model.CardScrapeTargetConfig
import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.exception.InvalidConfigurationException
import io.quarkus.runtime.Startup
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped
@Startup
@ApplicationScoped
class SourceService (
val sourcesConfiguration: SourcesConfiguration
class SourceService(
val sourcesConfiguration: SourcesConfig
) {
fun getSources(): Set<SourceConfig> = sourcesConfiguration.getSources().toSet()
fun getSourceById(id: String): SourceConfig? = getSources().firstOrNull { it.getId() == id }
@PostConstruct
fun init() {
sourcesConfiguration.getSources().forEach { validateSource(it) }
}
private fun validateSource(providerConfig: ProviderConfig) {
val optionalRegionalSetConfig = providerConfig.getTargets().getRegionalSetConfig()
val optionalCardConfig = providerConfig.getTargets().getCardConfig()
if (optionalRegionalSetConfig.isPresent) {
validateSetExtractConfig(optionalRegionalSetConfig.get())
}
if (optionalCardConfig.isPresent) {
validateCardExtractConfig(optionalCardConfig.get())
}
}
private fun validateSetExtractConfig(setExtractConfig: RegionalSetScrapeTargetConfig) {
val selectors = listOf(
setExtractConfig.getLanguageConfig(),
setExtractConfig.getIdConfig(),
setExtractConfig.getRegionKeyConfig()
)
// If global root is present, dedicated roots may not exist
if (setExtractConfig.getRootConfig().isPresent) {
if (selectors.any { it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots cannot be set when a global extraction root is configured"
)
}
} else {
if (selectors.any { !it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots have to be set when a global extraction root is not configured"
)
}
}
}
private fun validateCardExtractConfig(cardScrapeTargetConfig: CardScrapeTargetConfig) {
val selectors = listOf(
cardScrapeTargetConfig.getEnglishNameConfig(),
cardScrapeTargetConfig.getDescriptionConfig(),
cardScrapeTargetConfig.getCardTypeConfig(),
cardScrapeTargetConfig.getAttackConfig(),
cardScrapeTargetConfig.getDefenseConfig(),
)
if (cardScrapeTargetConfig.getRootConfig().isPresent) {
if (selectors.any { it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots cannot be set when a global extraction root is configured"
)
}
} else {
if (selectors.any { !it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots have to be set when a global extraction root is not configured"
)
}
}
}
fun getSources(): Set<ProviderConfig> = sourcesConfiguration.getSources().toSet()
fun getSourceById(id: String): ProviderConfig? = getSources().firstOrNull { it.getId() == id }
}

View File

@@ -0,0 +1,57 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import io.smallrye.faulttolerance.api.RateLimit
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.faulttolerance.Bulkhead
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
import java.time.temporal.ChronoUnit
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
@RateLimit(
value = 1,
minSpacing = 5
)
@Bulkhead
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}

View File

@@ -0,0 +1,263 @@
package com.rak.service.extract
import com.rak.config.model.*
import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil
import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.*
import kotlin.jvm.optionals.getOrElse
// find root element from global or node config
// get field target configs as list
// extract item from root element via field config
abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
private val transformationRegistry = TransformationRegistry()
abstract fun T.getItems(): Map<String, ScrapeTargetFieldConfig>
abstract fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): E
abstract fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<E>
abstract fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<List<E>>
fun getRootElement(
element: Element,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Element {
return getRootElements(
element,
globalRootExtractConfig,
nodeRootExtractConfig
).firstOrNull() ?: throw ElementNotFoundException("")
}
fun getRootElements(
element: Element,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Elements {
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.getOrElse {
nodeRootExtractConfig.orElseThrow {
InvalidConfigurationException("")
}
}
return getElementsFromElementByExtractConfig(
element,
rootExtractConfig
)
}
protected fun getElementFromDocumentByExtractConfig(
element: Element,
step: ExtractConfig,
): Element? {
return getElementsFromElementByExtractConfig(element, step).firstOrNull()
}
protected fun getElementsFromElementByExtractConfig(
element: Element,
step: ExtractConfig,
): Elements {
return if (step.selectorType() == Selector.CSS) {
element.select(step.getQueryString())
} else {
element.selectXpath(step.getQueryString())
}
}
protected fun extractSingle(
document: Element,
extractionConfig: T
): Map<String, String> {
val result = mutableMapOf<String, String>()
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
val rootElement = getRootElement(
document,
extractionConfig.getRootConfig(),
fieldConfig.getRootConfig()
)
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
result.put(identifier, extractedText)
}
return result
}
fun extractMulti(
element: Element,
extractionConfig: T
): List<Map<String, String>> {
val resultList = mutableListOf<MutableMap<String, String>>()
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
val rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
fieldConfig.getRootConfig()
)
for(index in 0..rootElements.size - 1) {
val rootElement = rootElements[index]
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: if (fieldConfig.isNullable()) {
""
} else {
throw ElementNotFoundException("Could not find element for '$identifier'")
}
val mapToModify: MutableMap<String, String> = try {
resultList[index]
} catch (_: IndexOutOfBoundsException) {
val newMap = mutableMapOf<String, String>()
resultList.add(newMap)
newMap
}
mapToModify.put(identifier, extractedText)
}
}
return resultList
}
fun extractMultiWithDiscriminator(
element: Element,
extractionConfig: T
): List<List<Map<String, String>>>{
val rootElements = getRootElements(
element,
extractionConfig.getDiscriminator().get().getRootConfig(),
Optional.empty<ExtractConfig>()
)
val result = mutableListOf<List<Map<String, String>>>()
for(element in rootElements) {
result.add(extractMulti(
element,
extractionConfig
))
}
return result
}
private fun extractTextFromElementByTargetFieldConfig(
root: Element,
extractionConfig: ScrapeTargetFieldConfig
): String? {
val extractionMethods = extractionConfig.getExtractionMethods()
var result: String? = null
for(extractionMethod in extractionMethods) {
val extractionSteps = extractionMethod.getExtractionSteps()
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted")
} else {
try {
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
} catch (ex: ValueValidationException) {
throw ex
}
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException,
is ValueValidationException -> Log.debug(ex.message)
else -> throw ex
}
}
}
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result
}
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
}

View File

@@ -0,0 +1,53 @@
package com.rak.service.extract
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrintScrapeTargetConfig>() {
override fun CardPrintScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("id", this.getIdConfig()),
Pair("name", this.getNameConfig()),
Pair("regionalName", this.getRegionNameConfig()),
Pair("rarity", this.getRarityConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): CardPrint {
throw NotImplementedException("Not implemented")
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<CardPrint> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<List<CardPrint>> {
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
element,
extractionConfig
)
return objectAsListOfMaps.map { innerList ->
innerList.map { map -> CardPrint.fromMap(map) }
}
}
}

View File

@@ -0,0 +1,54 @@
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.CardSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class CardSetExtractionService(
private val regionalSetExtractionService: RegionalSetExtractionService
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("name", this.getNameConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): CardSet {
val set = extractSingle(element, extractionConfig)
return CardSet.fromMap(
set,
regionalSetExtractionService.extractMultiple(
element,
providerConfig,
providerConfig.getTargets().getRegionalSetConfig().get()
).toSet()
)
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<CardSet> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<CardSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -0,0 +1,62 @@
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class RegionalSetExtractionService(
private val cardPrintExtractionService: CardPrintExtractionService,
private val sourcesConfig: SourcesConfig
) : AbstractExtractionService<RegionalSet, RegionalSetScrapeTargetConfig>() {
override fun RegionalSetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("prefix", this.getIdConfig()),
Pair("regionCode", this.getRegionKeyConfig()),
Pair("region", this.getLanguageConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): RegionalSet {
throw NotImplementedException("Not implemented")
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig)
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
element,
providerConfig,
providerConfig.getTargets().getCardPrintConfiguration().get()
)
// Pair each RegionalSet with its CardPrint group by index
return regionalSetList.mapIndexed { index, regionalSetMap ->
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
}
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -0,0 +1,19 @@
package com.rak.util
import org.jsoup.nodes.Element
class CssUtil private constructor() {
companion object {
fun getNextElement(element: Element, path: String): Element? {
return element.select(path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {
return root
.select(path)
.firstOrNull()?.text()
}
}
}

View File

@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}

View File

@@ -0,0 +1,44 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}

View File

@@ -3,12 +3,16 @@ package com.rak.util
import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
@@ -20,14 +24,29 @@ class XPathUtil private constructor() {
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
.firstOrNull()?.text()
private fun selectXpath(element: Element, xpath: String): Elements {
return if (xpath.contains(INDEX_MATCHER)) {
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
val xpathHalves = xpath.split("[$index]")
try {
Elements(element.selectXpath(xpathHalves[0])[index])
} catch (_: IndexOutOfBoundsException) {
Elements()
}
} else {
element.selectXpath(xpath)
}
}
fun getNextElement(root: Element, path: String): Element? {
return root.selectXpath(path).firstOrNull()
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath.replace("/text()", ""))
.text()
}
fun getNextElement(element: Element, path: String): Element? {
return selectXpath(element, path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {

View File

@@ -1 +1,2 @@
com.rak.config.converter.TypeSelectorConverter
com.rak.config.converter.DiscriminatorDirectionConverter

View File

@@ -1,48 +1,204 @@
quarkus:
container-image:
registry: gitea.smoothbrain.win
group: rak
build: true
additional-tags: latest
http:
port: 8081
live-reload:
instrumentation: true
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
# - id: konami-official
# name: "Konami Official Database"
# domain: "yugioh-card.com"
# url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
# targets:
# card:
# root:
# type: css
# value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
# name:
# steps:
# - type: "css"
# value: "h1.product-title"
# - type: "xpath"
# value: "//h1[@itemprop='name']"
# attack:
# steps:
# - type: "css"
# value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
url-pattern: "https://yugioh.fandom.com/wiki/%s"
targets:
set:
root:
type: css
value: "aside > .pi-title"
name:
type: string
extractors:
- steps:
- type: xpath
value: "//h2/text()"
regional-set:
root:
type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: xpath
value: "//li/text()"
type: int
extractors:
- steps:
- type: xpath
value: "//li/text()"
transform:
- name: "regexReplace"
parameters: [
" *\\(.+\\)",
""
]
language:
steps:
- type: xpath
value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
type: int
extractors:
- steps:
- type: xpath
value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
region-key:
steps:
- type: xpath
value: "//li/abbr/text()"
type: int
extractors:
- steps:
- type: xpath
value: "//li/abbr/text()"
card-print:
multi: true
root:
type: css
value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[0]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/span/text()"
transform:
- name: "regexReplace"
parameters: [
" .+",
""
]
- name: "regexReplace"
parameters: [
".+-[A-Za-z]*0?",
""
]
validation:
pattern: "^.+-.+\\\\d.+$"
name:
type: int
extractors:
- steps:
- type: xpath
value: "./td[1]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
" ?\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name:
type: int
nullable: true
extractors:
- steps:
- type: xpath
value: "./td[2]"
- type: xpath
value: "./text()"
transform:
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\"$"
rarity:
fallback:
default: "N/A"
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[3]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[2]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[1]"
- type: xpath
value: "./text()"
validation:
pattern: "^.*(Common|Rare|Print).*$"
# card:
# name:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "./tbody/tr[3]/th/text()"
# description:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# type:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# attack:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# defense:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"