24 Commits

Author SHA1 Message Date
rak 7860819029 Add CI/CD 2025-07-15 19:14:54 +02:00
rak 304490b52e Correct YGO Fandom name transformation regex 2025-07-06 15:05:51 +02:00
rak ce5b87c34e Minor moddel adjustments 2025-07-01 12:54:56 +02:00
rak a9f6efc818 Minor config adjustment 2025-07-01 12:54:32 +02:00
rak 5930da7a4c Split Set/RegionalSet properly 2025-06-29 16:49:30 +02:00
rak 8a0777e557 Minor config amend
Regards Set ID
2025-06-29 14:56:00 +02:00
rak 2a79218a54 Add RegEx validation
Amend RegExReplace transformer
Amend transformations
2025-06-29 14:52:09 +02:00
rak ee4ce4fd65 Basic multi-method extraction 2025-06-29 13:21:18 +02:00
rak 108b4c4c19 Basic exception mapping 2025-06-26 17:17:10 +02:00
rak 8f934bc2b9 Basic CommonCrawl integration 2025-06-26 17:05:50 +02:00
rak a6ed98c36e Remove old config file 2025-06-26 13:04:14 +02:00
rak 052bdd6a52 Refactor packages
Remove ExtractionService
2025-06-26 12:48:19 +02:00
rak edc604231f Change project name 2025-06-26 12:46:01 +02:00
rak 2289489fe1 Amend transformation engine 2025-06-26 12:40:51 +02:00
rak e97f9bdd61 Implement XPath index access 2025-06-25 23:11:05 +02:00
rak 39c0ebfc7c Attempt to implement CardPrints 2025-06-25 21:06:34 +02:00
rak e0330e7baa Remove isMulti 2025-06-25 14:17:58 +02:00
rak 3808fe153e Amend naming schema 2025-06-25 14:17:35 +02:00
rak 0196308c10 Amend naming schema 2025-06-25 14:17:13 +02:00
rak 72af626e54 Amend naming schema 2025-06-25 14:10:04 +02:00
rak ce64f90a66 Refactor extraction logic
Add required models
2025-06-25 13:57:44 +02:00
rak 284723c978 Implement transformation application 2025-06-24 15:53:20 +02:00
rak 8cc9a64111 Add Transformation model 2025-06-24 15:23:12 +02:00
rak 9db3753105 Add Transformation model 2025-06-03 18:38:18 +02:00
80 changed files with 1710 additions and 264 deletions
+32
View File
@@ -0,0 +1,32 @@
name: Create and Push Release
on:
workflow_dispatch:
env:
AUTHENTIK_URL: https://auth.smoothbrain.win
REGISTRY_URL: gitea.smoothbrain.win
IMAGE_OWNER: rak
IMAGE_NAME: dex-scraper-java
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup JDK
uses: https://gitea.smoothbrain.win/rak/setup-java@main
with:
distribution: 'corretto'
java-version: '21.0.6'
cache: 'gradle'
- name: Build & Push Image
env:
QUARKUS_CONTAINER_IMAGE_USERNAME: ${{ secrets.CI_SERVICE_ACCOUNT }}
QUARKUS_CONTAINER_IMAGE_PASSWORD: ${{ secrets.CI_SERVICE_ACCOUNT_PASSWORD }}
run: |
./gradlew clean build \
-Dquarkus.container-image.push=true
+5 -1
View File
@@ -22,9 +22,13 @@ dependencies {
implementation("io.quarkus:quarkus-rest-client-kotlin-serialization") implementation("io.quarkus:quarkus-rest-client-kotlin-serialization")
implementation("io.quarkus:quarkus-rest-jackson") implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin") implementation("io.quarkus:quarkus-kotlin")
implementation("io.quarkus:quarkus-smallrye-fault-tolerance")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc") implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5") testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured") testImplementation("io.rest-assured:rest-assured")
} }
+1 -1
View File
@@ -10,4 +10,4 @@ pluginManagement {
id(quarkusPluginId) version quarkusPluginVersion id(quarkusPluginId) version quarkusPluginVersion
} }
} }
rootProject.name = "jsoup-scraper" rootProject.name = "dex-scraper"
-42
View File
@@ -1,42 +0,0 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
language:
steps:
- type: "xpath"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"
@@ -1,12 +0,0 @@
package com.rak.config
import io.smallrye.config.WithName
interface CardDefinition {
@WithName("name")
fun nameSelector(): SelectorDefinition
@WithName("attack")
fun attackSelector(): SelectorDefinition
@WithName("effect")
fun effectSelector(): SelectorDefinition
}
-8
View File
@@ -1,8 +0,0 @@
package com.rak.config
import java.util.*
interface Items {
fun card(): Optional<CardDefinition>
fun regionalSet(): Optional<RegionalSetDefinition>
}
@@ -1,13 +0,0 @@
package com.rak.config
import com.rak.config.converter.AbstractModelDefinition
import io.smallrye.config.WithName
interface RegionalSetDefinition : AbstractModelDefinition {
@WithName("id")
fun idSelector(): SelectorDefinition
@WithName("language")
fun languageSelector(): SelectorDefinition
@WithName("region-key")
fun regionKeySelector(): SelectorDefinition
}
@@ -1,5 +0,0 @@
package com.rak.config
interface SelectorDefinition {
fun steps(): Set<Step>
}
@@ -1,9 +0,0 @@
package com.rak.config.converter
import com.rak.config.Step
import io.smallrye.config.WithName
interface AbstractModelDefinition {
@WithName("root")
fun rootSelector(): Step
}
@@ -0,0 +1,10 @@
package com.rak.config.converter
import com.rak.model.DiscriminatorDirection
import org.eclipse.microprofile.config.spi.Converter
class DiscriminatorDirectionConverter : Converter<DiscriminatorDirection> {
override fun convert(value: String): DiscriminatorDirection? {
return DiscriminatorDirection.from(value)
}
}
@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}
@@ -1,6 +1,6 @@
package com.rak.config.converter package com.rak.config.converter
import com.rak.model.scrape.selector.Selector import com.rak.model.Selector
import org.eclipse.microprofile.config.spi.Converter import org.eclipse.microprofile.config.spi.Converter
class TypeSelectorConverter : Converter<Selector> { class TypeSelectorConverter : Converter<Selector> {
@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.*
interface AbstractScrapeTargetConfig {
@WithName("root")
fun getRootConfig(): Optional<ExtractConfig>
@WithName("multi")
@WithDefault("false")
fun isMulti(): Boolean
@WithName("discriminator")
fun getDiscriminator(): Optional<DiscriminatorConfig>
}
@@ -0,0 +1,3 @@
package com.rak.config.model
interface AbstractScrapeTargetFieldConfig
@@ -0,0 +1,14 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface CardPrintScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("name")
fun getNameConfig(): ScrapeTargetFieldConfig
@WithName("regional-name")
fun getRegionNameConfig(): ScrapeTargetFieldConfig
@WithName("rarity")
fun getRarityConfig(): ScrapeTargetFieldConfig
}
@@ -0,0 +1,16 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface CardScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("name")
fun getEnglishNameConfig(): ScrapeTargetFieldConfig
@WithName("description")
fun getDescriptionConfig(): ScrapeTargetFieldConfig
@WithName("type")
fun getCardTypeConfig(): ScrapeTargetFieldConfig
@WithName("attack")
fun getAttackConfig(): ScrapeTargetFieldConfig
@WithName("defense")
fun getDefenseConfig(): ScrapeTargetFieldConfig
}
@@ -0,0 +1,9 @@
package com.rak.config.model
import com.rak.config.converter.DiscriminatorDirectionConverter
import com.rak.model.DiscriminatorDirection
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
}
@@ -1,13 +1,14 @@
package com.rak.config package com.rak.config.model
import com.rak.config.converter.TypeSelectorConverter import com.rak.config.converter.TypeSelectorConverter
import com.rak.model.scrape.selector.Selector import com.rak.model.Selector
import io.smallrye.config.WithConverter import io.smallrye.config.WithConverter
import io.smallrye.config.WithName import io.smallrye.config.WithName
interface Step { interface ExtractConfig {
@WithConverter(TypeSelectorConverter::class) @WithConverter(TypeSelectorConverter::class)
@WithName("type") @WithName("type")
fun selectorType(): Selector // e.g. css or xpath fun selectorType(): Selector
fun value(): String @WithName("value")
fun getQueryString(): String
} }
@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}
@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.Optional
interface FieldConfigFallback {
@WithName("steps")
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("default")
@WithDefault("N/A")
fun getOptionalDefaultValue(): String
}
@@ -1,9 +1,9 @@
package com.rak.config package com.rak.config.model
import io.smallrye.config.WithName import io.smallrye.config.WithName
import java.util.* import java.util.*
interface SourceConfig { interface ProviderConfig {
@WithName("id") @WithName("id")
fun getId(): String fun getId(): String
@@ -11,9 +11,9 @@ interface SourceConfig {
fun getName(): String fun getName(): String
@WithName("domain") @WithName("domain")
fun getDomain(): String fun getDomain(): String
@WithName("url-patterns") @WithName("url-pattern")
fun getUrlPatterns(): Optional<MutableSet<String>> fun getUrlPattern(): String
@WithName("selectors") @WithName("targets")
fun getItems(): Items fun getTargets(): TargetsConfig
} }
@@ -0,0 +1,12 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface RegionalSetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
}
@@ -0,0 +1,21 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("nullable")
@WithDefault("false")
fun isNullable(): Boolean
@WithName("root")
fun getRootConfig(): Optional<ExtractConfig>
@WithName("extractors")
fun getExtractionMethods(): List<ExtractorConfig>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
}
@@ -0,0 +1,8 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("name")
fun getNameConfig(): ScrapeTargetFieldConfig
}
@@ -1,12 +1,12 @@
package com.rak.config package com.rak.config.model
import io.smallrye.config.ConfigMapping import io.smallrye.config.ConfigMapping
import io.smallrye.config.WithName import io.smallrye.config.WithName
@ConfigMapping(prefix = "scraper") @ConfigMapping(prefix = "scraper")
interface SourcesConfiguration { interface SourcesConfig {
@WithName("sources") @WithName("sources")
fun getSources(): MutableList<SourceConfig> fun getSources(): MutableList<ProviderConfig>
} }
@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.*
interface TargetsConfig {
@WithName("card")
fun getCardConfig(): Optional<CardScrapeTargetConfig>
@WithName("set")
fun getSetConfig(): Optional<SetScrapeTargetConfig>
@WithName("regional-set")
fun getRegionalSetConfig(): Optional<RegionalSetScrapeTargetConfig>
@WithName("card-print")
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
}
@@ -0,0 +1,8 @@
package com.rak.config.model
import java.util.Optional
interface TransformationStepConfig {
fun name(): String
fun parameters(): Optional<MutableList<String>>
}
@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}
@@ -1,8 +1,11 @@
package com.rak.controller package com.rak.controller
import com.rak.config.SourcesConfiguration import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService import com.rak.service.ScrapeService
import com.rak.service.SourceService
import jakarta.ws.rs.Consumes import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET import jakarta.ws.rs.GET
import jakarta.ws.rs.Path import jakarta.ws.rs.Path
@@ -13,16 +16,11 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api") @Path("/api")
class ExampleResource( class ScrapeController(
private val sourcesConfiguration: SourcesConfiguration,
private val scrapeService: ScrapeService, private val scrapeService: ScrapeService,
private val sourceService: SourceService private val commonCrawlService: CommonCrawlService
) { ) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
}
@GET @GET
@Path("/{provider}/set") @Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON)
@@ -32,8 +30,24 @@ class ExampleResource(
provider: String, provider: String,
@RestQuery @RestQuery
setName: String setName: String
): Map<String, String> { ): CardSet {
return scrapeService.extractSet( return scrapeService.scrapeSet(
provider,
setName
)
}
@GET
@Path("/{provider}/regionalSet")
@Produces(MediaType.APPLICATION_JSON)
@Consumes(MediaType.APPLICATION_JSON)
fun scrapeRegionalSet(
@RestPath
provider: String,
@RestQuery
setName: String
): RegionalSet {
return scrapeService.scrapeRegionalSet(
provider, provider,
setName setName
) )
@@ -48,8 +62,8 @@ class ExampleResource(
provider: String, provider: String,
@RestQuery @RestQuery
cardName: String cardName: String
): Map<String, String> { ): Card? {
return scrapeService.extractCard( return scrapeService.scrapeCard(
provider, provider,
cardName cardName
) )
@@ -0,0 +1,17 @@
package com.rak.model
enum class DiscriminatorDirection(val value: String) {
ASC("asc"),
DESC("desc");
companion object {
fun from(value: String): DiscriminatorDirection? {
for (discriminatorDirection in DiscriminatorDirection.entries) {
if (discriminatorDirection.value == value) {
return discriminatorDirection
}
}
return null
}
}
}
@@ -0,0 +1,5 @@
package com.rak.model
data class ErrorResponse(
val message: String
)
@@ -1,4 +1,4 @@
package com.rak.model.scrape.selector package com.rak.model
enum class Selector { enum class Selector {
CSS, CSS,
@@ -0,0 +1,11 @@
package com.rak.model.card
enum class Attribute {
WIND,
WATER,
FIRE,
EARTH,
LIGHT,
DARK,
DIVINE;
}
@@ -0,0 +1,8 @@
package com.rak.model.card
abstract class Card {
abstract val id: Int
abstract val cardType: CardType
abstract val description: String
abstract val name: String
}
@@ -0,0 +1,28 @@
package com.rak.model.card
data class CardPrint(
var id: Int,
val name: String,
val regionalName: String? = null,
val rarity: String
) {
companion object {
fun fromMap(map: Map<String, String>): CardPrint {
val regionalNameValue = map["regionalName"]
val regionalName = if (regionalNameValue == "") {
null
} else {
regionalNameValue
}
return CardPrint(
map["id"]?.toInt() ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["name"] ?: throw IllegalStateException("Parameter 'region' not found"),
regionalName,
map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
)
}
}
}
@@ -0,0 +1,8 @@
package com.rak.model.card
enum class CardType {
MONSTER,
SPELL,
TRAP,
UNKNOWN
}
@@ -0,0 +1,3 @@
package com.rak.model.card
interface ICardType
@@ -0,0 +1,12 @@
package com.rak.model.card
enum class LinkArrow {
TOP_LEFT,
TOP,
TOP_RIGHT,
LEFT,
RIGHT,
BOTTOM_LEFT,
BOTTOM,
BOTTOM_RIGHT;
}
@@ -0,0 +1,20 @@
package com.rak.model.card
data class MonsterCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val monsterEffect: String? = null,
val attack: Int? = null,
val defense: Int? = null,
val level: Int? = null,
val isPendulum: Boolean = false,
val pendulumScale: Int? = null,
val pendulumEffect: String? = null,
val linkValue: Int? = null,
val subType: MonsterCardType,
val monsterType: MonsterType,
val attribute: Attribute,
val linkArrows: Set<LinkArrow>
) : Card()
@@ -0,0 +1,11 @@
package com.rak.model.card
enum class MonsterCardType : ICardType {
NORMAL,
EFFECT,
RITUAL,
FUSION,
SYNCHRO,
XYZ,
LINK
}
@@ -0,0 +1,32 @@
package com.rak.model.card
// TODO string value for proper names
// TODO consider adding unknown type
enum class MonsterType {
AQUA,
BEAST,
BEAST_WARRIOR,
CREATOR_GOD,
CYBERSE,
DINOSAUR,
DIVINE_BEAST,
DRAGON,
FAIRY,
FIEND,
FISH,
INSECT,
ILLUSION,
MACHINE,
PLANT,
PSYCHIC,
PYRO,
REPTILE,
ROCK,
SEA_SERPENT,
SPELLCASTER,
THUNDER,
WARRIOR,
WINGED_BEAST,
WYRM,
ZOMBIE
}
@@ -0,0 +1,9 @@
package com.rak.model.card
data class SpellCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val subType: SpellCardType
) : Card()
@@ -0,0 +1,11 @@
package com.rak.model.card
// TODO fix underscore for all types with string value
enum class SpellCardType {
NORMAL,
CONTINUOUS,
EQUIP,
QUICK_PLAY,
FIELD,
RITUAL
}
@@ -0,0 +1,9 @@
package com.rak.model.card
data class TrapCard(
override val id: Int,
override val cardType: CardType,
override val description: String,
override val name: String,
val subType: TrapCardType
) : Card()
@@ -0,0 +1,7 @@
package com.rak.model.card
enum class TrapCardType {
NORMAL,
CONTINUOUS,
COUNTER
}
@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)
@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)
@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}
@@ -0,0 +1,3 @@
package com.rak.model.exception
class ElementNotFoundException(message: String) : RuntimeException(message)
@@ -0,0 +1,3 @@
package com.rak.model.exception
class InvalidConfigurationException(message: String) : RuntimeException(message)
@@ -0,0 +1,3 @@
package com.rak.model.exception
class NotImplementedException(message: String) : RuntimeException(message)
@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)
@@ -0,0 +1,7 @@
package com.rak.model.exception
import java.lang.RuntimeException
class UnsupportedQueryForProviderException(
message: String,
) : RuntimeException(message)
@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)
@@ -0,0 +1,18 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
override fun toResponse(exception: NotImplementedException): Response {
return Response.status(405).entity(
ErrorResponse(
exception.message ?: "Provider does not implement this method"
)
).build()
}
}
@@ -0,0 +1,19 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
override fun toResponse(exception: TargetNotFoundException): Response {
return Response.status(404).entity(
ErrorResponse(
exception.message ?: "Scrape target could not be found"
)
).build()
}
}
@@ -1,4 +0,0 @@
package com.rak.model.scrape
abstract class AbstractScraper{
}
@@ -1,6 +0,0 @@
package com.rak.model.scrape
class JsoupScraper : AbstractScraper() {
}
@@ -1,5 +0,0 @@
package com.rak.model.scrape
data class ScrapeJob(
val url: String,
)
@@ -0,0 +1,15 @@
package com.rak.model.set
data class CardSet(
var name: String,
val regionalSets: Set<RegionalSet>
) {
companion object {
fun fromMap(map: Map<String, String>, regionalSet: Set<RegionalSet>): CardSet {
return CardSet(
map["name"] ?: throw IllegalStateException("Parameter 'name' not found"),
regionalSet
)
}
}
}
@@ -0,0 +1,27 @@
package com.rak.model.set
import com.rak.model.card.CardPrint
data class RegionalSet(
val prefix: String,
val region: String,
val regionCode: String,
val cardPrints: Collection<CardPrint>,
val numberOfCards: Int
) {
companion object {
fun fromMap(map: Map<String, String>, cardPrints: Collection<CardPrint>): RegionalSet {
return RegionalSet(
map["prefix"] ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["region"] ?: throw IllegalStateException("Parameter 'region' not found"),
map["regionCode"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
cardPrints,
cardPrints.size
)
}
}
}
@@ -0,0 +1,3 @@
package com.rak.model.transform
interface AbstractTransformation
@@ -0,0 +1,6 @@
package com.rak.model.transform
@FunctionalInterface
fun interface ParameterizedTransformation : AbstractTransformation {
fun apply(input: String, parameters: MutableList<String>): String
}
@@ -0,0 +1,6 @@
package com.rak.model.transform
@FunctionalInterface
fun interface Transformation : AbstractTransformation {
fun apply(input: String): String
}
@@ -0,0 +1,83 @@
package com.rak.model.transform
import com.rak.config.model.TransformationStepConfig
import java.util.concurrent.ConcurrentHashMap
class TransformationRegistry {
private val transformations: ConcurrentHashMap<String, Transformation> = ConcurrentHashMap()
private val parameterizedTransformation: ConcurrentHashMap<String, ParameterizedTransformation> =
ConcurrentHashMap()
init {
register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace(Regex("^\""), "").replace(Regex("\"$"), "") }
register("replace") { input, parameters ->
require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters"
}
if (parameters.size == 1) {
parameters.add("")
}
input.replace(parameters[0], parameters[1])
}
register("regexReplace") { input, params ->
require(params.size == 1 || params.size == 2) {
"'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
}
input.replace(params[0].toRegex(), params[1])
}
}
fun register(name: String, transformation: Transformation) {
transformations.put(name, transformation)
}
fun register(name: String, transformation: ParameterizedTransformation) {
parameterizedTransformation.put(name, transformation)
}
fun getTransformation(transformationStep: TransformationStepConfig): AbstractTransformation {
val name = transformationStep.name()
val parameters = transformationStep.parameters()
return when {
transformations.containsKey(name) -> {
if (parameters.isPresent && parameters.get().isNotEmpty()) {
throw IllegalArgumentException("'$name' doesn't accept parameters")
} else {
transformations[name]!!
}
}
parameterizedTransformation.containsKey(name) -> {
if (parameters.isPresent && parameters.get().isEmpty()) {
throw IllegalArgumentException("'$name' requires parameters")
} else {
parameterizedTransformation[name]!!
}
}
else -> throw IllegalArgumentException("Unknown transformation: '$name'")
}
}
fun applyTransformations(input: String, steps: List<TransformationStepConfig>): String {
return steps.fold(input) { current, step ->
val actualStep = getTransformation(step)
when (actualStep) {
is Transformation ->
transformations[step.name()]?.apply(current)
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
is ParameterizedTransformation ->
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
else -> throw IllegalStateException("Invalid transformation type")
}
}
}
}
@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url", ex)
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}
@@ -1,29 +0,0 @@
package com.rak.service
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
/**
* To use it via injection.
*
* ```kotlin
* @Inject
* @RestClient
* lateinit var myRemoteService: MyRemoteService
*
* fun doSomething() {
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
* }
* ```
*/
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
interface MyRemoteService {
@GET
@Path("/extensions")
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
}
@@ -1,89 +1,89 @@
package com.rak.service package com.rak.service
import com.rak.config.RegionalSetDefinition import com.rak.config.model.ProviderConfig
import com.rak.config.SourcesConfiguration import com.rak.model.card.Card
import com.rak.config.Step import com.rak.model.exception.NotImplementedException
import com.rak.util.XPathUtil import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.CardSetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup import org.jsoup.Jsoup
import org.jsoup.nodes.Document import org.jsoup.nodes.Document
import org.jsoup.nodes.Element import java.lang.Exception
@ApplicationScoped @ApplicationScoped
class ScrapeService( class ScrapeService(
private val sourceService: SourceService private val sourceService: SourceService,
private val cardSetExtractionService: CardSetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService
) { ) {
companion object { fun ProviderConfig.buildUrl(targetName: String): String {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$") return this.getUrlPattern().format(targetName)
} }
private fun extractTextFromRootBySteps( fun scrapeSet(
root: Element,
steps: Set<Step>
): String? {
var currentElement: Element? = root.clone()
var result: String? = null
for (index in 0 until steps.size) {
val currentStep = steps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == steps.size - 1) {
result = XPathUtil.extractResult(currentElement, currentStep.value())
}
else {
currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
}
}
return result
}
fun extractSet(
provider: String, provider: String,
setName: String, setName: String,
): Map<String, String> { ): CardSet {
val source = val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName)
val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
var document: Document? = null
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query", ex)
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return cardSetExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
)
}
fun scrapeRegionalSet(
provider: String,
setName: String,
): RegionalSet {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName) val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
val regionalSetSelector = source.getItems().regionalSet().get()
val regionalSetRoot = document.selectFirst(regionalSetSelector.rootSelector().value())!! return regionalSetExtractionService.extract(document, source, source.getTargets().getRegionalSetConfig().get())
val setId: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.idSelector().steps()
)
val setLanguage: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.languageSelector().steps()
)
val setKey: String? = extractTextFromRootBySteps(
regionalSetRoot,
regionalSetSelector.regionKeySelector().steps()
)
return mapOf(
Pair("id", setId ?: "N/A"),
Pair("language", setLanguage ?: "N/A"),
Pair("key", setKey ?: "N/A"),
)
} }
fun scrapeCard(
fun extractCard(
provider: String, provider: String,
cardName: String, cardName: String,
): Map<String, String> { ): Card? {
val path: String = normalizePath(cardName) throw NotImplementedException("Not implemented")
return mapOf()
} }
private fun normalizePath(path: String): String = path private fun normalizePath(path: String): String = path
@@ -1,15 +1,86 @@
package com.rak.service package com.rak.service
import com.rak.config.SourceConfig import com.rak.config.model.CardScrapeTargetConfig
import com.rak.config.SourcesConfiguration import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.exception.InvalidConfigurationException
import io.quarkus.runtime.Startup
import jakarta.annotation.PostConstruct
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
@Startup
@ApplicationScoped @ApplicationScoped
class SourceService( class SourceService(
val sourcesConfiguration: SourcesConfiguration val sourcesConfiguration: SourcesConfig
) { ) {
fun getSources(): Set<SourceConfig> = sourcesConfiguration.getSources().toSet() @PostConstruct
fun getSourceById(id: String): SourceConfig? = getSources().firstOrNull { it.getId() == id } fun init() {
sourcesConfiguration.getSources().forEach { validateSource(it) }
}
private fun validateSource(providerConfig: ProviderConfig) {
val optionalRegionalSetConfig = providerConfig.getTargets().getRegionalSetConfig()
val optionalCardConfig = providerConfig.getTargets().getCardConfig()
if (optionalRegionalSetConfig.isPresent) {
validateSetExtractConfig(optionalRegionalSetConfig.get())
}
if (optionalCardConfig.isPresent) {
validateCardExtractConfig(optionalCardConfig.get())
}
}
private fun validateSetExtractConfig(setExtractConfig: RegionalSetScrapeTargetConfig) {
val selectors = listOf(
setExtractConfig.getLanguageConfig(),
setExtractConfig.getIdConfig(),
setExtractConfig.getRegionKeyConfig()
)
// If global root is present, dedicated roots may not exist
if (setExtractConfig.getRootConfig().isPresent) {
if (selectors.any { it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots cannot be set when a global extraction root is configured"
)
}
} else {
if (selectors.any { !it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots have to be set when a global extraction root is not configured"
)
}
}
}
private fun validateCardExtractConfig(cardScrapeTargetConfig: CardScrapeTargetConfig) {
val selectors = listOf(
cardScrapeTargetConfig.getEnglishNameConfig(),
cardScrapeTargetConfig.getDescriptionConfig(),
cardScrapeTargetConfig.getCardTypeConfig(),
cardScrapeTargetConfig.getAttackConfig(),
cardScrapeTargetConfig.getDefenseConfig(),
)
if (cardScrapeTargetConfig.getRootConfig().isPresent) {
if (selectors.any { it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots cannot be set when a global extraction root is configured"
)
}
} else {
if (selectors.any { !it.getRootConfig().isPresent }) {
throw InvalidConfigurationException(
"Dedicated extraction roots have to be set when a global extraction root is not configured"
)
}
}
}
fun getSources(): Set<ProviderConfig> = sourcesConfiguration.getSources().toSet()
fun getSourceById(id: String): ProviderConfig? = getSources().firstOrNull { it.getId() == id }
} }
@@ -0,0 +1,57 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import io.smallrye.faulttolerance.api.RateLimit
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.faulttolerance.Bulkhead
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
import java.time.temporal.ChronoUnit
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
@RateLimit(
value = 1,
minSpacing = 5
)
@Bulkhead
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}
@@ -0,0 +1,263 @@
package com.rak.service.extract
import com.rak.config.model.*
import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil
import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.*
import kotlin.jvm.optionals.getOrElse
// find root element from global or node config
// get field target configs as list
// extract item from root element via field config
abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
private val transformationRegistry = TransformationRegistry()
abstract fun T.getItems(): Map<String, ScrapeTargetFieldConfig>
abstract fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): E
abstract fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<E>
abstract fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<List<E>>
fun getRootElement(
element: Element,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Element {
return getRootElements(
element,
globalRootExtractConfig,
nodeRootExtractConfig
).firstOrNull() ?: throw ElementNotFoundException("")
}
fun getRootElements(
element: Element,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Elements {
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.getOrElse {
nodeRootExtractConfig.orElseThrow {
InvalidConfigurationException("")
}
}
return getElementsFromElementByExtractConfig(
element,
rootExtractConfig
)
}
protected fun getElementFromDocumentByExtractConfig(
element: Element,
step: ExtractConfig,
): Element? {
return getElementsFromElementByExtractConfig(element, step).firstOrNull()
}
protected fun getElementsFromElementByExtractConfig(
element: Element,
step: ExtractConfig,
): Elements {
return if (step.selectorType() == Selector.CSS) {
element.select(step.getQueryString())
} else {
element.selectXpath(step.getQueryString())
}
}
protected fun extractSingle(
document: Element,
extractionConfig: T
): Map<String, String> {
val result = mutableMapOf<String, String>()
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
val rootElement = getRootElement(
document,
extractionConfig.getRootConfig(),
fieldConfig.getRootConfig()
)
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
result.put(identifier, extractedText)
}
return result
}
fun extractMulti(
element: Element,
extractionConfig: T
): List<Map<String, String>> {
val resultList = mutableListOf<MutableMap<String, String>>()
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
val rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
fieldConfig.getRootConfig()
)
for(index in 0..rootElements.size - 1) {
val rootElement = rootElements[index]
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: if (fieldConfig.isNullable()) {
""
} else {
throw ElementNotFoundException("Could not find element for '$identifier'")
}
val mapToModify: MutableMap<String, String> = try {
resultList[index]
} catch (_: IndexOutOfBoundsException) {
val newMap = mutableMapOf<String, String>()
resultList.add(newMap)
newMap
}
mapToModify.put(identifier, extractedText)
}
}
return resultList
}
fun extractMultiWithDiscriminator(
element: Element,
extractionConfig: T
): List<List<Map<String, String>>>{
val rootElements = getRootElements(
element,
extractionConfig.getDiscriminator().get().getRootConfig(),
Optional.empty<ExtractConfig>()
)
val result = mutableListOf<List<Map<String, String>>>()
for(element in rootElements) {
result.add(extractMulti(
element,
extractionConfig
))
}
return result
}
private fun extractTextFromElementByTargetFieldConfig(
root: Element,
extractionConfig: ScrapeTargetFieldConfig
): String? {
val extractionMethods = extractionConfig.getExtractionMethods()
var result: String? = null
for(extractionMethod in extractionMethods) {
val extractionSteps = extractionMethod.getExtractionSteps()
val transformationSteps = extractionMethod.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted")
} else {
try {
validateValue(intermediateResult, extractionConfig.getOptionalValidation())
} catch (ex: ValueValidationException) {
throw ex
}
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException,
is ValueValidationException -> Log.debug(ex.message)
else -> throw ex
}
}
}
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result
}
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
}
@@ -0,0 +1,53 @@
package com.rak.service.extract
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrintScrapeTargetConfig>() {
override fun CardPrintScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("id", this.getIdConfig()),
Pair("name", this.getNameConfig()),
Pair("regionalName", this.getRegionNameConfig()),
Pair("rarity", this.getRarityConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): CardPrint {
throw NotImplementedException("Not implemented")
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<CardPrint> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<List<CardPrint>> {
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
element,
extractionConfig
)
return objectAsListOfMaps.map { innerList ->
innerList.map { map -> CardPrint.fromMap(map) }
}
}
}
@@ -0,0 +1,54 @@
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.CardSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class CardSetExtractionService(
private val regionalSetExtractionService: RegionalSetExtractionService
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("name", this.getNameConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): CardSet {
val set = extractSingle(element, extractionConfig)
return CardSet.fromMap(
set,
regionalSetExtractionService.extractMultiple(
element,
providerConfig,
providerConfig.getTargets().getRegionalSetConfig().get()
).toSet()
)
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<CardSet> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<CardSet>> {
throw NotImplementedException("Not implemented")
}
}
@@ -0,0 +1,62 @@
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
@ApplicationScoped
class RegionalSetExtractionService(
private val cardPrintExtractionService: CardPrintExtractionService,
private val sourcesConfig: SourcesConfig
) : AbstractExtractionService<RegionalSet, RegionalSetScrapeTargetConfig>() {
override fun RegionalSetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf(
Pair("prefix", this.getIdConfig()),
Pair("regionCode", this.getRegionKeyConfig()),
Pair("region", this.getLanguageConfig()),
)
}
override fun extract(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): RegionalSet {
throw NotImplementedException("Not implemented")
}
override fun extractMultiple(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig)
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
element,
providerConfig,
providerConfig.getTargets().getCardPrintConfiguration().get()
)
// Pair each RegionalSet with its CardPrint group by index
return regionalSetList.mapIndexed { index, regionalSetMap ->
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
}
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: RegionalSetScrapeTargetConfig
): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented")
}
}
+19
View File
@@ -0,0 +1,19 @@
package com.rak.util
import org.jsoup.nodes.Element
class CssUtil private constructor() {
companion object {
fun getNextElement(element: Element, path: String): Element? {
return element.select(path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {
return root
.select(path)
.firstOrNull()?.text()
}
}
}
+23
View File
@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}
@@ -0,0 +1,44 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}
+25 -6
View File
@@ -3,12 +3,16 @@ package com.rak.util
import com.rak.model.XPathTarget import com.rak.model.XPathTarget
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() { class XPathUtil private constructor() {
companion object { companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$") private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
private fun extractTextFromAttribute(root: Element, xpath: String): String? { private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath) val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
@@ -20,14 +24,29 @@ class XPathUtil private constructor() {
} }
} }
private fun extractTextFromNode(root: Element, xpath: String): String? { private fun selectXpath(element: Element, xpath: String): Elements {
return root return if (xpath.contains(INDEX_MATCHER)) {
.selectXpath(xpath, TextNode::class.java) val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
.firstOrNull()?.text() val xpathHalves = xpath.split("[$index]")
try {
Elements(element.selectXpath(xpathHalves[0])[index])
} catch (_: IndexOutOfBoundsException) {
Elements()
}
} else {
element.selectXpath(xpath)
}
} }
fun getNextElement(root: Element, path: String): Element? { private fun extractTextFromNode(root: Element, xpath: String): String? {
return root.selectXpath(path).firstOrNull() return root
.selectXpath(xpath.replace("/text()", ""))
.text()
}
fun getNextElement(element: Element, path: String): Element? {
return selectXpath(element, path).firstOrNull()
} }
fun extractResult(root: Element, path: String): String? { fun extractResult(root: Element, path: String): String? {
@@ -1 +1,2 @@
com.rak.config.converter.TypeSelectorConverter com.rak.config.converter.TypeSelectorConverter
com.rak.config.converter.DiscriminatorDirectionConverter
+179 -23
View File
@@ -1,48 +1,204 @@
quarkus: quarkus:
container-image:
registry: gitea.smoothbrain.win
group: rak
build: true
additional-tags: latest
http: http:
port: 8081 port: 8081
live-reload:
instrumentation: true
scraper: scraper:
sources: sources:
- id: konami-official # - id: konami-official
name: "Konami Official Database" # name: "Konami Official Database"
domain: "yugioh-card.com" # domain: "yugioh-card.com"
url-patterns: # url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" # targets:
selectors: # card:
card: # root:
name: # type: css
steps: # value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
- type: "css" # name:
value: "h1.product-title" # steps:
- type: "xpath" # - type: "css"
value: "//h1[@itemprop='name']" # value: "h1.product-title"
attack: # - type: "xpath"
steps: # value: "//h1[@itemprop='name']"
- type: "css" # attack:
value: ".atk-value" # steps:
# - type: "css"
# value: ".atk-value"
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com" domain: "yugioh.fandom.com"
url-patterns: url-pattern: "https://yugioh.fandom.com/wiki/%s"
- "^https://yugioh\\.fandom\\.com/wiki/.*$" targets:
selectors: set:
root:
type: css
value: "aside > .pi-title"
name:
type: string
extractors:
- steps:
- type: xpath
value: "//h2/text()"
regional-set: regional-set:
root: root:
type: css type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id: id:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/text()" value: "//li/text()"
transform:
- name: "regexReplace"
parameters: [
" *\\(.+\\)",
""
]
language: language:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/abbr" value: "//li/abbr"
- type: xpath - type: xpath
value: "//abbr/@title" value: "//abbr/@title"
region-key: region-key:
steps: type: int
extractors:
- steps:
- type: xpath - type: xpath
value: "//li/abbr/text()" value: "//li/abbr/text()"
card-print:
multi: true
root:
type: css
value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id:
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[0]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/span/text()"
transform:
- name: "regexReplace"
parameters: [
" .+",
""
]
- name: "regexReplace"
parameters: [
".+-[A-Za-z]*0?",
""
]
validation:
pattern: "^.+-.+\\\\d.+$"
name:
type: int
extractors:
- steps:
- type: xpath
value: "./td[1]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
" ?\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name:
type: int
nullable: true
extractors:
- steps:
- type: xpath
value: "./td[2]"
- type: xpath
value: "./text()"
transform:
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\"$"
rarity:
fallback:
default: "N/A"
type: int
extractors:
- steps:
- type: xpath
value: "./td/a[3]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[2]"
- type: xpath
value: "./text()"
- steps:
- type: xpath
value: "./td/a[1]"
- type: xpath
value: "./text()"
validation:
pattern: "^.*(Common|Rare|Print).*$"
# card:
# name:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "./tbody/tr[3]/th/text()"
# description:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# type:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# attack:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# defense:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"