Compare commits
16 Commits
69ff62c4c0
...
108b4c4c19
| Author | SHA1 | Date | |
|---|---|---|---|
| 108b4c4c19 | |||
| 8f934bc2b9 | |||
| a6ed98c36e | |||
| 052bdd6a52 | |||
| edc604231f | |||
| 2289489fe1 | |||
| e97f9bdd61 | |||
| 39c0ebfc7c | |||
| e0330e7baa | |||
| 3808fe153e | |||
| 0196308c10 | |||
| 72af626e54 | |||
| ce64f90a66 | |||
| 284723c978 | |||
| 8cc9a64111 | |||
| 9db3753105 |
@@ -23,8 +23,11 @@ dependencies {
|
|||||||
implementation("io.quarkus:quarkus-rest-jackson")
|
implementation("io.quarkus:quarkus-rest-jackson")
|
||||||
implementation("io.quarkus:quarkus-kotlin")
|
implementation("io.quarkus:quarkus-kotlin")
|
||||||
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
|
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
|
||||||
implementation("org.jsoup:jsoup:1.20.1")
|
|
||||||
implementation("io.quarkus:quarkus-arc")
|
implementation("io.quarkus:quarkus-arc")
|
||||||
|
implementation("org.jsoup:jsoup:1.20.1")
|
||||||
|
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
|
||||||
|
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
|
||||||
|
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
|
||||||
testImplementation("io.quarkus:quarkus-junit5")
|
testImplementation("io.quarkus:quarkus-junit5")
|
||||||
testImplementation("io.rest-assured:rest-assured")
|
testImplementation("io.rest-assured:rest-assured")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,4 +10,4 @@ pluginManagement {
|
|||||||
id(quarkusPluginId) version quarkusPluginVersion
|
id(quarkusPluginId) version quarkusPluginVersion
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
rootProject.name = "jsoup-scraper"
|
rootProject.name = "dex-scraper"
|
||||||
|
|||||||
42
sources.yml
42
sources.yml
@@ -1,42 +0,0 @@
|
|||||||
scraper:
|
|
||||||
sources:
|
|
||||||
- id: konami-official
|
|
||||||
name: "Konami Official Database"
|
|
||||||
domain: "yugioh-card.com"
|
|
||||||
url-patterns:
|
|
||||||
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
|
||||||
selectors:
|
|
||||||
card:
|
|
||||||
name:
|
|
||||||
steps:
|
|
||||||
- type: "css"
|
|
||||||
value: "h1.product-title"
|
|
||||||
- type: "xpath"
|
|
||||||
value: "//h1[@itemprop='name']"
|
|
||||||
attack:
|
|
||||||
steps:
|
|
||||||
- type: "css"
|
|
||||||
value: ".atk-value"
|
|
||||||
|
|
||||||
- id: ygo-fandom
|
|
||||||
name: "Yu-Gi-Oh Fandom Wiki"
|
|
||||||
domain: "yugioh.fandom.com"
|
|
||||||
url-patterns:
|
|
||||||
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
|
||||||
selectors:
|
|
||||||
regional-set:
|
|
||||||
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
|
||||||
id:
|
|
||||||
steps:
|
|
||||||
- type: "xpath"
|
|
||||||
value: "//li/text()"
|
|
||||||
language:
|
|
||||||
steps:
|
|
||||||
- type: "xpath"
|
|
||||||
value: "//li/abbr"
|
|
||||||
- type: "xpath"
|
|
||||||
value: "//abbr/@title"
|
|
||||||
region-key:
|
|
||||||
steps:
|
|
||||||
- type: "xpath"
|
|
||||||
value: "//li/abbr/text()"
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
package com.rak.config
|
|
||||||
|
|
||||||
import io.smallrye.config.WithName
|
|
||||||
|
|
||||||
interface CardDefinition {
|
|
||||||
@WithName("name")
|
|
||||||
fun nameSelector(): SelectorDefinition
|
|
||||||
@WithName("attack")
|
|
||||||
fun attackSelector(): SelectorDefinition
|
|
||||||
@WithName("effect")
|
|
||||||
fun effectSelector(): SelectorDefinition
|
|
||||||
}
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
package com.rak.config
|
|
||||||
|
|
||||||
import java.util.*
|
|
||||||
|
|
||||||
interface Items {
|
|
||||||
fun card(): Optional<CardDefinition>
|
|
||||||
fun regionalSet(): Optional<RegionalSetDefinition>
|
|
||||||
}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
package com.rak.config
|
|
||||||
|
|
||||||
import com.rak.config.converter.AbstractModelDefinition
|
|
||||||
import io.smallrye.config.WithName
|
|
||||||
|
|
||||||
interface RegionalSetDefinition : AbstractModelDefinition {
|
|
||||||
@WithName("id")
|
|
||||||
fun idSelector(): SelectorDefinition
|
|
||||||
@WithName("language")
|
|
||||||
fun languageSelector(): SelectorDefinition
|
|
||||||
@WithName("region-key")
|
|
||||||
fun regionKeySelector(): SelectorDefinition
|
|
||||||
}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
package com.rak.config
|
|
||||||
|
|
||||||
interface SelectorDefinition {
|
|
||||||
fun steps(): Set<Step>
|
|
||||||
}
|
|
||||||
@@ -1,9 +0,0 @@
|
|||||||
package com.rak.config.converter
|
|
||||||
|
|
||||||
import com.rak.config.Step
|
|
||||||
import io.smallrye.config.WithName
|
|
||||||
|
|
||||||
interface AbstractModelDefinition {
|
|
||||||
@WithName("root")
|
|
||||||
fun rootSelector(): Step
|
|
||||||
}
|
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
package com.rak.config.converter
|
||||||
|
|
||||||
|
import com.rak.model.DiscriminatorDirection
|
||||||
|
import org.eclipse.microprofile.config.spi.Converter
|
||||||
|
|
||||||
|
class DiscriminatorDirectionConverter : Converter<DiscriminatorDirection> {
|
||||||
|
override fun convert(value: String): DiscriminatorDirection? {
|
||||||
|
return DiscriminatorDirection.from(value)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
package com.rak.config.converter
|
package com.rak.config.converter
|
||||||
|
|
||||||
import com.rak.model.scrape.selector.Selector
|
import com.rak.model.Selector
|
||||||
import org.eclipse.microprofile.config.spi.Converter
|
import org.eclipse.microprofile.config.spi.Converter
|
||||||
|
|
||||||
class TypeSelectorConverter : Converter<Selector> {
|
class TypeSelectorConverter : Converter<Selector> {
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithDefault
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.*
|
||||||
|
|
||||||
|
interface AbstractScrapeTargetConfig {
|
||||||
|
@WithName("root")
|
||||||
|
fun getRootConfig(): Optional<ExtractConfig>
|
||||||
|
@WithName("multi")
|
||||||
|
@WithDefault("false")
|
||||||
|
fun isMulti(): Boolean
|
||||||
|
@WithName("discriminator")
|
||||||
|
fun getDiscriminator(): Optional<DiscriminatorConfig>
|
||||||
|
}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
interface AbstractScrapeTargetFieldConfig
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
|
interface CardPrintScrapeTargetConfig : AbstractScrapeTargetConfig {
|
||||||
|
@WithName("id")
|
||||||
|
fun getIdConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("name")
|
||||||
|
fun getNameConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("regional-name")
|
||||||
|
fun getRegionNameConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("rarity")
|
||||||
|
fun getRarityConfig(): ScrapeTargetFieldConfig
|
||||||
|
}
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
|
interface CardScrapeTargetConfig : AbstractScrapeTargetConfig {
|
||||||
|
@WithName("name")
|
||||||
|
fun getEnglishNameConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("description")
|
||||||
|
fun getDescriptionConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("type")
|
||||||
|
fun getCardTypeConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("attack")
|
||||||
|
fun getAttackConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("defense")
|
||||||
|
fun getDefenseConfig(): ScrapeTargetFieldConfig
|
||||||
|
}
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import com.rak.config.converter.DiscriminatorDirectionConverter
|
||||||
|
import com.rak.model.DiscriminatorDirection
|
||||||
|
import io.smallrye.config.WithConverter
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
|
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
|
||||||
|
}
|
||||||
@@ -1,13 +1,14 @@
|
|||||||
package com.rak.config
|
package com.rak.config.model
|
||||||
|
|
||||||
import com.rak.config.converter.TypeSelectorConverter
|
import com.rak.config.converter.TypeSelectorConverter
|
||||||
import com.rak.model.scrape.selector.Selector
|
import com.rak.model.Selector
|
||||||
import io.smallrye.config.WithConverter
|
import io.smallrye.config.WithConverter
|
||||||
import io.smallrye.config.WithName
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
interface Step {
|
interface ExtractConfig {
|
||||||
@WithConverter(TypeSelectorConverter::class)
|
@WithConverter(TypeSelectorConverter::class)
|
||||||
@WithName("type")
|
@WithName("type")
|
||||||
fun selectorType(): Selector // e.g. css or xpath
|
fun selectorType(): Selector
|
||||||
fun value(): String
|
@WithName("value")
|
||||||
|
fun getQueryString(): String
|
||||||
}
|
}
|
||||||
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithDefault
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.Optional
|
||||||
|
|
||||||
|
interface FieldConfigFallback {
|
||||||
|
@WithName("steps")
|
||||||
|
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
|
||||||
|
@WithName("transform")
|
||||||
|
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||||
|
@WithName("default")
|
||||||
|
@WithDefault("N/A")
|
||||||
|
fun getOptionalDefaultValue(): String
|
||||||
|
}
|
||||||
@@ -1,9 +1,9 @@
|
|||||||
package com.rak.config
|
package com.rak.config.model
|
||||||
|
|
||||||
import io.smallrye.config.WithName
|
import io.smallrye.config.WithName
|
||||||
import java.util.*
|
import java.util.*
|
||||||
|
|
||||||
interface SourceConfig {
|
interface ProviderConfig {
|
||||||
|
|
||||||
@WithName("id")
|
@WithName("id")
|
||||||
fun getId(): String
|
fun getId(): String
|
||||||
@@ -11,9 +11,9 @@ interface SourceConfig {
|
|||||||
fun getName(): String
|
fun getName(): String
|
||||||
@WithName("domain")
|
@WithName("domain")
|
||||||
fun getDomain(): String
|
fun getDomain(): String
|
||||||
@WithName("url-patterns")
|
@WithName("url-pattern")
|
||||||
fun getUrlPatterns(): Optional<MutableSet<String>>
|
fun getUrlPattern(): String
|
||||||
@WithName("selectors")
|
@WithName("targets")
|
||||||
fun getItems(): Items
|
fun getTargets(): TargetsConfig
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.*
|
||||||
|
|
||||||
|
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||||
|
@WithName("root")
|
||||||
|
fun getRootConfig(): Optional<ExtractConfig>
|
||||||
|
@WithName("steps")
|
||||||
|
fun getExtractionSteps(): List<ExtractConfig>
|
||||||
|
@WithName("transform")
|
||||||
|
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||||
|
@WithName("fallback")
|
||||||
|
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||||
|
}
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
|
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
|
||||||
|
@WithName("id")
|
||||||
|
fun getIdConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("language")
|
||||||
|
fun getLanguageConfig(): ScrapeTargetFieldConfig
|
||||||
|
@WithName("region-key")
|
||||||
|
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
|
||||||
|
}
|
||||||
@@ -1,12 +1,12 @@
|
|||||||
package com.rak.config
|
package com.rak.config.model
|
||||||
|
|
||||||
import io.smallrye.config.ConfigMapping
|
import io.smallrye.config.ConfigMapping
|
||||||
import io.smallrye.config.WithName
|
import io.smallrye.config.WithName
|
||||||
|
|
||||||
@ConfigMapping(prefix = "scraper")
|
@ConfigMapping(prefix = "scraper")
|
||||||
interface SourcesConfiguration {
|
interface SourcesConfig {
|
||||||
|
|
||||||
@WithName("sources")
|
@WithName("sources")
|
||||||
fun getSources(): MutableList<SourceConfig>
|
fun getSources(): MutableList<ProviderConfig>
|
||||||
|
|
||||||
}
|
}
|
||||||
13
src/main/kotlin/com/rak/config/model/TargetsConfig.kt
Normal file
13
src/main/kotlin/com/rak/config/model/TargetsConfig.kt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import io.smallrye.config.WithName
|
||||||
|
import java.util.*
|
||||||
|
|
||||||
|
interface TargetsConfig {
|
||||||
|
@WithName("card")
|
||||||
|
fun getCardConfig(): Optional<CardScrapeTargetConfig>
|
||||||
|
@WithName("set")
|
||||||
|
fun getSetConfig(): Optional<SetScrapeTargetConfig>
|
||||||
|
@WithName("card-print")
|
||||||
|
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
|
||||||
|
}
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.config.model
|
||||||
|
|
||||||
|
import java.util.Optional
|
||||||
|
|
||||||
|
interface TransformationStepConfig {
|
||||||
|
fun name(): String
|
||||||
|
fun parameters(): Optional<MutableList<String>>
|
||||||
|
}
|
||||||
@@ -1,8 +1,11 @@
|
|||||||
package com.rak.controller
|
package com.rak.controller
|
||||||
|
|
||||||
import com.rak.config.SourcesConfiguration
|
import com.rak.model.card.Card
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import com.rak.model.set.CardSet
|
||||||
|
import com.rak.model.set.RegionalSet
|
||||||
|
import com.rak.service.CommonCrawlService
|
||||||
import com.rak.service.ScrapeService
|
import com.rak.service.ScrapeService
|
||||||
import com.rak.service.SourceService
|
|
||||||
import jakarta.ws.rs.Consumes
|
import jakarta.ws.rs.Consumes
|
||||||
import jakarta.ws.rs.GET
|
import jakarta.ws.rs.GET
|
||||||
import jakarta.ws.rs.Path
|
import jakarta.ws.rs.Path
|
||||||
@@ -13,16 +16,11 @@ import org.jboss.resteasy.reactive.RestQuery
|
|||||||
|
|
||||||
|
|
||||||
@Path("/api")
|
@Path("/api")
|
||||||
class ExampleResource(
|
class ScrapeController(
|
||||||
private val sourcesConfiguration: SourcesConfiguration,
|
|
||||||
private val scrapeService: ScrapeService,
|
private val scrapeService: ScrapeService,
|
||||||
private val sourceService: SourceService
|
private val commonCrawlService: CommonCrawlService
|
||||||
) {
|
) {
|
||||||
|
|
||||||
companion object {
|
|
||||||
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
|
|
||||||
}
|
|
||||||
|
|
||||||
@GET
|
@GET
|
||||||
@Path("/{provider}/set")
|
@Path("/{provider}/set")
|
||||||
@Produces(MediaType.APPLICATION_JSON)
|
@Produces(MediaType.APPLICATION_JSON)
|
||||||
@@ -32,8 +30,24 @@ class ExampleResource(
|
|||||||
provider: String,
|
provider: String,
|
||||||
@RestQuery
|
@RestQuery
|
||||||
setName: String
|
setName: String
|
||||||
): Map<String, String> {
|
): CardSet {
|
||||||
return scrapeService.extractSet(
|
return scrapeService.scrapeSet(
|
||||||
|
provider,
|
||||||
|
setName
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
@GET
|
||||||
|
@Path("/{provider}/regionalSet")
|
||||||
|
@Produces(MediaType.APPLICATION_JSON)
|
||||||
|
@Consumes(MediaType.APPLICATION_JSON)
|
||||||
|
fun scrapeRegionalSet(
|
||||||
|
@RestPath
|
||||||
|
provider: String,
|
||||||
|
@RestQuery
|
||||||
|
setName: String
|
||||||
|
): RegionalSet {
|
||||||
|
return scrapeService.scrapeRegionalSet(
|
||||||
provider,
|
provider,
|
||||||
setName
|
setName
|
||||||
)
|
)
|
||||||
@@ -48,8 +62,8 @@ class ExampleResource(
|
|||||||
provider: String,
|
provider: String,
|
||||||
@RestQuery
|
@RestQuery
|
||||||
cardName: String
|
cardName: String
|
||||||
): Map<String, String> {
|
): Card? {
|
||||||
return scrapeService.extractCard(
|
return scrapeService.scrapeCard(
|
||||||
provider,
|
provider,
|
||||||
cardName
|
cardName
|
||||||
)
|
)
|
||||||
17
src/main/kotlin/com/rak/model/DiscriminatorDirection.kt
Normal file
17
src/main/kotlin/com/rak/model/DiscriminatorDirection.kt
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
package com.rak.model
|
||||||
|
|
||||||
|
enum class DiscriminatorDirection(val value: String) {
|
||||||
|
ASC("asc"),
|
||||||
|
DESC("desc");
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
fun from(value: String): DiscriminatorDirection? {
|
||||||
|
for (discriminatorDirection in DiscriminatorDirection.entries) {
|
||||||
|
if (discriminatorDirection.value == value) {
|
||||||
|
return discriminatorDirection
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
5
src/main/kotlin/com/rak/model/ErrorResponse.kt
Normal file
5
src/main/kotlin/com/rak/model/ErrorResponse.kt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
package com.rak.model
|
||||||
|
|
||||||
|
data class ErrorResponse(
|
||||||
|
val message: String
|
||||||
|
)
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
package com.rak.model.scrape.selector
|
package com.rak.model
|
||||||
|
|
||||||
enum class Selector {
|
enum class Selector {
|
||||||
CSS,
|
CSS,
|
||||||
11
src/main/kotlin/com/rak/model/card/Attribute.kt
Normal file
11
src/main/kotlin/com/rak/model/card/Attribute.kt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
enum class Attribute {
|
||||||
|
WIND,
|
||||||
|
WATER,
|
||||||
|
FIRE,
|
||||||
|
EARTH,
|
||||||
|
LIGHT,
|
||||||
|
DARK,
|
||||||
|
DIVINE;
|
||||||
|
}
|
||||||
8
src/main/kotlin/com/rak/model/card/Card.kt
Normal file
8
src/main/kotlin/com/rak/model/card/Card.kt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
abstract class Card {
|
||||||
|
abstract val id: Int
|
||||||
|
abstract val cardType: CardType
|
||||||
|
abstract val description: String
|
||||||
|
abstract val name: String
|
||||||
|
}
|
||||||
23
src/main/kotlin/com/rak/model/card/CardPrint.kt
Normal file
23
src/main/kotlin/com/rak/model/card/CardPrint.kt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
import com.rak.model.set.RegionalSet
|
||||||
|
|
||||||
|
data class CardPrint(
|
||||||
|
val id: String,
|
||||||
|
val name: String,
|
||||||
|
val regionalName: String? = null,
|
||||||
|
val rarity: String
|
||||||
|
) {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
fun fromMap(map: Map<String, String>): CardPrint {
|
||||||
|
return CardPrint(
|
||||||
|
map["id"] ?: throw IllegalStateException("Parameter 'prefix' not found"),
|
||||||
|
map["name"] ?: throw IllegalStateException("Parameter 'region' not found"),
|
||||||
|
map["regionalName"],
|
||||||
|
map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
8
src/main/kotlin/com/rak/model/card/CardType.kt
Normal file
8
src/main/kotlin/com/rak/model/card/CardType.kt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
enum class CardType {
|
||||||
|
MONSTER,
|
||||||
|
SPELL,
|
||||||
|
TRAP,
|
||||||
|
UNKNOWN
|
||||||
|
}
|
||||||
3
src/main/kotlin/com/rak/model/card/ICardType.kt
Normal file
3
src/main/kotlin/com/rak/model/card/ICardType.kt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
interface ICardType
|
||||||
12
src/main/kotlin/com/rak/model/card/LinkArrow.kt
Normal file
12
src/main/kotlin/com/rak/model/card/LinkArrow.kt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
enum class LinkArrow {
|
||||||
|
TOP_LEFT,
|
||||||
|
TOP,
|
||||||
|
TOP_RIGHT,
|
||||||
|
LEFT,
|
||||||
|
RIGHT,
|
||||||
|
BOTTOM_LEFT,
|
||||||
|
BOTTOM,
|
||||||
|
BOTTOM_RIGHT;
|
||||||
|
}
|
||||||
20
src/main/kotlin/com/rak/model/card/MonsterCard.kt
Normal file
20
src/main/kotlin/com/rak/model/card/MonsterCard.kt
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
data class MonsterCard(
|
||||||
|
override val id: Int,
|
||||||
|
override val cardType: CardType,
|
||||||
|
override val description: String,
|
||||||
|
override val name: String,
|
||||||
|
val monsterEffect: String? = null,
|
||||||
|
val attack: Int? = null,
|
||||||
|
val defense: Int? = null,
|
||||||
|
val level: Int? = null,
|
||||||
|
val isPendulum: Boolean = false,
|
||||||
|
val pendulumScale: Int? = null,
|
||||||
|
val pendulumEffect: String? = null,
|
||||||
|
val linkValue: Int? = null,
|
||||||
|
val subType: MonsterCardType,
|
||||||
|
val monsterType: MonsterType,
|
||||||
|
val attribute: Attribute,
|
||||||
|
val linkArrows: Set<LinkArrow>
|
||||||
|
) : Card()
|
||||||
11
src/main/kotlin/com/rak/model/card/MonsterCardType.kt
Normal file
11
src/main/kotlin/com/rak/model/card/MonsterCardType.kt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
enum class MonsterCardType : ICardType {
|
||||||
|
NORMAL,
|
||||||
|
EFFECT,
|
||||||
|
RITUAL,
|
||||||
|
FUSION,
|
||||||
|
SYNCHRO,
|
||||||
|
XYZ,
|
||||||
|
LINK
|
||||||
|
}
|
||||||
32
src/main/kotlin/com/rak/model/card/MonsterType.kt
Normal file
32
src/main/kotlin/com/rak/model/card/MonsterType.kt
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
// TODO string value for proper names
|
||||||
|
// TODO consider adding unknown type
|
||||||
|
enum class MonsterType {
|
||||||
|
AQUA,
|
||||||
|
BEAST,
|
||||||
|
BEAST_WARRIOR,
|
||||||
|
CREATOR_GOD,
|
||||||
|
CYBERSE,
|
||||||
|
DINOSAUR,
|
||||||
|
DIVINE_BEAST,
|
||||||
|
DRAGON,
|
||||||
|
FAIRY,
|
||||||
|
FIEND,
|
||||||
|
FISH,
|
||||||
|
INSECT,
|
||||||
|
ILLUSION,
|
||||||
|
MACHINE,
|
||||||
|
PLANT,
|
||||||
|
PSYCHIC,
|
||||||
|
PYRO,
|
||||||
|
REPTILE,
|
||||||
|
ROCK,
|
||||||
|
SEA_SERPENT,
|
||||||
|
SPELLCASTER,
|
||||||
|
THUNDER,
|
||||||
|
WARRIOR,
|
||||||
|
WINGED_BEAST,
|
||||||
|
WYRM,
|
||||||
|
ZOMBIE
|
||||||
|
}
|
||||||
9
src/main/kotlin/com/rak/model/card/SpellCard.kt
Normal file
9
src/main/kotlin/com/rak/model/card/SpellCard.kt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
data class SpellCard(
|
||||||
|
override val id: Int,
|
||||||
|
override val cardType: CardType,
|
||||||
|
override val description: String,
|
||||||
|
override val name: String,
|
||||||
|
val subType: SpellCardType
|
||||||
|
) : Card()
|
||||||
11
src/main/kotlin/com/rak/model/card/SpellCardType.kt
Normal file
11
src/main/kotlin/com/rak/model/card/SpellCardType.kt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
// TODO fix underscore for all types with string value
|
||||||
|
enum class SpellCardType {
|
||||||
|
NORMAL,
|
||||||
|
CONTINUOUS,
|
||||||
|
EQUIP,
|
||||||
|
QUICK_PLAY,
|
||||||
|
FIELD,
|
||||||
|
RITUAL
|
||||||
|
}
|
||||||
9
src/main/kotlin/com/rak/model/card/TrapCard.kt
Normal file
9
src/main/kotlin/com/rak/model/card/TrapCard.kt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
data class TrapCard(
|
||||||
|
override val id: Int,
|
||||||
|
override val cardType: CardType,
|
||||||
|
override val description: String,
|
||||||
|
override val name: String,
|
||||||
|
val subType: TrapCardType
|
||||||
|
) : Card()
|
||||||
7
src/main/kotlin/com/rak/model/card/TrapCardType.kt
Normal file
7
src/main/kotlin/com/rak/model/card/TrapCardType.kt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
package com.rak.model.card
|
||||||
|
|
||||||
|
enum class TrapCardType {
|
||||||
|
NORMAL,
|
||||||
|
CONTINUOUS,
|
||||||
|
COUNTER
|
||||||
|
}
|
||||||
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
data class CCIndexErrorResponse(
|
||||||
|
val message: String
|
||||||
|
)
|
||||||
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty
|
||||||
|
import java.time.Instant
|
||||||
|
|
||||||
|
data class CCIndexSuccessResponse(
|
||||||
|
@JsonProperty("urlkey")
|
||||||
|
val urlKey: String,
|
||||||
|
val timestamp: Instant,
|
||||||
|
val url: String,
|
||||||
|
val mime: String,
|
||||||
|
@JsonProperty("mime-detected")
|
||||||
|
val mimeDetected: String,
|
||||||
|
val status: String,
|
||||||
|
val digest: String,
|
||||||
|
val length: Int,
|
||||||
|
val offset: Int,
|
||||||
|
@JsonProperty("filename")
|
||||||
|
val fileName: String,
|
||||||
|
val languages: String,
|
||||||
|
val encoding: String,
|
||||||
|
)
|
||||||
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
enum class CCIndices(val indexName: String) {
|
||||||
|
CC_2025_21("CC-MAIN-2025-21"),
|
||||||
|
CC_2025_05("CC-MAIN-2024-05"),
|
||||||
|
CC_2024_46("CC-MAIN-2024-46"),
|
||||||
|
CC_2024_26("CC-MAIN-2024-26"),
|
||||||
|
CC_2023_50("CC-MAIN-2023-50");
|
||||||
|
}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class ElementNotFoundException(message: String) : RuntimeException(message)
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class InvalidConfigurationException(message: String) : RuntimeException(message)
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class NotImplementedException(message: String) : RuntimeException(message)
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class TargetNotFoundException(message: String) : RuntimeException(message)
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
import java.lang.RuntimeException
|
||||||
|
|
||||||
|
class UnsupportedQueryForProviderException(
|
||||||
|
message: String,
|
||||||
|
) : RuntimeException(message)
|
||||||
@@ -0,0 +1,18 @@
|
|||||||
|
package com.rak.model.exception.mapper
|
||||||
|
|
||||||
|
import com.rak.model.ErrorResponse
|
||||||
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import jakarta.ws.rs.core.Response
|
||||||
|
import jakarta.ws.rs.ext.ExceptionMapper
|
||||||
|
import jakarta.ws.rs.ext.Provider
|
||||||
|
|
||||||
|
@Provider
|
||||||
|
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
|
||||||
|
override fun toResponse(exception: NotImplementedException): Response {
|
||||||
|
return Response.status(405).entity(
|
||||||
|
ErrorResponse(
|
||||||
|
exception.message ?: "Provider does not implement this method"
|
||||||
|
)
|
||||||
|
).build()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
package com.rak.model.exception.mapper
|
||||||
|
|
||||||
|
import com.rak.model.ErrorResponse
|
||||||
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import com.rak.model.exception.TargetNotFoundException
|
||||||
|
import jakarta.ws.rs.core.Response
|
||||||
|
import jakarta.ws.rs.ext.ExceptionMapper
|
||||||
|
import jakarta.ws.rs.ext.Provider
|
||||||
|
|
||||||
|
@Provider
|
||||||
|
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
|
||||||
|
override fun toResponse(exception: TargetNotFoundException): Response {
|
||||||
|
return Response.status(404).entity(
|
||||||
|
ErrorResponse(
|
||||||
|
exception.message ?: "Scrape target could not be found"
|
||||||
|
)
|
||||||
|
).build()
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
package com.rak.model.scrape
|
|
||||||
|
|
||||||
abstract class AbstractScraper{
|
|
||||||
}
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
package com.rak.model.scrape
|
|
||||||
|
|
||||||
class JsoupScraper : AbstractScraper() {
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
package com.rak.model.scrape
|
|
||||||
|
|
||||||
data class ScrapeJob(
|
|
||||||
val url: String,
|
|
||||||
)
|
|
||||||
12
src/main/kotlin/com/rak/model/set/CardSet.kt
Normal file
12
src/main/kotlin/com/rak/model/set/CardSet.kt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
package com.rak.model.set
|
||||||
|
|
||||||
|
import kotlin.collections.Set
|
||||||
|
|
||||||
|
data class CardSet(
|
||||||
|
var name: String,
|
||||||
|
val regionalSets: Set<RegionalSet>
|
||||||
|
) {
|
||||||
|
companion object {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
49
src/main/kotlin/com/rak/model/set/RegionalSet.kt
Normal file
49
src/main/kotlin/com/rak/model/set/RegionalSet.kt
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
package com.rak.model.set
|
||||||
|
|
||||||
|
import com.rak.model.card.CardPrint
|
||||||
|
|
||||||
|
data class RegionalSet(
|
||||||
|
val prefix: String,
|
||||||
|
val region: String,
|
||||||
|
val regionCode: String,
|
||||||
|
val cardPrints: Collection<CardPrint>,
|
||||||
|
val numberOfCards: Int
|
||||||
|
) {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
|
||||||
|
fun fromMap(map: Map<String, String>, cardPrints: Collection<CardPrint>): RegionalSet {
|
||||||
|
return RegionalSet(
|
||||||
|
map["prefix"] ?: throw IllegalStateException("Parameter 'prefix' not found"),
|
||||||
|
map["region"] ?: throw IllegalStateException("Parameter 'region' not found"),
|
||||||
|
map["regionCode"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
|
||||||
|
cardPrints,
|
||||||
|
cardPrints.size
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun flattenFromMemberLists(
|
||||||
|
idList: List<String>,
|
||||||
|
languageList: List<String>,
|
||||||
|
regionKeyAliasList: List<String>,
|
||||||
|
): MutableSet<RegionalSet> {
|
||||||
|
if (idList.size != languageList.size && idList.size != regionKeyAliasList.size) {
|
||||||
|
throw IllegalArgumentException("Lists have to be the same size")
|
||||||
|
}
|
||||||
|
|
||||||
|
val regionalSetList: MutableSet<RegionalSet> = mutableSetOf()
|
||||||
|
for (index in 0..idList.size - 1) {
|
||||||
|
regionalSetList.add(RegionalSet(
|
||||||
|
prefix = idList[index],
|
||||||
|
region = languageList[index],
|
||||||
|
regionCode = regionKeyAliasList[index],
|
||||||
|
listOf(),
|
||||||
|
numberOfCards = -1
|
||||||
|
))
|
||||||
|
}
|
||||||
|
return regionalSetList
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.transform
|
||||||
|
|
||||||
|
interface AbstractTransformation
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
package com.rak.model.transform
|
||||||
|
|
||||||
|
@FunctionalInterface
|
||||||
|
fun interface ParameterizedTransformation : AbstractTransformation {
|
||||||
|
fun apply(input: String, parameters: MutableList<String>): String
|
||||||
|
}
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
package com.rak.model.transform
|
||||||
|
|
||||||
|
@FunctionalInterface
|
||||||
|
fun interface Transformation : AbstractTransformation {
|
||||||
|
fun apply(input: String): String
|
||||||
|
}
|
||||||
@@ -0,0 +1,80 @@
|
|||||||
|
package com.rak.model.transform
|
||||||
|
|
||||||
|
import com.rak.config.model.TransformationStepConfig
|
||||||
|
import java.util.concurrent.ConcurrentHashMap
|
||||||
|
|
||||||
|
class TransformationRegistry {
|
||||||
|
|
||||||
|
private val transformations: ConcurrentHashMap<String, Transformation> = ConcurrentHashMap()
|
||||||
|
private val parameterizedTransformation: ConcurrentHashMap<String, ParameterizedTransformation> =
|
||||||
|
ConcurrentHashMap()
|
||||||
|
|
||||||
|
init {
|
||||||
|
register("trim") { it.trim() }
|
||||||
|
register("removeInnerQuotes") { it.replace("\"", "") }
|
||||||
|
register("replace") { input, parameters ->
|
||||||
|
require(parameters.size == 1 || parameters.size == 2) {
|
||||||
|
"'replace' requires either 1 or 2 parameters"
|
||||||
|
}
|
||||||
|
if (parameters.size == 1) {
|
||||||
|
parameters.add("")
|
||||||
|
}
|
||||||
|
input.replace(parameters[0], parameters[1])
|
||||||
|
}
|
||||||
|
register("regexReplace") { input, params ->
|
||||||
|
require(params.size == 2) {
|
||||||
|
"'regexReplace' requires exactly 2 parameters"
|
||||||
|
}
|
||||||
|
input.replace(params[0].toRegex(), params[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fun register(name: String, transformation: Transformation) {
|
||||||
|
transformations.put(name, transformation)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun register(name: String, transformation: ParameterizedTransformation) {
|
||||||
|
parameterizedTransformation.put(name, transformation)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getTransformation(transformationStep: TransformationStepConfig): AbstractTransformation {
|
||||||
|
val name = transformationStep.name()
|
||||||
|
val parameters = transformationStep.parameters()
|
||||||
|
return when {
|
||||||
|
transformations.containsKey(name) -> {
|
||||||
|
if (parameters.isPresent && parameters.get().isNotEmpty()) {
|
||||||
|
throw IllegalArgumentException("'$name' doesn't accept parameters")
|
||||||
|
} else {
|
||||||
|
transformations[name]!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameterizedTransformation.containsKey(name) -> {
|
||||||
|
if (parameters.isPresent && parameters.get().isEmpty()) {
|
||||||
|
throw IllegalArgumentException("'$name' requires parameters")
|
||||||
|
} else {
|
||||||
|
parameterizedTransformation[name]!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else -> throw IllegalArgumentException("Unknown transformation: '$name'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun applyTransformations(input: String, steps: List<TransformationStepConfig>): String {
|
||||||
|
return steps.fold(input) { current, step ->
|
||||||
|
val actualStep = getTransformation(step)
|
||||||
|
when (actualStep) {
|
||||||
|
is Transformation ->
|
||||||
|
transformations[step.name()]?.apply(current)
|
||||||
|
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||||
|
|
||||||
|
is ParameterizedTransformation ->
|
||||||
|
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
|
||||||
|
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||||
|
|
||||||
|
else -> throw IllegalStateException("Invalid transformation type")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package com.rak.service
|
||||||
|
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import com.rak.model.cc.CCIndices
|
||||||
|
import com.rak.service.client.CommonCrawlRestClient
|
||||||
|
import io.netty.buffer.ByteBufInputStream
|
||||||
|
import io.quarkus.logging.Log
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
import org.archive.format.http.HttpResponseParser
|
||||||
|
import org.archive.io.warc.WARCReaderFactory
|
||||||
|
import org.eclipse.microprofile.rest.client.inject.RestClient
|
||||||
|
import org.jsoup.helper.DataUtil
|
||||||
|
import org.jsoup.nodes.Document
|
||||||
|
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class CommonCrawlService(
|
||||||
|
@RestClient
|
||||||
|
private val commonCrawlRestClient: CommonCrawlRestClient
|
||||||
|
) {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
|
||||||
|
private const val DATA_URL: String = "http://data.commoncrawl.org"
|
||||||
|
}
|
||||||
|
|
||||||
|
fun queryIndex(
|
||||||
|
url: String
|
||||||
|
): CCIndexSuccessResponse {
|
||||||
|
return commonCrawlRestClient.queryIndex(
|
||||||
|
INDEX_QUERY_URL,
|
||||||
|
url,
|
||||||
|
CCIndices.CC_2024_46.indexName
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun queryAllCrawlIndices(
|
||||||
|
url: String
|
||||||
|
): List<CCIndexSuccessResponse> {
|
||||||
|
val responses = mutableListOf<CCIndexSuccessResponse>()
|
||||||
|
for (crawlName in CCIndices.entries) {
|
||||||
|
try {
|
||||||
|
responses.add(commonCrawlRestClient.queryIndex(
|
||||||
|
INDEX_QUERY_URL,
|
||||||
|
url,
|
||||||
|
crawlName.indexName
|
||||||
|
))
|
||||||
|
} catch (ex: RuntimeException) {
|
||||||
|
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return responses
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getDocument(
|
||||||
|
ccIndexSuccessResponse: CCIndexSuccessResponse,
|
||||||
|
baseUri: String
|
||||||
|
): Document? {
|
||||||
|
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
|
||||||
|
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
|
||||||
|
DATA_URL,
|
||||||
|
ccIndexSuccessResponse.fileName,
|
||||||
|
ccIndexSuccessResponse.length,
|
||||||
|
ccIndexSuccessResponse.offset
|
||||||
|
)
|
||||||
|
|
||||||
|
val test = WARCReaderFactory.get(
|
||||||
|
fileName,
|
||||||
|
buf,
|
||||||
|
true
|
||||||
|
)
|
||||||
|
|
||||||
|
val parser = HttpResponseParser()
|
||||||
|
|
||||||
|
for(record in test) {
|
||||||
|
val http = parser.parse(record.buffered())
|
||||||
|
val charSet = http.headers.get("charset")
|
||||||
|
|
||||||
|
val doc = DataUtil.load(
|
||||||
|
http.buffered(),
|
||||||
|
"UTF-8",
|
||||||
|
baseUri
|
||||||
|
)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
}
|
||||||
|
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
package com.rak.service
|
|
||||||
|
|
||||||
import jakarta.ws.rs.GET
|
|
||||||
import jakarta.ws.rs.Path
|
|
||||||
import jakarta.ws.rs.QueryParam
|
|
||||||
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To use it via injection.
|
|
||||||
*
|
|
||||||
* ```kotlin
|
|
||||||
* @Inject
|
|
||||||
* @RestClient
|
|
||||||
* lateinit var myRemoteService: MyRemoteService
|
|
||||||
*
|
|
||||||
* fun doSomething() {
|
|
||||||
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
|
|
||||||
* }
|
|
||||||
* ```
|
|
||||||
*/
|
|
||||||
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
|
|
||||||
interface MyRemoteService {
|
|
||||||
|
|
||||||
@GET
|
|
||||||
@Path("/extensions")
|
|
||||||
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
|
|
||||||
|
|
||||||
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
|
|
||||||
}
|
|
||||||
@@ -1,89 +1,91 @@
|
|||||||
package com.rak.service
|
package com.rak.service
|
||||||
|
|
||||||
import com.rak.config.RegionalSetDefinition
|
import com.rak.config.model.ProviderConfig
|
||||||
import com.rak.config.SourcesConfiguration
|
import com.rak.model.card.Card
|
||||||
import com.rak.config.Step
|
import com.rak.model.exception.NotImplementedException
|
||||||
import com.rak.util.XPathUtil
|
import com.rak.model.exception.TargetNotFoundException
|
||||||
|
import com.rak.model.set.CardSet
|
||||||
|
import com.rak.model.set.RegionalSet
|
||||||
|
import com.rak.service.extract.RegionalSetExtractionService
|
||||||
|
import com.rak.service.extract.SetExtractionService
|
||||||
|
import io.quarkus.logging.Log
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
import org.jsoup.Jsoup
|
import org.jsoup.Jsoup
|
||||||
import org.jsoup.nodes.Document
|
import org.jsoup.nodes.Document
|
||||||
import org.jsoup.nodes.Element
|
import java.lang.Exception
|
||||||
|
|
||||||
@ApplicationScoped
|
@ApplicationScoped
|
||||||
class ScrapeService(
|
class ScrapeService(
|
||||||
private val sourceService: SourceService
|
private val sourceService: SourceService,
|
||||||
|
private val setExtractionService: SetExtractionService,
|
||||||
|
private val regionalSetExtractionService: RegionalSetExtractionService,
|
||||||
|
private val commonCrawlService: CommonCrawlService
|
||||||
) {
|
) {
|
||||||
|
|
||||||
companion object {
|
fun ProviderConfig.buildUrl(targetName: String): String {
|
||||||
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
|
return this.getUrlPattern().format(targetName)
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun extractTextFromRootBySteps(
|
fun scrapeSet(
|
||||||
root: Element,
|
|
||||||
steps: Set<Step>
|
|
||||||
): String? {
|
|
||||||
var currentElement: Element? = root.clone()
|
|
||||||
var result: String? = null
|
|
||||||
|
|
||||||
for (index in 0 until steps.size) {
|
|
||||||
val currentStep = steps.elementAtOrNull(index) ?: return null
|
|
||||||
if (currentElement == null) {
|
|
||||||
throw IllegalStateException()
|
|
||||||
}
|
|
||||||
|
|
||||||
if (index == steps.size - 1) {
|
|
||||||
result = XPathUtil.extractResult(currentElement, currentStep.value())
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
fun extractSet(
|
|
||||||
provider: String,
|
provider: String,
|
||||||
setName: String,
|
setName: String,
|
||||||
): Map<String, String> {
|
): CardSet {
|
||||||
val source =
|
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||||
sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
|
||||||
|
|
||||||
|
val path: String = normalizePath(setName)
|
||||||
|
val url = source.buildUrl(path)
|
||||||
|
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
|
||||||
|
|
||||||
|
var document: Document? = null
|
||||||
|
|
||||||
|
for (indexResponse in ccIndexResponses) {
|
||||||
|
document = commonCrawlService.getDocument(
|
||||||
|
indexResponse,
|
||||||
|
source.getDomain()
|
||||||
|
)
|
||||||
|
|
||||||
|
if (document != null) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document == null) {
|
||||||
|
// Fallback to Jsoup directly
|
||||||
|
try {
|
||||||
|
document = Jsoup.connect(url).get()
|
||||||
|
} catch(ex: Exception) {
|
||||||
|
Log.warn("Error occurred during Jsoup query")
|
||||||
|
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return setExtractionService.extract(
|
||||||
|
document,
|
||||||
|
source,
|
||||||
|
source.getTargets().getSetConfig().get()
|
||||||
|
).apply {
|
||||||
|
name = setName
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun scrapeRegionalSet(
|
||||||
|
provider: String,
|
||||||
|
setName: String,
|
||||||
|
): RegionalSet {
|
||||||
|
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||||
|
|
||||||
val path: String = normalizePath(setName)
|
val path: String = normalizePath(setName)
|
||||||
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
||||||
val regionalSetSelector = source.getItems().regionalSet().get()
|
|
||||||
|
|
||||||
val regionalSetRoot = document.selectFirst(regionalSetSelector.rootSelector().value())!!
|
return regionalSetExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
|
||||||
|
|
||||||
val setId: String? = extractTextFromRootBySteps(
|
|
||||||
regionalSetRoot,
|
|
||||||
regionalSetSelector.idSelector().steps()
|
|
||||||
)
|
|
||||||
val setLanguage: String? = extractTextFromRootBySteps(
|
|
||||||
regionalSetRoot,
|
|
||||||
regionalSetSelector.languageSelector().steps()
|
|
||||||
)
|
|
||||||
val setKey: String? = extractTextFromRootBySteps(
|
|
||||||
regionalSetRoot,
|
|
||||||
regionalSetSelector.regionKeySelector().steps()
|
|
||||||
)
|
|
||||||
|
|
||||||
return mapOf(
|
|
||||||
Pair("id", setId ?: "N/A"),
|
|
||||||
Pair("language", setLanguage ?: "N/A"),
|
|
||||||
Pair("key", setKey ?: "N/A"),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fun scrapeCard(
|
||||||
fun extractCard(
|
|
||||||
provider: String,
|
provider: String,
|
||||||
cardName: String,
|
cardName: String,
|
||||||
): Map<String, String> {
|
): Card? {
|
||||||
val path: String = normalizePath(cardName)
|
throw NotImplementedException("Not implemented")
|
||||||
return mapOf()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private fun normalizePath(path: String): String = path
|
private fun normalizePath(path: String): String = path
|
||||||
|
|||||||
@@ -1,15 +1,86 @@
|
|||||||
package com.rak.service
|
package com.rak.service
|
||||||
|
|
||||||
import com.rak.config.SourceConfig
|
import com.rak.config.model.CardScrapeTargetConfig
|
||||||
import com.rak.config.SourcesConfiguration
|
import com.rak.config.model.SetScrapeTargetConfig
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
|
import com.rak.config.model.SourcesConfig
|
||||||
|
import com.rak.model.exception.InvalidConfigurationException
|
||||||
|
import io.quarkus.runtime.Startup
|
||||||
|
import jakarta.annotation.PostConstruct
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
|
||||||
|
@Startup
|
||||||
@ApplicationScoped
|
@ApplicationScoped
|
||||||
class SourceService(
|
class SourceService(
|
||||||
val sourcesConfiguration: SourcesConfiguration
|
val sourcesConfiguration: SourcesConfig
|
||||||
) {
|
) {
|
||||||
|
|
||||||
fun getSources(): Set<SourceConfig> = sourcesConfiguration.getSources().toSet()
|
@PostConstruct
|
||||||
fun getSourceById(id: String): SourceConfig? = getSources().firstOrNull { it.getId() == id }
|
fun init() {
|
||||||
|
sourcesConfiguration.getSources().forEach { validateSource(it) }
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun validateSource(providerConfig: ProviderConfig) {
|
||||||
|
val optionalRegionalSetConfig = providerConfig.getTargets().getSetConfig()
|
||||||
|
val optionalCardConfig = providerConfig.getTargets().getCardConfig()
|
||||||
|
|
||||||
|
if (optionalRegionalSetConfig.isPresent) {
|
||||||
|
validateSetExtractConfig(optionalRegionalSetConfig.get())
|
||||||
|
}
|
||||||
|
|
||||||
|
if (optionalCardConfig.isPresent) {
|
||||||
|
validateCardExtractConfig(optionalCardConfig.get())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun validateSetExtractConfig(setExtractConfig: SetScrapeTargetConfig) {
|
||||||
|
val selectors = listOf(
|
||||||
|
setExtractConfig.getLanguageConfig(),
|
||||||
|
setExtractConfig.getIdConfig(),
|
||||||
|
setExtractConfig.getRegionKeyConfig()
|
||||||
|
)
|
||||||
|
|
||||||
|
// If global root is present, dedicated roots may not exist
|
||||||
|
if (setExtractConfig.getRootConfig().isPresent) {
|
||||||
|
if (selectors.any { it.getRootConfig().isPresent }) {
|
||||||
|
throw InvalidConfigurationException(
|
||||||
|
"Dedicated extraction roots cannot be set when a global extraction root is configured"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (selectors.any { !it.getRootConfig().isPresent }) {
|
||||||
|
throw InvalidConfigurationException(
|
||||||
|
"Dedicated extraction roots have to be set when a global extraction root is not configured"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun validateCardExtractConfig(cardScrapeTargetConfig: CardScrapeTargetConfig) {
|
||||||
|
val selectors = listOf(
|
||||||
|
cardScrapeTargetConfig.getEnglishNameConfig(),
|
||||||
|
cardScrapeTargetConfig.getDescriptionConfig(),
|
||||||
|
cardScrapeTargetConfig.getCardTypeConfig(),
|
||||||
|
cardScrapeTargetConfig.getAttackConfig(),
|
||||||
|
cardScrapeTargetConfig.getDefenseConfig(),
|
||||||
|
)
|
||||||
|
|
||||||
|
if (cardScrapeTargetConfig.getRootConfig().isPresent) {
|
||||||
|
if (selectors.any { it.getRootConfig().isPresent }) {
|
||||||
|
throw InvalidConfigurationException(
|
||||||
|
"Dedicated extraction roots cannot be set when a global extraction root is configured"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (selectors.any { !it.getRootConfig().isPresent }) {
|
||||||
|
throw InvalidConfigurationException(
|
||||||
|
"Dedicated extraction roots have to be set when a global extraction root is not configured"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getSources(): Set<ProviderConfig> = sourcesConfiguration.getSources().toSet()
|
||||||
|
fun getSourceById(id: String): ProviderConfig? = getSources().firstOrNull { it.getId() == id }
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,49 @@
|
|||||||
|
package com.rak.service.client
|
||||||
|
|
||||||
|
import com.rak.util.NDJsonReader
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import io.netty.buffer.ByteBufInputStream
|
||||||
|
import io.quarkus.rest.client.reactive.ClientQueryParam
|
||||||
|
import io.quarkus.rest.client.reactive.NotBody
|
||||||
|
import io.quarkus.rest.client.reactive.Url
|
||||||
|
import jakarta.ws.rs.Consumes
|
||||||
|
import jakarta.ws.rs.GET
|
||||||
|
import jakarta.ws.rs.Path
|
||||||
|
import jakarta.ws.rs.PathParam
|
||||||
|
import jakarta.ws.rs.QueryParam
|
||||||
|
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
|
||||||
|
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
|
||||||
|
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
|
||||||
|
|
||||||
|
@RegisterRestClient(baseUri = "whatever")
|
||||||
|
@RegisterProvider(NDJsonReader::class)
|
||||||
|
interface CommonCrawlRestClient {
|
||||||
|
|
||||||
|
@GET
|
||||||
|
@ClientQueryParam(name = "output", value = ["json"])
|
||||||
|
@Path("/{index}-index")
|
||||||
|
@Consumes("text/x-ndjson")
|
||||||
|
fun queryIndex(
|
||||||
|
@Url
|
||||||
|
baseUrl: String,
|
||||||
|
@QueryParam("url")
|
||||||
|
queryUrl: String,
|
||||||
|
@PathParam("index")
|
||||||
|
indexName: String
|
||||||
|
): CCIndexSuccessResponse
|
||||||
|
|
||||||
|
@GET
|
||||||
|
@Path("/{fileName}")
|
||||||
|
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
|
||||||
|
fun getWarcArchive(
|
||||||
|
@Url
|
||||||
|
baseUrl: String,
|
||||||
|
@PathParam("fileName")
|
||||||
|
fileName: String,
|
||||||
|
@NotBody
|
||||||
|
fileLength: Int,
|
||||||
|
@NotBody
|
||||||
|
fileOffset: Int
|
||||||
|
): ByteBufInputStream
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,229 @@
|
|||||||
|
package com.rak.service.extract
|
||||||
|
|
||||||
|
import com.rak.config.model.AbstractScrapeTargetConfig
|
||||||
|
import com.rak.config.model.ExtractConfig
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
|
import com.rak.model.Selector
|
||||||
|
import com.rak.model.exception.ElementNotFoundException
|
||||||
|
import com.rak.model.exception.InvalidConfigurationException
|
||||||
|
import com.rak.model.transform.TransformationRegistry
|
||||||
|
import com.rak.util.CssUtil
|
||||||
|
import com.rak.util.XPathUtil
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
import org.jsoup.select.Elements
|
||||||
|
import java.util.Optional
|
||||||
|
import kotlin.jvm.optionals.getOrElse
|
||||||
|
|
||||||
|
// find root element from global or node config
|
||||||
|
// get field target configs as list
|
||||||
|
// extract item from root element via field config
|
||||||
|
|
||||||
|
abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||||
|
|
||||||
|
private val transformationRegistry = TransformationRegistry()
|
||||||
|
|
||||||
|
abstract fun T.getItems(): Map<String, ScrapeTargetFieldConfig>
|
||||||
|
abstract fun extract(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: T
|
||||||
|
): E
|
||||||
|
|
||||||
|
abstract fun extractMultiple(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: T
|
||||||
|
): List<E>
|
||||||
|
|
||||||
|
abstract fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: T
|
||||||
|
): List<List<E>>
|
||||||
|
|
||||||
|
fun getRootElement(
|
||||||
|
element: Element,
|
||||||
|
globalRootExtractConfig: Optional<ExtractConfig>,
|
||||||
|
nodeRootExtractConfig: Optional<ExtractConfig>
|
||||||
|
): Element {
|
||||||
|
return getRootElements(
|
||||||
|
element,
|
||||||
|
globalRootExtractConfig,
|
||||||
|
nodeRootExtractConfig
|
||||||
|
).firstOrNull() ?: throw ElementNotFoundException("")
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getRootElements(
|
||||||
|
element: Element,
|
||||||
|
globalRootExtractConfig: Optional<ExtractConfig>,
|
||||||
|
nodeRootExtractConfig: Optional<ExtractConfig>
|
||||||
|
): Elements {
|
||||||
|
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.getOrElse {
|
||||||
|
nodeRootExtractConfig.orElseThrow {
|
||||||
|
InvalidConfigurationException("")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return getElementsFromElementByExtractConfig(
|
||||||
|
element,
|
||||||
|
rootExtractConfig
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
protected fun getElementFromDocumentByExtractConfig(
|
||||||
|
element: Element,
|
||||||
|
step: ExtractConfig,
|
||||||
|
): Element? {
|
||||||
|
return getElementsFromElementByExtractConfig(element, step).firstOrNull()
|
||||||
|
}
|
||||||
|
|
||||||
|
protected fun getElementsFromElementByExtractConfig(
|
||||||
|
element: Element,
|
||||||
|
step: ExtractConfig,
|
||||||
|
): Elements {
|
||||||
|
return if (step.selectorType() == Selector.CSS) {
|
||||||
|
element.select(step.getQueryString())
|
||||||
|
} else {
|
||||||
|
element.selectXpath(step.getQueryString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected fun extractSingle(
|
||||||
|
document: Element,
|
||||||
|
extractionConfig: T
|
||||||
|
): Map<String, String> {
|
||||||
|
val result = mutableMapOf<String, String>()
|
||||||
|
|
||||||
|
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
|
||||||
|
val rootElement = getRootElement(
|
||||||
|
document,
|
||||||
|
extractionConfig.getRootConfig(),
|
||||||
|
fieldConfig.getRootConfig()
|
||||||
|
)
|
||||||
|
|
||||||
|
val extractedText = extractTextFromElementByTargetFieldConfig(
|
||||||
|
rootElement,
|
||||||
|
fieldConfig
|
||||||
|
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
|
||||||
|
|
||||||
|
result.put(identifier, extractedText)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
fun extractMulti(
|
||||||
|
element: Element,
|
||||||
|
extractionConfig: T
|
||||||
|
): List<Map<String, String>> {
|
||||||
|
val resultList = mutableListOf<MutableMap<String, String>>()
|
||||||
|
|
||||||
|
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
|
||||||
|
val rootElements = getRootElements(
|
||||||
|
element,
|
||||||
|
extractionConfig.getRootConfig(),
|
||||||
|
fieldConfig.getRootConfig()
|
||||||
|
)
|
||||||
|
|
||||||
|
for(index in 0..rootElements.size - 1) {
|
||||||
|
val rootElement = rootElements[index]
|
||||||
|
val extractedText = extractTextFromElementByTargetFieldConfig(
|
||||||
|
rootElement,
|
||||||
|
fieldConfig
|
||||||
|
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
|
||||||
|
|
||||||
|
val mapToModify: MutableMap<String, String> = try {
|
||||||
|
resultList[index]
|
||||||
|
} catch (_: IndexOutOfBoundsException) {
|
||||||
|
val newMap = mutableMapOf<String, String>()
|
||||||
|
resultList.add(newMap)
|
||||||
|
newMap
|
||||||
|
}
|
||||||
|
|
||||||
|
mapToModify.put(identifier, extractedText)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultList
|
||||||
|
}
|
||||||
|
|
||||||
|
fun extractMultiWithDiscriminator(
|
||||||
|
element: Element,
|
||||||
|
extractionConfig: T
|
||||||
|
): List<List<Map<String, String>>>{
|
||||||
|
val rootElements = getRootElements(
|
||||||
|
element,
|
||||||
|
extractionConfig.getDiscriminator().get().getRootConfig(),
|
||||||
|
Optional.empty<ExtractConfig>()
|
||||||
|
)
|
||||||
|
|
||||||
|
val result = mutableListOf<List<Map<String, String>>>()
|
||||||
|
|
||||||
|
for(element in rootElements) {
|
||||||
|
result.add(extractMulti(
|
||||||
|
element,
|
||||||
|
extractionConfig
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun extractTextFromElementByTargetFieldConfig(
|
||||||
|
root: Element,
|
||||||
|
extractionConfig: ScrapeTargetFieldConfig
|
||||||
|
): String? {
|
||||||
|
val extractionSteps = extractionConfig.getExtractionSteps()
|
||||||
|
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
|
||||||
|
|
||||||
|
var currentElement: Element? = root.clone()
|
||||||
|
var result: String? = null
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (index in 0 until extractionSteps.size) {
|
||||||
|
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||||
|
if (currentElement == null) {
|
||||||
|
throw IllegalStateException()
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index == extractionSteps.size - 1) {
|
||||||
|
result = when (currentStep.selectorType()) {
|
||||||
|
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
|
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
currentElement = when (currentStep.selectorType()) {
|
||||||
|
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result == null) {
|
||||||
|
throw ElementNotFoundException("Result could not be extracted")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (transformationSteps.isPresent) {
|
||||||
|
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||||
|
}
|
||||||
|
} catch (ex: RuntimeException) {
|
||||||
|
when (ex) {
|
||||||
|
is ElementNotFoundException,
|
||||||
|
is IllegalStateException -> {
|
||||||
|
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||||
|
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||||
|
} else {
|
||||||
|
throw ex
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else -> throw ex
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
package com.rak.service.extract
|
||||||
|
|
||||||
|
import com.rak.config.model.CardPrintScrapeTargetConfig
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
|
import com.rak.model.card.CardPrint
|
||||||
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrintScrapeTargetConfig>() {
|
||||||
|
|
||||||
|
override fun CardPrintScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||||
|
return mapOf(
|
||||||
|
Pair("id", this.getIdConfig()),
|
||||||
|
Pair("name", this.getNameConfig()),
|
||||||
|
Pair("regionalName", this.getRegionNameConfig()),
|
||||||
|
Pair("rarity", this.getRarityConfig()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extract(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
|
): CardPrint {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractMultiple(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
|
): List<CardPrint> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: CardPrintScrapeTargetConfig
|
||||||
|
): List<List<CardPrint>> {
|
||||||
|
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
|
||||||
|
element,
|
||||||
|
extractionConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
return objectAsListOfMaps.map { innerList ->
|
||||||
|
innerList.map { map -> CardPrint.fromMap(map) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,62 @@
|
|||||||
|
package com.rak.service.extract
|
||||||
|
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
|
import com.rak.config.model.SetScrapeTargetConfig
|
||||||
|
import com.rak.config.model.SourcesConfig
|
||||||
|
import com.rak.model.card.CardPrint
|
||||||
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import com.rak.model.set.RegionalSet
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class RegionalSetExtractionService(
|
||||||
|
private val cardPrintExtractionService: CardPrintExtractionService,
|
||||||
|
private val sourcesConfig: SourcesConfig
|
||||||
|
) : AbstractExtractionService<RegionalSet, SetScrapeTargetConfig>() {
|
||||||
|
|
||||||
|
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||||
|
return mapOf(
|
||||||
|
Pair("prefix", this.getIdConfig()),
|
||||||
|
Pair("regionCode", this.getRegionKeyConfig()),
|
||||||
|
Pair("region", this.getLanguageConfig()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extract(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): RegionalSet {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractMultiple(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<RegionalSet> {
|
||||||
|
val regionalSetList = extractMulti(element, extractionConfig)
|
||||||
|
|
||||||
|
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
|
||||||
|
element,
|
||||||
|
providerConfig,
|
||||||
|
providerConfig.getTargets().getCardPrintConfiguration().get()
|
||||||
|
)
|
||||||
|
|
||||||
|
// Pair each RegionalSet with its CardPrint group by index
|
||||||
|
return regionalSetList.mapIndexed { index, regionalSetMap ->
|
||||||
|
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
|
||||||
|
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<List<RegionalSet>> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
package com.rak.service.extract
|
||||||
|
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
|
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||||
|
import com.rak.config.model.SetScrapeTargetConfig
|
||||||
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import com.rak.model.set.CardSet
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class SetExtractionService(
|
||||||
|
private val regionalSetExtractionService: RegionalSetExtractionService
|
||||||
|
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
|
||||||
|
|
||||||
|
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
|
||||||
|
return mapOf(
|
||||||
|
Pair("prefix", this.getIdConfig()),
|
||||||
|
Pair("regionCode", this.getRegionKeyConfig()),
|
||||||
|
Pair("region", this.getLanguageConfig()),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extract(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): CardSet {
|
||||||
|
return CardSet(
|
||||||
|
"test",
|
||||||
|
regionalSetExtractionService.extractMultiple(element, providerConfig, extractionConfig).toSet()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractMultiple(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<CardSet> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun extractNestedMultiples(
|
||||||
|
element: Element,
|
||||||
|
providerConfig: ProviderConfig,
|
||||||
|
extractionConfig: SetScrapeTargetConfig
|
||||||
|
): List<List<CardSet>> {
|
||||||
|
throw NotImplementedException("Not implemented")
|
||||||
|
}
|
||||||
|
}
|
||||||
19
src/main/kotlin/com/rak/util/CssUtil.kt
Normal file
19
src/main/kotlin/com/rak/util/CssUtil.kt
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
|
||||||
|
class CssUtil private constructor() {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
fun getNextElement(element: Element, path: String): Element? {
|
||||||
|
return element.select(path).firstOrNull()
|
||||||
|
}
|
||||||
|
|
||||||
|
fun extractResult(root: Element, path: String): String? {
|
||||||
|
return root
|
||||||
|
.select(path)
|
||||||
|
.firstOrNull()?.text()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import io.quarkus.rest.client.reactive.ComputedParamContext
|
||||||
|
|
||||||
|
class HttpUtil {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
|
||||||
|
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
|
||||||
|
|
||||||
|
@JvmStatic
|
||||||
|
fun computeHeader(context: ComputedParamContext): String {
|
||||||
|
val fileLengthContext = context.methodParameters().subList(2, 4)
|
||||||
|
|
||||||
|
val fileLength = fileLengthContext[0].value().toString().toInt()
|
||||||
|
val fileOffset = fileLengthContext[1].value().toString().toInt()
|
||||||
|
|
||||||
|
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
||||||
|
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||||
|
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import jakarta.ws.rs.Consumes
|
||||||
|
import jakarta.ws.rs.core.MediaType
|
||||||
|
import jakarta.ws.rs.core.MultivaluedMap
|
||||||
|
import jakarta.ws.rs.ext.MessageBodyReader
|
||||||
|
import jakarta.ws.rs.ext.Provider
|
||||||
|
import java.io.BufferedReader
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.io.InputStreamReader
|
||||||
|
import java.lang.reflect.Type
|
||||||
|
|
||||||
|
@Provider
|
||||||
|
@Consumes("text/x-ndjson") // Handles NDJSON content
|
||||||
|
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
|
||||||
|
|
||||||
|
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
|
||||||
|
|
||||||
|
override fun isReadable(
|
||||||
|
type: Class<*>?,
|
||||||
|
genericType: Type?,
|
||||||
|
annotations: Array<out Annotation>?,
|
||||||
|
mediaType: MediaType?
|
||||||
|
): Boolean {
|
||||||
|
return type == CCIndexSuccessResponse::class.java
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun readFrom(
|
||||||
|
type: Class<CCIndexSuccessResponse>,
|
||||||
|
genericType: Type?,
|
||||||
|
annotations: Array<out Annotation>?,
|
||||||
|
mediaType: MediaType?,
|
||||||
|
httpHeaders: MultivaluedMap<String, String>?,
|
||||||
|
entityStream: InputStream
|
||||||
|
): CCIndexSuccessResponse {
|
||||||
|
BufferedReader(InputStreamReader(entityStream)).use { reader ->
|
||||||
|
val firstLine = reader.readLine()
|
||||||
|
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -3,12 +3,15 @@ package com.rak.util
|
|||||||
import com.rak.model.XPathTarget
|
import com.rak.model.XPathTarget
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.nodes.TextNode
|
import org.jsoup.nodes.TextNode
|
||||||
|
import org.jsoup.select.Elements
|
||||||
|
import kotlin.coroutines.CoroutineContext
|
||||||
|
|
||||||
class XPathUtil private constructor() {
|
class XPathUtil private constructor() {
|
||||||
|
|
||||||
companion object {
|
companion object {
|
||||||
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
||||||
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
||||||
|
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
|
||||||
|
|
||||||
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
||||||
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
||||||
@@ -20,14 +23,29 @@ class XPathUtil private constructor() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun selectXpath(element: Element, xpath: String): Elements {
|
||||||
|
return if (xpath.contains(INDEX_MATCHER)) {
|
||||||
|
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
|
||||||
|
val xpathHalves = xpath.split("[$index]")
|
||||||
|
|
||||||
|
try {
|
||||||
|
Elements(element.selectXpath(xpathHalves[0])[index])
|
||||||
|
} catch (_: IndexOutOfBoundsException) {
|
||||||
|
Elements()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
element.selectXpath(xpath)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||||
return root
|
return root
|
||||||
.selectXpath(xpath, TextNode::class.java)
|
.selectXpath(xpath, TextNode::class.java)
|
||||||
.firstOrNull()?.text()
|
.firstOrNull()?.text()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun getNextElement(root: Element, path: String): Element? {
|
fun getNextElement(element: Element, path: String): Element? {
|
||||||
return root.selectXpath(path).firstOrNull()
|
return selectXpath(element, path).firstOrNull()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractResult(root: Element, path: String): String? {
|
fun extractResult(root: Element, path: String): String? {
|
||||||
|
|||||||
@@ -1 +1,2 @@
|
|||||||
com.rak.config.converter.TypeSelectorConverter
|
com.rak.config.converter.TypeSelectorConverter
|
||||||
|
com.rak.config.converter.DiscriminatorDirectionConverter
|
||||||
@@ -7,10 +7,12 @@ scraper:
|
|||||||
- id: konami-official
|
- id: konami-official
|
||||||
name: "Konami Official Database"
|
name: "Konami Official Database"
|
||||||
domain: "yugioh-card.com"
|
domain: "yugioh-card.com"
|
||||||
url-patterns:
|
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||||
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
targets:
|
||||||
selectors:
|
|
||||||
card:
|
card:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||||
name:
|
name:
|
||||||
steps:
|
steps:
|
||||||
- type: "css"
|
- type: "css"
|
||||||
@@ -25,10 +27,9 @@ scraper:
|
|||||||
- id: ygo-fandom
|
- id: ygo-fandom
|
||||||
name: "Yu-Gi-Oh Fandom Wiki"
|
name: "Yu-Gi-Oh Fandom Wiki"
|
||||||
domain: "yugioh.fandom.com"
|
domain: "yugioh.fandom.com"
|
||||||
url-patterns:
|
url-pattern: "https://yugioh.fandom.com/wiki/%s"
|
||||||
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
targets:
|
||||||
selectors:
|
set:
|
||||||
regional-set:
|
|
||||||
root:
|
root:
|
||||||
type: css
|
type: css
|
||||||
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||||
@@ -36,6 +37,12 @@ scraper:
|
|||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "//li/text()"
|
value: "//li/text()"
|
||||||
|
transform:
|
||||||
|
- name: "replace"
|
||||||
|
parameters: [
|
||||||
|
" (",
|
||||||
|
""
|
||||||
|
]
|
||||||
language:
|
language:
|
||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
@@ -46,3 +53,79 @@ scraper:
|
|||||||
steps:
|
steps:
|
||||||
- type: xpath
|
- type: xpath
|
||||||
value: "//li/abbr/text()"
|
value: "//li/abbr/text()"
|
||||||
|
card-print:
|
||||||
|
multi: true
|
||||||
|
discriminator:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".wds-tab__content"
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: "table > tbody > tr:has(> td)"
|
||||||
|
id:
|
||||||
|
steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/a[0]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
name:
|
||||||
|
steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/a[1]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
regional-name:
|
||||||
|
fallback:
|
||||||
|
default: "N/A"
|
||||||
|
steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td[2]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
transform:
|
||||||
|
- name: "removeInnerQuotes"
|
||||||
|
parameters: []
|
||||||
|
rarity:
|
||||||
|
fallback:
|
||||||
|
default: "N/A"
|
||||||
|
steps:
|
||||||
|
- type: xpath
|
||||||
|
value: "./td/a[3]"
|
||||||
|
- type: xpath
|
||||||
|
value: "./text()"
|
||||||
|
card:
|
||||||
|
name:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".cardTable"
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "./tbody/tr[3]/th/text()"
|
||||||
|
description:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".cardTable"
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "b:contains(Card descriptions)"
|
||||||
|
type:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".cardTable"
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "b:contains(Card descriptions)"
|
||||||
|
attack:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".cardTable"
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "b:contains(Card descriptions)"
|
||||||
|
defense:
|
||||||
|
root:
|
||||||
|
type: css
|
||||||
|
value: ".cardTable"
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "b:contains(Card descriptions)"
|
||||||
Reference in New Issue
Block a user