Compare commits

16 Commits

Author SHA1 Message Date
786c11981b Bump version 2025-07-19 16:43:15 +02:00
7671c05893 Add missing transformation 2025-07-19 16:42:12 +02:00
de1c4fadd7 Add missing dep 2025-07-15 19:21:38 +02:00
7860819029 Add CI/CD 2025-07-15 19:14:54 +02:00
304490b52e Correct YGO Fandom name transformation regex 2025-07-06 15:05:51 +02:00
ce5b87c34e Minor moddel adjustments 2025-07-01 12:54:56 +02:00
a9f6efc818 Minor config adjustment 2025-07-01 12:54:32 +02:00
5930da7a4c Split Set/RegionalSet properly 2025-06-29 16:49:30 +02:00
8a0777e557 Minor config amend
Regards Set ID
2025-06-29 14:56:00 +02:00
2a79218a54 Add RegEx validation
Amend RegExReplace transformer
Amend transformations
2025-06-29 14:52:09 +02:00
ee4ce4fd65 Basic multi-method extraction 2025-06-29 13:21:18 +02:00
108b4c4c19 Basic exception mapping 2025-06-26 17:17:10 +02:00
8f934bc2b9 Basic CommonCrawl integration 2025-06-26 17:05:50 +02:00
a6ed98c36e Remove old config file 2025-06-26 13:04:14 +02:00
052bdd6a52 Refactor packages
Remove ExtractionService
2025-06-26 12:48:19 +02:00
edc604231f Change project name 2025-06-26 12:46:01 +02:00
39 changed files with 765 additions and 572 deletions

View File

@@ -0,0 +1,32 @@
name: Create and Push Release
on:
workflow_dispatch:
env:
AUTHENTIK_URL: https://auth.smoothbrain.win
REGISTRY_URL: gitea.smoothbrain.win
IMAGE_OWNER: rak
IMAGE_NAME: dex-scraper-java
jobs:
release:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup JDK
uses: https://gitea.smoothbrain.win/rak/setup-java@main
with:
distribution: 'corretto'
java-version: '21.0.6'
cache: 'gradle'
- name: Build & Push Image
env:
QUARKUS_CONTAINER_IMAGE_USERNAME: ${{ secrets.CI_SERVICE_ACCOUNT }}
QUARKUS_CONTAINER_IMAGE_PASSWORD: ${{ secrets.CI_SERVICE_ACCOUNT_PASSWORD }}
run: |
./gradlew clean build \
-Dquarkus.container-image.push=true

View File

@@ -14,6 +14,7 @@ val quarkusPlatformArtifactId: String by project
val quarkusPlatformVersion: String by project val quarkusPlatformVersion: String by project
dependencies { dependencies {
implementation("io.quarkus:quarkus-container-image-docker")
implementation("io.quarkus:quarkus-config-yaml") implementation("io.quarkus:quarkus-config-yaml")
implementation(enforcedPlatform("${quarkusPlatformGroupId}:${quarkusPlatformArtifactId}:${quarkusPlatformVersion}")) implementation(enforcedPlatform("${quarkusPlatformGroupId}:${quarkusPlatformArtifactId}:${quarkusPlatformVersion}"))
implementation("io.quarkus:quarkus-rest") implementation("io.quarkus:quarkus-rest")
@@ -22,15 +23,19 @@ dependencies {
implementation("io.quarkus:quarkus-rest-client-kotlin-serialization") implementation("io.quarkus:quarkus-rest-client-kotlin-serialization")
implementation("io.quarkus:quarkus-rest-jackson") implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin") implementation("io.quarkus:quarkus-kotlin")
implementation("io.quarkus:quarkus-smallrye-fault-tolerance")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc") implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5") testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured") testImplementation("io.rest-assured:rest-assured")
} }
group = "com.rak" group = "com.rak"
version = "1.0-SNAPSHOT" version = "0.0.2"
java { java {
sourceCompatibility = JavaVersion.VERSION_21 sourceCompatibility = JavaVersion.VERSION_21

View File

@@ -10,4 +10,4 @@ pluginManagement {
id(quarkusPluginId) version quarkusPluginVersion id(quarkusPluginId) version quarkusPluginVersion
} }
} }
rootProject.name = "jsoup-scraper" rootProject.name = "dex-scraper"

View File

@@ -1,47 +0,0 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
transform:
- name: "replace"
parameters:
- " ("
- ""
language:
steps:
- type: "xpath"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"

View File

@@ -0,0 +1,19 @@
package com.rak.config.converter
import org.eclipse.microprofile.config.spi.Converter
import java.util.regex.Pattern
import java.util.regex.PatternSyntaxException
class PatternConverter : Converter<Pattern> {
override fun convert(value: String): Pattern {
if (value.isBlank()) {
throw IllegalArgumentException("Pattern may not be empty")
}
try {
return Pattern.compile(value)
} catch (_: PatternSyntaxException) {
throw IllegalStateException("'$value' is not a valid RegEx pattern")
}
}
}

View File

@@ -0,0 +1,11 @@
package com.rak.config.model
import io.smallrye.config.WithName
import java.util.Optional
interface ExtractorConfig {
@WithName("steps")
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
}

View File

@@ -11,8 +11,8 @@ interface ProviderConfig {
fun getName(): String fun getName(): String
@WithName("domain") @WithName("domain")
fun getDomain(): String fun getDomain(): String
@WithName("url-patterns") @WithName("url-pattern")
fun getUrlPatterns(): Optional<MutableSet<String>> fun getUrlPattern(): String
@WithName("targets") @WithName("targets")
fun getTargets(): TargetsConfig fun getTargets(): TargetsConfig

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import io.smallrye.config.WithName
interface RegionalSetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id")
fun getIdConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
}

View File

@@ -1,15 +1,21 @@
package com.rak.config.model package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName import io.smallrye.config.WithName
import java.util.* import java.util.*
interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig { interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
@WithName("type")
fun getType(): String
@WithName("nullable")
@WithDefault("false")
fun isNullable(): Boolean
@WithName("root") @WithName("root")
fun getRootConfig(): Optional<ExtractConfig> fun getRootConfig(): Optional<ExtractConfig>
@WithName("steps") @WithName("extractors")
fun getExtractionSteps(): List<ExtractConfig> fun getExtractionMethods(): List<ExtractorConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback") @WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback> fun getFallbackConfiguration(): Optional<FieldConfigFallback>
@WithName("validation")
fun getOptionalValidation(): Optional<ValidationConfig>
} }

View File

@@ -3,10 +3,6 @@ package com.rak.config.model
import io.smallrye.config.WithName import io.smallrye.config.WithName
interface SetScrapeTargetConfig : AbstractScrapeTargetConfig { interface SetScrapeTargetConfig : AbstractScrapeTargetConfig {
@WithName("id") @WithName("name")
fun getIdConfig(): ScrapeTargetFieldConfig fun getNameConfig(): ScrapeTargetFieldConfig
@WithName("language")
fun getLanguageConfig(): ScrapeTargetFieldConfig
@WithName("region-key")
fun getRegionKeyConfig(): ScrapeTargetFieldConfig
} }

View File

@@ -8,6 +8,8 @@ interface TargetsConfig {
fun getCardConfig(): Optional<CardScrapeTargetConfig> fun getCardConfig(): Optional<CardScrapeTargetConfig>
@WithName("set") @WithName("set")
fun getSetConfig(): Optional<SetScrapeTargetConfig> fun getSetConfig(): Optional<SetScrapeTargetConfig>
@WithName("regional-set")
fun getRegionalSetConfig(): Optional<RegionalSetScrapeTargetConfig>
@WithName("card-print") @WithName("card-print")
fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig> fun getCardPrintConfiguration(): Optional<CardPrintScrapeTargetConfig>
} }

View File

@@ -0,0 +1,12 @@
package com.rak.config.model
import com.rak.config.converter.PatternConverter
import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
import java.util.regex.Pattern
interface ValidationConfig {
@WithName("pattern")
@WithConverter(PatternConverter::class)
fun getRegexPatterns(): MutableList<Pattern>
}

View File

@@ -1,8 +1,10 @@
package com.rak.controller package com.rak.controller
import com.rak.model.card.Card import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService import com.rak.service.ScrapeService
import jakarta.ws.rs.Consumes import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET import jakarta.ws.rs.GET
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api") @Path("/api")
class ScrapeController( class ScrapeController(
private val scrapeService: ScrapeService, private val scrapeService: ScrapeService,
private val commonCrawlService: CommonCrawlService
) { ) {
@GET @GET
@Path("/{provider}/set") @Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON)

View File

@@ -0,0 +1,5 @@
package com.rak.model
data class ErrorResponse(
val message: String
)

View File

@@ -1,9 +1,7 @@
package com.rak.model.card package com.rak.model.card
import com.rak.model.set.RegionalSet
data class CardPrint( data class CardPrint(
val id: String, var id: Int,
val name: String, val name: String,
val regionalName: String? = null, val regionalName: String? = null,
val rarity: String val rarity: String
@@ -11,10 +9,17 @@ data class CardPrint(
companion object { companion object {
fun fromMap(map: Map<String, String>): CardPrint { fun fromMap(map: Map<String, String>): CardPrint {
val regionalNameValue = map["regionalName"]
val regionalName = if (regionalNameValue == "") {
null
} else {
regionalNameValue
}
return CardPrint( return CardPrint(
map["id"] ?: throw IllegalStateException("Parameter 'prefix' not found"), map["id"]?.toInt() ?: throw IllegalStateException("Parameter 'prefix' not found"),
map["name"] ?: throw IllegalStateException("Parameter 'region' not found"), map["name"] ?: throw IllegalStateException("Parameter 'region' not found"),
map["regionalName"], regionalName,
map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"), map["rarity"] ?: throw IllegalStateException("Parameter 'regionCode' not found"),
) )
} }

View File

@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)

View File

@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)

View File

@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class ValueValidationException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,18 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
override fun toResponse(exception: NotImplementedException): Response {
return Response.status(405).entity(
ErrorResponse(
exception.message ?: "Provider does not implement this method"
)
).build()
}
}

View File

@@ -0,0 +1,19 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
override fun toResponse(exception: TargetNotFoundException): Response {
return Response.status(404).entity(
ErrorResponse(
exception.message ?: "Scrape target could not be found"
)
).build()
}
}

View File

@@ -1,12 +1,15 @@
package com.rak.model.set package com.rak.model.set
import kotlin.collections.Set
data class CardSet( data class CardSet(
val name: String, var name: String,
val regionalSets: Set<RegionalSet> val regionalSets: Set<RegionalSet>
) { ) {
companion object { companion object {
fun fromMap(map: Map<String, String>, regionalSet: Set<RegionalSet>): CardSet {
return CardSet(
map["name"] ?: throw IllegalStateException("Parameter 'name' not found"),
regionalSet
)
}
} }
} }

View File

@@ -22,28 +22,6 @@ data class RegionalSet(
) )
} }
fun flattenFromMemberLists(
idList: List<String>,
languageList: List<String>,
regionKeyAliasList: List<String>,
): MutableSet<RegionalSet> {
if (idList.size != languageList.size && idList.size != regionKeyAliasList.size) {
throw IllegalArgumentException("Lists have to be the same size")
}
val regionalSetList: MutableSet<RegionalSet> = mutableSetOf()
for (index in 0..idList.size - 1) {
regionalSetList.add(RegionalSet(
prefix = idList[index],
region = languageList[index],
regionCode = regionKeyAliasList[index],
listOf(),
numberOfCards = -1
))
}
return regionalSetList
}
} }
} }

View File

@@ -11,7 +11,7 @@ class TransformationRegistry {
init { init {
register("trim") { it.trim() } register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace("\"", "") } register("removeInnerQuotes") { it.replace(Regex("^\""), "").replace(Regex("\"$"), "") }
register("replace") { input, parameters -> register("replace") { input, parameters ->
require(parameters.size == 1 || parameters.size == 2) { require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters" "'replace' requires either 1 or 2 parameters"
@@ -22,8 +22,11 @@ class TransformationRegistry {
input.replace(parameters[0], parameters[1]) input.replace(parameters[0], parameters[1])
} }
register("regexReplace") { input, params -> register("regexReplace") { input, params ->
require(params.size == 2) { require(params.size == 1 || params.size == 2) {
"'regexReplace' requires exactly 2 parameters" "'regexReplace' requires either 1 or 2 parameters"
}
if (params.size == 1) {
params.add("")
} }
input.replace(params[0].toRegex(), params[1]) input.replace(params[0].toRegex(), params[1])
} }

View File

@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url", ex)
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}

View File

@@ -1,273 +0,0 @@
package com.rak.service
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.Selector
import com.rak.model.card.Card
import com.rak.model.card.CardPrint
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.model.transform.TransformationRegistry
import com.rak.util.XPathUtil
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.Optional
@ApplicationScoped
class ExtractionService(
private val sourceService: SourceService,
) {
private val transformationRegistry = TransformationRegistry()
fun extractSet(setName: String, root: Element, provider: String): CardSet {
return CardSet(
name = setName,
regionalSets = extractRegionalSets(root, provider)
)
}
fun getRootElement(
document: Document,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Element {
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.orElse(
nodeRootExtractConfig.orElseThrow {
InvalidConfigurationException("")
})
return getElementFromDocumentByExtractConfig(document, rootExtractConfig) ?: throw ElementNotFoundException("No root could be found")
}
fun extractCardPrint(document: Document, cardPrintConfig: CardPrintScrapeTargetConfig): CardPrint? {
val cardName = extractTextFromElementByTargetFieldConfig(
getRootElement(
document,
cardPrintConfig.getRootConfig(),
cardPrintConfig.getNameConfig().getRootConfig()
),
cardPrintConfig.getNameConfig()
)
return null
}
fun extractRegionalSet(root: Element, provider: String): RegionalSet {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val setExtractionConfig = source.getTargets().getSetConfig().get()
if (setExtractionConfig.getRootConfig().isPresent) {
val setId: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getIdConfig(),
) ?: throw IllegalStateException("Parameter 'id' could not be found")
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getLanguageConfig()
) ?: throw IllegalStateException("Parameter 'language' could not be found")
val setKey: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getRegionKeyConfig()
) ?: throw IllegalStateException("Parameter 'key' could not be found")
return RegionalSet(
setId,
setLanguage,
setKey,
listOf(),
-1
)
} else {
val setIdConfiguration = setExtractionConfig.getIdConfig()
val rootConfiguration = setIdConfiguration.getRootConfig().get()
val setIdRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setId: String = extractTextFromElementByTargetFieldConfig(
setIdRoot,
setIdConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
val setLanguageConfiguration = setExtractionConfig.getIdConfig()
val setLanguageRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
setLanguageRoot,
setLanguageConfiguration
) ?: throw IllegalStateException("Parameter 'language' could not be found")
val setKeyConfiguration = setExtractionConfig.getIdConfig()
val setKeyRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setKey: String = extractTextFromElementByTargetFieldConfig(
setKeyRoot,
setKeyConfiguration
) ?: throw IllegalStateException("Parameter 'key' could not be found")
return RegionalSet(
setId,
setLanguage,
setKey,
listOf(),
-1
)
}
}
fun extractRegionalSets(root: Element, provider: String): Set<RegionalSet> {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val setExtractionConfig = source.getTargets().getSetConfig().get()
if (setExtractionConfig.getRootConfig().isPresent) {
val rootConfiguration = setExtractionConfig.getRootConfig().get()
val regionalSetRoots: Elements = getElementsFromDocumentByExtractConfig(
root,
rootConfiguration
)
return regionalSetRoots.map {
extractRegionalSet(
it,
provider
)
}.toSet()
} else {
try {
val setIdConfiguration = setExtractionConfig.getIdConfig()
val setIdRoot = getElementsFromDocumentByExtractConfig(root, setIdConfiguration.getRootConfig().get())
val setIds = setIdRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
setIdConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
val languageConfiguration = setExtractionConfig.getLanguageConfig()
val languageRoot = getElementsFromDocumentByExtractConfig(root, languageConfiguration.getRootConfig().get())
val languages = languageRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
languageConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
val setKeyConfiguration = setExtractionConfig.getRegionKeyConfig()
val setKeyRoot = getElementsFromDocumentByExtractConfig(root, setKeyConfiguration.getRootConfig().get())
val setKeys = setKeyRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
setKeyConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
return RegionalSet.flattenFromMemberLists(
setIds,
languages,
setKeys
)
} catch (ex: NoSuchElementException) {
throw RuntimeException("sdfgs") // TODO handle me
}
}
}
fun extractCard(root: Document, provider: String): Card? {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val cardSelector = source.getTargets().getCardConfig().get()
val rootConfigurationOptional = cardSelector.getRootConfig()
if (rootConfigurationOptional.isPresent) {
val rootConfiguration = rootConfigurationOptional.get()
val rootElement: Element = getElementFromDocumentByExtractConfig(
root,
rootConfiguration
) ?: throw ElementNotFoundException("TODO make this better")
val englishCardName: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
val cardType: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
val description: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
return null
} else {
return null
}
}
private fun getElementsFromDocumentByExtractConfig(
document: Element,
step: ExtractConfig
): Elements {
return if (step.selectorType() == Selector.CSS) {
document.select(step.getQueryString())
} else {
document.selectXpath(step.getQueryString())
}
}
private fun getElementFromDocumentByExtractConfig(
document: Element,
step: ExtractConfig,
): Element? {
return if (step.selectorType() == Selector.CSS) {
document.select(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
} else {
document.selectXpath(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
}
}
private fun extractTextFromElementByTargetFieldConfig(
root: Element,
extractionConfig: ScrapeTargetFieldConfig
): String? {
val extractionSteps = extractionConfig.getExtractionSteps()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
result = XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
else {
currentElement = XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
return result
}
}

View File

@@ -1,20 +1,30 @@
package com.rak.service package com.rak.service
import com.rak.config.model.ProviderConfig
import com.rak.model.card.Card import com.rak.model.card.Card
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.CardSetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup import org.jsoup.Jsoup
import org.jsoup.nodes.Document import org.jsoup.nodes.Document
import java.lang.Exception
@ApplicationScoped @ApplicationScoped
class ScrapeService( class ScrapeService(
private val sourceService: SourceService, private val sourceService: SourceService,
private val extractionService: ExtractionService, private val cardSetExtractionService: CardSetExtractionService,
private val setExtractionService: SetExtractionService, private val regionalSetExtractionService: RegionalSetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService private val commonCrawlService: CommonCrawlService
) { ) {
fun ProviderConfig.buildUrl(targetName: String): String {
return this.getUrlPattern().format(targetName)
}
fun scrapeSet( fun scrapeSet(
provider: String, provider: String,
@@ -23,10 +33,37 @@ class ScrapeService(
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found") val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName) val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
// return extractionService.extractSet(setName, document, provider) var document: Document? = null
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query", ex)
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return cardSetExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
)
} }
fun scrapeRegionalSet( fun scrapeRegionalSet(
@@ -38,19 +75,15 @@ class ScrapeService(
val path: String = normalizePath(setName) val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
return regionalSetExtractionService.extract(document, source, source.getTargets().getSetConfig().get()) return regionalSetExtractionService.extract(document, source, source.getTargets().getRegionalSetConfig().get())
} }
fun scrapeCard( fun scrapeCard(
provider: String, provider: String,
cardName: String, cardName: String,
): Card? { ): Card? {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found") throw NotImplementedException("Not implemented")
val path: String = normalizePath(cardName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
return extractionService.extractCard(document, provider)
} }
private fun normalizePath(path: String): String = path private fun normalizePath(path: String): String = path

View File

@@ -1,7 +1,7 @@
package com.rak.service package com.rak.service
import com.rak.config.model.CardScrapeTargetConfig import com.rak.config.model.CardScrapeTargetConfig
import com.rak.config.model.SetScrapeTargetConfig import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.SourcesConfig import com.rak.config.model.SourcesConfig
import com.rak.model.exception.InvalidConfigurationException import com.rak.model.exception.InvalidConfigurationException
@@ -21,7 +21,7 @@ class SourceService(
} }
private fun validateSource(providerConfig: ProviderConfig) { private fun validateSource(providerConfig: ProviderConfig) {
val optionalRegionalSetConfig = providerConfig.getTargets().getSetConfig() val optionalRegionalSetConfig = providerConfig.getTargets().getRegionalSetConfig()
val optionalCardConfig = providerConfig.getTargets().getCardConfig() val optionalCardConfig = providerConfig.getTargets().getCardConfig()
if (optionalRegionalSetConfig.isPresent) { if (optionalRegionalSetConfig.isPresent) {
@@ -33,7 +33,7 @@ class SourceService(
} }
} }
private fun validateSetExtractConfig(setExtractConfig: SetScrapeTargetConfig) { private fun validateSetExtractConfig(setExtractConfig: RegionalSetScrapeTargetConfig) {
val selectors = listOf( val selectors = listOf(
setExtractConfig.getLanguageConfig(), setExtractConfig.getLanguageConfig(),
setExtractConfig.getIdConfig(), setExtractConfig.getIdConfig(),

View File

@@ -0,0 +1,57 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import io.smallrye.faulttolerance.api.RateLimit
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.faulttolerance.Bulkhead
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
import java.time.temporal.ChronoUnit
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
@RateLimit(
value = 1,
minSpacing = 5
)
@Bulkhead
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}

View File

@@ -1,19 +1,17 @@
package com.rak.service package com.rak.service.extract
import com.rak.config.model.AbstractScrapeTargetConfig import com.rak.config.model.*
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.TransformationStepConfig
import com.rak.model.Selector import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.exception.ValueValidationException
import com.rak.model.transform.TransformationRegistry import com.rak.model.transform.TransformationRegistry
import com.rak.util.CssUtil import com.rak.util.CssUtil
import com.rak.util.XPathUtil import com.rak.util.XPathUtil
import io.quarkus.logging.Log
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.Optional import java.util.*
import kotlin.jvm.optionals.getOrElse import kotlin.jvm.optionals.getOrElse
// find root element from global or node config // find root element from global or node config
@@ -132,7 +130,11 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
val extractedText = extractTextFromElementByTargetFieldConfig( val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement, rootElement,
fieldConfig fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'") ) ?: if (fieldConfig.isNullable()) {
""
} else {
throw ElementNotFoundException("Could not find element for '$identifier'")
}
val mapToModify: MutableMap<String, String> = try { val mapToModify: MutableMap<String, String> = try {
resultList[index] resultList[index]
@@ -175,56 +177,87 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
root: Element, root: Element,
extractionConfig: ScrapeTargetFieldConfig extractionConfig: ScrapeTargetFieldConfig
): String? { ): String? {
val extractionSteps = extractionConfig.getExtractionSteps() val extractionMethods = extractionConfig.getExtractionMethods()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null var result: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) { for(extractionMethod in extractionMethods) {
result = when (currentStep.selectorType()) { val extractionSteps = extractionMethod.getExtractionSteps()
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString()) val transformationSteps = extractionMethod.getOptionalTransformationSteps()
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
var currentElement: Element? = root.clone()
var intermediateResult: String? = null
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
intermediateResult = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
} }
} }
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
if (result == null) { if (intermediateResult == null) {
throw ElementNotFoundException("Result could not be extracted") throw ElementNotFoundException("Result could not be extracted")
} } else {
try {
if (transformationSteps.isPresent) { validateValue(intermediateResult, extractionConfig.getOptionalValidation())
result = transformationRegistry.applyTransformations(result, transformationSteps.get()) } catch (ex: ValueValidationException) {
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex throw ex
} }
if (transformationSteps.isPresent) {
intermediateResult = transformationRegistry.applyTransformations(intermediateResult, transformationSteps.get())
}
result = intermediateResult
break
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException,
is ValueValidationException -> Log.debug(ex.message)
else -> throw ex
} }
else -> throw ex
} }
} }
if (result == null && extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
}
return result return result
} }
private fun validateValue(value: String, validationConfig: Optional<ValidationConfig>) {
if (!validationConfig.isPresent) {
return
}
var validated = true
for(regex in validationConfig.get().getRegexPatterns()) {
if (!value.matches(regex.toRegex())) {
validated = false
}
}
if (!validated) {
throw ValueValidationException("'$value' does not validate against RegEx(s)")
}
}
} }

View File

@@ -1,4 +1,4 @@
package com.rak.service package com.rak.service.extract
import com.rak.config.model.CardPrintScrapeTargetConfig import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig

View File

@@ -1,4 +1,4 @@
package com.rak.service package com.rak.service.extract
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig import com.rak.config.model.ScrapeTargetFieldConfig
@@ -9,15 +9,13 @@ import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
@ApplicationScoped @ApplicationScoped
class SetExtractionService( class CardSetExtractionService(
private val regionalSetExtractionService: RegionalSetExtractionService private val regionalSetExtractionService: RegionalSetExtractionService
) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() { ) : AbstractExtractionService<CardSet, SetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> { override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf( return mapOf(
Pair("prefix", this.getIdConfig()), Pair("name", this.getNameConfig()),
Pair("regionCode", this.getRegionKeyConfig()),
Pair("region", this.getLanguageConfig()),
) )
} }
@@ -26,9 +24,15 @@ class SetExtractionService(
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: SetScrapeTargetConfig
): CardSet { ): CardSet {
return CardSet( val set = extractSingle(element, extractionConfig)
"test",
regionalSetExtractionService.extractMultiple(element, providerConfig, extractionConfig).toSet() return CardSet.fromMap(
set,
regionalSetExtractionService.extractMultiple(
element,
providerConfig,
providerConfig.getTargets().getRegionalSetConfig().get()
).toSet()
) )
} }

View File

@@ -1,8 +1,8 @@
package com.rak.service package com.rak.service.extract
import com.rak.config.model.ProviderConfig import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig import com.rak.config.model.RegionalSetScrapeTargetConfig
import com.rak.config.model.SourcesConfig import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException import com.rak.model.exception.NotImplementedException
@@ -14,9 +14,9 @@ import org.jsoup.nodes.Element
class RegionalSetExtractionService( class RegionalSetExtractionService(
private val cardPrintExtractionService: CardPrintExtractionService, private val cardPrintExtractionService: CardPrintExtractionService,
private val sourcesConfig: SourcesConfig private val sourcesConfig: SourcesConfig
) : AbstractExtractionService<RegionalSet, SetScrapeTargetConfig>() { ) : AbstractExtractionService<RegionalSet, RegionalSetScrapeTargetConfig>() {
override fun SetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> { override fun RegionalSetScrapeTargetConfig.getItems(): Map<String, ScrapeTargetFieldConfig> {
return mapOf( return mapOf(
Pair("prefix", this.getIdConfig()), Pair("prefix", this.getIdConfig()),
Pair("regionCode", this.getRegionKeyConfig()), Pair("regionCode", this.getRegionKeyConfig()),
@@ -27,7 +27,7 @@ class RegionalSetExtractionService(
override fun extract( override fun extract(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): RegionalSet { ): RegionalSet {
throw NotImplementedException("Not implemented") throw NotImplementedException("Not implemented")
} }
@@ -35,7 +35,7 @@ class RegionalSetExtractionService(
override fun extractMultiple( override fun extractMultiple(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): List<RegionalSet> { ): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig) val regionalSetList = extractMulti(element, extractionConfig)
@@ -55,7 +55,7 @@ class RegionalSetExtractionService(
override fun extractNestedMultiples( override fun extractNestedMultiples(
element: Element, element: Element,
providerConfig: ProviderConfig, providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig extractionConfig: RegionalSetScrapeTargetConfig
): List<List<RegionalSet>> { ): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented") throw NotImplementedException("Not implemented")
} }

View File

@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}

View File

@@ -0,0 +1,44 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}

View File

@@ -4,6 +4,7 @@ import com.rak.model.XPathTarget
import org.jsoup.nodes.Element import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements import org.jsoup.select.Elements
import java.util.regex.Pattern
import kotlin.coroutines.CoroutineContext import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() { class XPathUtil private constructor() {
@@ -40,8 +41,8 @@ class XPathUtil private constructor() {
private fun extractTextFromNode(root: Element, xpath: String): String? { private fun extractTextFromNode(root: Element, xpath: String): String? {
return root return root
.selectXpath(xpath, TextNode::class.java) .selectXpath(xpath.replace("/text()", ""))
.firstOrNull()?.text() .text()
} }
fun getNextElement(element: Element, path: String): Element? { fun getNextElement(element: Element, path: String): Element? {

View File

@@ -1,133 +1,210 @@
quarkus: quarkus:
container-image:
registry: gitea.smoothbrain.win
group: rak
build: true
additional-tags: latest
http: http:
port: 8081 port: 8081
live-reload:
instrumentation: true
scraper: scraper:
sources: sources:
- id: konami-official # - id: konami-official
name: "Konami Official Database" # name: "Konami Official Database"
domain: "yugioh-card.com" # domain: "yugioh-card.com"
url-patterns: # url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" # targets:
targets: # card:
card: # root:
root: # type: css
type: css # value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" # name:
name: # steps:
steps: # - type: "css"
- type: "css" # value: "h1.product-title"
value: "h1.product-title" # - type: "xpath"
- type: "xpath" # value: "//h1[@itemprop='name']"
value: "//h1[@itemprop='name']" # attack:
attack: # steps:
steps: # - type: "css"
- type: "css" # value: ".atk-value"
value: ".atk-value"
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com" domain: "yugioh.fandom.com"
url-patterns: url-pattern: "https://yugioh.fandom.com/wiki/%s"
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
targets: targets:
set: set:
root:
type: css
value: "aside > .pi-title"
name:
type: string
extractors:
- steps:
- type: xpath
value: "//h2/text()"
regional-set:
root: root:
type: css type: css
value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" value: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id: id:
steps: type: int
- type: xpath extractors:
value: "//li/text()" - steps:
transform: - type: xpath
- name: "replace" value: "//li/text()"
parameters: [ transform:
" (", - name: "regexReplace"
"" parameters: [
] " *\\(.+\\)",
""
]
language: language:
steps: type: int
- type: xpath extractors:
value: "//li/abbr" - steps:
- type: xpath - type: xpath
value: "//abbr/@title" value: "//li/abbr"
- type: xpath
value: "//abbr/@title"
region-key: region-key:
steps: type: int
- type: xpath extractors:
value: "//li/abbr/text()" - steps:
- type: xpath
value: "//li/abbr/text()"
card-print: card-print:
multi: true multi: true
discriminator:
root:
type: css
value: ".wds-tab__content"
root: root:
type: css type: css
value: "table > tbody > tr:has(> td)" value: "table > tbody > tr:has(> td)"
discriminator:
type: string
root:
type: css
value: ".wds-tab__content"
id: id:
steps: type: int
- type: xpath extractors:
value: "./td/a[0]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[0]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
".+-[A-Za-z]*0?",
""
]
- steps:
- type: xpath
value: "./td/span/text()"
transform:
- name: "regexReplace"
parameters: [
" .+",
""
]
- name: "regexReplace"
parameters: [
".+-[A-Za-z]*0?",
""
]
validation:
pattern: "^.+-.+\\\\d.+$"
name: name:
steps: type: int
- type: xpath extractors:
value: "./td/a[1]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td[1]"
- type: xpath
value: "./text()"
transform:
- name: "regexReplace"
parameters: [
" ?\\(.+\\)",
""
]
- name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\".*"
regional-name: regional-name:
fallback: type: int
default: "N/A" nullable: true
steps: extractors:
- type: xpath - steps:
value: "./td[2]" - type: xpath
- type: xpath value: "./td[2]"
value: "./text()" - type: xpath
transform: value: "./text()"
- name: "removeInnerQuotes" transform:
parameters: [] - name: "removeInnerQuotes"
parameters: []
validation:
pattern: "^\".+\"$"
rarity: rarity:
fallback: fallback:
default: "N/A" default: "N/A"
steps: type: int
- type: xpath extractors:
value: "./td/a[3]" - steps:
- type: xpath - type: xpath
value: "./text()" value: "./td/a[3]"
card: - type: xpath
name: value: "./text()"
root: - steps:
type: css - type: xpath
value: ".cardTable" value: "./td/a[2]"
steps: - type: xpath
- type: "xpath" value: "./text()"
value: "./tbody/tr[3]/th/text()" - steps:
description: - type: xpath
root: value: "./td/a[1]"
type: css - type: xpath
value: ".cardTable" value: "./text()"
steps: validation:
- type: "xpath" pattern: "^.*(Common|Rare|Print).*$"
value: "b:contains(Card descriptions)" # card:
type: # name:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "./tbody/tr[3]/th/text()"
attack: # description:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
defense: # type:
root: # root:
type: css # type: css
value: ".cardTable" # value: ".cardTable"
steps: # steps:
- type: "xpath" # - type: "xpath"
value: "b:contains(Card descriptions)" # value: "b:contains(Card descriptions)"
# attack:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"
# defense:
# root:
# type: css
# value: ".cardTable"
# steps:
# - type: "xpath"
# value: "b:contains(Card descriptions)"

View File

@@ -1,20 +0,0 @@
package com.rak
import io.quarkus.test.junit.QuarkusTest
import io.restassured.RestAssured.given
import org.hamcrest.CoreMatchers.`is`
import org.junit.jupiter.api.Test
@QuarkusTest
class ExampleResourceTest {
@Test
fun testHelloEndpoint() {
given()
.`when`().get("/hello")
.then()
.statusCode(200)
.body(`is`("Hello from Quarkus REST"))
}
}