Compare commits

7 Commits

Author SHA1 Message Date
108b4c4c19 Basic exception mapping 2025-06-26 17:17:10 +02:00
8f934bc2b9 Basic CommonCrawl integration 2025-06-26 17:05:50 +02:00
a6ed98c36e Remove old config file 2025-06-26 13:04:14 +02:00
052bdd6a52 Refactor packages
Remove ExtractionService
2025-06-26 12:48:19 +02:00
edc604231f Change project name 2025-06-26 12:46:01 +02:00
2289489fe1 Amend transformation engine 2025-06-26 12:40:51 +02:00
rak
e97f9bdd61 Implement XPath index access 2025-06-25 23:11:05 +02:00
33 changed files with 519 additions and 502 deletions

View File

@@ -23,8 +23,11 @@ dependencies {
implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured")
}

View File

@@ -10,4 +10,4 @@ pluginManagement {
id(quarkusPluginId) version quarkusPluginVersion
}
}
rootProject.name = "jsoup-scraper"
rootProject.name = "dex-scraper"

View File

@@ -1,47 +0,0 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
transform:
- name: "replace"
parameters:
- " ("
- ""
language:
steps:
- type: "xpath"
value: "//li/abbr"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"

View File

@@ -1,11 +0,0 @@
package com.rak.config.converter
import jakarta.annotation.Priority
import org.eclipse.microprofile.config.spi.Converter
@Priority(1)
class EmptyStringConverter : Converter<String> {
override fun convert(value: String): String {
return value
}
}

View File

@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
import io.smallrye.config.WithName
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
@WithName("direction")
@WithConverter(DiscriminatorDirectionConverter::class)
fun getDiscriminatorDirection(): DiscriminatorDirection
}

View File

@@ -0,0 +1,15 @@
package com.rak.config.model
import io.smallrye.config.WithDefault
import io.smallrye.config.WithName
import java.util.Optional
interface FieldConfigFallback {
@WithName("steps")
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("default")
@WithDefault("N/A")
fun getOptionalDefaultValue(): String
}

View File

@@ -11,8 +11,8 @@ interface ProviderConfig {
fun getName(): String
@WithName("domain")
fun getDomain(): String
@WithName("url-patterns")
fun getUrlPatterns(): Optional<MutableSet<String>>
@WithName("url-pattern")
fun getUrlPattern(): String
@WithName("targets")
fun getTargets(): TargetsConfig

View File

@@ -10,4 +10,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
fun getExtractionSteps(): List<ExtractConfig>
@WithName("transform")
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
@WithName("fallback")
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
}

View File

@@ -1,10 +1,8 @@
package com.rak.config.model
import com.rak.config.converter.EmptyStringConverter
import io.smallrye.config.WithConverter
import java.util.Optional
interface TransformationStepConfig {
fun name(): String
@WithConverter(EmptyStringConverter::class)
fun parameters(): MutableList<String>
fun parameters(): Optional<MutableList<String>>
}

View File

@@ -1,8 +1,10 @@
package com.rak.controller
import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api")
class ScrapeController(
private val scrapeService: ScrapeService,
private val commonCrawlService: CommonCrawlService
) {
@GET
@Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON)

View File

@@ -0,0 +1,5 @@
package com.rak.model
data class ErrorResponse(
val message: String
)

View File

@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)

View File

@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)

View File

@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)

View File

@@ -0,0 +1,18 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
override fun toResponse(exception: NotImplementedException): Response {
return Response.status(405).entity(
ErrorResponse(
exception.message ?: "Provider does not implement this method"
)
).build()
}
}

View File

@@ -0,0 +1,19 @@
package com.rak.model.exception.mapper
import com.rak.model.ErrorResponse
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import jakarta.ws.rs.core.Response
import jakarta.ws.rs.ext.ExceptionMapper
import jakarta.ws.rs.ext.Provider
@Provider
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
override fun toResponse(exception: TargetNotFoundException): Response {
return Response.status(404).entity(
ErrorResponse(
exception.message ?: "Scrape target could not be found"
)
).build()
}
}

View File

@@ -3,7 +3,7 @@ package com.rak.model.set
import kotlin.collections.Set
data class CardSet(
val name: String,
var name: String,
val regionalSets: Set<RegionalSet>
) {
companion object {

View File

@@ -2,5 +2,5 @@ package com.rak.model.transform
@FunctionalInterface
fun interface ParameterizedTransformation : AbstractTransformation {
fun apply(input: String, parameters: List<String>): String
fun apply(input: String, parameters: MutableList<String>): String
}

View File

@@ -11,9 +11,13 @@ class TransformationRegistry {
init {
register("trim") { it.trim() }
register("removeInnerQuotes") { it.replace("\"", "") }
register("replace") { input, parameters ->
require(parameters.size == 2) {
"'replace' requires exactly 2 parameters"
require(parameters.size == 1 || parameters.size == 2) {
"'replace' requires either 1 or 2 parameters"
}
if (parameters.size == 1) {
parameters.add("")
}
input.replace(parameters[0], parameters[1])
}
@@ -39,14 +43,14 @@ class TransformationRegistry {
val parameters = transformationStep.parameters()
return when {
transformations.containsKey(name) -> {
if (parameters.isNotEmpty()) {
if (parameters.isPresent && parameters.get().isNotEmpty()) {
throw IllegalArgumentException("'$name' doesn't accept parameters")
} else {
transformations[name]!!
}
}
parameterizedTransformation.containsKey(name) -> {
if (parameters.isEmpty()) {
if (parameters.isPresent && parameters.get().isEmpty()) {
throw IllegalArgumentException("'$name' requires parameters")
} else {
parameterizedTransformation[name]!!
@@ -65,7 +69,7 @@ class TransformationRegistry {
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
is ParameterizedTransformation ->
parameterizedTransformation[step.name()]?.apply(current, step.parameters())
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
else -> throw IllegalStateException("Invalid transformation type")

View File

@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}

View File

@@ -1,273 +0,0 @@
package com.rak.service
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.Selector
import com.rak.model.card.Card
import com.rak.model.card.CardPrint
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.model.transform.TransformationRegistry
import com.rak.util.XPathUtil
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
import java.util.Optional
@ApplicationScoped
class ExtractionService(
private val sourceService: SourceService,
) {
private val transformationRegistry = TransformationRegistry()
fun extractSet(setName: String, root: Element, provider: String): CardSet {
return CardSet(
name = setName,
regionalSets = extractRegionalSets(root, provider)
)
}
fun getRootElement(
document: Document,
globalRootExtractConfig: Optional<ExtractConfig>,
nodeRootExtractConfig: Optional<ExtractConfig>
): Element {
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.orElse(
nodeRootExtractConfig.orElseThrow {
InvalidConfigurationException("")
})
return getElementFromDocumentByExtractConfig(document, rootExtractConfig) ?: throw ElementNotFoundException("No root could be found")
}
fun extractCardPrint(document: Document, cardPrintConfig: CardPrintScrapeTargetConfig): CardPrint? {
val cardName = extractTextFromElementByTargetFieldConfig(
getRootElement(
document,
cardPrintConfig.getRootConfig(),
cardPrintConfig.getNameConfig().getRootConfig()
),
cardPrintConfig.getNameConfig()
)
return null
}
fun extractRegionalSet(root: Element, provider: String): RegionalSet {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val setExtractionConfig = source.getTargets().getSetConfig().get()
if (setExtractionConfig.getRootConfig().isPresent) {
val setId: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getIdConfig(),
) ?: throw IllegalStateException("Parameter 'id' could not be found")
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getLanguageConfig()
) ?: throw IllegalStateException("Parameter 'language' could not be found")
val setKey: String = extractTextFromElementByTargetFieldConfig(
root,
setExtractionConfig.getRegionKeyConfig()
) ?: throw IllegalStateException("Parameter 'key' could not be found")
return RegionalSet(
setId,
setLanguage,
setKey,
listOf(),
-1
)
} else {
val setIdConfiguration = setExtractionConfig.getIdConfig()
val rootConfiguration = setIdConfiguration.getRootConfig().get()
val setIdRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setId: String = extractTextFromElementByTargetFieldConfig(
setIdRoot,
setIdConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
val setLanguageConfiguration = setExtractionConfig.getIdConfig()
val setLanguageRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
setLanguageRoot,
setLanguageConfiguration
) ?: throw IllegalStateException("Parameter 'language' could not be found")
val setKeyConfiguration = setExtractionConfig.getIdConfig()
val setKeyRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
val setKey: String = extractTextFromElementByTargetFieldConfig(
setKeyRoot,
setKeyConfiguration
) ?: throw IllegalStateException("Parameter 'key' could not be found")
return RegionalSet(
setId,
setLanguage,
setKey,
listOf(),
-1
)
}
}
fun extractRegionalSets(root: Element, provider: String): Set<RegionalSet> {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val setExtractionConfig = source.getTargets().getSetConfig().get()
if (setExtractionConfig.getRootConfig().isPresent) {
val rootConfiguration = setExtractionConfig.getRootConfig().get()
val regionalSetRoots: Elements = getElementsFromDocumentByExtractConfig(
root,
rootConfiguration
)
return regionalSetRoots.map {
extractRegionalSet(
it,
provider
)
}.toSet()
} else {
try {
val setIdConfiguration = setExtractionConfig.getIdConfig()
val setIdRoot = getElementsFromDocumentByExtractConfig(root, setIdConfiguration.getRootConfig().get())
val setIds = setIdRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
setIdConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
val languageConfiguration = setExtractionConfig.getLanguageConfig()
val languageRoot = getElementsFromDocumentByExtractConfig(root, languageConfiguration.getRootConfig().get())
val languages = languageRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
languageConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
val setKeyConfiguration = setExtractionConfig.getRegionKeyConfig()
val setKeyRoot = getElementsFromDocumentByExtractConfig(root, setKeyConfiguration.getRootConfig().get())
val setKeys = setKeyRoot.map {
extractTextFromElementByTargetFieldConfig(
it,
setKeyConfiguration
) ?: throw IllegalStateException("Parameter 'id' could not be found")
}
return RegionalSet.flattenFromMemberLists(
setIds,
languages,
setKeys
)
} catch (ex: NoSuchElementException) {
throw RuntimeException("sdfgs") // TODO handle me
}
}
}
fun extractCard(root: Document, provider: String): Card? {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val cardSelector = source.getTargets().getCardConfig().get()
val rootConfigurationOptional = cardSelector.getRootConfig()
if (rootConfigurationOptional.isPresent) {
val rootConfiguration = rootConfigurationOptional.get()
val rootElement: Element = getElementFromDocumentByExtractConfig(
root,
rootConfiguration
) ?: throw ElementNotFoundException("TODO make this better")
val englishCardName: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
val cardType: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
val description: String = extractTextFromElementByTargetFieldConfig(
rootElement,
cardSelector.getEnglishNameConfig()
) ?: throw IllegalStateException("Parameter 'name' could not be found")
return null
} else {
return null
}
}
private fun getElementsFromDocumentByExtractConfig(
document: Element,
step: ExtractConfig
): Elements {
return if (step.selectorType() == Selector.CSS) {
document.select(step.getQueryString())
} else {
document.selectXpath(step.getQueryString())
}
}
private fun getElementFromDocumentByExtractConfig(
document: Element,
step: ExtractConfig,
): Element? {
return if (step.selectorType() == Selector.CSS) {
document.select(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
} else {
document.selectXpath(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
}
}
private fun extractTextFromElementByTargetFieldConfig(
root: Element,
extractionConfig: ScrapeTargetFieldConfig
): String? {
val extractionSteps = extractionConfig.getExtractionSteps()
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
var currentElement: Element? = root.clone()
var result: String? = null
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
result = XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
else {
currentElement = XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
return result
}
}

View File

@@ -1,20 +1,30 @@
package com.rak.service
import com.rak.config.model.ProviderConfig
import com.rak.model.card.Card
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.SetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import java.lang.Exception
@ApplicationScoped
class ScrapeService(
private val sourceService: SourceService,
private val extractionService: ExtractionService,
private val setExtractionService: SetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService
private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService
) {
fun ProviderConfig.buildUrl(targetName: String): String {
return this.getUrlPattern().format(targetName)
}
fun scrapeSet(
provider: String,
@@ -23,10 +33,39 @@ class ScrapeService(
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
// return extractionService.extractSet(setName, document, provider)
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
var document: Document? = null
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query")
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return setExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
).apply {
name = setName
}
}
fun scrapeRegionalSet(
@@ -45,12 +84,8 @@ class ScrapeService(
provider: String,
cardName: String,
): Card? {
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
throw NotImplementedException("Not implemented")
val path: String = normalizePath(cardName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
return extractionService.extractCard(document, provider)
}
private fun normalizePath(path: String): String = path

View File

@@ -0,0 +1,49 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}

View File

@@ -1,10 +1,9 @@
package com.rak.service
package com.rak.service.extract
import com.rak.config.model.AbstractScrapeTargetConfig
import com.rak.config.model.ExtractConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.model.DiscriminatorDirection
import com.rak.model.Selector
import com.rak.model.exception.ElementNotFoundException
import com.rak.model.exception.InvalidConfigurationException
@@ -35,7 +34,13 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): Collection<E>
): List<E>
abstract fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: T
): List<List<E>>
fun getRootElement(
element: Element,
@@ -84,7 +89,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
}
}
protected fun extractAsMap(
protected fun extractSingle(
document: Element,
extractionConfig: T
): Map<String, String> {
@@ -108,7 +113,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return result
}
fun extractAsListOfMaps(
fun extractMulti(
element: Element,
extractionConfig: T
): List<Map<String, String>> {
@@ -143,95 +148,25 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
return resultList
}
fun extractAsListOfMaps(
elements: Elements,
extractionConfig: T
): List<Map<String, String>> {
val resultList = mutableListOf<MutableMap<String, String>>()
// refactor this
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
for(index in 0..elements.size - 1) {
val rootElement = elements[index]
val extractedText = extractTextFromElementByTargetFieldConfig(
rootElement,
fieldConfig
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
val mapToModify: MutableMap<String, String> = try {
resultList[index]
} catch (_: IndexOutOfBoundsException) {
val newMap = mutableMapOf<String, String>()
resultList.add(newMap)
newMap
}
mapToModify.put(identifier, extractedText)
}
}
return resultList
}
fun extractWithDiscriminator(
fun extractMultiWithDiscriminator(
element: Element,
extractionConfig: T
): List<List<Map<String, String>>>{
val rootElement = getRootElement(
val rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
extractionConfig.getDiscriminator().get().getRootConfig(),
Optional.empty<ExtractConfig>()
)
var rootElements = getRootElements(
element,
extractionConfig.getRootConfig(),
Optional.empty<ExtractConfig>()
)
val discriminatedElements = getElementsFromElementByExtractConfig(
rootElement,
extractionConfig.getDiscriminator().get().getRootConfig().get(),
)
val discriminations = mutableListOf<String>()
val result = mutableListOf<List<Map<String, String>>>()
for (element in discriminatedElements) {
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig(
for(element in rootElements) {
result.add(extractMulti(
element,
extractionConfig.getDiscriminator().get()
) ?: throw ElementNotFoundException("")
discriminations.add(discriminatorValue)
extractionConfig
))
}
val definitiveElements = if (discriminations.size < rootElements.size) {
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
while (discriminations.size < rootElements.size) {
rootElements.removeFirst()
}
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
rootElements = Elements(rootElements.reversed())
}
rootElements
} else {
rootElements
}
result.add(extractAsListOfMaps(
definitiveElements,
extractionConfig
))
return result
}
@@ -245,33 +180,48 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
var currentElement: Element? = root.clone()
var result: String? = null
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
try {
for (index in 0 until extractionSteps.size) {
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
if (currentElement == null) {
throw IllegalStateException()
}
if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
if (index == extractionSteps.size - 1) {
result = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.extractResult(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.extractResult(currentElement, currentStep.getQueryString())
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
}
}
}
else {
currentElement = when (currentStep.selectorType()) {
Selector.CSS -> CssUtil.getNextElement(currentElement, currentStep.getQueryString())
Selector.XPATH -> XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
} catch (ex: RuntimeException) {
when (ex) {
is ElementNotFoundException,
is IllegalStateException -> {
if (extractionConfig.getFallbackConfiguration().isPresent) {
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
} else {
throw ex
}
}
else -> throw ex
}
}
if (result == null) {
throw ElementNotFoundException("Result could not be extracted")
}
if (transformationSteps.isPresent) {
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
}
return result
}

View File

@@ -1,15 +1,11 @@
package com.rak.service
package com.rak.service.extract
import com.rak.config.model.CardPrintScrapeTargetConfig
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
@ApplicationScoped
@@ -36,11 +32,22 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): Collection<CardPrint> {
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig)
): List<CardPrint> {
throw NotImplementedException("Not implemented")
}
return objectAsListOfMaps.map {
CardPrint.fromMap(it[0])
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: CardPrintScrapeTargetConfig
): List<List<CardPrint>> {
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
element,
extractionConfig
)
return objectAsListOfMaps.map { innerList ->
innerList.map { map -> CardPrint.fromMap(map) }
}
}
}

View File

@@ -1,9 +1,10 @@
package com.rak.service
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
import com.rak.config.model.SetScrapeTargetConfig
import com.rak.config.model.SourcesConfig
import com.rak.model.card.CardPrint
import com.rak.model.exception.NotImplementedException
import com.rak.model.set.RegionalSet
import jakarta.enterprise.context.ApplicationScoped
@@ -35,18 +36,27 @@ class RegionalSetExtractionService(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): Collection<RegionalSet> {
val regionalSetList = extractAsListOfMaps(element, extractionConfig)
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig)
): List<RegionalSet> {
val regionalSetList = extractMulti(element, extractionConfig)
val cardPrints = cardPrintExtractionService.extractMultiple(
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
element,
providerConfig,
providerConfig.getTargets().getCardPrintConfiguration().get()
)
return regionalSetList.map {
RegionalSet.fromMap(it, cardPrints)
// Pair each RegionalSet with its CardPrint group by index
return regionalSetList.mapIndexed { index, regionalSetMap ->
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
}
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<RegionalSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -1,4 +1,4 @@
package com.rak.service
package com.rak.service.extract
import com.rak.config.model.ProviderConfig
import com.rak.config.model.ScrapeTargetFieldConfig
@@ -36,7 +36,15 @@ class SetExtractionService(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): Collection<CardSet> {
): List<CardSet> {
throw NotImplementedException("Not implemented")
}
override fun extractNestedMultiples(
element: Element,
providerConfig: ProviderConfig,
extractionConfig: SetScrapeTargetConfig
): List<List<CardSet>> {
throw NotImplementedException("Not implemented")
}
}

View File

@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}

View File

@@ -0,0 +1,45 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}

View File

@@ -3,12 +3,15 @@ package com.rak.util
import com.rak.model.XPathTarget
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import org.jsoup.select.Elements
import kotlin.coroutines.CoroutineContext
class XPathUtil private constructor() {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
}
}
private fun selectXpath(element: Element, xpath: String): Elements {
return if (xpath.contains(INDEX_MATCHER)) {
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
val xpathHalves = xpath.split("[$index]")
try {
Elements(element.selectXpath(xpathHalves[0])[index])
} catch (_: IndexOutOfBoundsException) {
Elements()
}
} else {
element.selectXpath(xpath)
}
}
private fun extractTextFromNode(root: Element, xpath: String): String? {
return root
.selectXpath(xpath, TextNode::class.java)
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
}
fun getNextElement(element: Element, path: String): Element? {
return element.selectXpath(path).firstOrNull()
return selectXpath(element, path).firstOrNull()
}
fun extractResult(root: Element, path: String): String? {

View File

@@ -7,8 +7,7 @@ scraper:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets:
card:
root:
@@ -28,8 +27,7 @@ scraper:
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
url-pattern: "https://yugioh.fandom.com/wiki/%s"
targets:
set:
root:
@@ -41,9 +39,10 @@ scraper:
value: "//li/text()"
transform:
- name: "replace"
parameters:
- " ("
- ""
parameters: [
" (",
""
]
language:
steps:
- type: xpath
@@ -56,33 +55,44 @@ scraper:
value: "//li/abbr/text()"
card-print:
multi: true
root:
type: css
value: ".tabber.wds-tabber > div"
discriminator:
direction: asc
root:
type: css
value: ".wds-tabs__tab"
steps:
- type: xpath
value: "//li/div/a/text()"
value: ".wds-tab__content"
root:
type: css
value: "table > tbody > tr:has(> td)"
id:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()"
value: "./td/a[0]"
- type: xpath
value: "./text()"
name:
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[1]/a/text()"
value: "./td/a[1]"
- type: xpath
value: "./text()"
regional-name:
fallback:
default: "N/A"
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[2]/a/text()"
value: "./td[2]"
- type: xpath
value: "./text()"
transform:
- name: "removeInnerQuotes"
parameters: []
rarity:
fallback:
default: "N/A"
steps:
- type: xpath
value: ".//table/tbody/tr[2]/td[3]/a/text()"
value: "./td/a[3]"
- type: xpath
value: "./text()"
card:
name:
root: