Compare commits
7 Commits
39c0ebfc7c
...
transforma
| Author | SHA1 | Date | |
|---|---|---|---|
| 108b4c4c19 | |||
| 8f934bc2b9 | |||
| a6ed98c36e | |||
| 052bdd6a52 | |||
| edc604231f | |||
| 2289489fe1 | |||
| e97f9bdd61 |
@@ -23,8 +23,11 @@ dependencies {
|
||||
implementation("io.quarkus:quarkus-rest-jackson")
|
||||
implementation("io.quarkus:quarkus-kotlin")
|
||||
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
|
||||
implementation("org.jsoup:jsoup:1.20.1")
|
||||
implementation("io.quarkus:quarkus-arc")
|
||||
implementation("org.jsoup:jsoup:1.20.1")
|
||||
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
|
||||
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
|
||||
testImplementation("io.quarkus:quarkus-junit5")
|
||||
testImplementation("io.rest-assured:rest-assured")
|
||||
}
|
||||
|
||||
@@ -10,4 +10,4 @@ pluginManagement {
|
||||
id(quarkusPluginId) version quarkusPluginVersion
|
||||
}
|
||||
}
|
||||
rootProject.name = "jsoup-scraper"
|
||||
rootProject.name = "dex-scraper"
|
||||
|
||||
47
sources.yml
47
sources.yml
@@ -1,47 +0,0 @@
|
||||
scraper:
|
||||
sources:
|
||||
- id: konami-official
|
||||
name: "Konami Official Database"
|
||||
domain: "yugioh-card.com"
|
||||
url-patterns:
|
||||
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
selectors:
|
||||
card:
|
||||
name:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: "h1.product-title"
|
||||
- type: "xpath"
|
||||
value: "//h1[@itemprop='name']"
|
||||
attack:
|
||||
steps:
|
||||
- type: "css"
|
||||
value: ".atk-value"
|
||||
|
||||
- id: ygo-fandom
|
||||
name: "Yu-Gi-Oh Fandom Wiki"
|
||||
domain: "yugioh.fandom.com"
|
||||
url-patterns:
|
||||
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
||||
selectors:
|
||||
regional-set:
|
||||
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||
id:
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
parameters:
|
||||
- " ("
|
||||
- ""
|
||||
language:
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "//li/abbr"
|
||||
- type: "xpath"
|
||||
value: "//abbr/@title"
|
||||
region-key:
|
||||
steps:
|
||||
- type: "xpath"
|
||||
value: "//li/abbr/text()"
|
||||
@@ -1,11 +0,0 @@
|
||||
package com.rak.config.converter
|
||||
|
||||
import jakarta.annotation.Priority
|
||||
import org.eclipse.microprofile.config.spi.Converter
|
||||
|
||||
@Priority(1)
|
||||
class EmptyStringConverter : Converter<String> {
|
||||
override fun convert(value: String): String {
|
||||
return value
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,4 @@ import io.smallrye.config.WithConverter
|
||||
import io.smallrye.config.WithName
|
||||
|
||||
interface DiscriminatorConfig : ScrapeTargetFieldConfig {
|
||||
@WithName("direction")
|
||||
@WithConverter(DiscriminatorDirectionConverter::class)
|
||||
fun getDiscriminatorDirection(): DiscriminatorDirection
|
||||
}
|
||||
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
15
src/main/kotlin/com/rak/config/model/FieldConfigFallback.kt
Normal file
@@ -0,0 +1,15 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import io.smallrye.config.WithDefault
|
||||
import io.smallrye.config.WithName
|
||||
import java.util.Optional
|
||||
|
||||
interface FieldConfigFallback {
|
||||
@WithName("steps")
|
||||
fun getOptionalFallbackExtractionSteps(): Optional<List<ExtractConfig>>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("default")
|
||||
@WithDefault("N/A")
|
||||
fun getOptionalDefaultValue(): String
|
||||
}
|
||||
@@ -11,8 +11,8 @@ interface ProviderConfig {
|
||||
fun getName(): String
|
||||
@WithName("domain")
|
||||
fun getDomain(): String
|
||||
@WithName("url-patterns")
|
||||
fun getUrlPatterns(): Optional<MutableSet<String>>
|
||||
@WithName("url-pattern")
|
||||
fun getUrlPattern(): String
|
||||
@WithName("targets")
|
||||
fun getTargets(): TargetsConfig
|
||||
|
||||
|
||||
@@ -10,4 +10,6 @@ interface ScrapeTargetFieldConfig : AbstractScrapeTargetFieldConfig {
|
||||
fun getExtractionSteps(): List<ExtractConfig>
|
||||
@WithName("transform")
|
||||
fun getOptionalTransformationSteps(): Optional<List<TransformationStepConfig>>
|
||||
@WithName("fallback")
|
||||
fun getFallbackConfiguration(): Optional<FieldConfigFallback>
|
||||
}
|
||||
@@ -1,10 +1,8 @@
|
||||
package com.rak.config.model
|
||||
|
||||
import com.rak.config.converter.EmptyStringConverter
|
||||
import io.smallrye.config.WithConverter
|
||||
import java.util.Optional
|
||||
|
||||
interface TransformationStepConfig {
|
||||
fun name(): String
|
||||
@WithConverter(EmptyStringConverter::class)
|
||||
fun parameters(): MutableList<String>
|
||||
fun parameters(): Optional<MutableList<String>>
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
package com.rak.controller
|
||||
|
||||
import com.rak.model.card.Card
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import com.rak.service.CommonCrawlService
|
||||
import com.rak.service.ScrapeService
|
||||
import jakarta.ws.rs.Consumes
|
||||
import jakarta.ws.rs.GET
|
||||
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
|
||||
@Path("/api")
|
||||
class ScrapeController(
|
||||
private val scrapeService: ScrapeService,
|
||||
private val commonCrawlService: CommonCrawlService
|
||||
) {
|
||||
|
||||
|
||||
|
||||
@GET
|
||||
@Path("/{provider}/set")
|
||||
@Produces(MediaType.APPLICATION_JSON)
|
||||
|
||||
5
src/main/kotlin/com/rak/model/ErrorResponse.kt
Normal file
5
src/main/kotlin/com/rak/model/ErrorResponse.kt
Normal file
@@ -0,0 +1,5 @@
|
||||
package com.rak.model
|
||||
|
||||
data class ErrorResponse(
|
||||
val message: String
|
||||
)
|
||||
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
@@ -0,0 +1,5 @@
|
||||
package com.rak.model.cc
|
||||
|
||||
data class CCIndexErrorResponse(
|
||||
val message: String
|
||||
)
|
||||
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
@@ -0,0 +1,22 @@
|
||||
package com.rak.model.cc
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonProperty
|
||||
import java.time.Instant
|
||||
|
||||
data class CCIndexSuccessResponse(
|
||||
@JsonProperty("urlkey")
|
||||
val urlKey: String,
|
||||
val timestamp: Instant,
|
||||
val url: String,
|
||||
val mime: String,
|
||||
@JsonProperty("mime-detected")
|
||||
val mimeDetected: String,
|
||||
val status: String,
|
||||
val digest: String,
|
||||
val length: Int,
|
||||
val offset: Int,
|
||||
@JsonProperty("filename")
|
||||
val fileName: String,
|
||||
val languages: String,
|
||||
val encoding: String,
|
||||
)
|
||||
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
@@ -0,0 +1,9 @@
|
||||
package com.rak.model.cc
|
||||
|
||||
enum class CCIndices(val indexName: String) {
|
||||
CC_2025_21("CC-MAIN-2025-21"),
|
||||
CC_2025_05("CC-MAIN-2024-05"),
|
||||
CC_2024_46("CC-MAIN-2024-46"),
|
||||
CC_2024_26("CC-MAIN-2024-26"),
|
||||
CC_2023_50("CC-MAIN-2023-50");
|
||||
}
|
||||
@@ -0,0 +1,3 @@
|
||||
package com.rak.model.exception
|
||||
|
||||
class TargetNotFoundException(message: String) : RuntimeException(message)
|
||||
@@ -0,0 +1,18 @@
|
||||
package com.rak.model.exception.mapper
|
||||
|
||||
import com.rak.model.ErrorResponse
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import jakarta.ws.rs.core.Response
|
||||
import jakarta.ws.rs.ext.ExceptionMapper
|
||||
import jakarta.ws.rs.ext.Provider
|
||||
|
||||
@Provider
|
||||
class NotImplementedExceptionMapper : ExceptionMapper<NotImplementedException> {
|
||||
override fun toResponse(exception: NotImplementedException): Response {
|
||||
return Response.status(405).entity(
|
||||
ErrorResponse(
|
||||
exception.message ?: "Provider does not implement this method"
|
||||
)
|
||||
).build()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.rak.model.exception.mapper
|
||||
|
||||
import com.rak.model.ErrorResponse
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.exception.TargetNotFoundException
|
||||
import jakarta.ws.rs.core.Response
|
||||
import jakarta.ws.rs.ext.ExceptionMapper
|
||||
import jakarta.ws.rs.ext.Provider
|
||||
|
||||
@Provider
|
||||
class TargetNotFoundExceptionMapper : ExceptionMapper<TargetNotFoundException> {
|
||||
override fun toResponse(exception: TargetNotFoundException): Response {
|
||||
return Response.status(404).entity(
|
||||
ErrorResponse(
|
||||
exception.message ?: "Scrape target could not be found"
|
||||
)
|
||||
).build()
|
||||
}
|
||||
}
|
||||
@@ -3,7 +3,7 @@ package com.rak.model.set
|
||||
import kotlin.collections.Set
|
||||
|
||||
data class CardSet(
|
||||
val name: String,
|
||||
var name: String,
|
||||
val regionalSets: Set<RegionalSet>
|
||||
) {
|
||||
companion object {
|
||||
|
||||
@@ -2,5 +2,5 @@ package com.rak.model.transform
|
||||
|
||||
@FunctionalInterface
|
||||
fun interface ParameterizedTransformation : AbstractTransformation {
|
||||
fun apply(input: String, parameters: List<String>): String
|
||||
fun apply(input: String, parameters: MutableList<String>): String
|
||||
}
|
||||
@@ -11,9 +11,13 @@ class TransformationRegistry {
|
||||
|
||||
init {
|
||||
register("trim") { it.trim() }
|
||||
register("removeInnerQuotes") { it.replace("\"", "") }
|
||||
register("replace") { input, parameters ->
|
||||
require(parameters.size == 2) {
|
||||
"'replace' requires exactly 2 parameters"
|
||||
require(parameters.size == 1 || parameters.size == 2) {
|
||||
"'replace' requires either 1 or 2 parameters"
|
||||
}
|
||||
if (parameters.size == 1) {
|
||||
parameters.add("")
|
||||
}
|
||||
input.replace(parameters[0], parameters[1])
|
||||
}
|
||||
@@ -39,14 +43,14 @@ class TransformationRegistry {
|
||||
val parameters = transformationStep.parameters()
|
||||
return when {
|
||||
transformations.containsKey(name) -> {
|
||||
if (parameters.isNotEmpty()) {
|
||||
if (parameters.isPresent && parameters.get().isNotEmpty()) {
|
||||
throw IllegalArgumentException("'$name' doesn't accept parameters")
|
||||
} else {
|
||||
transformations[name]!!
|
||||
}
|
||||
}
|
||||
parameterizedTransformation.containsKey(name) -> {
|
||||
if (parameters.isEmpty()) {
|
||||
if (parameters.isPresent && parameters.get().isEmpty()) {
|
||||
throw IllegalArgumentException("'$name' requires parameters")
|
||||
} else {
|
||||
parameterizedTransformation[name]!!
|
||||
@@ -65,7 +69,7 @@ class TransformationRegistry {
|
||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||
|
||||
is ParameterizedTransformation ->
|
||||
parameterizedTransformation[step.name()]?.apply(current, step.parameters())
|
||||
parameterizedTransformation[step.name()]?.apply(current, step.parameters().get())
|
||||
?: throw IllegalArgumentException("Unknown transformation: ${step.name()}")
|
||||
|
||||
else -> throw IllegalStateException("Invalid transformation type")
|
||||
|
||||
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
@@ -0,0 +1,92 @@
|
||||
package com.rak.service
|
||||
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
import com.rak.model.cc.CCIndices
|
||||
import com.rak.service.client.CommonCrawlRestClient
|
||||
import io.netty.buffer.ByteBufInputStream
|
||||
import io.quarkus.logging.Log
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.archive.format.http.HttpResponseParser
|
||||
import org.archive.io.warc.WARCReaderFactory
|
||||
import org.eclipse.microprofile.rest.client.inject.RestClient
|
||||
import org.jsoup.helper.DataUtil
|
||||
import org.jsoup.nodes.Document
|
||||
|
||||
|
||||
@ApplicationScoped
|
||||
class CommonCrawlService(
|
||||
@RestClient
|
||||
private val commonCrawlRestClient: CommonCrawlRestClient
|
||||
) {
|
||||
|
||||
companion object {
|
||||
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
|
||||
private const val DATA_URL: String = "http://data.commoncrawl.org"
|
||||
}
|
||||
|
||||
fun queryIndex(
|
||||
url: String
|
||||
): CCIndexSuccessResponse {
|
||||
return commonCrawlRestClient.queryIndex(
|
||||
INDEX_QUERY_URL,
|
||||
url,
|
||||
CCIndices.CC_2024_46.indexName
|
||||
)
|
||||
}
|
||||
|
||||
fun queryAllCrawlIndices(
|
||||
url: String
|
||||
): List<CCIndexSuccessResponse> {
|
||||
val responses = mutableListOf<CCIndexSuccessResponse>()
|
||||
for (crawlName in CCIndices.entries) {
|
||||
try {
|
||||
responses.add(commonCrawlRestClient.queryIndex(
|
||||
INDEX_QUERY_URL,
|
||||
url,
|
||||
crawlName.indexName
|
||||
))
|
||||
} catch (ex: RuntimeException) {
|
||||
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
|
||||
}
|
||||
}
|
||||
|
||||
return responses
|
||||
}
|
||||
|
||||
fun getDocument(
|
||||
ccIndexSuccessResponse: CCIndexSuccessResponse,
|
||||
baseUri: String
|
||||
): Document? {
|
||||
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
|
||||
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
|
||||
DATA_URL,
|
||||
ccIndexSuccessResponse.fileName,
|
||||
ccIndexSuccessResponse.length,
|
||||
ccIndexSuccessResponse.offset
|
||||
)
|
||||
|
||||
val test = WARCReaderFactory.get(
|
||||
fileName,
|
||||
buf,
|
||||
true
|
||||
)
|
||||
|
||||
val parser = HttpResponseParser()
|
||||
|
||||
for(record in test) {
|
||||
val http = parser.parse(record.buffered())
|
||||
val charSet = http.headers.get("charset")
|
||||
|
||||
val doc = DataUtil.load(
|
||||
http.buffered(),
|
||||
"UTF-8",
|
||||
baseUri
|
||||
)
|
||||
|
||||
return doc
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,273 +0,0 @@
|
||||
package com.rak.service
|
||||
|
||||
import com.rak.config.model.CardPrintScrapeTargetConfig
|
||||
import com.rak.config.model.ExtractConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.model.Selector
|
||||
import com.rak.model.card.Card
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.ElementNotFoundException
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import com.rak.model.transform.TransformationRegistry
|
||||
import com.rak.util.XPathUtil
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.nodes.Document
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.select.Elements
|
||||
import java.util.Optional
|
||||
|
||||
@ApplicationScoped
|
||||
class ExtractionService(
|
||||
private val sourceService: SourceService,
|
||||
) {
|
||||
|
||||
private val transformationRegistry = TransformationRegistry()
|
||||
|
||||
fun extractSet(setName: String, root: Element, provider: String): CardSet {
|
||||
return CardSet(
|
||||
name = setName,
|
||||
regionalSets = extractRegionalSets(root, provider)
|
||||
)
|
||||
}
|
||||
|
||||
fun getRootElement(
|
||||
document: Document,
|
||||
globalRootExtractConfig: Optional<ExtractConfig>,
|
||||
nodeRootExtractConfig: Optional<ExtractConfig>
|
||||
): Element {
|
||||
val rootExtractConfig: ExtractConfig = globalRootExtractConfig.orElse(
|
||||
nodeRootExtractConfig.orElseThrow {
|
||||
InvalidConfigurationException("")
|
||||
})
|
||||
|
||||
return getElementFromDocumentByExtractConfig(document, rootExtractConfig) ?: throw ElementNotFoundException("No root could be found")
|
||||
}
|
||||
|
||||
fun extractCardPrint(document: Document, cardPrintConfig: CardPrintScrapeTargetConfig): CardPrint? {
|
||||
val cardName = extractTextFromElementByTargetFieldConfig(
|
||||
getRootElement(
|
||||
document,
|
||||
cardPrintConfig.getRootConfig(),
|
||||
cardPrintConfig.getNameConfig().getRootConfig()
|
||||
),
|
||||
cardPrintConfig.getNameConfig()
|
||||
)
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
fun extractRegionalSet(root: Element, provider: String): RegionalSet {
|
||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||
val setExtractionConfig = source.getTargets().getSetConfig().get()
|
||||
|
||||
if (setExtractionConfig.getRootConfig().isPresent) {
|
||||
val setId: String = extractTextFromElementByTargetFieldConfig(
|
||||
root,
|
||||
setExtractionConfig.getIdConfig(),
|
||||
|
||||
) ?: throw IllegalStateException("Parameter 'id' could not be found")
|
||||
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
|
||||
root,
|
||||
setExtractionConfig.getLanguageConfig()
|
||||
) ?: throw IllegalStateException("Parameter 'language' could not be found")
|
||||
val setKey: String = extractTextFromElementByTargetFieldConfig(
|
||||
root,
|
||||
setExtractionConfig.getRegionKeyConfig()
|
||||
) ?: throw IllegalStateException("Parameter 'key' could not be found")
|
||||
|
||||
return RegionalSet(
|
||||
setId,
|
||||
setLanguage,
|
||||
setKey,
|
||||
listOf(),
|
||||
-1
|
||||
)
|
||||
} else {
|
||||
val setIdConfiguration = setExtractionConfig.getIdConfig()
|
||||
val rootConfiguration = setIdConfiguration.getRootConfig().get()
|
||||
|
||||
val setIdRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
|
||||
val setId: String = extractTextFromElementByTargetFieldConfig(
|
||||
setIdRoot,
|
||||
setIdConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'id' could not be found")
|
||||
|
||||
|
||||
val setLanguageConfiguration = setExtractionConfig.getIdConfig()
|
||||
val setLanguageRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
|
||||
val setLanguage: String = extractTextFromElementByTargetFieldConfig(
|
||||
setLanguageRoot,
|
||||
setLanguageConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'language' could not be found")
|
||||
|
||||
|
||||
val setKeyConfiguration = setExtractionConfig.getIdConfig()
|
||||
val setKeyRoot = getElementFromDocumentByExtractConfig(root, rootConfiguration) ?: throw ElementNotFoundException("TODO fix this")
|
||||
val setKey: String = extractTextFromElementByTargetFieldConfig(
|
||||
setKeyRoot,
|
||||
setKeyConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'key' could not be found")
|
||||
|
||||
return RegionalSet(
|
||||
setId,
|
||||
setLanguage,
|
||||
setKey,
|
||||
listOf(),
|
||||
-1
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fun extractRegionalSets(root: Element, provider: String): Set<RegionalSet> {
|
||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||
val setExtractionConfig = source.getTargets().getSetConfig().get()
|
||||
|
||||
if (setExtractionConfig.getRootConfig().isPresent) {
|
||||
val rootConfiguration = setExtractionConfig.getRootConfig().get()
|
||||
val regionalSetRoots: Elements = getElementsFromDocumentByExtractConfig(
|
||||
root,
|
||||
rootConfiguration
|
||||
)
|
||||
|
||||
return regionalSetRoots.map {
|
||||
extractRegionalSet(
|
||||
it,
|
||||
provider
|
||||
)
|
||||
}.toSet()
|
||||
} else {
|
||||
try {
|
||||
val setIdConfiguration = setExtractionConfig.getIdConfig()
|
||||
val setIdRoot = getElementsFromDocumentByExtractConfig(root, setIdConfiguration.getRootConfig().get())
|
||||
val setIds = setIdRoot.map {
|
||||
extractTextFromElementByTargetFieldConfig(
|
||||
it,
|
||||
setIdConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'id' could not be found")
|
||||
}
|
||||
|
||||
val languageConfiguration = setExtractionConfig.getLanguageConfig()
|
||||
val languageRoot = getElementsFromDocumentByExtractConfig(root, languageConfiguration.getRootConfig().get())
|
||||
val languages = languageRoot.map {
|
||||
extractTextFromElementByTargetFieldConfig(
|
||||
it,
|
||||
languageConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'id' could not be found")
|
||||
}
|
||||
|
||||
val setKeyConfiguration = setExtractionConfig.getRegionKeyConfig()
|
||||
val setKeyRoot = getElementsFromDocumentByExtractConfig(root, setKeyConfiguration.getRootConfig().get())
|
||||
val setKeys = setKeyRoot.map {
|
||||
extractTextFromElementByTargetFieldConfig(
|
||||
it,
|
||||
setKeyConfiguration
|
||||
) ?: throw IllegalStateException("Parameter 'id' could not be found")
|
||||
}
|
||||
|
||||
return RegionalSet.flattenFromMemberLists(
|
||||
setIds,
|
||||
languages,
|
||||
setKeys
|
||||
)
|
||||
} catch (ex: NoSuchElementException) {
|
||||
throw RuntimeException("sdfgs") // TODO handle me
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun extractCard(root: Document, provider: String): Card? {
|
||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||
val cardSelector = source.getTargets().getCardConfig().get()
|
||||
|
||||
val rootConfigurationOptional = cardSelector.getRootConfig()
|
||||
|
||||
if (rootConfigurationOptional.isPresent) {
|
||||
val rootConfiguration = rootConfigurationOptional.get()
|
||||
val rootElement: Element = getElementFromDocumentByExtractConfig(
|
||||
root,
|
||||
rootConfiguration
|
||||
) ?: throw ElementNotFoundException("TODO make this better")
|
||||
|
||||
val englishCardName: String = extractTextFromElementByTargetFieldConfig(
|
||||
rootElement,
|
||||
cardSelector.getEnglishNameConfig()
|
||||
) ?: throw IllegalStateException("Parameter 'name' could not be found")
|
||||
|
||||
val cardType: String = extractTextFromElementByTargetFieldConfig(
|
||||
rootElement,
|
||||
cardSelector.getEnglishNameConfig()
|
||||
) ?: throw IllegalStateException("Parameter 'name' could not be found")
|
||||
|
||||
val description: String = extractTextFromElementByTargetFieldConfig(
|
||||
rootElement,
|
||||
cardSelector.getEnglishNameConfig()
|
||||
) ?: throw IllegalStateException("Parameter 'name' could not be found")
|
||||
|
||||
return null
|
||||
} else {
|
||||
return null
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private fun getElementsFromDocumentByExtractConfig(
|
||||
document: Element,
|
||||
step: ExtractConfig
|
||||
): Elements {
|
||||
return if (step.selectorType() == Selector.CSS) {
|
||||
document.select(step.getQueryString())
|
||||
} else {
|
||||
document.selectXpath(step.getQueryString())
|
||||
}
|
||||
}
|
||||
|
||||
private fun getElementFromDocumentByExtractConfig(
|
||||
document: Element,
|
||||
step: ExtractConfig,
|
||||
): Element? {
|
||||
return if (step.selectorType() == Selector.CSS) {
|
||||
document.select(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
|
||||
} else {
|
||||
document.selectXpath(step.getQueryString()).firstOrNull() ?: throw ElementNotFoundException("")
|
||||
}
|
||||
}
|
||||
|
||||
private fun extractTextFromElementByTargetFieldConfig(
|
||||
root: Element,
|
||||
extractionConfig: ScrapeTargetFieldConfig
|
||||
): String? {
|
||||
val extractionSteps = extractionConfig.getExtractionSteps()
|
||||
val transformationSteps = extractionConfig.getOptionalTransformationSteps()
|
||||
|
||||
var currentElement: Element? = root.clone()
|
||||
var result: String? = null
|
||||
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
throw IllegalStateException()
|
||||
}
|
||||
|
||||
if (index == extractionSteps.size - 1) {
|
||||
result = XPathUtil.extractResult(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
else {
|
||||
currentElement = XPathUtil.getNextElement(currentElement, currentStep.getQueryString())
|
||||
}
|
||||
}
|
||||
|
||||
if (result == null) {
|
||||
throw ElementNotFoundException("Result could not be extracted")
|
||||
}
|
||||
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,20 +1,30 @@
|
||||
package com.rak.service
|
||||
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.model.card.Card
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.exception.TargetNotFoundException
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import com.rak.service.extract.RegionalSetExtractionService
|
||||
import com.rak.service.extract.SetExtractionService
|
||||
import io.quarkus.logging.Log
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.Jsoup
|
||||
import org.jsoup.nodes.Document
|
||||
import java.lang.Exception
|
||||
|
||||
@ApplicationScoped
|
||||
class ScrapeService(
|
||||
private val sourceService: SourceService,
|
||||
private val extractionService: ExtractionService,
|
||||
private val setExtractionService: SetExtractionService,
|
||||
private val regionalSetExtractionService: RegionalSetExtractionService
|
||||
private val regionalSetExtractionService: RegionalSetExtractionService,
|
||||
private val commonCrawlService: CommonCrawlService
|
||||
) {
|
||||
|
||||
fun ProviderConfig.buildUrl(targetName: String): String {
|
||||
return this.getUrlPattern().format(targetName)
|
||||
}
|
||||
|
||||
fun scrapeSet(
|
||||
provider: String,
|
||||
@@ -23,10 +33,39 @@ class ScrapeService(
|
||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||
|
||||
val path: String = normalizePath(setName)
|
||||
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
||||
val url = source.buildUrl(path)
|
||||
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
|
||||
|
||||
// return extractionService.extractSet(setName, document, provider)
|
||||
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
|
||||
var document: Document? = null
|
||||
|
||||
for (indexResponse in ccIndexResponses) {
|
||||
document = commonCrawlService.getDocument(
|
||||
indexResponse,
|
||||
source.getDomain()
|
||||
)
|
||||
|
||||
if (document != null) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (document == null) {
|
||||
// Fallback to Jsoup directly
|
||||
try {
|
||||
document = Jsoup.connect(url).get()
|
||||
} catch(ex: Exception) {
|
||||
Log.warn("Error occurred during Jsoup query")
|
||||
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
|
||||
}
|
||||
}
|
||||
|
||||
return setExtractionService.extract(
|
||||
document,
|
||||
source,
|
||||
source.getTargets().getSetConfig().get()
|
||||
).apply {
|
||||
name = setName
|
||||
}
|
||||
}
|
||||
|
||||
fun scrapeRegionalSet(
|
||||
@@ -45,12 +84,8 @@ class ScrapeService(
|
||||
provider: String,
|
||||
cardName: String,
|
||||
): Card? {
|
||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||
throw NotImplementedException("Not implemented")
|
||||
|
||||
val path: String = normalizePath(cardName)
|
||||
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
||||
|
||||
return extractionService.extractCard(document, provider)
|
||||
}
|
||||
|
||||
private fun normalizePath(path: String): String = path
|
||||
|
||||
@@ -0,0 +1,49 @@
|
||||
package com.rak.service.client
|
||||
|
||||
import com.rak.util.NDJsonReader
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
import io.netty.buffer.ByteBufInputStream
|
||||
import io.quarkus.rest.client.reactive.ClientQueryParam
|
||||
import io.quarkus.rest.client.reactive.NotBody
|
||||
import io.quarkus.rest.client.reactive.Url
|
||||
import jakarta.ws.rs.Consumes
|
||||
import jakarta.ws.rs.GET
|
||||
import jakarta.ws.rs.Path
|
||||
import jakarta.ws.rs.PathParam
|
||||
import jakarta.ws.rs.QueryParam
|
||||
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
|
||||
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
|
||||
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
|
||||
|
||||
@RegisterRestClient(baseUri = "whatever")
|
||||
@RegisterProvider(NDJsonReader::class)
|
||||
interface CommonCrawlRestClient {
|
||||
|
||||
@GET
|
||||
@ClientQueryParam(name = "output", value = ["json"])
|
||||
@Path("/{index}-index")
|
||||
@Consumes("text/x-ndjson")
|
||||
fun queryIndex(
|
||||
@Url
|
||||
baseUrl: String,
|
||||
@QueryParam("url")
|
||||
queryUrl: String,
|
||||
@PathParam("index")
|
||||
indexName: String
|
||||
): CCIndexSuccessResponse
|
||||
|
||||
@GET
|
||||
@Path("/{fileName}")
|
||||
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
|
||||
fun getWarcArchive(
|
||||
@Url
|
||||
baseUrl: String,
|
||||
@PathParam("fileName")
|
||||
fileName: String,
|
||||
@NotBody
|
||||
fileLength: Int,
|
||||
@NotBody
|
||||
fileOffset: Int
|
||||
): ByteBufInputStream
|
||||
|
||||
}
|
||||
@@ -1,10 +1,9 @@
|
||||
package com.rak.service
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.AbstractScrapeTargetConfig
|
||||
import com.rak.config.model.ExtractConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.model.DiscriminatorDirection
|
||||
import com.rak.model.Selector
|
||||
import com.rak.model.exception.ElementNotFoundException
|
||||
import com.rak.model.exception.InvalidConfigurationException
|
||||
@@ -35,7 +34,13 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: T
|
||||
): Collection<E>
|
||||
): List<E>
|
||||
|
||||
abstract fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: T
|
||||
): List<List<E>>
|
||||
|
||||
fun getRootElement(
|
||||
element: Element,
|
||||
@@ -84,7 +89,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
}
|
||||
}
|
||||
|
||||
protected fun extractAsMap(
|
||||
protected fun extractSingle(
|
||||
document: Element,
|
||||
extractionConfig: T
|
||||
): Map<String, String> {
|
||||
@@ -108,7 +113,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return result
|
||||
}
|
||||
|
||||
fun extractAsListOfMaps(
|
||||
fun extractMulti(
|
||||
element: Element,
|
||||
extractionConfig: T
|
||||
): List<Map<String, String>> {
|
||||
@@ -143,94 +148,24 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractAsListOfMaps(
|
||||
elements: Elements,
|
||||
extractionConfig: T
|
||||
): List<Map<String, String>> {
|
||||
val resultList = mutableListOf<MutableMap<String, String>>()
|
||||
|
||||
|
||||
// refactor this
|
||||
extractionConfig.getItems().forEach { (identifier, fieldConfig) ->
|
||||
for(index in 0..elements.size - 1) {
|
||||
val rootElement = elements[index]
|
||||
val extractedText = extractTextFromElementByTargetFieldConfig(
|
||||
rootElement,
|
||||
fieldConfig
|
||||
) ?: throw ElementNotFoundException("Could not find element for '$identifier'")
|
||||
|
||||
val mapToModify: MutableMap<String, String> = try {
|
||||
resultList[index]
|
||||
} catch (_: IndexOutOfBoundsException) {
|
||||
val newMap = mutableMapOf<String, String>()
|
||||
resultList.add(newMap)
|
||||
newMap
|
||||
}
|
||||
|
||||
mapToModify.put(identifier, extractedText)
|
||||
}
|
||||
}
|
||||
|
||||
return resultList
|
||||
}
|
||||
|
||||
fun extractWithDiscriminator(
|
||||
fun extractMultiWithDiscriminator(
|
||||
element: Element,
|
||||
extractionConfig: T
|
||||
): List<List<Map<String, String>>>{
|
||||
val rootElement = getRootElement(
|
||||
val rootElements = getRootElements(
|
||||
element,
|
||||
extractionConfig.getRootConfig(),
|
||||
extractionConfig.getDiscriminator().get().getRootConfig(),
|
||||
Optional.empty<ExtractConfig>()
|
||||
)
|
||||
|
||||
var rootElements = getRootElements(
|
||||
element,
|
||||
extractionConfig.getRootConfig(),
|
||||
Optional.empty<ExtractConfig>()
|
||||
)
|
||||
|
||||
val discriminatedElements = getElementsFromElementByExtractConfig(
|
||||
rootElement,
|
||||
extractionConfig.getDiscriminator().get().getRootConfig().get(),
|
||||
)
|
||||
|
||||
val discriminations = mutableListOf<String>()
|
||||
val result = mutableListOf<List<Map<String, String>>>()
|
||||
|
||||
for (element in discriminatedElements) {
|
||||
val discriminatorValue: String = extractTextFromElementByTargetFieldConfig(
|
||||
for(element in rootElements) {
|
||||
result.add(extractMulti(
|
||||
element,
|
||||
extractionConfig.getDiscriminator().get()
|
||||
) ?: throw ElementNotFoundException("")
|
||||
|
||||
discriminations.add(discriminatorValue)
|
||||
}
|
||||
|
||||
val definitiveElements = if (discriminations.size < rootElements.size) {
|
||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
||||
rootElements = Elements(rootElements.reversed())
|
||||
}
|
||||
|
||||
while (discriminations.size < rootElements.size) {
|
||||
rootElements.removeFirst()
|
||||
}
|
||||
|
||||
if (extractionConfig.getDiscriminator().get().getDiscriminatorDirection() == DiscriminatorDirection.DESC) {
|
||||
rootElements = Elements(rootElements.reversed())
|
||||
}
|
||||
|
||||
rootElements
|
||||
} else {
|
||||
rootElements
|
||||
}
|
||||
|
||||
result.add(extractAsListOfMaps(
|
||||
definitiveElements,
|
||||
extractionConfig
|
||||
))
|
||||
|
||||
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -245,6 +180,7 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
var currentElement: Element? = root.clone()
|
||||
var result: String? = null
|
||||
|
||||
try {
|
||||
for (index in 0 until extractionSteps.size) {
|
||||
val currentStep = extractionSteps.elementAtOrNull(index) ?: return null
|
||||
if (currentElement == null) {
|
||||
@@ -272,6 +208,20 @@ abstract class AbstractExtractionService<E, T : AbstractScrapeTargetConfig> {
|
||||
if (transformationSteps.isPresent) {
|
||||
result = transformationRegistry.applyTransformations(result, transformationSteps.get())
|
||||
}
|
||||
} catch (ex: RuntimeException) {
|
||||
when (ex) {
|
||||
is ElementNotFoundException,
|
||||
is IllegalStateException -> {
|
||||
if (extractionConfig.getFallbackConfiguration().isPresent) {
|
||||
result = extractionConfig.getFallbackConfiguration().get().getOptionalDefaultValue()
|
||||
} else {
|
||||
throw ex
|
||||
}
|
||||
}
|
||||
else -> throw ex
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -1,15 +1,11 @@
|
||||
package com.rak.service
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.CardPrintScrapeTargetConfig
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.set.CardSet
|
||||
import com.rak.model.set.RegionalSet
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
import org.jsoup.nodes.Document
|
||||
import org.jsoup.nodes.Element
|
||||
|
||||
@ApplicationScoped
|
||||
@@ -36,11 +32,22 @@ class CardPrintExtractionService : AbstractExtractionService<CardPrint, CardPrin
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: CardPrintScrapeTargetConfig
|
||||
): Collection<CardPrint> {
|
||||
val objectAsListOfMaps = extractWithDiscriminator(element, extractionConfig)
|
||||
): List<CardPrint> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
|
||||
return objectAsListOfMaps.map {
|
||||
CardPrint.fromMap(it[0])
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: CardPrintScrapeTargetConfig
|
||||
): List<List<CardPrint>> {
|
||||
val objectAsListOfMaps: List<List<Map<String, String>>> = extractMultiWithDiscriminator(
|
||||
element,
|
||||
extractionConfig
|
||||
)
|
||||
|
||||
return objectAsListOfMaps.map { innerList ->
|
||||
innerList.map { map -> CardPrint.fromMap(map) }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,10 @@
|
||||
package com.rak.service
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
import com.rak.config.model.SetScrapeTargetConfig
|
||||
import com.rak.config.model.SourcesConfig
|
||||
import com.rak.model.card.CardPrint
|
||||
import com.rak.model.exception.NotImplementedException
|
||||
import com.rak.model.set.RegionalSet
|
||||
import jakarta.enterprise.context.ApplicationScoped
|
||||
@@ -35,18 +36,27 @@ class RegionalSetExtractionService(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): Collection<RegionalSet> {
|
||||
val regionalSetList = extractAsListOfMaps(element, extractionConfig)
|
||||
val cardPrintsInRegionalSet = extractAsListOfMaps(element, extractionConfig)
|
||||
): List<RegionalSet> {
|
||||
val regionalSetList = extractMulti(element, extractionConfig)
|
||||
|
||||
val cardPrints = cardPrintExtractionService.extractMultiple(
|
||||
val cardPrintGroups: List<List<CardPrint>> = cardPrintExtractionService.extractNestedMultiples(
|
||||
element,
|
||||
providerConfig,
|
||||
providerConfig.getTargets().getCardPrintConfiguration().get()
|
||||
)
|
||||
|
||||
return regionalSetList.map {
|
||||
RegionalSet.fromMap(it, cardPrints)
|
||||
// Pair each RegionalSet with its CardPrint group by index
|
||||
return regionalSetList.mapIndexed { index, regionalSetMap ->
|
||||
val cardPrintsForSet = cardPrintGroups.getOrElse(index) { emptyList() }
|
||||
RegionalSet.fromMap(regionalSetMap, cardPrintsForSet)
|
||||
}
|
||||
}
|
||||
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): List<List<RegionalSet>> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
package com.rak.service
|
||||
package com.rak.service.extract
|
||||
|
||||
import com.rak.config.model.ProviderConfig
|
||||
import com.rak.config.model.ScrapeTargetFieldConfig
|
||||
@@ -36,7 +36,15 @@ class SetExtractionService(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): Collection<CardSet> {
|
||||
): List<CardSet> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
|
||||
override fun extractNestedMultiples(
|
||||
element: Element,
|
||||
providerConfig: ProviderConfig,
|
||||
extractionConfig: SetScrapeTargetConfig
|
||||
): List<List<CardSet>> {
|
||||
throw NotImplementedException("Not implemented")
|
||||
}
|
||||
}
|
||||
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
@@ -0,0 +1,23 @@
|
||||
package com.rak.util
|
||||
|
||||
import io.quarkus.rest.client.reactive.ComputedParamContext
|
||||
|
||||
class HttpUtil {
|
||||
|
||||
companion object {
|
||||
|
||||
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
|
||||
|
||||
@JvmStatic
|
||||
fun computeHeader(context: ComputedParamContext): String {
|
||||
val fileLengthContext = context.methodParameters().subList(2, 4)
|
||||
|
||||
val fileLength = fileLengthContext[0].value().toString().toInt()
|
||||
val fileOffset = fileLengthContext[1].value().toString().toInt()
|
||||
|
||||
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
@@ -0,0 +1,45 @@
|
||||
package com.rak.util
|
||||
|
||||
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
||||
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||
import com.rak.model.cc.CCIndexSuccessResponse
|
||||
import jakarta.ws.rs.Consumes
|
||||
import jakarta.ws.rs.core.MediaType
|
||||
import jakarta.ws.rs.core.MultivaluedMap
|
||||
import jakarta.ws.rs.ext.MessageBodyReader
|
||||
import jakarta.ws.rs.ext.Provider
|
||||
import java.io.BufferedReader
|
||||
import java.io.InputStream
|
||||
import java.io.InputStreamReader
|
||||
import java.lang.reflect.Type
|
||||
|
||||
@Provider
|
||||
@Consumes("text/x-ndjson") // Handles NDJSON content
|
||||
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
|
||||
|
||||
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
|
||||
|
||||
override fun isReadable(
|
||||
type: Class<*>?,
|
||||
genericType: Type?,
|
||||
annotations: Array<out Annotation>?,
|
||||
mediaType: MediaType?
|
||||
): Boolean {
|
||||
return type == CCIndexSuccessResponse::class.java
|
||||
}
|
||||
|
||||
override fun readFrom(
|
||||
type: Class<CCIndexSuccessResponse>,
|
||||
genericType: Type?,
|
||||
annotations: Array<out Annotation>?,
|
||||
mediaType: MediaType?,
|
||||
httpHeaders: MultivaluedMap<String, String>?,
|
||||
entityStream: InputStream
|
||||
): CCIndexSuccessResponse {
|
||||
BufferedReader(InputStreamReader(entityStream)).use { reader ->
|
||||
val firstLine = reader.readLine()
|
||||
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,12 +3,15 @@ package com.rak.util
|
||||
import com.rak.model.XPathTarget
|
||||
import org.jsoup.nodes.Element
|
||||
import org.jsoup.nodes.TextNode
|
||||
import org.jsoup.select.Elements
|
||||
import kotlin.coroutines.CoroutineContext
|
||||
|
||||
class XPathUtil private constructor() {
|
||||
|
||||
companion object {
|
||||
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
||||
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
||||
private val INDEX_MATCHER: Regex = Regex("\\[(\\w)\\]")
|
||||
|
||||
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
||||
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
||||
@@ -20,6 +23,21 @@ class XPathUtil private constructor() {
|
||||
}
|
||||
}
|
||||
|
||||
private fun selectXpath(element: Element, xpath: String): Elements {
|
||||
return if (xpath.contains(INDEX_MATCHER)) {
|
||||
val index = INDEX_MATCHER.find(xpath)?.groupValues[1]!!.toInt()
|
||||
val xpathHalves = xpath.split("[$index]")
|
||||
|
||||
try {
|
||||
Elements(element.selectXpath(xpathHalves[0])[index])
|
||||
} catch (_: IndexOutOfBoundsException) {
|
||||
Elements()
|
||||
}
|
||||
} else {
|
||||
element.selectXpath(xpath)
|
||||
}
|
||||
}
|
||||
|
||||
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||
return root
|
||||
.selectXpath(xpath, TextNode::class.java)
|
||||
@@ -27,7 +45,7 @@ class XPathUtil private constructor() {
|
||||
}
|
||||
|
||||
fun getNextElement(element: Element, path: String): Element? {
|
||||
return element.selectXpath(path).firstOrNull()
|
||||
return selectXpath(element, path).firstOrNull()
|
||||
}
|
||||
|
||||
fun extractResult(root: Element, path: String): String? {
|
||||
|
||||
@@ -7,8 +7,7 @@ scraper:
|
||||
- id: konami-official
|
||||
name: "Konami Official Database"
|
||||
domain: "yugioh-card.com"
|
||||
url-patterns:
|
||||
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||
targets:
|
||||
card:
|
||||
root:
|
||||
@@ -28,8 +27,7 @@ scraper:
|
||||
- id: ygo-fandom
|
||||
name: "Yu-Gi-Oh Fandom Wiki"
|
||||
domain: "yugioh.fandom.com"
|
||||
url-patterns:
|
||||
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
||||
url-pattern: "https://yugioh.fandom.com/wiki/%s"
|
||||
targets:
|
||||
set:
|
||||
root:
|
||||
@@ -41,9 +39,10 @@ scraper:
|
||||
value: "//li/text()"
|
||||
transform:
|
||||
- name: "replace"
|
||||
parameters:
|
||||
- " ("
|
||||
- ""
|
||||
parameters: [
|
||||
" (",
|
||||
""
|
||||
]
|
||||
language:
|
||||
steps:
|
||||
- type: xpath
|
||||
@@ -56,33 +55,44 @@ scraper:
|
||||
value: "//li/abbr/text()"
|
||||
card-print:
|
||||
multi: true
|
||||
root:
|
||||
type: css
|
||||
value: ".tabber.wds-tabber > div"
|
||||
discriminator:
|
||||
direction: asc
|
||||
root:
|
||||
type: css
|
||||
value: ".wds-tabs__tab"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: "//li/div/a/text()"
|
||||
value: ".wds-tab__content"
|
||||
root:
|
||||
type: css
|
||||
value: "table > tbody > tr:has(> td)"
|
||||
id:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
||||
value: "./td/a[0]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
name:
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[1]/a/text()"
|
||||
value: "./td/a[1]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
regional-name:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[2]/a/text()"
|
||||
value: "./td[2]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
transform:
|
||||
- name: "removeInnerQuotes"
|
||||
parameters: []
|
||||
rarity:
|
||||
fallback:
|
||||
default: "N/A"
|
||||
steps:
|
||||
- type: xpath
|
||||
value: ".//table/tbody/tr[2]/td[3]/a/text()"
|
||||
value: "./td/a[3]"
|
||||
- type: xpath
|
||||
value: "./text()"
|
||||
card:
|
||||
name:
|
||||
root:
|
||||
|
||||
Reference in New Issue
Block a user