diff --git a/build.gradle.kts b/build.gradle.kts index 38e29b5..1210b3e 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -23,8 +23,11 @@ dependencies { implementation("io.quarkus:quarkus-rest-jackson") implementation("io.quarkus:quarkus-kotlin") implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") - implementation("org.jsoup:jsoup:1.20.1") implementation("io.quarkus:quarkus-arc") + implementation("org.jsoup:jsoup:1.20.1") + implementation("org.netpreserve.commons:webarchive-commons:2.0.1") + implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0") + implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1") testImplementation("io.quarkus:quarkus-junit5") testImplementation("io.rest-assured:rest-assured") } diff --git a/src/main/kotlin/com/rak/config/model/ProviderConfig.kt b/src/main/kotlin/com/rak/config/model/ProviderConfig.kt index 919f0a4..8c77119 100644 --- a/src/main/kotlin/com/rak/config/model/ProviderConfig.kt +++ b/src/main/kotlin/com/rak/config/model/ProviderConfig.kt @@ -11,8 +11,8 @@ interface ProviderConfig { fun getName(): String @WithName("domain") fun getDomain(): String - @WithName("url-patterns") - fun getUrlPatterns(): Optional> + @WithName("url-pattern") + fun getUrlPattern(): String @WithName("targets") fun getTargets(): TargetsConfig diff --git a/src/main/kotlin/com/rak/controller/ScrapeController.kt b/src/main/kotlin/com/rak/controller/ScrapeController.kt index 86eeb61..841b710 100644 --- a/src/main/kotlin/com/rak/controller/ScrapeController.kt +++ b/src/main/kotlin/com/rak/controller/ScrapeController.kt @@ -1,8 +1,10 @@ package com.rak.controller import com.rak.model.card.Card +import com.rak.model.cc.CCIndexSuccessResponse import com.rak.model.set.CardSet import com.rak.model.set.RegionalSet +import com.rak.service.CommonCrawlService import com.rak.service.ScrapeService import jakarta.ws.rs.Consumes import jakarta.ws.rs.GET @@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery @Path("/api") class ScrapeController( private val scrapeService: ScrapeService, + private val commonCrawlService: CommonCrawlService ) { - - @GET @Path("/{provider}/set") @Produces(MediaType.APPLICATION_JSON) diff --git a/src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt b/src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt new file mode 100644 index 0000000..b1f97b8 --- /dev/null +++ b/src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt @@ -0,0 +1,5 @@ +package com.rak.model.cc + +data class CCIndexErrorResponse( + val message: String +) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt b/src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt new file mode 100644 index 0000000..920ad3e --- /dev/null +++ b/src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt @@ -0,0 +1,22 @@ +package com.rak.model.cc + +import com.fasterxml.jackson.annotation.JsonProperty +import java.time.Instant + +data class CCIndexSuccessResponse( + @JsonProperty("urlkey") + val urlKey: String, + val timestamp: Instant, + val url: String, + val mime: String, + @JsonProperty("mime-detected") + val mimeDetected: String, + val status: String, + val digest: String, + val length: Int, + val offset: Int, + @JsonProperty("filename") + val fileName: String, + val languages: String, + val encoding: String, +) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/cc/CCIndices.kt b/src/main/kotlin/com/rak/model/cc/CCIndices.kt new file mode 100644 index 0000000..3294bf4 --- /dev/null +++ b/src/main/kotlin/com/rak/model/cc/CCIndices.kt @@ -0,0 +1,9 @@ +package com.rak.model.cc + +enum class CCIndices(val indexName: String) { + CC_2025_21("CC-MAIN-2025-21"), + CC_2025_05("CC-MAIN-2024-05"), + CC_2024_46("CC-MAIN-2024-46"), + CC_2024_26("CC-MAIN-2024-26"), + CC_2023_50("CC-MAIN-2023-50"); +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/exception/TargetNotFoundException.kt b/src/main/kotlin/com/rak/model/exception/TargetNotFoundException.kt new file mode 100644 index 0000000..f767cde --- /dev/null +++ b/src/main/kotlin/com/rak/model/exception/TargetNotFoundException.kt @@ -0,0 +1,3 @@ +package com.rak.model.exception + +class TargetNotFoundException(message: String) : RuntimeException(message) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/set/CardSet.kt b/src/main/kotlin/com/rak/model/set/CardSet.kt index 2685bed..0a9fd49 100644 --- a/src/main/kotlin/com/rak/model/set/CardSet.kt +++ b/src/main/kotlin/com/rak/model/set/CardSet.kt @@ -3,7 +3,7 @@ package com.rak.model.set import kotlin.collections.Set data class CardSet( - val name: String, + var name: String, val regionalSets: Set ) { companion object { diff --git a/src/main/kotlin/com/rak/service/CommonCrawlService.kt b/src/main/kotlin/com/rak/service/CommonCrawlService.kt new file mode 100644 index 0000000..11c5a41 --- /dev/null +++ b/src/main/kotlin/com/rak/service/CommonCrawlService.kt @@ -0,0 +1,92 @@ +package com.rak.service + +import com.rak.model.cc.CCIndexSuccessResponse +import com.rak.model.cc.CCIndices +import com.rak.service.client.CommonCrawlRestClient +import io.netty.buffer.ByteBufInputStream +import io.quarkus.logging.Log +import jakarta.enterprise.context.ApplicationScoped +import org.archive.format.http.HttpResponseParser +import org.archive.io.warc.WARCReaderFactory +import org.eclipse.microprofile.rest.client.inject.RestClient +import org.jsoup.helper.DataUtil +import org.jsoup.nodes.Document + + +@ApplicationScoped +class CommonCrawlService( + @RestClient + private val commonCrawlRestClient: CommonCrawlRestClient +) { + + companion object { + private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org" + private const val DATA_URL: String = "http://data.commoncrawl.org" + } + + fun queryIndex( + url: String + ): CCIndexSuccessResponse { + return commonCrawlRestClient.queryIndex( + INDEX_QUERY_URL, + url, + CCIndices.CC_2024_46.indexName + ) + } + + fun queryAllCrawlIndices( + url: String + ): List { + val responses = mutableListOf() + for (crawlName in CCIndices.entries) { + try { + responses.add(commonCrawlRestClient.queryIndex( + INDEX_QUERY_URL, + url, + crawlName.indexName + )) + } catch (ex: RuntimeException) { + Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url") + } + } + + return responses + } + + fun getDocument( + ccIndexSuccessResponse: CCIndexSuccessResponse, + baseUri: String + ): Document? { + val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz" + val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive( + DATA_URL, + ccIndexSuccessResponse.fileName, + ccIndexSuccessResponse.length, + ccIndexSuccessResponse.offset + ) + + val test = WARCReaderFactory.get( + fileName, + buf, + true + ) + + val parser = HttpResponseParser() + + for(record in test) { + val http = parser.parse(record.buffered()) + val charSet = http.headers.get("charset") + + val doc = DataUtil.load( + http.buffered(), + "UTF-8", + baseUri + ) + + return doc + } + + return null + } + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/ScrapeService.kt b/src/main/kotlin/com/rak/service/ScrapeService.kt index 268e903..e9bc480 100644 --- a/src/main/kotlin/com/rak/service/ScrapeService.kt +++ b/src/main/kotlin/com/rak/service/ScrapeService.kt @@ -1,22 +1,30 @@ package com.rak.service +import com.rak.config.model.ProviderConfig import com.rak.model.card.Card import com.rak.model.exception.NotImplementedException +import com.rak.model.exception.TargetNotFoundException import com.rak.model.set.CardSet import com.rak.model.set.RegionalSet import com.rak.service.extract.RegionalSetExtractionService import com.rak.service.extract.SetExtractionService +import io.quarkus.logging.Log import jakarta.enterprise.context.ApplicationScoped import org.jsoup.Jsoup import org.jsoup.nodes.Document +import java.lang.Exception @ApplicationScoped class ScrapeService( private val sourceService: SourceService, private val setExtractionService: SetExtractionService, - private val regionalSetExtractionService: RegionalSetExtractionService + private val regionalSetExtractionService: RegionalSetExtractionService, + private val commonCrawlService: CommonCrawlService ) { + fun ProviderConfig.buildUrl(targetName: String): String { + return this.getUrlPattern().format(targetName) + } fun scrapeSet( provider: String, @@ -25,9 +33,39 @@ class ScrapeService( val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found") val path: String = normalizePath(setName) - val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() + val url = source.buildUrl(path) + val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp } - return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get()) + var document: Document? = null + + for (indexResponse in ccIndexResponses) { + document = commonCrawlService.getDocument( + indexResponse, + source.getDomain() + ) + + if (document != null) { + break + } + } + + if (document == null) { + // Fallback to Jsoup directly + try { + document = Jsoup.connect(url).get() + } catch(ex: Exception) { + Log.warn("Error occurred during Jsoup query") + throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'") + } + } + + return setExtractionService.extract( + document, + source, + source.getTargets().getSetConfig().get() + ).apply { + name = setName + } } fun scrapeRegionalSet( diff --git a/src/main/kotlin/com/rak/service/client/CommonCrawlRestClient.kt b/src/main/kotlin/com/rak/service/client/CommonCrawlRestClient.kt new file mode 100644 index 0000000..a8332e5 --- /dev/null +++ b/src/main/kotlin/com/rak/service/client/CommonCrawlRestClient.kt @@ -0,0 +1,49 @@ +package com.rak.service.client + +import com.rak.util.NDJsonReader +import com.rak.model.cc.CCIndexSuccessResponse +import io.netty.buffer.ByteBufInputStream +import io.quarkus.rest.client.reactive.ClientQueryParam +import io.quarkus.rest.client.reactive.NotBody +import io.quarkus.rest.client.reactive.Url +import jakarta.ws.rs.Consumes +import jakarta.ws.rs.GET +import jakarta.ws.rs.Path +import jakarta.ws.rs.PathParam +import jakarta.ws.rs.QueryParam +import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam +import org.eclipse.microprofile.rest.client.annotation.RegisterProvider +import org.eclipse.microprofile.rest.client.inject.RegisterRestClient + +@RegisterRestClient(baseUri = "whatever") +@RegisterProvider(NDJsonReader::class) +interface CommonCrawlRestClient { + + @GET + @ClientQueryParam(name = "output", value = ["json"]) + @Path("/{index}-index") + @Consumes("text/x-ndjson") + fun queryIndex( + @Url + baseUrl: String, + @QueryParam("url") + queryUrl: String, + @PathParam("index") + indexName: String + ): CCIndexSuccessResponse + + @GET + @Path("/{fileName}") + @ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"]) + fun getWarcArchive( + @Url + baseUrl: String, + @PathParam("fileName") + fileName: String, + @NotBody + fileLength: Int, + @NotBody + fileOffset: Int + ): ByteBufInputStream + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/util/HttpUtil.kt b/src/main/kotlin/com/rak/util/HttpUtil.kt new file mode 100644 index 0000000..29c42d6 --- /dev/null +++ b/src/main/kotlin/com/rak/util/HttpUtil.kt @@ -0,0 +1,23 @@ +package com.rak.util + +import io.quarkus.rest.client.reactive.ComputedParamContext + +class HttpUtil { + + companion object { + + private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d" + + @JvmStatic + fun computeHeader(context: ComputedParamContext): String { + val fileLengthContext = context.methodParameters().subList(2, 4) + + val fileLength = fileLengthContext[0].value().toString().toInt() + val fileOffset = fileLengthContext[1].value().toString().toInt() + + return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1) + } + + } + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/util/NDJsonReader.kt b/src/main/kotlin/com/rak/util/NDJsonReader.kt new file mode 100644 index 0000000..f4c01d6 --- /dev/null +++ b/src/main/kotlin/com/rak/util/NDJsonReader.kt @@ -0,0 +1,45 @@ +package com.rak.util + +import com.fasterxml.jackson.datatype.jsr310.JSR310Module +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule +import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper +import com.rak.model.cc.CCIndexSuccessResponse +import jakarta.ws.rs.Consumes +import jakarta.ws.rs.core.MediaType +import jakarta.ws.rs.core.MultivaluedMap +import jakarta.ws.rs.ext.MessageBodyReader +import jakarta.ws.rs.ext.Provider +import java.io.BufferedReader +import java.io.InputStream +import java.io.InputStreamReader +import java.lang.reflect.Type + +@Provider +@Consumes("text/x-ndjson") // Handles NDJSON content +class NDJsonReader : MessageBodyReader { + + private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule()) + + override fun isReadable( + type: Class<*>?, + genericType: Type?, + annotations: Array?, + mediaType: MediaType? + ): Boolean { + return type == CCIndexSuccessResponse::class.java + } + + override fun readFrom( + type: Class, + genericType: Type?, + annotations: Array?, + mediaType: MediaType?, + httpHeaders: MultivaluedMap?, + entityStream: InputStream + ): CCIndexSuccessResponse { + BufferedReader(InputStreamReader(entityStream)).use { reader -> + val firstLine = reader.readLine() + return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java) + } + } +} \ No newline at end of file diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 640d7c8..7e72b04 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -7,8 +7,7 @@ scraper: - id: konami-official name: "Konami Official Database" domain: "yugioh-card.com" - url-patterns: - - "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" + url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" targets: card: root: @@ -28,8 +27,7 @@ scraper: - id: ygo-fandom name: "Yu-Gi-Oh Fandom Wiki" domain: "yugioh.fandom.com" - url-patterns: - - "^https://yugioh\\.fandom\\.com/wiki/.*$" + url-pattern: "https://yugioh.fandom.com/wiki/%s" targets: set: root: