Basic CommonCrawl integration

This commit is contained in:
2025-06-26 15:08:41 +02:00
parent a6ed98c36e
commit 8f934bc2b9
14 changed files with 301 additions and 13 deletions

View File

@@ -23,8 +23,11 @@ dependencies {
implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured")
}

View File

@@ -11,8 +11,8 @@ interface ProviderConfig {
fun getName(): String
@WithName("domain")
fun getDomain(): String
@WithName("url-patterns")
fun getUrlPatterns(): Optional<MutableSet<String>>
@WithName("url-pattern")
fun getUrlPattern(): String
@WithName("targets")
fun getTargets(): TargetsConfig

View File

@@ -1,8 +1,10 @@
package com.rak.controller
import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api")
class ScrapeController(
private val scrapeService: ScrapeService,
private val commonCrawlService: CommonCrawlService
) {
@GET
@Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON)

View File

@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)

View File

@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)

View File

@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)

View File

@@ -3,7 +3,7 @@ package com.rak.model.set
import kotlin.collections.Set
data class CardSet(
val name: String,
var name: String,
val regionalSets: Set<RegionalSet>
) {
companion object {

View File

@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}

View File

@@ -1,22 +1,30 @@
package com.rak.service
import com.rak.config.model.ProviderConfig
import com.rak.model.card.Card
import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.SetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import java.lang.Exception
@ApplicationScoped
class ScrapeService(
private val sourceService: SourceService,
private val setExtractionService: SetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService
private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService
) {
fun ProviderConfig.buildUrl(targetName: String): String {
return this.getUrlPattern().format(targetName)
}
fun scrapeSet(
provider: String,
@@ -25,9 +33,39 @@ class ScrapeService(
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
var document: Document? = null
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query")
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return setExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
).apply {
name = setName
}
}
fun scrapeRegionalSet(

View File

@@ -0,0 +1,49 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}

View File

@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}

View File

@@ -0,0 +1,45 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}

View File

@@ -7,8 +7,7 @@ scraper:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets:
card:
root:
@@ -28,8 +27,7 @@ scraper:
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
url-pattern: "https://yugioh.fandom.com/wiki/%s"
targets:
set:
root: