Basic CommonCrawl integration

This commit is contained in:
2025-06-26 15:08:41 +02:00
parent a6ed98c36e
commit 8f934bc2b9
14 changed files with 301 additions and 13 deletions

View File

@@ -23,8 +23,11 @@ dependencies {
implementation("io.quarkus:quarkus-rest-jackson") implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin") implementation("io.quarkus:quarkus-kotlin")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc") implementation("io.quarkus:quarkus-arc")
implementation("org.jsoup:jsoup:1.20.1")
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
testImplementation("io.quarkus:quarkus-junit5") testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured") testImplementation("io.rest-assured:rest-assured")
} }

View File

@@ -11,8 +11,8 @@ interface ProviderConfig {
fun getName(): String fun getName(): String
@WithName("domain") @WithName("domain")
fun getDomain(): String fun getDomain(): String
@WithName("url-patterns") @WithName("url-pattern")
fun getUrlPatterns(): Optional<MutableSet<String>> fun getUrlPattern(): String
@WithName("targets") @WithName("targets")
fun getTargets(): TargetsConfig fun getTargets(): TargetsConfig

View File

@@ -1,8 +1,10 @@
package com.rak.controller package com.rak.controller
import com.rak.model.card.Card import com.rak.model.card.Card
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.set.CardSet import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet import com.rak.model.set.RegionalSet
import com.rak.service.CommonCrawlService
import com.rak.service.ScrapeService import com.rak.service.ScrapeService
import jakarta.ws.rs.Consumes import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET import jakarta.ws.rs.GET
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
@Path("/api") @Path("/api")
class ScrapeController( class ScrapeController(
private val scrapeService: ScrapeService, private val scrapeService: ScrapeService,
private val commonCrawlService: CommonCrawlService
) { ) {
@GET @GET
@Path("/{provider}/set") @Path("/{provider}/set")
@Produces(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON)

View File

@@ -0,0 +1,5 @@
package com.rak.model.cc
data class CCIndexErrorResponse(
val message: String
)

View File

@@ -0,0 +1,22 @@
package com.rak.model.cc
import com.fasterxml.jackson.annotation.JsonProperty
import java.time.Instant
data class CCIndexSuccessResponse(
@JsonProperty("urlkey")
val urlKey: String,
val timestamp: Instant,
val url: String,
val mime: String,
@JsonProperty("mime-detected")
val mimeDetected: String,
val status: String,
val digest: String,
val length: Int,
val offset: Int,
@JsonProperty("filename")
val fileName: String,
val languages: String,
val encoding: String,
)

View File

@@ -0,0 +1,9 @@
package com.rak.model.cc
enum class CCIndices(val indexName: String) {
CC_2025_21("CC-MAIN-2025-21"),
CC_2025_05("CC-MAIN-2024-05"),
CC_2024_46("CC-MAIN-2024-46"),
CC_2024_26("CC-MAIN-2024-26"),
CC_2023_50("CC-MAIN-2023-50");
}

View File

@@ -0,0 +1,3 @@
package com.rak.model.exception
class TargetNotFoundException(message: String) : RuntimeException(message)

View File

@@ -3,7 +3,7 @@ package com.rak.model.set
import kotlin.collections.Set import kotlin.collections.Set
data class CardSet( data class CardSet(
val name: String, var name: String,
val regionalSets: Set<RegionalSet> val regionalSets: Set<RegionalSet>
) { ) {
companion object { companion object {

View File

@@ -0,0 +1,92 @@
package com.rak.service
import com.rak.model.cc.CCIndexSuccessResponse
import com.rak.model.cc.CCIndices
import com.rak.service.client.CommonCrawlRestClient
import io.netty.buffer.ByteBufInputStream
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped
import org.archive.format.http.HttpResponseParser
import org.archive.io.warc.WARCReaderFactory
import org.eclipse.microprofile.rest.client.inject.RestClient
import org.jsoup.helper.DataUtil
import org.jsoup.nodes.Document
@ApplicationScoped
class CommonCrawlService(
@RestClient
private val commonCrawlRestClient: CommonCrawlRestClient
) {
companion object {
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
private const val DATA_URL: String = "http://data.commoncrawl.org"
}
fun queryIndex(
url: String
): CCIndexSuccessResponse {
return commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
CCIndices.CC_2024_46.indexName
)
}
fun queryAllCrawlIndices(
url: String
): List<CCIndexSuccessResponse> {
val responses = mutableListOf<CCIndexSuccessResponse>()
for (crawlName in CCIndices.entries) {
try {
responses.add(commonCrawlRestClient.queryIndex(
INDEX_QUERY_URL,
url,
crawlName.indexName
))
} catch (ex: RuntimeException) {
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
}
}
return responses
}
fun getDocument(
ccIndexSuccessResponse: CCIndexSuccessResponse,
baseUri: String
): Document? {
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
DATA_URL,
ccIndexSuccessResponse.fileName,
ccIndexSuccessResponse.length,
ccIndexSuccessResponse.offset
)
val test = WARCReaderFactory.get(
fileName,
buf,
true
)
val parser = HttpResponseParser()
for(record in test) {
val http = parser.parse(record.buffered())
val charSet = http.headers.get("charset")
val doc = DataUtil.load(
http.buffered(),
"UTF-8",
baseUri
)
return doc
}
return null
}
}

View File

@@ -1,22 +1,30 @@
package com.rak.service package com.rak.service
import com.rak.config.model.ProviderConfig
import com.rak.model.card.Card import com.rak.model.card.Card
import com.rak.model.exception.NotImplementedException import com.rak.model.exception.NotImplementedException
import com.rak.model.exception.TargetNotFoundException
import com.rak.model.set.CardSet import com.rak.model.set.CardSet
import com.rak.model.set.RegionalSet import com.rak.model.set.RegionalSet
import com.rak.service.extract.RegionalSetExtractionService import com.rak.service.extract.RegionalSetExtractionService
import com.rak.service.extract.SetExtractionService import com.rak.service.extract.SetExtractionService
import io.quarkus.logging.Log
import jakarta.enterprise.context.ApplicationScoped import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.Jsoup import org.jsoup.Jsoup
import org.jsoup.nodes.Document import org.jsoup.nodes.Document
import java.lang.Exception
@ApplicationScoped @ApplicationScoped
class ScrapeService( class ScrapeService(
private val sourceService: SourceService, private val sourceService: SourceService,
private val setExtractionService: SetExtractionService, private val setExtractionService: SetExtractionService,
private val regionalSetExtractionService: RegionalSetExtractionService private val regionalSetExtractionService: RegionalSetExtractionService,
private val commonCrawlService: CommonCrawlService
) { ) {
fun ProviderConfig.buildUrl(targetName: String): String {
return this.getUrlPattern().format(targetName)
}
fun scrapeSet( fun scrapeSet(
provider: String, provider: String,
@@ -25,9 +33,39 @@ class ScrapeService(
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found") val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
val path: String = normalizePath(setName) val path: String = normalizePath(setName)
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get() val url = source.buildUrl(path)
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get()) var document: Document? = null
for (indexResponse in ccIndexResponses) {
document = commonCrawlService.getDocument(
indexResponse,
source.getDomain()
)
if (document != null) {
break
}
}
if (document == null) {
// Fallback to Jsoup directly
try {
document = Jsoup.connect(url).get()
} catch(ex: Exception) {
Log.warn("Error occurred during Jsoup query")
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
}
}
return setExtractionService.extract(
document,
source,
source.getTargets().getSetConfig().get()
).apply {
name = setName
}
} }
fun scrapeRegionalSet( fun scrapeRegionalSet(

View File

@@ -0,0 +1,49 @@
package com.rak.service.client
import com.rak.util.NDJsonReader
import com.rak.model.cc.CCIndexSuccessResponse
import io.netty.buffer.ByteBufInputStream
import io.quarkus.rest.client.reactive.ClientQueryParam
import io.quarkus.rest.client.reactive.NotBody
import io.quarkus.rest.client.reactive.Url
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.PathParam
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
@RegisterRestClient(baseUri = "whatever")
@RegisterProvider(NDJsonReader::class)
interface CommonCrawlRestClient {
@GET
@ClientQueryParam(name = "output", value = ["json"])
@Path("/{index}-index")
@Consumes("text/x-ndjson")
fun queryIndex(
@Url
baseUrl: String,
@QueryParam("url")
queryUrl: String,
@PathParam("index")
indexName: String
): CCIndexSuccessResponse
@GET
@Path("/{fileName}")
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
fun getWarcArchive(
@Url
baseUrl: String,
@PathParam("fileName")
fileName: String,
@NotBody
fileLength: Int,
@NotBody
fileOffset: Int
): ByteBufInputStream
}

View File

@@ -0,0 +1,23 @@
package com.rak.util
import io.quarkus.rest.client.reactive.ComputedParamContext
class HttpUtil {
companion object {
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
@JvmStatic
fun computeHeader(context: ComputedParamContext): String {
val fileLengthContext = context.methodParameters().subList(2, 4)
val fileLength = fileLengthContext[0].value().toString().toInt()
val fileOffset = fileLengthContext[1].value().toString().toInt()
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
}
}
}

View File

@@ -0,0 +1,45 @@
package com.rak.util
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
import com.rak.model.cc.CCIndexSuccessResponse
import jakarta.ws.rs.Consumes
import jakarta.ws.rs.core.MediaType
import jakarta.ws.rs.core.MultivaluedMap
import jakarta.ws.rs.ext.MessageBodyReader
import jakarta.ws.rs.ext.Provider
import java.io.BufferedReader
import java.io.InputStream
import java.io.InputStreamReader
import java.lang.reflect.Type
@Provider
@Consumes("text/x-ndjson") // Handles NDJSON content
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
override fun isReadable(
type: Class<*>?,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?
): Boolean {
return type == CCIndexSuccessResponse::class.java
}
override fun readFrom(
type: Class<CCIndexSuccessResponse>,
genericType: Type?,
annotations: Array<out Annotation>?,
mediaType: MediaType?,
httpHeaders: MultivaluedMap<String, String>?,
entityStream: InputStream
): CCIndexSuccessResponse {
BufferedReader(InputStreamReader(entityStream)).use { reader ->
val firstLine = reader.readLine()
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
}
}
}

View File

@@ -7,8 +7,7 @@ scraper:
- id: konami-official - id: konami-official
name: "Konami Official Database" name: "Konami Official Database"
domain: "yugioh-card.com" domain: "yugioh-card.com"
url-patterns: url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
targets: targets:
card: card:
root: root:
@@ -28,8 +27,7 @@ scraper:
- id: ygo-fandom - id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki" name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com" domain: "yugioh.fandom.com"
url-patterns: url-pattern: "https://yugioh.fandom.com/wiki/%s"
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
targets: targets:
set: set:
root: root: