Basic CommonCrawl integration
This commit is contained in:
@@ -23,8 +23,11 @@ dependencies {
|
|||||||
implementation("io.quarkus:quarkus-rest-jackson")
|
implementation("io.quarkus:quarkus-rest-jackson")
|
||||||
implementation("io.quarkus:quarkus-kotlin")
|
implementation("io.quarkus:quarkus-kotlin")
|
||||||
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
|
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
|
||||||
implementation("org.jsoup:jsoup:1.20.1")
|
|
||||||
implementation("io.quarkus:quarkus-arc")
|
implementation("io.quarkus:quarkus-arc")
|
||||||
|
implementation("org.jsoup:jsoup:1.20.1")
|
||||||
|
implementation("org.netpreserve.commons:webarchive-commons:2.0.1")
|
||||||
|
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.19.0")
|
||||||
|
implementation("com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.19.1")
|
||||||
testImplementation("io.quarkus:quarkus-junit5")
|
testImplementation("io.quarkus:quarkus-junit5")
|
||||||
testImplementation("io.rest-assured:rest-assured")
|
testImplementation("io.rest-assured:rest-assured")
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ interface ProviderConfig {
|
|||||||
fun getName(): String
|
fun getName(): String
|
||||||
@WithName("domain")
|
@WithName("domain")
|
||||||
fun getDomain(): String
|
fun getDomain(): String
|
||||||
@WithName("url-patterns")
|
@WithName("url-pattern")
|
||||||
fun getUrlPatterns(): Optional<MutableSet<String>>
|
fun getUrlPattern(): String
|
||||||
@WithName("targets")
|
@WithName("targets")
|
||||||
fun getTargets(): TargetsConfig
|
fun getTargets(): TargetsConfig
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
package com.rak.controller
|
package com.rak.controller
|
||||||
|
|
||||||
import com.rak.model.card.Card
|
import com.rak.model.card.Card
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
import com.rak.model.set.CardSet
|
import com.rak.model.set.CardSet
|
||||||
import com.rak.model.set.RegionalSet
|
import com.rak.model.set.RegionalSet
|
||||||
|
import com.rak.service.CommonCrawlService
|
||||||
import com.rak.service.ScrapeService
|
import com.rak.service.ScrapeService
|
||||||
import jakarta.ws.rs.Consumes
|
import jakarta.ws.rs.Consumes
|
||||||
import jakarta.ws.rs.GET
|
import jakarta.ws.rs.GET
|
||||||
@@ -16,10 +18,9 @@ import org.jboss.resteasy.reactive.RestQuery
|
|||||||
@Path("/api")
|
@Path("/api")
|
||||||
class ScrapeController(
|
class ScrapeController(
|
||||||
private val scrapeService: ScrapeService,
|
private val scrapeService: ScrapeService,
|
||||||
|
private val commonCrawlService: CommonCrawlService
|
||||||
) {
|
) {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@GET
|
@GET
|
||||||
@Path("/{provider}/set")
|
@Path("/{provider}/set")
|
||||||
@Produces(MediaType.APPLICATION_JSON)
|
@Produces(MediaType.APPLICATION_JSON)
|
||||||
|
|||||||
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
5
src/main/kotlin/com/rak/model/cc/CCIndexErrorResponse.kt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
data class CCIndexErrorResponse(
|
||||||
|
val message: String
|
||||||
|
)
|
||||||
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
22
src/main/kotlin/com/rak/model/cc/CCIndexSuccessResponse.kt
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty
|
||||||
|
import java.time.Instant
|
||||||
|
|
||||||
|
data class CCIndexSuccessResponse(
|
||||||
|
@JsonProperty("urlkey")
|
||||||
|
val urlKey: String,
|
||||||
|
val timestamp: Instant,
|
||||||
|
val url: String,
|
||||||
|
val mime: String,
|
||||||
|
@JsonProperty("mime-detected")
|
||||||
|
val mimeDetected: String,
|
||||||
|
val status: String,
|
||||||
|
val digest: String,
|
||||||
|
val length: Int,
|
||||||
|
val offset: Int,
|
||||||
|
@JsonProperty("filename")
|
||||||
|
val fileName: String,
|
||||||
|
val languages: String,
|
||||||
|
val encoding: String,
|
||||||
|
)
|
||||||
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
9
src/main/kotlin/com/rak/model/cc/CCIndices.kt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
package com.rak.model.cc
|
||||||
|
|
||||||
|
enum class CCIndices(val indexName: String) {
|
||||||
|
CC_2025_21("CC-MAIN-2025-21"),
|
||||||
|
CC_2025_05("CC-MAIN-2024-05"),
|
||||||
|
CC_2024_46("CC-MAIN-2024-46"),
|
||||||
|
CC_2024_26("CC-MAIN-2024-26"),
|
||||||
|
CC_2023_50("CC-MAIN-2023-50");
|
||||||
|
}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
package com.rak.model.exception
|
||||||
|
|
||||||
|
class TargetNotFoundException(message: String) : RuntimeException(message)
|
||||||
@@ -3,7 +3,7 @@ package com.rak.model.set
|
|||||||
import kotlin.collections.Set
|
import kotlin.collections.Set
|
||||||
|
|
||||||
data class CardSet(
|
data class CardSet(
|
||||||
val name: String,
|
var name: String,
|
||||||
val regionalSets: Set<RegionalSet>
|
val regionalSets: Set<RegionalSet>
|
||||||
) {
|
) {
|
||||||
companion object {
|
companion object {
|
||||||
|
|||||||
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
92
src/main/kotlin/com/rak/service/CommonCrawlService.kt
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
package com.rak.service
|
||||||
|
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import com.rak.model.cc.CCIndices
|
||||||
|
import com.rak.service.client.CommonCrawlRestClient
|
||||||
|
import io.netty.buffer.ByteBufInputStream
|
||||||
|
import io.quarkus.logging.Log
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
import org.archive.format.http.HttpResponseParser
|
||||||
|
import org.archive.io.warc.WARCReaderFactory
|
||||||
|
import org.eclipse.microprofile.rest.client.inject.RestClient
|
||||||
|
import org.jsoup.helper.DataUtil
|
||||||
|
import org.jsoup.nodes.Document
|
||||||
|
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class CommonCrawlService(
|
||||||
|
@RestClient
|
||||||
|
private val commonCrawlRestClient: CommonCrawlRestClient
|
||||||
|
) {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private const val INDEX_QUERY_URL: String = "http://index.commoncrawl.org"
|
||||||
|
private const val DATA_URL: String = "http://data.commoncrawl.org"
|
||||||
|
}
|
||||||
|
|
||||||
|
fun queryIndex(
|
||||||
|
url: String
|
||||||
|
): CCIndexSuccessResponse {
|
||||||
|
return commonCrawlRestClient.queryIndex(
|
||||||
|
INDEX_QUERY_URL,
|
||||||
|
url,
|
||||||
|
CCIndices.CC_2024_46.indexName
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun queryAllCrawlIndices(
|
||||||
|
url: String
|
||||||
|
): List<CCIndexSuccessResponse> {
|
||||||
|
val responses = mutableListOf<CCIndexSuccessResponse>()
|
||||||
|
for (crawlName in CCIndices.entries) {
|
||||||
|
try {
|
||||||
|
responses.add(commonCrawlRestClient.queryIndex(
|
||||||
|
INDEX_QUERY_URL,
|
||||||
|
url,
|
||||||
|
crawlName.indexName
|
||||||
|
))
|
||||||
|
} catch (ex: RuntimeException) {
|
||||||
|
Log.warn("Error occurred querying crawl '${crawlName.indexName}' for URL $url")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return responses
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getDocument(
|
||||||
|
ccIndexSuccessResponse: CCIndexSuccessResponse,
|
||||||
|
baseUri: String
|
||||||
|
): Document? {
|
||||||
|
val fileName = "CC-MAIN-20241106230027-20241107020027-00740.warc.gz"
|
||||||
|
val buf: ByteBufInputStream = commonCrawlRestClient.getWarcArchive(
|
||||||
|
DATA_URL,
|
||||||
|
ccIndexSuccessResponse.fileName,
|
||||||
|
ccIndexSuccessResponse.length,
|
||||||
|
ccIndexSuccessResponse.offset
|
||||||
|
)
|
||||||
|
|
||||||
|
val test = WARCReaderFactory.get(
|
||||||
|
fileName,
|
||||||
|
buf,
|
||||||
|
true
|
||||||
|
)
|
||||||
|
|
||||||
|
val parser = HttpResponseParser()
|
||||||
|
|
||||||
|
for(record in test) {
|
||||||
|
val http = parser.parse(record.buffered())
|
||||||
|
val charSet = http.headers.get("charset")
|
||||||
|
|
||||||
|
val doc = DataUtil.load(
|
||||||
|
http.buffered(),
|
||||||
|
"UTF-8",
|
||||||
|
baseUri
|
||||||
|
)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
}
|
||||||
|
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -1,22 +1,30 @@
|
|||||||
package com.rak.service
|
package com.rak.service
|
||||||
|
|
||||||
|
import com.rak.config.model.ProviderConfig
|
||||||
import com.rak.model.card.Card
|
import com.rak.model.card.Card
|
||||||
import com.rak.model.exception.NotImplementedException
|
import com.rak.model.exception.NotImplementedException
|
||||||
|
import com.rak.model.exception.TargetNotFoundException
|
||||||
import com.rak.model.set.CardSet
|
import com.rak.model.set.CardSet
|
||||||
import com.rak.model.set.RegionalSet
|
import com.rak.model.set.RegionalSet
|
||||||
import com.rak.service.extract.RegionalSetExtractionService
|
import com.rak.service.extract.RegionalSetExtractionService
|
||||||
import com.rak.service.extract.SetExtractionService
|
import com.rak.service.extract.SetExtractionService
|
||||||
|
import io.quarkus.logging.Log
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
import org.jsoup.Jsoup
|
import org.jsoup.Jsoup
|
||||||
import org.jsoup.nodes.Document
|
import org.jsoup.nodes.Document
|
||||||
|
import java.lang.Exception
|
||||||
|
|
||||||
@ApplicationScoped
|
@ApplicationScoped
|
||||||
class ScrapeService(
|
class ScrapeService(
|
||||||
private val sourceService: SourceService,
|
private val sourceService: SourceService,
|
||||||
private val setExtractionService: SetExtractionService,
|
private val setExtractionService: SetExtractionService,
|
||||||
private val regionalSetExtractionService: RegionalSetExtractionService
|
private val regionalSetExtractionService: RegionalSetExtractionService,
|
||||||
|
private val commonCrawlService: CommonCrawlService
|
||||||
) {
|
) {
|
||||||
|
|
||||||
|
fun ProviderConfig.buildUrl(targetName: String): String {
|
||||||
|
return this.getUrlPattern().format(targetName)
|
||||||
|
}
|
||||||
|
|
||||||
fun scrapeSet(
|
fun scrapeSet(
|
||||||
provider: String,
|
provider: String,
|
||||||
@@ -25,9 +33,39 @@ class ScrapeService(
|
|||||||
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
val source = sourceService.getSourceById(provider) ?: throw IllegalArgumentException("Provider $provider not found")
|
||||||
|
|
||||||
val path: String = normalizePath(setName)
|
val path: String = normalizePath(setName)
|
||||||
val document: Document = Jsoup.connect("https://${source.getDomain()}/$path").get()
|
val url = source.buildUrl(path)
|
||||||
|
val ccIndexResponses = commonCrawlService.queryAllCrawlIndices(url).sortedBy { it.timestamp }
|
||||||
|
|
||||||
return setExtractionService.extract(document, source, source.getTargets().getSetConfig().get())
|
var document: Document? = null
|
||||||
|
|
||||||
|
for (indexResponse in ccIndexResponses) {
|
||||||
|
document = commonCrawlService.getDocument(
|
||||||
|
indexResponse,
|
||||||
|
source.getDomain()
|
||||||
|
)
|
||||||
|
|
||||||
|
if (document != null) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document == null) {
|
||||||
|
// Fallback to Jsoup directly
|
||||||
|
try {
|
||||||
|
document = Jsoup.connect(url).get()
|
||||||
|
} catch(ex: Exception) {
|
||||||
|
Log.warn("Error occurred during Jsoup query")
|
||||||
|
throw TargetNotFoundException("Could not find '$setName' for Provider '$provider'")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return setExtractionService.extract(
|
||||||
|
document,
|
||||||
|
source,
|
||||||
|
source.getTargets().getSetConfig().get()
|
||||||
|
).apply {
|
||||||
|
name = setName
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fun scrapeRegionalSet(
|
fun scrapeRegionalSet(
|
||||||
|
|||||||
@@ -0,0 +1,49 @@
|
|||||||
|
package com.rak.service.client
|
||||||
|
|
||||||
|
import com.rak.util.NDJsonReader
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import io.netty.buffer.ByteBufInputStream
|
||||||
|
import io.quarkus.rest.client.reactive.ClientQueryParam
|
||||||
|
import io.quarkus.rest.client.reactive.NotBody
|
||||||
|
import io.quarkus.rest.client.reactive.Url
|
||||||
|
import jakarta.ws.rs.Consumes
|
||||||
|
import jakarta.ws.rs.GET
|
||||||
|
import jakarta.ws.rs.Path
|
||||||
|
import jakarta.ws.rs.PathParam
|
||||||
|
import jakarta.ws.rs.QueryParam
|
||||||
|
import org.eclipse.microprofile.rest.client.annotation.ClientHeaderParam
|
||||||
|
import org.eclipse.microprofile.rest.client.annotation.RegisterProvider
|
||||||
|
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
|
||||||
|
|
||||||
|
@RegisterRestClient(baseUri = "whatever")
|
||||||
|
@RegisterProvider(NDJsonReader::class)
|
||||||
|
interface CommonCrawlRestClient {
|
||||||
|
|
||||||
|
@GET
|
||||||
|
@ClientQueryParam(name = "output", value = ["json"])
|
||||||
|
@Path("/{index}-index")
|
||||||
|
@Consumes("text/x-ndjson")
|
||||||
|
fun queryIndex(
|
||||||
|
@Url
|
||||||
|
baseUrl: String,
|
||||||
|
@QueryParam("url")
|
||||||
|
queryUrl: String,
|
||||||
|
@PathParam("index")
|
||||||
|
indexName: String
|
||||||
|
): CCIndexSuccessResponse
|
||||||
|
|
||||||
|
@GET
|
||||||
|
@Path("/{fileName}")
|
||||||
|
@ClientHeaderParam(name = "Range", value = ["{com.rak.util.HttpUtil.computeHeader}"])
|
||||||
|
fun getWarcArchive(
|
||||||
|
@Url
|
||||||
|
baseUrl: String,
|
||||||
|
@PathParam("fileName")
|
||||||
|
fileName: String,
|
||||||
|
@NotBody
|
||||||
|
fileLength: Int,
|
||||||
|
@NotBody
|
||||||
|
fileOffset: Int
|
||||||
|
): ByteBufInputStream
|
||||||
|
|
||||||
|
}
|
||||||
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
23
src/main/kotlin/com/rak/util/HttpUtil.kt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import io.quarkus.rest.client.reactive.ComputedParamContext
|
||||||
|
|
||||||
|
class HttpUtil {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
|
||||||
|
private const val HEADER_FORMAT_STRING: String = "bytes=%d-%d"
|
||||||
|
|
||||||
|
@JvmStatic
|
||||||
|
fun computeHeader(context: ComputedParamContext): String {
|
||||||
|
val fileLengthContext = context.methodParameters().subList(2, 4)
|
||||||
|
|
||||||
|
val fileLength = fileLengthContext[0].value().toString().toInt()
|
||||||
|
val fileOffset = fileLengthContext[1].value().toString().toInt()
|
||||||
|
|
||||||
|
return HEADER_FORMAT_STRING.format(fileOffset, fileOffset + fileLength - 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
45
src/main/kotlin/com/rak/util/NDJsonReader.kt
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.datatype.jsr310.JSR310Module
|
||||||
|
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
|
||||||
|
import com.fasterxml.jackson.module.kotlin.jacksonObjectMapper
|
||||||
|
import com.rak.model.cc.CCIndexSuccessResponse
|
||||||
|
import jakarta.ws.rs.Consumes
|
||||||
|
import jakarta.ws.rs.core.MediaType
|
||||||
|
import jakarta.ws.rs.core.MultivaluedMap
|
||||||
|
import jakarta.ws.rs.ext.MessageBodyReader
|
||||||
|
import jakarta.ws.rs.ext.Provider
|
||||||
|
import java.io.BufferedReader
|
||||||
|
import java.io.InputStream
|
||||||
|
import java.io.InputStreamReader
|
||||||
|
import java.lang.reflect.Type
|
||||||
|
|
||||||
|
@Provider
|
||||||
|
@Consumes("text/x-ndjson") // Handles NDJSON content
|
||||||
|
class NDJsonReader : MessageBodyReader<CCIndexSuccessResponse> {
|
||||||
|
|
||||||
|
private val objectMapper = jacksonObjectMapper().registerModule(JavaTimeModule())
|
||||||
|
|
||||||
|
override fun isReadable(
|
||||||
|
type: Class<*>?,
|
||||||
|
genericType: Type?,
|
||||||
|
annotations: Array<out Annotation>?,
|
||||||
|
mediaType: MediaType?
|
||||||
|
): Boolean {
|
||||||
|
return type == CCIndexSuccessResponse::class.java
|
||||||
|
}
|
||||||
|
|
||||||
|
override fun readFrom(
|
||||||
|
type: Class<CCIndexSuccessResponse>,
|
||||||
|
genericType: Type?,
|
||||||
|
annotations: Array<out Annotation>?,
|
||||||
|
mediaType: MediaType?,
|
||||||
|
httpHeaders: MultivaluedMap<String, String>?,
|
||||||
|
entityStream: InputStream
|
||||||
|
): CCIndexSuccessResponse {
|
||||||
|
BufferedReader(InputStreamReader(entityStream)).use { reader ->
|
||||||
|
val firstLine = reader.readLine()
|
||||||
|
return objectMapper.readValue(firstLine, CCIndexSuccessResponse::class.java)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -7,8 +7,7 @@ scraper:
|
|||||||
- id: konami-official
|
- id: konami-official
|
||||||
name: "Konami Official Database"
|
name: "Konami Official Database"
|
||||||
domain: "yugioh-card.com"
|
domain: "yugioh-card.com"
|
||||||
url-patterns:
|
url-pattern: "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||||
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
|
||||||
targets:
|
targets:
|
||||||
card:
|
card:
|
||||||
root:
|
root:
|
||||||
@@ -28,8 +27,7 @@ scraper:
|
|||||||
- id: ygo-fandom
|
- id: ygo-fandom
|
||||||
name: "Yu-Gi-Oh Fandom Wiki"
|
name: "Yu-Gi-Oh Fandom Wiki"
|
||||||
domain: "yugioh.fandom.com"
|
domain: "yugioh.fandom.com"
|
||||||
url-patterns:
|
url-pattern: "https://yugioh.fandom.com/wiki/%s"
|
||||||
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
|
||||||
targets:
|
targets:
|
||||||
set:
|
set:
|
||||||
root:
|
root:
|
||||||
|
|||||||
Reference in New Issue
Block a user