This commit is contained in:
rak
2025-05-25 18:58:10 +02:00
commit ac35d7f8d9
27 changed files with 1048 additions and 0 deletions

View File

@@ -0,0 +1,98 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode
#
# Before building the container image run:
#
# ./gradlew build
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.jvm -t quarkus/jsoup-scraper-jvm .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm
#
# If you want to include the debug port into your docker image
# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005.
# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005
# when running the container
#
# Then run the container using :
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm
#
# This image uses the `run-java.sh` script to run the application.
# This scripts computes the command line to execute your Java application, and
# includes memory/GC tuning.
# You can configure the behavior using the following environment properties:
# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override
# the default JVM options, use `JAVA_OPTS_APPEND` to append options
# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options
# in JAVA_OPTS (example: "-Dsome.property=foo")
# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is
# used to calculate a default maximal heap memory based on a containers restriction.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio
# of the container available memory as set here. The default is `50` which means 50%
# of the available memory is used as an upper boundary. You can skip this mechanism by
# setting this value to `0` in which case no `-Xmx` option is added.
# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This
# is used to calculate a default initial heap memory based on the maximum heap memory.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio
# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx`
# is used as the initial heap size. You can skip this mechanism by setting this value
# to `0` in which case no `-Xms` option is added (example: "25")
# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS.
# This is used to calculate the maximum value of the initial heap memory. If used in
# a container without any memory constraints for the container then this option has
# no effect. If there is a memory constraint then `-Xms` is limited to the value set
# here. The default is 4096MB which means the calculated value of `-Xms` never will
# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096")
# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output
# when things are happening. This option, if set to true, will set
# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true").
# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example:
# true").
# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787").
# - CONTAINER_CORE_LIMIT: A calculated core limit as described in
# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2")
# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024").
# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion.
# (example: "20")
# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking.
# (example: "40")
# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection.
# (example: "4")
# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus
# previous GC times. (example: "90")
# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20")
# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100")
# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should
# contain the necessary JRE command-line options to specify the required GC, which
# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC).
# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080")
# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080")
# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be
# accessed directly. (example: "foo.example.com,bar.example.com")
#
###
FROM registry.access.redhat.com/ubi9/openjdk-21:1.21
ENV LANGUAGE='en_US:en'
# We make four distinct layers so if there are application changes the library layers can be re-used
COPY --chown=185 build/quarkus-app/lib/ /deployments/lib/
COPY --chown=185 build/quarkus-app/*.jar /deployments/
COPY --chown=185 build/quarkus-app/app/ /deployments/app/
COPY --chown=185 build/quarkus-app/quarkus/ /deployments/quarkus/
EXPOSE 8080
USER 185
ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager"
ENV JAVA_APP_JAR="/deployments/quarkus-run.jar"
ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ]

View File

@@ -0,0 +1,94 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.package.jar.type=legacy-jar
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.legacy-jar -t quarkus/jsoup-scraper-legacy-jar .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar
#
# If you want to include the debug port into your docker image
# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005.
# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005
# when running the container
#
# Then run the container using :
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar
#
# This image uses the `run-java.sh` script to run the application.
# This scripts computes the command line to execute your Java application, and
# includes memory/GC tuning.
# You can configure the behavior using the following environment properties:
# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override
# the default JVM options, use `JAVA_OPTS_APPEND` to append options
# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options
# in JAVA_OPTS (example: "-Dsome.property=foo")
# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is
# used to calculate a default maximal heap memory based on a containers restriction.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio
# of the container available memory as set here. The default is `50` which means 50%
# of the available memory is used as an upper boundary. You can skip this mechanism by
# setting this value to `0` in which case no `-Xmx` option is added.
# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This
# is used to calculate a default initial heap memory based on the maximum heap memory.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio
# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx`
# is used as the initial heap size. You can skip this mechanism by setting this value
# to `0` in which case no `-Xms` option is added (example: "25")
# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS.
# This is used to calculate the maximum value of the initial heap memory. If used in
# a container without any memory constraints for the container then this option has
# no effect. If there is a memory constraint then `-Xms` is limited to the value set
# here. The default is 4096MB which means the calculated value of `-Xms` never will
# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096")
# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output
# when things are happening. This option, if set to true, will set
# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true").
# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example:
# true").
# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787").
# - CONTAINER_CORE_LIMIT: A calculated core limit as described in
# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2")
# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024").
# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion.
# (example: "20")
# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking.
# (example: "40")
# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection.
# (example: "4")
# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus
# previous GC times. (example: "90")
# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20")
# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100")
# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should
# contain the necessary JRE command-line options to specify the required GC, which
# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC).
# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080")
# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080")
# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be
# accessed directly. (example: "foo.example.com,bar.example.com")
#
###
FROM registry.access.redhat.com/ubi9/openjdk-21:1.21
ENV LANGUAGE='en_US:en'
COPY build/lib/* /deployments/lib/
COPY build/*-runner.jar /deployments/quarkus-run.jar
EXPOSE 8080
USER 185
ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager"
ENV JAVA_APP_JAR="/deployments/quarkus-run.jar"
ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ]

View File

@@ -0,0 +1,29 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode.
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.native.enabled=true
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.native -t quarkus/jsoup-scraper .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper
#
# The ` registry.access.redhat.com/ubi8/ubi-minimal:8.10` base image is based on UBI 9.
# To use UBI 8, switch to `quay.io/ubi8/ubi-minimal:8.10`.
###
FROM registry.access.redhat.com/ubi8/ubi-minimal:8.10
WORKDIR /work/
RUN chown 1001 /work \
&& chmod "g+rwX" /work \
&& chown 1001:root /work
COPY --chown=1001:root --chmod=0755 build/*-runner /work/application
EXPOSE 8080
USER 1001
ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"]

View File

@@ -0,0 +1,32 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode.
# It uses a micro base image, tuned for Quarkus native executables.
# It reduces the size of the resulting container image.
# Check https://quarkus.io/guides/quarkus-runtime-base-image for further information about this image.
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.native.enabled=true
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.native-micro -t quarkus/jsoup-scraper .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper
#
# The `quay.io/quarkus/quarkus-micro-image:2.0` base image is based on UBI 9.
# To use UBI 8, switch to `quay.io/quarkus/quarkus-micro-image:2.0`.
###
FROM quay.io/quarkus/quarkus-micro-image:2.0
WORKDIR /work/
RUN chown 1001 /work \
&& chmod "g+rwX" /work \
&& chown 1001:root /work
COPY --chown=1001:root --chmod=0755 build/*-runner /work/application
EXPOSE 8080
USER 1001
ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"]

View File

@@ -0,0 +1,50 @@
package com.rak.config
import io.smallrye.config.ConfigMapping
import java.util.*
@ConfigMapping(prefix = "scraper")
interface SourcesConfiguration {
fun sources(): MutableList<SourceConfig>
interface SourceConfig {
fun id(): String
fun name(): String
fun domain(): String
fun urlPatterns(): Optional<MutableList<String>>
fun selectors(): Selectors
interface Selectors {
fun card(): Optional<CardDefinition>
fun regionalSet(): Optional<RegionalSetDefinition>
interface AbstractModelDefinition {
fun root(): Optional<String>
}
interface RegionalSetDefinition : AbstractModelDefinition {
fun id(): SelectorDefinition
fun language(): SelectorDefinition
fun regionKey(): SelectorDefinition
}
interface CardDefinition {
fun name(): SelectorDefinition
fun attack(): SelectorDefinition
fun effect(): SelectorDefinition
}
interface SelectorDefinition {
fun steps(): Set<StepDefinition>
}
interface StepDefinition {
fun type(): String // e.g. css or xpath
fun value(): String
}
}
}
}

View File

@@ -0,0 +1,56 @@
package com.rak.controller
import com.rak.config.SourcesConfiguration
import com.rak.service.ScrapeService
import io.quarkus.logging.Log
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.Produces
import jakarta.ws.rs.core.MediaType
import org.jboss.resteasy.reactive.RestQuery
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
@Path("/hello")
class ExampleResource(
private val sourcesConfiguration: SourcesConfiguration,
private val scrapeService: ScrapeService
) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
}
@GET
@Produces(MediaType.TEXT_PLAIN)
fun hello(
@RestQuery
provider: String,
@RestQuery
path: String
): String {
val sources = sourcesConfiguration
.sources()
.filter {
it.id().equals(provider, ignoreCase = true)
}
val source = sources.firstOrNull() ?: throw IllegalArgumentException("Provider $provider not found")
val newPath: String = path
.trim()
.replace(" ", "_")
Log.info(newPath)
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
val regionalSetSelector = source.selectors().regionalSet().get()
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp"
}
}

View File

@@ -0,0 +1,4 @@
package com.rak.model.scrape
abstract class AbstractScraper{
}

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape
class JsoupScraper : AbstractScraper() {
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.scrape
data class ScrapeJob(
val url: String,
)

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape.selector
enum class Selector {
CSS,
XPATH
}

View File

@@ -0,0 +1,29 @@
package com.rak.service
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
/**
* To use it via injection.
*
* ```kotlin
* @Inject
* @RestClient
* lateinit var myRemoteService: MyRemoteService
*
* fun doSomething() {
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
* }
* ```
*/
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
interface MyRemoteService {
@GET
@Path("/extensions")
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
}

View File

@@ -0,0 +1,68 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import java.util.concurrent.LinkedBlockingQueue
@ApplicationScoped
class ScrapeService {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private fun evaluateXpath(element: Element, xpath: String): Element? {
return element.selectXpath(xpath).first()
}
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
return null
}
// XPath
// - text()
// - last step (default to text())
// CSS
// - last step???
private fun untilText(): String? {
return null
}
}
fun extractTextFromRootBySteps(
root: Element,
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
): String? {
var currentElement: Element? = root.clone()
val stepsAsQueue = LinkedBlockingQueue(
steps
)
while (stepsAsQueue.isNotEmpty()) {
val step = stepsAsQueue.take()
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
if (currentElement == null) {
return null
}
currentElement = if (step.type() == "xpath") {
if (stepTargetsTextNode) {
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
}
else {
currentElement.selectXpath(step.value()).first()
}
}
else {
currentElement.selectFirst(step.value())
}
}
return null
}
}

View File

@@ -0,0 +1 @@
quarkus.config.locations=sources.yml

View File

@@ -0,0 +1,42 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
language:
steps:
- type: "xpath"
value: "//li/abbr/@title"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"