This commit is contained in:
rak
2025-05-25 18:58:10 +02:00
commit ac35d7f8d9
27 changed files with 1048 additions and 0 deletions

5
.dockerignore Normal file
View File

@@ -0,0 +1,5 @@
*
!build/*-runner
!build/*-runner.jar
!build/lib/*
!build/quarkus-app/*

41
.gitignore vendored Normal file
View File

@@ -0,0 +1,41 @@
# Gradle
.gradle/
build/
# Eclipse
.project
.classpath
.settings/
bin/
# IntelliJ
.idea
*.ipr
*.iml
*.iws
# NetBeans
nb-configuration.xml
# Visual Studio Code
.vscode
.factorypath
# OSX
.DS_Store
# Vim
*.swp
*.swo
# patch
*.orig
*.rej
# Local environment
.env
# Plugin directory
/.quarkus/cli/plugins/
# TLS Certificates
.certs/

78
README.md Normal file
View File

@@ -0,0 +1,78 @@
# jsoup-scraper
This project uses Quarkus, the Supersonic Subatomic Java Framework.
If you want to learn more about Quarkus, please visit its website: <https://quarkus.io/>.
## Running the application in dev mode
You can run your application in dev mode that enables live coding using:
```shell script
./gradlew quarkusDev
```
> **_NOTE:_** Quarkus now ships with a Dev UI, which is available in dev mode only at <http://localhost:8080/q/dev/>.
## Packaging and running the application
The application can be packaged using:
```shell script
./gradlew build
```
It produces the `quarkus-run.jar` file in the `build/quarkus-app/` directory.
Be aware that its not an _über-jar_ as the dependencies are copied into the `build/quarkus-app/lib/` directory.
The application is now runnable using `java -jar build/quarkus-app/quarkus-run.jar`.
If you want to build an _über-jar_, execute the following command:
```shell script
./gradlew build -Dquarkus.package.jar.type=uber-jar
```
The application, packaged as an _über-jar_, is now runnable using `java -jar build/*-runner.jar`.
## Creating a native executable
You can create a native executable using:
```shell script
./gradlew build -Dquarkus.native.enabled=true
```
Or, if you don't have GraalVM installed, you can run the native executable build in a container using:
```shell script
./gradlew build -Dquarkus.native.enabled=true -Dquarkus.native.container-build=true
```
You can then execute your native executable with: `./build/jsoup-scraper-1.0-SNAPSHOT-runner`
If you want to learn more about building native executables, please consult <https://quarkus.io/guides/gradle-tooling>.
## Related Guides
- REST ([guide](https://quarkus.io/guides/rest)): A Jakarta REST implementation utilizing build time processing and
Vert.x. This extension is not compatible with the quarkus-resteasy extension, or any of the extensions that depend on
it.
- REST Client ([guide](https://quarkus.io/guides/rest-client)): Call REST services
- REST Jackson ([guide](https://quarkus.io/guides/rest#json-serialisation)): Jackson serialization support for Quarkus
REST. This extension is not compatible with the quarkus-resteasy extension, or any of the extensions that depend on it
- Kotlin ([guide](https://quarkus.io/guides/kotlin)): Write your services in Kotlin
## Provided Code
### REST Client
Invoke different services through REST with JSON
[Related guide section...](https://quarkus.io/guides/rest-client)
### REST
Easily start your REST Web Services
[Related guide section...](https://quarkus.io/guides/getting-started-reactive#reactive-jax-rs-resources)

55
build.gradle.kts Normal file
View File

@@ -0,0 +1,55 @@
plugins {
kotlin("jvm") version "2.1.20"
kotlin("plugin.allopen") version "2.1.20"
id("io.quarkus")
}
repositories {
mavenCentral()
mavenLocal()
}
val quarkusPlatformGroupId: String by project
val quarkusPlatformArtifactId: String by project
val quarkusPlatformVersion: String by project
dependencies {
implementation("io.quarkus:quarkus-config-yaml")
implementation(enforcedPlatform("${quarkusPlatformGroupId}:${quarkusPlatformArtifactId}:${quarkusPlatformVersion}"))
implementation("io.quarkus:quarkus-rest")
implementation("io.quarkus:quarkus-rest-client-jackson")
implementation("io.quarkus:quarkus-rest-client")
implementation("io.quarkus:quarkus-rest-client-kotlin-serialization")
implementation("io.quarkus:quarkus-rest-jackson")
implementation("io.quarkus:quarkus-kotlin")
implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8")
implementation("org.jsoup:jsoup:1.20.1")
implementation("io.quarkus:quarkus-arc")
testImplementation("io.quarkus:quarkus-junit5")
testImplementation("io.rest-assured:rest-assured")
}
group = "com.rak"
version = "1.0-SNAPSHOT"
java {
sourceCompatibility = JavaVersion.VERSION_21
targetCompatibility = JavaVersion.VERSION_21
}
tasks.withType<Test> {
systemProperty("java.util.logging.manager", "org.jboss.logmanager.LogManager")
}
allOpen {
annotation("jakarta.ws.rs.Path")
annotation("jakarta.enterprise.context.ApplicationScoped")
annotation("jakarta.persistence.Entity")
annotation("io.quarkus.test.junit.QuarkusTest")
}
kotlin {
compilerOptions {
jvmTarget = org.jetbrains.kotlin.gradle.dsl.JvmTarget.JVM_21
javaParameters = true
}
}

8
design.md Normal file
View File

@@ -0,0 +1,8 @@
# DEX Scraper
---
## Roadmap
- Static sites + Jsoup
- CommonCrawl integraton
- Explore Playwright

8
gradle.properties Normal file
View File

@@ -0,0 +1,8 @@
# Gradle properties
# Gradle properties
quarkusPluginId=io.quarkus
quarkusPluginVersion=3.22.3
quarkusPlatformGroupId=io.quarkus.platform
quarkusPlatformArtifactId=quarkus-bom
quarkusPlatformVersion=3.22.3

BIN
gradle/wrapper/gradle-wrapper.jar vendored Normal file

Binary file not shown.

View File

@@ -0,0 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

185
gradlew vendored Executable file
View File

@@ -0,0 +1,185 @@
#!/usr/bin/env sh
#
# Copyright 2015 the original author or authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin or MSYS, switch paths to Windows format before running java
if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=`expr $i + 1`
done
case $i in
0) set -- ;;
1) set -- "$args0" ;;
2) set -- "$args0" "$args1" ;;
3) set -- "$args0" "$args1" "$args2" ;;
4) set -- "$args0" "$args1" "$args2" "$args3" ;;
5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=`save "$@"`
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
exec "$JAVACMD" "$@"

104
gradlew.bat vendored Normal file
View File

@@ -0,0 +1,104 @@
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega

13
settings.gradle.kts Normal file
View File

@@ -0,0 +1,13 @@
pluginManagement {
val quarkusPluginVersion: String by settings
val quarkusPluginId: String by settings
repositories {
mavenCentral()
gradlePluginPortal()
mavenLocal()
}
plugins {
id(quarkusPluginId) version quarkusPluginVersion
}
}
rootProject.name = "jsoup-scraper"

View File

@@ -0,0 +1,98 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode
#
# Before building the container image run:
#
# ./gradlew build
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.jvm -t quarkus/jsoup-scraper-jvm .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm
#
# If you want to include the debug port into your docker image
# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005.
# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005
# when running the container
#
# Then run the container using :
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm
#
# This image uses the `run-java.sh` script to run the application.
# This scripts computes the command line to execute your Java application, and
# includes memory/GC tuning.
# You can configure the behavior using the following environment properties:
# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override
# the default JVM options, use `JAVA_OPTS_APPEND` to append options
# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options
# in JAVA_OPTS (example: "-Dsome.property=foo")
# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is
# used to calculate a default maximal heap memory based on a containers restriction.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio
# of the container available memory as set here. The default is `50` which means 50%
# of the available memory is used as an upper boundary. You can skip this mechanism by
# setting this value to `0` in which case no `-Xmx` option is added.
# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This
# is used to calculate a default initial heap memory based on the maximum heap memory.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio
# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx`
# is used as the initial heap size. You can skip this mechanism by setting this value
# to `0` in which case no `-Xms` option is added (example: "25")
# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS.
# This is used to calculate the maximum value of the initial heap memory. If used in
# a container without any memory constraints for the container then this option has
# no effect. If there is a memory constraint then `-Xms` is limited to the value set
# here. The default is 4096MB which means the calculated value of `-Xms` never will
# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096")
# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output
# when things are happening. This option, if set to true, will set
# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true").
# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example:
# true").
# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787").
# - CONTAINER_CORE_LIMIT: A calculated core limit as described in
# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2")
# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024").
# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion.
# (example: "20")
# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking.
# (example: "40")
# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection.
# (example: "4")
# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus
# previous GC times. (example: "90")
# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20")
# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100")
# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should
# contain the necessary JRE command-line options to specify the required GC, which
# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC).
# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080")
# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080")
# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be
# accessed directly. (example: "foo.example.com,bar.example.com")
#
###
FROM registry.access.redhat.com/ubi9/openjdk-21:1.21
ENV LANGUAGE='en_US:en'
# We make four distinct layers so if there are application changes the library layers can be re-used
COPY --chown=185 build/quarkus-app/lib/ /deployments/lib/
COPY --chown=185 build/quarkus-app/*.jar /deployments/
COPY --chown=185 build/quarkus-app/app/ /deployments/app/
COPY --chown=185 build/quarkus-app/quarkus/ /deployments/quarkus/
EXPOSE 8080
USER 185
ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager"
ENV JAVA_APP_JAR="/deployments/quarkus-run.jar"
ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ]

View File

@@ -0,0 +1,94 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.package.jar.type=legacy-jar
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.legacy-jar -t quarkus/jsoup-scraper-legacy-jar .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar
#
# If you want to include the debug port into your docker image
# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005.
# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005
# when running the container
#
# Then run the container using :
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar
#
# This image uses the `run-java.sh` script to run the application.
# This scripts computes the command line to execute your Java application, and
# includes memory/GC tuning.
# You can configure the behavior using the following environment properties:
# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override
# the default JVM options, use `JAVA_OPTS_APPEND` to append options
# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options
# in JAVA_OPTS (example: "-Dsome.property=foo")
# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is
# used to calculate a default maximal heap memory based on a containers restriction.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio
# of the container available memory as set here. The default is `50` which means 50%
# of the available memory is used as an upper boundary. You can skip this mechanism by
# setting this value to `0` in which case no `-Xmx` option is added.
# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This
# is used to calculate a default initial heap memory based on the maximum heap memory.
# If used in a container without any memory constraints for the container then this
# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio
# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx`
# is used as the initial heap size. You can skip this mechanism by setting this value
# to `0` in which case no `-Xms` option is added (example: "25")
# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS.
# This is used to calculate the maximum value of the initial heap memory. If used in
# a container without any memory constraints for the container then this option has
# no effect. If there is a memory constraint then `-Xms` is limited to the value set
# here. The default is 4096MB which means the calculated value of `-Xms` never will
# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096")
# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output
# when things are happening. This option, if set to true, will set
# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true").
# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example:
# true").
# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787").
# - CONTAINER_CORE_LIMIT: A calculated core limit as described in
# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2")
# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024").
# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion.
# (example: "20")
# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking.
# (example: "40")
# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection.
# (example: "4")
# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus
# previous GC times. (example: "90")
# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20")
# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100")
# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should
# contain the necessary JRE command-line options to specify the required GC, which
# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC).
# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080")
# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080")
# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be
# accessed directly. (example: "foo.example.com,bar.example.com")
#
###
FROM registry.access.redhat.com/ubi9/openjdk-21:1.21
ENV LANGUAGE='en_US:en'
COPY build/lib/* /deployments/lib/
COPY build/*-runner.jar /deployments/quarkus-run.jar
EXPOSE 8080
USER 185
ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager"
ENV JAVA_APP_JAR="/deployments/quarkus-run.jar"
ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ]

View File

@@ -0,0 +1,29 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode.
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.native.enabled=true
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.native -t quarkus/jsoup-scraper .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper
#
# The ` registry.access.redhat.com/ubi8/ubi-minimal:8.10` base image is based on UBI 9.
# To use UBI 8, switch to `quay.io/ubi8/ubi-minimal:8.10`.
###
FROM registry.access.redhat.com/ubi8/ubi-minimal:8.10
WORKDIR /work/
RUN chown 1001 /work \
&& chmod "g+rwX" /work \
&& chown 1001:root /work
COPY --chown=1001:root --chmod=0755 build/*-runner /work/application
EXPOSE 8080
USER 1001
ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"]

View File

@@ -0,0 +1,32 @@
####
# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode.
# It uses a micro base image, tuned for Quarkus native executables.
# It reduces the size of the resulting container image.
# Check https://quarkus.io/guides/quarkus-runtime-base-image for further information about this image.
#
# Before building the container image run:
#
# ./gradlew build -Dquarkus.native.enabled=true
#
# Then, build the image with:
#
# docker build -f src/main/docker/Dockerfile.native-micro -t quarkus/jsoup-scraper .
#
# Then run the container using:
#
# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper
#
# The `quay.io/quarkus/quarkus-micro-image:2.0` base image is based on UBI 9.
# To use UBI 8, switch to `quay.io/quarkus/quarkus-micro-image:2.0`.
###
FROM quay.io/quarkus/quarkus-micro-image:2.0
WORKDIR /work/
RUN chown 1001 /work \
&& chmod "g+rwX" /work \
&& chown 1001:root /work
COPY --chown=1001:root --chmod=0755 build/*-runner /work/application
EXPOSE 8080
USER 1001
ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"]

View File

@@ -0,0 +1,50 @@
package com.rak.config
import io.smallrye.config.ConfigMapping
import java.util.*
@ConfigMapping(prefix = "scraper")
interface SourcesConfiguration {
fun sources(): MutableList<SourceConfig>
interface SourceConfig {
fun id(): String
fun name(): String
fun domain(): String
fun urlPatterns(): Optional<MutableList<String>>
fun selectors(): Selectors
interface Selectors {
fun card(): Optional<CardDefinition>
fun regionalSet(): Optional<RegionalSetDefinition>
interface AbstractModelDefinition {
fun root(): Optional<String>
}
interface RegionalSetDefinition : AbstractModelDefinition {
fun id(): SelectorDefinition
fun language(): SelectorDefinition
fun regionKey(): SelectorDefinition
}
interface CardDefinition {
fun name(): SelectorDefinition
fun attack(): SelectorDefinition
fun effect(): SelectorDefinition
}
interface SelectorDefinition {
fun steps(): Set<StepDefinition>
}
interface StepDefinition {
fun type(): String // e.g. css or xpath
fun value(): String
}
}
}
}

View File

@@ -0,0 +1,56 @@
package com.rak.controller
import com.rak.config.SourcesConfiguration
import com.rak.service.ScrapeService
import io.quarkus.logging.Log
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.Produces
import jakarta.ws.rs.core.MediaType
import org.jboss.resteasy.reactive.RestQuery
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
@Path("/hello")
class ExampleResource(
private val sourcesConfiguration: SourcesConfiguration,
private val scrapeService: ScrapeService
) {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$")
}
@GET
@Produces(MediaType.TEXT_PLAIN)
fun hello(
@RestQuery
provider: String,
@RestQuery
path: String
): String {
val sources = sourcesConfiguration
.sources()
.filter {
it.id().equals(provider, ignoreCase = true)
}
val source = sources.firstOrNull() ?: throw IllegalArgumentException("Provider $provider not found")
val newPath: String = path
.trim()
.replace(" ", "_")
Log.info(newPath)
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
val regionalSetSelector = source.selectors().regionalSet().get()
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp"
}
}

View File

@@ -0,0 +1,4 @@
package com.rak.model.scrape
abstract class AbstractScraper{
}

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape
class JsoupScraper : AbstractScraper() {
}

View File

@@ -0,0 +1,5 @@
package com.rak.model.scrape
data class ScrapeJob(
val url: String,
)

View File

@@ -0,0 +1,6 @@
package com.rak.model.scrape.selector
enum class Selector {
CSS,
XPATH
}

View File

@@ -0,0 +1,29 @@
package com.rak.service
import jakarta.ws.rs.GET
import jakarta.ws.rs.Path
import jakarta.ws.rs.QueryParam
import org.eclipse.microprofile.rest.client.inject.RegisterRestClient
/**
* To use it via injection.
*
* ```kotlin
* @Inject
* @RestClient
* lateinit var myRemoteService: MyRemoteService
*
* fun doSomething() {
* val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client")
* }
* ```
*/
@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api")
interface MyRemoteService {
@GET
@Path("/extensions")
fun getExtensionsById(@QueryParam("id") id: String): Set<Extension>
data class Extension(val id: String, val name: String, val shortName: String, val keywords: List<String>)
}

View File

@@ -0,0 +1,68 @@
package com.rak.service
import com.rak.config.SourcesConfiguration
import jakarta.enterprise.context.ApplicationScoped
import org.jsoup.nodes.Element
import org.jsoup.nodes.TextNode
import java.util.concurrent.LinkedBlockingQueue
@ApplicationScoped
class ScrapeService {
companion object {
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
private fun evaluateXpath(element: Element, xpath: String): Element? {
return element.selectXpath(xpath).first()
}
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
return null
}
// XPath
// - text()
// - last step (default to text())
// CSS
// - last step???
private fun untilText(): String? {
return null
}
}
fun extractTextFromRootBySteps(
root: Element,
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
): String? {
var currentElement: Element? = root.clone()
val stepsAsQueue = LinkedBlockingQueue(
steps
)
while (stepsAsQueue.isNotEmpty()) {
val step = stepsAsQueue.take()
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
if (currentElement == null) {
return null
}
currentElement = if (step.type() == "xpath") {
if (stepTargetsTextNode) {
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
}
else {
currentElement.selectXpath(step.value()).first()
}
}
else {
currentElement.selectFirst(step.value())
}
}
return null
}
}

View File

@@ -0,0 +1 @@
quarkus.config.locations=sources.yml

View File

@@ -0,0 +1,42 @@
scraper:
sources:
- id: konami-official
name: "Konami Official Database"
domain: "yugioh-card.com"
url-patterns:
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
selectors:
card:
name:
steps:
- type: "css"
value: "h1.product-title"
- type: "xpath"
value: "//h1[@itemprop='name']"
attack:
steps:
- type: "css"
value: ".atk-value"
- id: ygo-fandom
name: "Yu-Gi-Oh Fandom Wiki"
domain: "yugioh.fandom.com"
url-patterns:
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
selectors:
regional-set:
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
id:
steps:
- type: "xpath"
value: "//li/text()"
language:
steps:
- type: "xpath"
value: "//li/abbr/@title"
- type: "xpath"
value: "//abbr/@title"
region-key:
steps:
- type: "xpath"
value: "//li/abbr/text()"

View File

@@ -0,0 +1,6 @@
package com.rak
import io.quarkus.test.junit.QuarkusIntegrationTest
@QuarkusIntegrationTest
class ExampleResourceIT : ExampleResourceTest()

View File

@@ -0,0 +1,20 @@
package com.rak
import io.quarkus.test.junit.QuarkusTest
import io.restassured.RestAssured.given
import org.hamcrest.CoreMatchers.`is`
import org.junit.jupiter.api.Test
@QuarkusTest
class ExampleResourceTest {
@Test
fun testHelloEndpoint() {
given()
.`when`().get("/hello")
.then()
.statusCode(200)
.body(`is`("Hello from Quarkus REST"))
}
}