commit ac35d7f8d915468659e4e0d0f21586432f60f54c Author: rak Date: Sun May 25 18:58:10 2025 +0200 Init diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..4361d2f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +* +!build/*-runner +!build/*-runner.jar +!build/lib/* +!build/quarkus-app/* \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba4fbcc --- /dev/null +++ b/.gitignore @@ -0,0 +1,41 @@ +# Gradle +.gradle/ +build/ + +# Eclipse +.project +.classpath +.settings/ +bin/ + +# IntelliJ +.idea +*.ipr +*.iml +*.iws + +# NetBeans +nb-configuration.xml + +# Visual Studio Code +.vscode +.factorypath + +# OSX +.DS_Store + +# Vim +*.swp +*.swo + +# patch +*.orig +*.rej + +# Local environment +.env + +# Plugin directory +/.quarkus/cli/plugins/ +# TLS Certificates +.certs/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..22a1860 --- /dev/null +++ b/README.md @@ -0,0 +1,78 @@ +# jsoup-scraper + +This project uses Quarkus, the Supersonic Subatomic Java Framework. + +If you want to learn more about Quarkus, please visit its website: . + +## Running the application in dev mode + +You can run your application in dev mode that enables live coding using: + +```shell script +./gradlew quarkusDev +``` + +> **_NOTE:_** Quarkus now ships with a Dev UI, which is available in dev mode only at . + +## Packaging and running the application + +The application can be packaged using: + +```shell script +./gradlew build +``` + +It produces the `quarkus-run.jar` file in the `build/quarkus-app/` directory. +Be aware that it’s not an _über-jar_ as the dependencies are copied into the `build/quarkus-app/lib/` directory. + +The application is now runnable using `java -jar build/quarkus-app/quarkus-run.jar`. + +If you want to build an _über-jar_, execute the following command: + +```shell script +./gradlew build -Dquarkus.package.jar.type=uber-jar +``` + +The application, packaged as an _über-jar_, is now runnable using `java -jar build/*-runner.jar`. + +## Creating a native executable + +You can create a native executable using: + +```shell script +./gradlew build -Dquarkus.native.enabled=true +``` + +Or, if you don't have GraalVM installed, you can run the native executable build in a container using: + +```shell script +./gradlew build -Dquarkus.native.enabled=true -Dquarkus.native.container-build=true +``` + +You can then execute your native executable with: `./build/jsoup-scraper-1.0-SNAPSHOT-runner` + +If you want to learn more about building native executables, please consult . + +## Related Guides + +- REST ([guide](https://quarkus.io/guides/rest)): A Jakarta REST implementation utilizing build time processing and + Vert.x. This extension is not compatible with the quarkus-resteasy extension, or any of the extensions that depend on + it. +- REST Client ([guide](https://quarkus.io/guides/rest-client)): Call REST services +- REST Jackson ([guide](https://quarkus.io/guides/rest#json-serialisation)): Jackson serialization support for Quarkus + REST. This extension is not compatible with the quarkus-resteasy extension, or any of the extensions that depend on it +- Kotlin ([guide](https://quarkus.io/guides/kotlin)): Write your services in Kotlin + +## Provided Code + +### REST Client + +Invoke different services through REST with JSON + +[Related guide section...](https://quarkus.io/guides/rest-client) + +### REST + +Easily start your REST Web Services + +[Related guide section...](https://quarkus.io/guides/getting-started-reactive#reactive-jax-rs-resources) diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 0000000..38e29b5 --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,55 @@ +plugins { + kotlin("jvm") version "2.1.20" + kotlin("plugin.allopen") version "2.1.20" + id("io.quarkus") +} + +repositories { + mavenCentral() + mavenLocal() +} + +val quarkusPlatformGroupId: String by project +val quarkusPlatformArtifactId: String by project +val quarkusPlatformVersion: String by project + +dependencies { + implementation("io.quarkus:quarkus-config-yaml") + implementation(enforcedPlatform("${quarkusPlatformGroupId}:${quarkusPlatformArtifactId}:${quarkusPlatformVersion}")) + implementation("io.quarkus:quarkus-rest") + implementation("io.quarkus:quarkus-rest-client-jackson") + implementation("io.quarkus:quarkus-rest-client") + implementation("io.quarkus:quarkus-rest-client-kotlin-serialization") + implementation("io.quarkus:quarkus-rest-jackson") + implementation("io.quarkus:quarkus-kotlin") + implementation("org.jetbrains.kotlin:kotlin-stdlib-jdk8") + implementation("org.jsoup:jsoup:1.20.1") + implementation("io.quarkus:quarkus-arc") + testImplementation("io.quarkus:quarkus-junit5") + testImplementation("io.rest-assured:rest-assured") +} + +group = "com.rak" +version = "1.0-SNAPSHOT" + +java { + sourceCompatibility = JavaVersion.VERSION_21 + targetCompatibility = JavaVersion.VERSION_21 +} + +tasks.withType { + systemProperty("java.util.logging.manager", "org.jboss.logmanager.LogManager") +} +allOpen { + annotation("jakarta.ws.rs.Path") + annotation("jakarta.enterprise.context.ApplicationScoped") + annotation("jakarta.persistence.Entity") + annotation("io.quarkus.test.junit.QuarkusTest") +} + +kotlin { + compilerOptions { + jvmTarget = org.jetbrains.kotlin.gradle.dsl.JvmTarget.JVM_21 + javaParameters = true + } +} diff --git a/design.md b/design.md new file mode 100644 index 0000000..85065d5 --- /dev/null +++ b/design.md @@ -0,0 +1,8 @@ +# DEX Scraper + +--- + +## Roadmap +- Static sites + Jsoup +- CommonCrawl integraton +- Explore Playwright \ No newline at end of file diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..6c96d73 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,8 @@ +# Gradle properties + +# Gradle properties +quarkusPluginId=io.quarkus +quarkusPluginVersion=3.22.3 +quarkusPlatformGroupId=io.quarkus.platform +quarkusPlatformArtifactId=quarkus-bom +quarkusPlatformVersion=3.22.3 diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..62d4c05 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..2733ed5 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,5 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.13-bin.zip +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..fbd7c51 --- /dev/null +++ b/gradlew @@ -0,0 +1,185 @@ +#!/usr/bin/env sh + +# +# Copyright 2015 the original author or authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin or MSYS, switch paths to Windows format before running java +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=`expr $i + 1` + done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..5093609 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,104 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/settings.gradle.kts b/settings.gradle.kts new file mode 100644 index 0000000..193fce4 --- /dev/null +++ b/settings.gradle.kts @@ -0,0 +1,13 @@ +pluginManagement { + val quarkusPluginVersion: String by settings + val quarkusPluginId: String by settings + repositories { + mavenCentral() + gradlePluginPortal() + mavenLocal() + } + plugins { + id(quarkusPluginId) version quarkusPluginVersion + } +} +rootProject.name = "jsoup-scraper" diff --git a/src/main/docker/Dockerfile.jvm b/src/main/docker/Dockerfile.jvm new file mode 100644 index 0000000..4d1ff69 --- /dev/null +++ b/src/main/docker/Dockerfile.jvm @@ -0,0 +1,98 @@ +#### +# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode +# +# Before building the container image run: +# +# ./gradlew build +# +# Then, build the image with: +# +# docker build -f src/main/docker/Dockerfile.jvm -t quarkus/jsoup-scraper-jvm . +# +# Then run the container using: +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm +# +# If you want to include the debug port into your docker image +# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005. +# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005 +# when running the container +# +# Then run the container using : +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-jvm +# +# This image uses the `run-java.sh` script to run the application. +# This scripts computes the command line to execute your Java application, and +# includes memory/GC tuning. +# You can configure the behavior using the following environment properties: +# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override +# the default JVM options, use `JAVA_OPTS_APPEND` to append options +# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options +# in JAVA_OPTS (example: "-Dsome.property=foo") +# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is +# used to calculate a default maximal heap memory based on a containers restriction. +# If used in a container without any memory constraints for the container then this +# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio +# of the container available memory as set here. The default is `50` which means 50% +# of the available memory is used as an upper boundary. You can skip this mechanism by +# setting this value to `0` in which case no `-Xmx` option is added. +# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This +# is used to calculate a default initial heap memory based on the maximum heap memory. +# If used in a container without any memory constraints for the container then this +# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio +# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx` +# is used as the initial heap size. You can skip this mechanism by setting this value +# to `0` in which case no `-Xms` option is added (example: "25") +# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS. +# This is used to calculate the maximum value of the initial heap memory. If used in +# a container without any memory constraints for the container then this option has +# no effect. If there is a memory constraint then `-Xms` is limited to the value set +# here. The default is 4096MB which means the calculated value of `-Xms` never will +# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096") +# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output +# when things are happening. This option, if set to true, will set +# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true"). +# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example: +# true"). +# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787"). +# - CONTAINER_CORE_LIMIT: A calculated core limit as described in +# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2") +# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024"). +# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion. +# (example: "20") +# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking. +# (example: "40") +# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection. +# (example: "4") +# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus +# previous GC times. (example: "90") +# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20") +# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100") +# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should +# contain the necessary JRE command-line options to specify the required GC, which +# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC). +# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080") +# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080") +# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be +# accessed directly. (example: "foo.example.com,bar.example.com") +# +### +FROM registry.access.redhat.com/ubi9/openjdk-21:1.21 + +ENV LANGUAGE='en_US:en' + + +# We make four distinct layers so if there are application changes the library layers can be re-used +COPY --chown=185 build/quarkus-app/lib/ /deployments/lib/ +COPY --chown=185 build/quarkus-app/*.jar /deployments/ +COPY --chown=185 build/quarkus-app/app/ /deployments/app/ +COPY --chown=185 build/quarkus-app/quarkus/ /deployments/quarkus/ + +EXPOSE 8080 +USER 185 +ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager" +ENV JAVA_APP_JAR="/deployments/quarkus-run.jar" + +ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ] + diff --git a/src/main/docker/Dockerfile.legacy-jar b/src/main/docker/Dockerfile.legacy-jar new file mode 100644 index 0000000..2e682f8 --- /dev/null +++ b/src/main/docker/Dockerfile.legacy-jar @@ -0,0 +1,94 @@ +#### +# This Dockerfile is used in order to build a container that runs the Quarkus application in JVM mode +# +# Before building the container image run: +# +# ./gradlew build -Dquarkus.package.jar.type=legacy-jar +# +# Then, build the image with: +# +# docker build -f src/main/docker/Dockerfile.legacy-jar -t quarkus/jsoup-scraper-legacy-jar . +# +# Then run the container using: +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar +# +# If you want to include the debug port into your docker image +# you will have to expose the debug port (default 5005 being the default) like this : EXPOSE 8080 5005. +# Additionally you will have to set -e JAVA_DEBUG=true and -e JAVA_DEBUG_PORT=*:5005 +# when running the container +# +# Then run the container using : +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper-legacy-jar +# +# This image uses the `run-java.sh` script to run the application. +# This scripts computes the command line to execute your Java application, and +# includes memory/GC tuning. +# You can configure the behavior using the following environment properties: +# - JAVA_OPTS: JVM options passed to the `java` command (example: "-verbose:class") - Be aware that this will override +# the default JVM options, use `JAVA_OPTS_APPEND` to append options +# - JAVA_OPTS_APPEND: User specified Java options to be appended to generated options +# in JAVA_OPTS (example: "-Dsome.property=foo") +# - JAVA_MAX_MEM_RATIO: Is used when no `-Xmx` option is given in JAVA_OPTS. This is +# used to calculate a default maximal heap memory based on a containers restriction. +# If used in a container without any memory constraints for the container then this +# option has no effect. If there is a memory constraint then `-Xmx` is set to a ratio +# of the container available memory as set here. The default is `50` which means 50% +# of the available memory is used as an upper boundary. You can skip this mechanism by +# setting this value to `0` in which case no `-Xmx` option is added. +# - JAVA_INITIAL_MEM_RATIO: Is used when no `-Xms` option is given in JAVA_OPTS. This +# is used to calculate a default initial heap memory based on the maximum heap memory. +# If used in a container without any memory constraints for the container then this +# option has no effect. If there is a memory constraint then `-Xms` is set to a ratio +# of the `-Xmx` memory as set here. The default is `25` which means 25% of the `-Xmx` +# is used as the initial heap size. You can skip this mechanism by setting this value +# to `0` in which case no `-Xms` option is added (example: "25") +# - JAVA_MAX_INITIAL_MEM: Is used when no `-Xms` option is given in JAVA_OPTS. +# This is used to calculate the maximum value of the initial heap memory. If used in +# a container without any memory constraints for the container then this option has +# no effect. If there is a memory constraint then `-Xms` is limited to the value set +# here. The default is 4096MB which means the calculated value of `-Xms` never will +# be greater than 4096MB. The value of this variable is expressed in MB (example: "4096") +# - JAVA_DIAGNOSTICS: Set this to get some diagnostics information to standard output +# when things are happening. This option, if set to true, will set +# `-XX:+UnlockDiagnosticVMOptions`. Disabled by default (example: "true"). +# - JAVA_DEBUG: If set remote debugging will be switched on. Disabled by default (example: +# true"). +# - JAVA_DEBUG_PORT: Port used for remote debugging. Defaults to 5005 (example: "8787"). +# - CONTAINER_CORE_LIMIT: A calculated core limit as described in +# https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt. (example: "2") +# - CONTAINER_MAX_MEMORY: Memory limit given to the container (example: "1024"). +# - GC_MIN_HEAP_FREE_RATIO: Minimum percentage of heap free after GC to avoid expansion. +# (example: "20") +# - GC_MAX_HEAP_FREE_RATIO: Maximum percentage of heap free after GC to avoid shrinking. +# (example: "40") +# - GC_TIME_RATIO: Specifies the ratio of the time spent outside the garbage collection. +# (example: "4") +# - GC_ADAPTIVE_SIZE_POLICY_WEIGHT: The weighting given to the current GC time versus +# previous GC times. (example: "90") +# - GC_METASPACE_SIZE: The initial metaspace size. (example: "20") +# - GC_MAX_METASPACE_SIZE: The maximum metaspace size. (example: "100") +# - GC_CONTAINER_OPTIONS: Specify Java GC to use. The value of this variable should +# contain the necessary JRE command-line options to specify the required GC, which +# will override the default of `-XX:+UseParallelGC` (example: -XX:+UseG1GC). +# - HTTPS_PROXY: The location of the https proxy. (example: "myuser@127.0.0.1:8080") +# - HTTP_PROXY: The location of the http proxy. (example: "myuser@127.0.0.1:8080") +# - NO_PROXY: A comma separated lists of hosts, IP addresses or domains that can be +# accessed directly. (example: "foo.example.com,bar.example.com") +# +### +FROM registry.access.redhat.com/ubi9/openjdk-21:1.21 + +ENV LANGUAGE='en_US:en' + + +COPY build/lib/* /deployments/lib/ +COPY build/*-runner.jar /deployments/quarkus-run.jar + +EXPOSE 8080 +USER 185 +ENV JAVA_OPTS_APPEND="-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager" +ENV JAVA_APP_JAR="/deployments/quarkus-run.jar" + +ENTRYPOINT [ "/opt/jboss/container/java/run/run-java.sh" ] diff --git a/src/main/docker/Dockerfile.native b/src/main/docker/Dockerfile.native new file mode 100644 index 0000000..2155783 --- /dev/null +++ b/src/main/docker/Dockerfile.native @@ -0,0 +1,29 @@ +#### +# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode. +# +# Before building the container image run: +# +# ./gradlew build -Dquarkus.native.enabled=true +# +# Then, build the image with: +# +# docker build -f src/main/docker/Dockerfile.native -t quarkus/jsoup-scraper . +# +# Then run the container using: +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper +# +# The ` registry.access.redhat.com/ubi8/ubi-minimal:8.10` base image is based on UBI 9. +# To use UBI 8, switch to `quay.io/ubi8/ubi-minimal:8.10`. +### +FROM registry.access.redhat.com/ubi8/ubi-minimal:8.10 +WORKDIR /work/ +RUN chown 1001 /work \ + && chmod "g+rwX" /work \ + && chown 1001:root /work +COPY --chown=1001:root --chmod=0755 build/*-runner /work/application + +EXPOSE 8080 +USER 1001 + +ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"] diff --git a/src/main/docker/Dockerfile.native-micro b/src/main/docker/Dockerfile.native-micro new file mode 100644 index 0000000..dc8a88e --- /dev/null +++ b/src/main/docker/Dockerfile.native-micro @@ -0,0 +1,32 @@ +#### +# This Dockerfile is used in order to build a container that runs the Quarkus application in native (no JVM) mode. +# It uses a micro base image, tuned for Quarkus native executables. +# It reduces the size of the resulting container image. +# Check https://quarkus.io/guides/quarkus-runtime-base-image for further information about this image. +# +# Before building the container image run: +# +# ./gradlew build -Dquarkus.native.enabled=true +# +# Then, build the image with: +# +# docker build -f src/main/docker/Dockerfile.native-micro -t quarkus/jsoup-scraper . +# +# Then run the container using: +# +# docker run -i --rm -p 8080:8080 quarkus/jsoup-scraper +# +# The `quay.io/quarkus/quarkus-micro-image:2.0` base image is based on UBI 9. +# To use UBI 8, switch to `quay.io/quarkus/quarkus-micro-image:2.0`. +### +FROM quay.io/quarkus/quarkus-micro-image:2.0 +WORKDIR /work/ +RUN chown 1001 /work \ + && chmod "g+rwX" /work \ + && chown 1001:root /work +COPY --chown=1001:root --chmod=0755 build/*-runner /work/application + +EXPOSE 8080 +USER 1001 + +ENTRYPOINT ["./application", "-Dquarkus.http.host=0.0.0.0"] diff --git a/src/main/kotlin/com/rak/config/SourcesConfiguration.kt b/src/main/kotlin/com/rak/config/SourcesConfiguration.kt new file mode 100644 index 0000000..9d1767d --- /dev/null +++ b/src/main/kotlin/com/rak/config/SourcesConfiguration.kt @@ -0,0 +1,50 @@ +package com.rak.config + +import io.smallrye.config.ConfigMapping +import java.util.* + + +@ConfigMapping(prefix = "scraper") +interface SourcesConfiguration { + + fun sources(): MutableList + + interface SourceConfig { + fun id(): String + fun name(): String + fun domain(): String + fun urlPatterns(): Optional> + fun selectors(): Selectors + + interface Selectors { + fun card(): Optional + fun regionalSet(): Optional + + interface AbstractModelDefinition { + fun root(): Optional + } + + interface RegionalSetDefinition : AbstractModelDefinition { + fun id(): SelectorDefinition + fun language(): SelectorDefinition + fun regionKey(): SelectorDefinition + } + + interface CardDefinition { + fun name(): SelectorDefinition + fun attack(): SelectorDefinition + fun effect(): SelectorDefinition + } + + interface SelectorDefinition { + fun steps(): Set + } + + interface StepDefinition { + fun type(): String // e.g. css or xpath + fun value(): String + } + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/controller/ExampleResource.kt b/src/main/kotlin/com/rak/controller/ExampleResource.kt new file mode 100644 index 0000000..59d91ad --- /dev/null +++ b/src/main/kotlin/com/rak/controller/ExampleResource.kt @@ -0,0 +1,56 @@ +package com.rak.controller + +import com.rak.config.SourcesConfiguration +import com.rak.service.ScrapeService +import io.quarkus.logging.Log +import jakarta.ws.rs.GET +import jakarta.ws.rs.Path +import jakarta.ws.rs.Produces +import jakarta.ws.rs.core.MediaType +import org.jboss.resteasy.reactive.RestQuery +import org.jsoup.Jsoup +import org.jsoup.nodes.Document +import org.jsoup.nodes.Element +import org.jsoup.nodes.TextNode + + +@Path("/hello") +class ExampleResource( + private val sourcesConfiguration: SourcesConfiguration, + private val scrapeService: ScrapeService +) { + + companion object { + private val TEXT_NODE_MATCHER: Regex = Regex("text\\(\\)$") + } + + @GET + @Produces(MediaType.TEXT_PLAIN) + fun hello( + @RestQuery + provider: String, + @RestQuery + path: String + ): String { + val sources = sourcesConfiguration + .sources() + .filter { + it.id().equals(provider, ignoreCase = true) + } + + val source = sources.firstOrNull() ?: throw IllegalArgumentException("Provider $provider not found") + + val newPath: String = path + .trim() + .replace(" ", "_") + + Log.info(newPath) + + val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get() + + val regionalSetSelector = source.selectors().regionalSet().get() + val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!! + + return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp" + } +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/scrape/AbstractScraper.kt b/src/main/kotlin/com/rak/model/scrape/AbstractScraper.kt new file mode 100644 index 0000000..9f4fc12 --- /dev/null +++ b/src/main/kotlin/com/rak/model/scrape/AbstractScraper.kt @@ -0,0 +1,4 @@ +package com.rak.model.scrape + +abstract class AbstractScraper{ +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/scrape/JsoupScraper.kt b/src/main/kotlin/com/rak/model/scrape/JsoupScraper.kt new file mode 100644 index 0000000..9cf4df6 --- /dev/null +++ b/src/main/kotlin/com/rak/model/scrape/JsoupScraper.kt @@ -0,0 +1,6 @@ +package com.rak.model.scrape + +class JsoupScraper : AbstractScraper() { + + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/scrape/ScrapeJob.kt b/src/main/kotlin/com/rak/model/scrape/ScrapeJob.kt new file mode 100644 index 0000000..51aa56f --- /dev/null +++ b/src/main/kotlin/com/rak/model/scrape/ScrapeJob.kt @@ -0,0 +1,5 @@ +package com.rak.model.scrape + +data class ScrapeJob( + val url: String, +) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/scrape/selector/Selector.kt b/src/main/kotlin/com/rak/model/scrape/selector/Selector.kt new file mode 100644 index 0000000..5a11624 --- /dev/null +++ b/src/main/kotlin/com/rak/model/scrape/selector/Selector.kt @@ -0,0 +1,6 @@ +package com.rak.model.scrape.selector + +enum class Selector { + CSS, + XPATH +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/MyRemoteService.kt b/src/main/kotlin/com/rak/service/MyRemoteService.kt new file mode 100644 index 0000000..c79aae0 --- /dev/null +++ b/src/main/kotlin/com/rak/service/MyRemoteService.kt @@ -0,0 +1,29 @@ +package com.rak.service + +import jakarta.ws.rs.GET +import jakarta.ws.rs.Path +import jakarta.ws.rs.QueryParam +import org.eclipse.microprofile.rest.client.inject.RegisterRestClient + +/** + * To use it via injection. + * + * ```kotlin + * @Inject + * @RestClient + * lateinit var myRemoteService: MyRemoteService + * + * fun doSomething() { + * val restClientExtensions = myRemoteService.getExtensionsById("io.quarkus:quarkus-rest-client") + * } + * ``` + */ +@RegisterRestClient(baseUri = "https://stage.code.quarkus.io/api") +interface MyRemoteService { + + @GET + @Path("/extensions") + fun getExtensionsById(@QueryParam("id") id: String): Set + + data class Extension(val id: String, val name: String, val shortName: String, val keywords: List) +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/ScrapeService.kt b/src/main/kotlin/com/rak/service/ScrapeService.kt new file mode 100644 index 0000000..6b3192f --- /dev/null +++ b/src/main/kotlin/com/rak/service/ScrapeService.kt @@ -0,0 +1,68 @@ +package com.rak.service + +import com.rak.config.SourcesConfiguration +import jakarta.enterprise.context.ApplicationScoped +import org.jsoup.nodes.Element +import org.jsoup.nodes.TextNode +import java.util.concurrent.LinkedBlockingQueue + +@ApplicationScoped +class ScrapeService { + + companion object { + private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") + + private fun evaluateXpath(element: Element, xpath: String): Element? { + return element.selectXpath(xpath).first() + } + + private fun evaluateCssSelector(element: Element, cssSelector: String): Element? { + return null + } + // XPath + // - text() + // - last step (default to text()) + // CSS + // - last step??? + private fun untilText(): String? { + + + return null + } + } + + + fun extractTextFromRootBySteps( + root: Element, + steps: Set + ): String? { + var currentElement: Element? = root.clone() + val stepsAsQueue = LinkedBlockingQueue( + steps + ) + + while (stepsAsQueue.isNotEmpty()) { + val step = stepsAsQueue.take() + val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value()) + + if (currentElement == null) { + return null + } + + currentElement = if (step.type() == "xpath") { + if (stepTargetsTextNode) { + return currentElement.selectXpath(step.value(), TextNode::class.java).first().text() + } + else { + currentElement.selectXpath(step.value()).first() + } + } + else { + currentElement.selectFirst(step.value()) + } + } + + return null + } + +} \ No newline at end of file diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties new file mode 100644 index 0000000..d794293 --- /dev/null +++ b/src/main/resources/application.properties @@ -0,0 +1 @@ +quarkus.config.locations=sources.yml \ No newline at end of file diff --git a/src/main/resources/sources.yml b/src/main/resources/sources.yml new file mode 100644 index 0000000..6f12816 --- /dev/null +++ b/src/main/resources/sources.yml @@ -0,0 +1,42 @@ +scraper: + sources: + - id: konami-official + name: "Konami Official Database" + domain: "yugioh-card.com" + url-patterns: + - "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" + selectors: + card: + name: + steps: + - type: "css" + value: "h1.product-title" + - type: "xpath" + value: "//h1[@itemprop='name']" + attack: + steps: + - type: "css" + value: ".atk-value" + + - id: ygo-fandom + name: "Yu-Gi-Oh Fandom Wiki" + domain: "yugioh.fandom.com" + url-patterns: + - "^https://yugioh\\.fandom\\.com/wiki/.*$" + selectors: + regional-set: + root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" + id: + steps: + - type: "xpath" + value: "//li/text()" + language: + steps: + - type: "xpath" + value: "//li/abbr/@title" + - type: "xpath" + value: "//abbr/@title" + region-key: + steps: + - type: "xpath" + value: "//li/abbr/text()" \ No newline at end of file diff --git a/src/native-test/kotlin/com/rak/ExampleResourceIT.kt b/src/native-test/kotlin/com/rak/ExampleResourceIT.kt new file mode 100644 index 0000000..2b203ec --- /dev/null +++ b/src/native-test/kotlin/com/rak/ExampleResourceIT.kt @@ -0,0 +1,6 @@ +package com.rak + +import io.quarkus.test.junit.QuarkusIntegrationTest + +@QuarkusIntegrationTest +class ExampleResourceIT : ExampleResourceTest() diff --git a/src/test/kotlin/com/rak/ExampleResourceTest.kt b/src/test/kotlin/com/rak/ExampleResourceTest.kt new file mode 100644 index 0000000..d363f8e --- /dev/null +++ b/src/test/kotlin/com/rak/ExampleResourceTest.kt @@ -0,0 +1,20 @@ +package com.rak + +import io.quarkus.test.junit.QuarkusTest +import io.restassured.RestAssured.given +import org.hamcrest.CoreMatchers.`is` +import org.junit.jupiter.api.Test + +@QuarkusTest +class ExampleResourceTest { + + @Test + fun testHelloEndpoint() { + given() + .`when`().get("/hello") + .then() + .statusCode(200) + .body(`is`("Hello from Quarkus REST")) + } + +} \ No newline at end of file