diff --git a/src/main/resources/sources.yml b/sources.yml similarity index 96% rename from src/main/resources/sources.yml rename to sources.yml index 6f12816..45ba8b3 100644 --- a/src/main/resources/sources.yml +++ b/sources.yml @@ -33,7 +33,7 @@ scraper: language: steps: - type: "xpath" - value: "//li/abbr/@title" + value: "//li/abbr" - type: "xpath" value: "//abbr/@title" region-key: diff --git a/src/main/java/com/rak/model/config/TestConverter.java b/src/main/java/com/rak/model/config/TestConverter.java new file mode 100644 index 0000000..0651144 --- /dev/null +++ b/src/main/java/com/rak/model/config/TestConverter.java @@ -0,0 +1,14 @@ +package com.rak.model.config; + +import org.eclipse.microprofile.config.spi.Converter; + +public class TestConverter { + + public static class ActualConverter implements Converter { + @Override + public Attribute convert(final String value) throws IllegalArgumentException, NullPointerException { + return null; + } + } + +} diff --git a/src/main/kotlin/com/rak/config/SourcesConfiguration.kt b/src/main/kotlin/com/rak/config/SourcesConfiguration.kt index 9d1767d..d775966 100644 --- a/src/main/kotlin/com/rak/config/SourcesConfiguration.kt +++ b/src/main/kotlin/com/rak/config/SourcesConfiguration.kt @@ -13,7 +13,7 @@ interface SourcesConfiguration { fun id(): String fun name(): String fun domain(): String - fun urlPatterns(): Optional> + fun urlPatterns(): Optional> fun selectors(): Selectors interface Selectors { diff --git a/src/main/kotlin/com/rak/controller/ExampleResource.kt b/src/main/kotlin/com/rak/controller/ExampleResource.kt index 59d91ad..0b9f060 100644 --- a/src/main/kotlin/com/rak/controller/ExampleResource.kt +++ b/src/main/kotlin/com/rak/controller/ExampleResource.kt @@ -31,7 +31,7 @@ class ExampleResource( provider: String, @RestQuery path: String - ): String { + ): Map { val sources = sourcesConfiguration .sources() .filter { @@ -44,13 +44,19 @@ class ExampleResource( .trim() .replace(" ", "_") - Log.info(newPath) - val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get() val regionalSetSelector = source.selectors().regionalSet().get() val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!! - return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp" + val setId: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) + val setLanguage: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().language().steps()) + val setKey: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().regionKey().steps()) + + return mapOf( + Pair("id", setId ?: "N/A"), + Pair("language", setLanguage ?: "N/A"), + Pair("key", setKey ?: "N/A"), + ) } } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/XPathTarget.kt b/src/main/kotlin/com/rak/model/XPathTarget.kt new file mode 100644 index 0000000..157e040 --- /dev/null +++ b/src/main/kotlin/com/rak/model/XPathTarget.kt @@ -0,0 +1,7 @@ +package com.rak.model + +enum class XPathTarget { + TEXT, + ATTRIBUTE, + ELEMENT +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/config/Attribute.kt b/src/main/kotlin/com/rak/model/config/Attribute.kt new file mode 100644 index 0000000..072e681 --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/Attribute.kt @@ -0,0 +1,5 @@ +package com.rak.model.config + +data class Attribute( + val steps: Set +) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/config/CardConfigModel.kt b/src/main/kotlin/com/rak/model/config/CardConfigModel.kt new file mode 100644 index 0000000..573bce5 --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/CardConfigModel.kt @@ -0,0 +1,8 @@ +package com.rak.model.config + +data class CardConfigModel( + override val root: String, + val name: Attribute, + val attack: Attribute, + val effect: Attribute, +) : ConfigModel \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/config/ConfigModel.kt b/src/main/kotlin/com/rak/model/config/ConfigModel.kt new file mode 100644 index 0000000..eabe196 --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/ConfigModel.kt @@ -0,0 +1,5 @@ +package com.rak.model.config + +interface ConfigModel { + val root: String +} diff --git a/src/main/kotlin/com/rak/model/config/RegionalSetConfigModel.kt b/src/main/kotlin/com/rak/model/config/RegionalSetConfigModel.kt new file mode 100644 index 0000000..42693d5 --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/RegionalSetConfigModel.kt @@ -0,0 +1,8 @@ +package com.rak.model.config + +data class RegionalSetConfigModel( + override val root: String, + val id: Attribute, + val language: Attribute, + val regionKey: Attribute +) : ConfigModel \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/config/SourceConfiguration.kt b/src/main/kotlin/com/rak/model/config/SourceConfiguration.kt new file mode 100644 index 0000000..d470489 --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/SourceConfiguration.kt @@ -0,0 +1,8 @@ +package com.rak.model.config + +data class SourceConfiguration( + val id: String, + val name: String, + val urlPatterns: Set, + val selectors: Set +) \ No newline at end of file diff --git a/src/main/kotlin/com/rak/model/config/Step.kt b/src/main/kotlin/com/rak/model/config/Step.kt new file mode 100644 index 0000000..679087d --- /dev/null +++ b/src/main/kotlin/com/rak/model/config/Step.kt @@ -0,0 +1,11 @@ +package com.rak.model.config + +data class Step( + val type: Type, + val value: String +) { + enum class Type { + XPATH, + CSS + } +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/ScrapeService.kt b/src/main/kotlin/com/rak/service/ScrapeService.kt index 6b3192f..a7375d3 100644 --- a/src/main/kotlin/com/rak/service/ScrapeService.kt +++ b/src/main/kotlin/com/rak/service/ScrapeService.kt @@ -1,68 +1,40 @@ package com.rak.service import com.rak.config.SourcesConfiguration +import com.rak.model.XPathTarget +import com.rak.util.XPathUtil import jakarta.enterprise.context.ApplicationScoped import org.jsoup.nodes.Element import org.jsoup.nodes.TextNode +import org.jsoup.select.Evaluator import java.util.concurrent.LinkedBlockingQueue @ApplicationScoped class ScrapeService { - companion object { - private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") - - private fun evaluateXpath(element: Element, xpath: String): Element? { - return element.selectXpath(xpath).first() - } - - private fun evaluateCssSelector(element: Element, cssSelector: String): Element? { - return null - } - // XPath - // - text() - // - last step (default to text()) - // CSS - // - last step??? - private fun untilText(): String? { - - - return null - } - } - - fun extractTextFromRootBySteps( root: Element, steps: Set ): String? { var currentElement: Element? = root.clone() - val stepsAsQueue = LinkedBlockingQueue( - steps - ) - - while (stepsAsQueue.isNotEmpty()) { - val step = stepsAsQueue.take() - val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value()) + var result: String? = null + for (index in 0 until steps.size) { + val currentStep = steps.elementAtOrNull(index) ?: return null if (currentElement == null) { - return null + throw IllegalStateException() } - currentElement = if (step.type() == "xpath") { - if (stepTargetsTextNode) { - return currentElement.selectXpath(step.value(), TextNode::class.java).first().text() - } - else { - currentElement.selectXpath(step.value()).first() - } + if (index == steps.size - 1) { + result = XPathUtil.extractResult(currentElement, currentStep.value()) } else { - currentElement.selectFirst(step.value()) + currentElement = XPathUtil.getNextElement(currentElement, currentStep.value()) } } - return null + + return result } } \ No newline at end of file diff --git a/src/main/kotlin/com/rak/service/SourceService.kt b/src/main/kotlin/com/rak/service/SourceService.kt new file mode 100644 index 0000000..870a528 --- /dev/null +++ b/src/main/kotlin/com/rak/service/SourceService.kt @@ -0,0 +1,31 @@ +package com.rak.service + +import com.rak.config.SourcesConfiguration +import com.rak.model.config.SourceConfiguration +import jakarta.annotation.PostConstruct +import jakarta.enterprise.context.ApplicationScoped + +@ApplicationScoped +class SourceService ( + private val sourcesConfiguration: SourcesConfiguration +) { + + private val sources: MutableSet = mutableSetOf() + + @PostConstruct + fun init() { + sourcesConfiguration + .sources() + .forEach { source -> + val config = SourceConfiguration( + source.id(), + source.name(), + source.urlPatterns().orElse(mutableSetOf()), + setOf() + ) + + sources.add(config) + } + } + +} \ No newline at end of file diff --git a/src/main/kotlin/com/rak/util/XPathUtil.kt b/src/main/kotlin/com/rak/util/XPathUtil.kt new file mode 100644 index 0000000..adb663b --- /dev/null +++ b/src/main/kotlin/com/rak/util/XPathUtil.kt @@ -0,0 +1,54 @@ +package com.rak.util + +import com.rak.model.XPathTarget +import org.jsoup.nodes.Element +import org.jsoup.nodes.TextNode + +class XPathUtil private constructor() { + + companion object { + private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$") + private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$") + + private fun extractTextFromAttribute(root: Element, xpath: String): String? { + val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath) + val attributeName = groupMatcher?.groupValues[1] ?: return null + val attributeValue = root.attr(attributeName) + + return attributeValue.ifBlank { + null + } + } + + private fun extractTextFromNode(root: Element, xpath: String): String? { + return root + .selectXpath(xpath, TextNode::class.java) + .firstOrNull()?.text() + } + + fun getNextElement(root: Element, path: String): Element? { + return root.selectXpath(path).firstOrNull() + } + + fun extractResult(root: Element, path: String): String? { + return when (getXPathTargetFromPath(path)) { + XPathTarget.TEXT -> extractTextFromNode(root, path) + XPathTarget.ATTRIBUTE -> extractTextFromAttribute(root, path) + else -> null + } + } + + fun getXPathTargetFromPath(path: String): XPathTarget { + return if (TEXT_NODE_MATCHER.matches(path)) { + XPathTarget.TEXT + } else if (ATTRIBUTE_MATCHER.matches(path)) { + XPathTarget.ATTRIBUTE + } else { + XPathTarget.ELEMENT + } + } + } + + + +} \ No newline at end of file diff --git a/src/main/resources/META-INF/services/org.eclipse.microprofile.config.spi.Converter b/src/main/resources/META-INF/services/org.eclipse.microprofile.config.spi.Converter new file mode 100644 index 0000000..301a868 --- /dev/null +++ b/src/main/resources/META-INF/services/org.eclipse.microprofile.config.spi.Converter @@ -0,0 +1 @@ +com.rak.model.config.TestConverter$ActualConverter \ No newline at end of file diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties deleted file mode 100644 index d794293..0000000 --- a/src/main/resources/application.properties +++ /dev/null @@ -1 +0,0 @@ -quarkus.config.locations=sources.yml \ No newline at end of file diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml new file mode 100644 index 0000000..064a9fb --- /dev/null +++ b/src/main/resources/application.yml @@ -0,0 +1,44 @@ +scraper: + sources: + - id: konami-official + name: "Konami Official Database" + domain: "yugioh-card.com" + url-patterns: + - "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$" + selectors: + card: + name: + steps: + - type: "css" + value: "h1.product-title" + - type: "xpath" + value: "//h1[@itemprop='name']" + attack: + steps: + - type: "css" + value: ".atk-value" + + - id: ygo-fandom + name: "Yu-Gi-Oh Fandom Wiki" + domain: "yugioh.fandom.com" + url-patterns: + - "^https://yugioh\\.fandom\\.com/wiki/.*$" + selectors: + regional-set: + root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li" + id: + steps: + - type: "xpath" + value: "//li/text()" + language: + steps: + - type: "xpath" + value: "//li/abbr" + - type: "xpath" + value: "//abbr/@title" + region-key: + steps: + - type: "xpath" + value: "//li/abbr/text()" + testing: + waaa: test \ No newline at end of file