Trying out model
This commit is contained in:
@@ -33,7 +33,7 @@ scraper:
|
|||||||
language:
|
language:
|
||||||
steps:
|
steps:
|
||||||
- type: "xpath"
|
- type: "xpath"
|
||||||
value: "//li/abbr/@title"
|
value: "//li/abbr"
|
||||||
- type: "xpath"
|
- type: "xpath"
|
||||||
value: "//abbr/@title"
|
value: "//abbr/@title"
|
||||||
region-key:
|
region-key:
|
||||||
14
src/main/java/com/rak/model/config/TestConverter.java
Normal file
14
src/main/java/com/rak/model/config/TestConverter.java
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
package com.rak.model.config;
|
||||||
|
|
||||||
|
import org.eclipse.microprofile.config.spi.Converter;
|
||||||
|
|
||||||
|
public class TestConverter {
|
||||||
|
|
||||||
|
public static class ActualConverter implements Converter<Attribute> {
|
||||||
|
@Override
|
||||||
|
public Attribute convert(final String value) throws IllegalArgumentException, NullPointerException {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -13,7 +13,7 @@ interface SourcesConfiguration {
|
|||||||
fun id(): String
|
fun id(): String
|
||||||
fun name(): String
|
fun name(): String
|
||||||
fun domain(): String
|
fun domain(): String
|
||||||
fun urlPatterns(): Optional<MutableList<String>>
|
fun urlPatterns(): Optional<MutableSet<String>>
|
||||||
fun selectors(): Selectors
|
fun selectors(): Selectors
|
||||||
|
|
||||||
interface Selectors {
|
interface Selectors {
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class ExampleResource(
|
|||||||
provider: String,
|
provider: String,
|
||||||
@RestQuery
|
@RestQuery
|
||||||
path: String
|
path: String
|
||||||
): String {
|
): Map<String, String> {
|
||||||
val sources = sourcesConfiguration
|
val sources = sourcesConfiguration
|
||||||
.sources()
|
.sources()
|
||||||
.filter {
|
.filter {
|
||||||
@@ -44,13 +44,19 @@ class ExampleResource(
|
|||||||
.trim()
|
.trim()
|
||||||
.replace(" ", "_")
|
.replace(" ", "_")
|
||||||
|
|
||||||
Log.info(newPath)
|
|
||||||
|
|
||||||
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
|
val doc: Document = Jsoup.connect("https://${source.domain()}/$newPath").get()
|
||||||
|
|
||||||
val regionalSetSelector = source.selectors().regionalSet().get()
|
val regionalSetSelector = source.selectors().regionalSet().get()
|
||||||
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
|
val regionalSetRoot = doc.selectFirst(regionalSetSelector.root().get())!!
|
||||||
|
|
||||||
return scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps()) ?: "whoomp whoomp"
|
val setId: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().id().steps())
|
||||||
|
val setLanguage: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().language().steps())
|
||||||
|
val setKey: String? = scrapeService.extractTextFromRootBySteps(regionalSetRoot, source.selectors().regionalSet().get().regionKey().steps())
|
||||||
|
|
||||||
|
return mapOf(
|
||||||
|
Pair("id", setId ?: "N/A"),
|
||||||
|
Pair("language", setLanguage ?: "N/A"),
|
||||||
|
Pair("key", setKey ?: "N/A"),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
7
src/main/kotlin/com/rak/model/XPathTarget.kt
Normal file
7
src/main/kotlin/com/rak/model/XPathTarget.kt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
package com.rak.model
|
||||||
|
|
||||||
|
enum class XPathTarget {
|
||||||
|
TEXT,
|
||||||
|
ATTRIBUTE,
|
||||||
|
ELEMENT
|
||||||
|
}
|
||||||
5
src/main/kotlin/com/rak/model/config/Attribute.kt
Normal file
5
src/main/kotlin/com/rak/model/config/Attribute.kt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
data class Attribute(
|
||||||
|
val steps: Set<Step>
|
||||||
|
)
|
||||||
8
src/main/kotlin/com/rak/model/config/CardConfigModel.kt
Normal file
8
src/main/kotlin/com/rak/model/config/CardConfigModel.kt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
data class CardConfigModel(
|
||||||
|
override val root: String,
|
||||||
|
val name: Attribute,
|
||||||
|
val attack: Attribute,
|
||||||
|
val effect: Attribute,
|
||||||
|
) : ConfigModel
|
||||||
5
src/main/kotlin/com/rak/model/config/ConfigModel.kt
Normal file
5
src/main/kotlin/com/rak/model/config/ConfigModel.kt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
interface ConfigModel {
|
||||||
|
val root: String
|
||||||
|
}
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
data class RegionalSetConfigModel(
|
||||||
|
override val root: String,
|
||||||
|
val id: Attribute,
|
||||||
|
val language: Attribute,
|
||||||
|
val regionKey: Attribute
|
||||||
|
) : ConfigModel
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
data class SourceConfiguration(
|
||||||
|
val id: String,
|
||||||
|
val name: String,
|
||||||
|
val urlPatterns: Set<String>,
|
||||||
|
val selectors: Set<ConfigModel>
|
||||||
|
)
|
||||||
11
src/main/kotlin/com/rak/model/config/Step.kt
Normal file
11
src/main/kotlin/com/rak/model/config/Step.kt
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package com.rak.model.config
|
||||||
|
|
||||||
|
data class Step(
|
||||||
|
val type: Type,
|
||||||
|
val value: String
|
||||||
|
) {
|
||||||
|
enum class Type {
|
||||||
|
XPATH,
|
||||||
|
CSS
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,68 +1,40 @@
|
|||||||
package com.rak.service
|
package com.rak.service
|
||||||
|
|
||||||
import com.rak.config.SourcesConfiguration
|
import com.rak.config.SourcesConfiguration
|
||||||
|
import com.rak.model.XPathTarget
|
||||||
|
import com.rak.util.XPathUtil
|
||||||
import jakarta.enterprise.context.ApplicationScoped
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
import org.jsoup.nodes.Element
|
import org.jsoup.nodes.Element
|
||||||
import org.jsoup.nodes.TextNode
|
import org.jsoup.nodes.TextNode
|
||||||
|
import org.jsoup.select.Evaluator
|
||||||
import java.util.concurrent.LinkedBlockingQueue
|
import java.util.concurrent.LinkedBlockingQueue
|
||||||
|
|
||||||
@ApplicationScoped
|
@ApplicationScoped
|
||||||
class ScrapeService {
|
class ScrapeService {
|
||||||
|
|
||||||
companion object {
|
|
||||||
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
|
||||||
|
|
||||||
private fun evaluateXpath(element: Element, xpath: String): Element? {
|
|
||||||
return element.selectXpath(xpath).first()
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun evaluateCssSelector(element: Element, cssSelector: String): Element? {
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
// XPath
|
|
||||||
// - text()
|
|
||||||
// - last step (default to text())
|
|
||||||
// CSS
|
|
||||||
// - last step???
|
|
||||||
private fun untilText(): String? {
|
|
||||||
|
|
||||||
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
fun extractTextFromRootBySteps(
|
fun extractTextFromRootBySteps(
|
||||||
root: Element,
|
root: Element,
|
||||||
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
|
steps: Set<SourcesConfiguration.SourceConfig.Selectors.StepDefinition>
|
||||||
): String? {
|
): String? {
|
||||||
var currentElement: Element? = root.clone()
|
var currentElement: Element? = root.clone()
|
||||||
val stepsAsQueue = LinkedBlockingQueue(
|
var result: String? = null
|
||||||
steps
|
|
||||||
)
|
|
||||||
|
|
||||||
while (stepsAsQueue.isNotEmpty()) {
|
|
||||||
val step = stepsAsQueue.take()
|
|
||||||
val stepTargetsTextNode: Boolean = TEXT_NODE_MATCHER.matches(step.value())
|
|
||||||
|
|
||||||
|
for (index in 0 until steps.size) {
|
||||||
|
val currentStep = steps.elementAtOrNull(index) ?: return null
|
||||||
if (currentElement == null) {
|
if (currentElement == null) {
|
||||||
return null
|
throw IllegalStateException()
|
||||||
}
|
}
|
||||||
|
|
||||||
currentElement = if (step.type() == "xpath") {
|
if (index == steps.size - 1) {
|
||||||
if (stepTargetsTextNode) {
|
result = XPathUtil.extractResult(currentElement, currentStep.value())
|
||||||
return currentElement.selectXpath(step.value(), TextNode::class.java).first().text()
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
currentElement.selectXpath(step.value()).first()
|
currentElement = XPathUtil.getNextElement(currentElement, currentStep.value())
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
currentElement.selectFirst(step.value())
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return null
|
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
31
src/main/kotlin/com/rak/service/SourceService.kt
Normal file
31
src/main/kotlin/com/rak/service/SourceService.kt
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
package com.rak.service
|
||||||
|
|
||||||
|
import com.rak.config.SourcesConfiguration
|
||||||
|
import com.rak.model.config.SourceConfiguration
|
||||||
|
import jakarta.annotation.PostConstruct
|
||||||
|
import jakarta.enterprise.context.ApplicationScoped
|
||||||
|
|
||||||
|
@ApplicationScoped
|
||||||
|
class SourceService (
|
||||||
|
private val sourcesConfiguration: SourcesConfiguration
|
||||||
|
) {
|
||||||
|
|
||||||
|
private val sources: MutableSet<SourceConfiguration> = mutableSetOf()
|
||||||
|
|
||||||
|
@PostConstruct
|
||||||
|
fun init() {
|
||||||
|
sourcesConfiguration
|
||||||
|
.sources()
|
||||||
|
.forEach { source ->
|
||||||
|
val config = SourceConfiguration(
|
||||||
|
source.id(),
|
||||||
|
source.name(),
|
||||||
|
source.urlPatterns().orElse(mutableSetOf()),
|
||||||
|
setOf()
|
||||||
|
)
|
||||||
|
|
||||||
|
sources.add(config)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
54
src/main/kotlin/com/rak/util/XPathUtil.kt
Normal file
54
src/main/kotlin/com/rak/util/XPathUtil.kt
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
package com.rak.util
|
||||||
|
|
||||||
|
import com.rak.model.XPathTarget
|
||||||
|
import org.jsoup.nodes.Element
|
||||||
|
import org.jsoup.nodes.TextNode
|
||||||
|
|
||||||
|
class XPathUtil private constructor() {
|
||||||
|
|
||||||
|
companion object {
|
||||||
|
private val TEXT_NODE_MATCHER: Regex = Regex("^.*text\\(\\)$")
|
||||||
|
private val ATTRIBUTE_MATCHER: Regex = Regex("^//[/a-z]*@([a-z]*)$")
|
||||||
|
|
||||||
|
private fun extractTextFromAttribute(root: Element, xpath: String): String? {
|
||||||
|
val groupMatcher = ATTRIBUTE_MATCHER.matchEntire(xpath)
|
||||||
|
val attributeName = groupMatcher?.groupValues[1] ?: return null
|
||||||
|
val attributeValue = root.attr(attributeName)
|
||||||
|
|
||||||
|
return attributeValue.ifBlank {
|
||||||
|
null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun extractTextFromNode(root: Element, xpath: String): String? {
|
||||||
|
return root
|
||||||
|
.selectXpath(xpath, TextNode::class.java)
|
||||||
|
.firstOrNull()?.text()
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getNextElement(root: Element, path: String): Element? {
|
||||||
|
return root.selectXpath(path).firstOrNull()
|
||||||
|
}
|
||||||
|
|
||||||
|
fun extractResult(root: Element, path: String): String? {
|
||||||
|
return when (getXPathTargetFromPath(path)) {
|
||||||
|
XPathTarget.TEXT -> extractTextFromNode(root, path)
|
||||||
|
XPathTarget.ATTRIBUTE -> extractTextFromAttribute(root, path)
|
||||||
|
else -> null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fun getXPathTargetFromPath(path: String): XPathTarget {
|
||||||
|
return if (TEXT_NODE_MATCHER.matches(path)) {
|
||||||
|
XPathTarget.TEXT
|
||||||
|
} else if (ATTRIBUTE_MATCHER.matches(path)) {
|
||||||
|
XPathTarget.ATTRIBUTE
|
||||||
|
} else {
|
||||||
|
XPathTarget.ELEMENT
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
com.rak.model.config.TestConverter$ActualConverter
|
||||||
@@ -1 +0,0 @@
|
|||||||
quarkus.config.locations=sources.yml
|
|
||||||
44
src/main/resources/application.yml
Normal file
44
src/main/resources/application.yml
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
scraper:
|
||||||
|
sources:
|
||||||
|
- id: konami-official
|
||||||
|
name: "Konami Official Database"
|
||||||
|
domain: "yugioh-card.com"
|
||||||
|
url-patterns:
|
||||||
|
- "^https://www\\.yugioh-card\\.com/[a-z]{2}/products/.*$"
|
||||||
|
selectors:
|
||||||
|
card:
|
||||||
|
name:
|
||||||
|
steps:
|
||||||
|
- type: "css"
|
||||||
|
value: "h1.product-title"
|
||||||
|
- type: "xpath"
|
||||||
|
value: "//h1[@itemprop='name']"
|
||||||
|
attack:
|
||||||
|
steps:
|
||||||
|
- type: "css"
|
||||||
|
value: ".atk-value"
|
||||||
|
|
||||||
|
- id: ygo-fandom
|
||||||
|
name: "Yu-Gi-Oh Fandom Wiki"
|
||||||
|
domain: "yugioh.fandom.com"
|
||||||
|
url-patterns:
|
||||||
|
- "^https://yugioh\\.fandom\\.com/wiki/.*$"
|
||||||
|
selectors:
|
||||||
|
regional-set:
|
||||||
|
root: "h3:contains(Prefix(es)) + div > ul:nth-child(1) > li"
|
||||||
|
id:
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "//li/text()"
|
||||||
|
language:
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "//li/abbr"
|
||||||
|
- type: "xpath"
|
||||||
|
value: "//abbr/@title"
|
||||||
|
region-key:
|
||||||
|
steps:
|
||||||
|
- type: "xpath"
|
||||||
|
value: "//li/abbr/text()"
|
||||||
|
testing:
|
||||||
|
waaa: test
|
||||||
Reference in New Issue
Block a user