Merge pull request #1097 from bushixuanqi/master

新增专用规则切分类,用于解决原来各种可能的切分错误。默认规则新增类jsonPath的索引选择写法,支持单索引与区间混合,支持负数,支持[!index…]排除写法。
pull/1101/head
kunfei 3 years ago committed by GitHub
commit 610b4ac536
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 164
      app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSonPath.kt
  2. 507
      app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt
  3. 71
      app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByXPath.kt
  4. 515
      app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt

@ -1,78 +1,77 @@
package io.legado.app.model.analyzeRule
import android.text.TextUtils
import androidx.annotation.Keep
import com.jayway.jsonpath.JsonPath
import com.jayway.jsonpath.ReadContext
import io.legado.app.utils.splitNotBlank
import java.util.*
import java.util.regex.Pattern
@Suppress("RegExpRedundantEscape")
@Keep
class AnalyzeByJSonPath(json: Any) {
companion object {
private val jsonRulePattern = Pattern.compile("(?<=\\{)\\$\\..+?(?=\\})")
fun parse(json: Any): ReadContext {
return when (json) {
is ReadContext -> json
is String -> JsonPath.parse(json)
else -> JsonPath.parse(json)
is String -> JsonPath.parse(json) //JsonPath.parse<String>(json)
else -> JsonPath.parse(json) //JsonPath.parse<Any>(json)
}
}
}
private var ctx: ReadContext = parse(json)
/**
* 改进解析方法
* 解决阅读&&||与jsonPath支持的&&||之间的冲突
* 解决{$.rule}形式规则可能匹配错误的问题旧规则用正则解析内容含}的json文本时用规则中的字段去匹配这种内容会匹配错误.现改用平衡嵌套方法解决这个问题
* */
fun getString(rule: String): String? {
if (TextUtils.isEmpty(rule)) return null
var result = ""
val rules: Array<String>
val elementsType: String
if (rule.contains("&&")) {
rules = rule.splitNotBlank("&&")
elementsType = "&"
} else {
rules = rule.splitNotBlank("||")
elementsType = "|"
}
if (rule.isEmpty()) return null
var result: String
val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&","||")
if (rules.size == 1) {
if (!rule.contains("{$.")) {
ruleAnalyzes.reSetPos() //将pos重置为0,复用解析器
result = ruleAnalyzes.innerRule("{$."){ getString(it) } //替换所有{$.rule...}
if (result.isEmpty()) { //st为空,表明无成功替换的内嵌规则
try {
val ob = ctx.read<Any>(rule)
result =
if (ob is List<*>) {
val builder = StringBuilder()
for (o in ob) {
builder.append(o).append("\n")
}
builder.toString().replace("\\n$".toRegex(), "")
} else {
ob.toString()
result =(if (ob is List<*>) {
val builder = StringBuilder()
for (o in ob) {
builder.append(o).append("\n")
}
builder.deleteCharAt(builder.lastIndex) //删除末尾赘余换行
builder
} else ob).toString()
} catch (ignored: Exception) {
}
return result
} else {
result = rule
val matcher = jsonRulePattern.matcher(rule)
while (matcher.find()) {
result = result.replace(
String.format("{%s}", matcher.group()),
getString(matcher.group())!!
)
}
return result
}
return result
} else {
val textList = arrayListOf<String>()
for (rl in rules) {
val temp = getString(rl)
if (!temp.isNullOrEmpty()) {
textList.add(temp)
if (elementsType == "|") {
if (ruleAnalyzes.elementsType == "||") {
break
}
}
@ -83,59 +82,48 @@ class AnalyzeByJSonPath(json: Any) {
internal fun getStringList(rule: String): List<String> {
val result = ArrayList<String>()
if (TextUtils.isEmpty(rule)) return result
val rules: Array<String>
val elementsType: String
when {
rule.contains("&&") -> {
rules = rule.splitNotBlank("&&")
elementsType = "&"
}
rule.contains("%%") -> {
rules = rule.splitNotBlank("%%")
elementsType = "%"
}
else -> {
rules = rule.splitNotBlank("||")
elementsType = "|"
}
}
if (rule.isEmpty()) return result
val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&","||","%%")
if (rules.size == 1) {
if (!rule.contains("{$.")) {
ruleAnalyzes.reSetPos() //将pos重置为0,复用解析器
val st = ruleAnalyzes.innerRule("{$."){ getString(it) } //替换所有{$.rule...}
if (st.isEmpty()) { //st为空,表明无成功替换的内嵌规则
try {
val obj = ctx.read<Any>(rule) ?: return result
val obj = ctx.read<Any>(rule) //kotlin的Any型返回值不包含null ,删除赘余 ?: return result
if (obj is List<*>) {
for (o in obj)
result.add(o.toString())
} else {
result.add(obj.toString())
}
for (o in obj) result.add(o.toString())
} else result.add(obj.toString())
} catch (ignored: Exception) {
}
return result
} else {
val matcher = jsonRulePattern.matcher(rule)
while (matcher.find()) {
val stringList = getStringList(matcher.group())
for (s in stringList) {
result.add(rule.replace(String.format("{%s}", matcher.group()), s))
}
}
return result
}
}else result.add(st)
return result
} else {
val results = ArrayList<List<String>>()
for (rl in rules) {
val temp = getStringList(rl)
if (temp.isNotEmpty()) {
results.add(temp)
if (temp.isNotEmpty() && elementsType == "|") {
if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") {
break
}
}
}
if (results.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in results[0].indices) {
for (temp in results) {
if (i < temp.size) {
@ -159,23 +147,9 @@ class AnalyzeByJSonPath(json: Any) {
internal fun getList(rule: String): ArrayList<Any>? {
val result = ArrayList<Any>()
if (TextUtils.isEmpty(rule)) return result
val elementsType: String
val rules: Array<String>
when {
rule.contains("&&") -> {
rules = rule.splitNotBlank("&&")
elementsType = "&"
}
rule.contains("%%") -> {
rules = rule.splitNotBlank("%%")
elementsType = "%"
}
else -> {
rules = rule.splitNotBlank("||")
elementsType = "|"
}
}
if (rule.isEmpty()) return result
val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&","||","%%")
if (rules.size == 1) {
ctx.let {
try {
@ -191,13 +165,13 @@ class AnalyzeByJSonPath(json: Any) {
val temp = getList(rl)
if (temp != null && temp.isNotEmpty()) {
results.add(temp)
if (temp.isNotEmpty() && elementsType == "|") {
if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") {
break
}
}
}
if (results.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in 0 until results[0].size) {
for (temp in results) {
if (i < temp.size) {

@ -1,9 +1,7 @@
package io.legado.app.model.analyzeRule
import android.text.TextUtils.isEmpty
import android.text.TextUtils.join
import androidx.annotation.Keep
import io.legado.app.utils.splitNotBlank
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import org.jsoup.select.Collector
@ -19,6 +17,9 @@ import java.util.*
@Keep
class AnalyzeByJSoup(doc: Any) {
companion object {
/**
* "class", "id", "tag", "text", "children"
*/
val validKeys = arrayOf("class", "id", "tag", "text", "children")
fun parse(doc: Any): Element {
@ -36,67 +37,44 @@ class AnalyzeByJSoup(doc: Any) {
/**
* 获取列表
*/
internal fun getElements(rule: String): Elements {
return getElements(element, rule)
}
internal fun getElements(rule: String) = getElements(element, rule)
/**
* 合并内容列表,得到内容
*/
internal fun getString(ruleStr: String): String? {
if (isEmpty(ruleStr)) {
return null
}
val textS = getStringList(ruleStr)
return if (textS.isEmpty()) {
null
} else {
textS.joinToString("\n")
}
}
internal fun getString(ruleStr: String) =
if(ruleStr.isEmpty()) null
else getStringList(ruleStr).takeIf { it.isNotEmpty() }?.joinToString("\n")
/**
* 获取一个字符串
*/
internal fun getString0(ruleStr: String): String {
val urlList = getStringList(ruleStr)
return if (urlList.isNotEmpty()) {
urlList[0]
} else ""
}
internal fun getString0(ruleStr: String) = getStringList(ruleStr).let{ if ( it.isEmpty() ) "" else it[0] }
/**
* 获取所有内容列表
*/
internal fun getStringList(ruleStr: String): List<String> {
val textS = ArrayList<String>()
if (isEmpty(ruleStr)) {
return textS
}
if (ruleStr.isEmpty()) return textS
//拆分规则
val sourceRule = SourceRule(ruleStr)
if (isEmpty(sourceRule.elementsRule)) {
if (sourceRule.elementsRule.isEmpty()) {
textS.add(element.data() ?: "")
} else {
val elementsType: String
val ruleStrS: Array<String>
when {
sourceRule.elementsRule.contains("&&") -> {
elementsType = "&"
ruleStrS = sourceRule.elementsRule.splitNotBlank("&&")
}
sourceRule.elementsRule.contains("%%") -> {
elementsType = "%"
ruleStrS = sourceRule.elementsRule.splitNotBlank("%%")
}
else -> {
elementsType = "|"
ruleStrS = sourceRule.elementsRule.splitNotBlank("||")
}
}
val ruleAnalyzes = RuleAnalyzer(sourceRule.elementsRule)
val ruleStrS = ruleAnalyzes.splitRule("&&","||" ,"%%")
val results = ArrayList<List<String>>()
for (ruleStrX in ruleStrS) {
val temp: List<String>? =
if (sourceRule.isCss) {
val lastIndex = ruleStrX.lastIndexOf('@')
@ -107,15 +85,17 @@ class AnalyzeByJSoup(doc: Any) {
} else {
getResultList(ruleStrX)
}
if (!temp.isNullOrEmpty()) {
results.add(temp)
if (results.isNotEmpty() && elementsType == "|") {
break
}
results.add(temp) //!temp.isNullOrEmpty()时,results.isNotEmpty()为true
if (ruleAnalyzes.elementsType == "||") break
}
}
if (results.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in results[0].indices) {
for (temp in results) {
if (i < temp.size) {
@ -137,47 +117,56 @@ class AnalyzeByJSoup(doc: Any) {
* 获取Elements
*/
private fun getElements(temp: Element?, rule: String): Elements {
if (temp == null || rule.isEmpty()) return Elements()
val elements = Elements()
if (temp == null || isEmpty(rule)) {
return elements
}
val sourceRule = SourceRule(rule)
val elementsType: String
val ruleStrS: Array<String>
when {
sourceRule.elementsRule.contains("&&") -> {
elementsType = "&"
ruleStrS = sourceRule.elementsRule.splitNotBlank("&&")
}
sourceRule.elementsRule.contains("%%") -> {
elementsType = "%"
ruleStrS = sourceRule.elementsRule.splitNotBlank("%%")
}
else -> {
elementsType = "|"
ruleStrS = sourceRule.elementsRule.splitNotBlank("||")
}
}
val ruleAnalyzes = RuleAnalyzer(sourceRule.elementsRule)
val ruleStrS = ruleAnalyzes.splitRule("&&","||","%%")
val elementsList = ArrayList<Elements>()
if (sourceRule.isCss) {
for (ruleStr in ruleStrS) {
val tempS = temp.select(ruleStr)
elementsList.add(tempS)
if (tempS.size > 0 && elementsType == "|") {
if (tempS.size > 0 && ruleAnalyzes.elementsType == "||") {
break
}
}
} else {
for (ruleStr in ruleStrS) {
val tempS = getElementsSingle(temp, ruleStr)
elementsList.add(tempS)
if (tempS.size > 0 && elementsType == "|") {
//将原getElementsSingle函数调用的函数的部分代码内联过来,方便简化getElementsSingle函数
val rsRule = RuleAnalyzer(ruleStr)
if( rsRule.peek() =='@' || rsRule.peek() < '!' ) rsRule.advance() // 修剪当前规则之前的"@"或者空白符
val rs = rsRule.splitRule("@")
val el = if (rs.size > 1) {
val el = Elements()
el.add(temp)
for (rl in rs) {
val es = Elements()
for (et in el) {
es.addAll(getElements(et, rl))
}
el.clear()
el.addAll(es)
}
el
}else getElementsSingle(temp,ruleStr)
elementsList.add(el)
if (el.size > 0 && ruleAnalyzes.elementsType == "||") {
break
}
}
}
if (elementsList.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in 0 until elementsList[0].size) {
for (es in elementsList) {
if (i < es.size) {
@ -194,134 +183,162 @@ class AnalyzeByJSoup(doc: Any) {
return elements
}
private fun filterElements(elements: Elements, rules: Array<String>?): Elements {
if (rules == null || rules.size < 2) return elements
val result = Elements()
for (element in elements) {
var isOk = false
when (rules[0]) {
"class" -> isOk = element.getElementsByClass(rules[1]).size > 0
"id" -> isOk = element.getElementById(rules[1]) != null
"tag" -> isOk = element.getElementsByTag(rules[1]).size > 0
"text" -> isOk = element.getElementsContainingOwnText(rules[1]).size > 0
/**
* 1.支持阅读原有写法':'分隔索引!.表示筛选方式索引可为负数
*
* 例如 tag.div.-1:10:2 tag.div!0:3
*
* 2. 支持与jsonPath类似的[]索引写法
*
* 格式形如 [it,it] [!it,it] 其中[!开头表示筛选方式为排除it为单个索引或区间
*
* 区间格式为 start:end start:end:step其中start为0可省略end为-1可省略
*
* 索引区间两端及间隔都支持负数
*
* 例如 tag.div[-1, 3:-2:-10, 2]
*
* 特殊用法 tag.div[-1:0] 可在任意地方让列表反向
*
* */
fun findIndexSet( rule:String ): IndexSet {
val indexSet = IndexSet()
val rus = rule.trim{ it <= ' '}
var len = rus.length
var curInt: Int? //当前数字
var curMinus = false //当前数字是否为负
val curList = mutableListOf<Int?>() //当前数字区间
var l = "" //暂存数字字符串
val head = rus[rus.length-1] == ']' //是否为常规索引写法
if(head){ //常规索引写法[index...]
len-- //跳过尾部']'
while (len-- > 0) { //逆向遍历,至少有一位前置字符,如 [
var rl = rus[len]
if (rl == ' ') continue //跳过空格
if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出
else if (rl == '-') curMinus = true
else {
curInt = if (l.isEmpty()) null else if (curMinus) -l.toInt() else l.toInt() //当前数字
when (rl) {
':' -> curList.add(curInt) //区间右端或区间间隔
else -> {
//为保证查找顺序,区间和单个索引都添加到同一集合
if(curList.isEmpty())indexSet.indexs.add(curInt!!)
else{
//列表最后压入的是区间右端,若列表有两位则最先压入的是间隔
indexSet.indexs.add( Triple(curInt, curList.last(), if(curList.size == 2) curList.first() else 1) )
curList.clear() //重置临时列表,避免影响到下个区间的处理
}
if(rl == '!'){
indexSet.split='!'
do{ rl = rus[--len] } while (len > 0 && rl == ' ')//跳过所有空格
}
if(rl == '[') return indexSet.apply {
beforeRule = rus.substring(0, len)
} //遇到索引边界,返回结果
if(rl != ',') break //非索引结构,跳出
}
}
l = "" //清空
curMinus = false //重置
}
}
if (isOk) {
result.add(element)
} else while (len --> 1) { //阅读原本写法,逆向遍历,至少两位前置字符,如 p.
val rl = rus[len]
if (rl == ' ') continue //跳过空格
if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出
else if (rl == '-') curMinus = true
else {
if(rl == '!' || rl == '.' || rl == ':') { //分隔符或起始符
indexSet.indexDefault.add(if (curMinus) -l.toInt() else l.toInt()) // 当前数字追加到列表
if (rl != ':') return indexSet.apply { //rl == '!' || rl == '.'
split = rl
beforeRule = rus.substring(0, len)
}
}else break //非索引结构,跳出循环
l = "" //清空
curMinus = false //重置
}
}
return result
return indexSet.apply{
split = ' '
beforeRule = rus } //非索引格式
}
/**
* 获取Elements按照一个规则
*/
private fun getElementsSingle(temp: Element, rule: String): Elements {
val elements = Elements()
try {
val rs = rule.trim { it <= ' ' }.splitNotBlank("@")
if (rs.size > 1) {
elements.add(temp)
for (rl in rs) {
val es = Elements()
for (et in elements) {
es.addAll(getElements(et, rl))
}
elements.clear()
elements.addAll(es)
}
} else {
val rulePcx = rule.split("!")
val rulePc = rulePcx[0].trim { it <= ' ' }.split(">")
val rules = rulePc[0].trim { it <= ' ' }.split(".")
var filterRules: Array<String>? = null
var needFilterElements = rulePc.size > 1 && !isEmpty(rulePc[1].trim { it <= ' ' })
if (needFilterElements) {
filterRules = rulePc[1].trim { it <= ' ' }.split(".").toTypedArray()
filterRules[0] = filterRules[0].trim { it <= ' ' }
if (filterRules.size < 2
|| !validKeys.contains(filterRules[0])
|| filterRules[1].trim { it <= ' ' }.isEmpty()
) {
needFilterElements = false
}
filterRules[1] = filterRules[1].trim { it <= ' ' }
}
when (rules[0]) {
"children" -> {
var children = temp.children()
if (needFilterElements)
children = filterElements(children, filterRules)
elements.addAll(children)
}
"class" -> {
var elementsByClass = temp.getElementsByClass(rules[1])
if (rules.size == 3 && rules[2].isNotEmpty()) {
val index = Integer.parseInt(rules[2])
if (index < 0) {
elements.add(elementsByClass[elementsByClass.size + index])
} else {
elements.add(elementsByClass[index])
}
} else {
if (needFilterElements)
elementsByClass = filterElements(elementsByClass, filterRules)
elements.addAll(elementsByClass)
}
}
"tag" -> {
var elementsByTag = temp.getElementsByTag(rules[1])
if (rules.size == 3 && rules[2].isNotEmpty()) {
val index = Integer.parseInt(rules[2])
if (index < 0) {
elements.add(elementsByTag[elementsByTag.size + index])
} else {
elements.add(elementsByTag[index])
}
} else {
if (needFilterElements)
elementsByTag = filterElements(elementsByTag, filterRules)
elements.addAll(elementsByTag)
}
}
"id" -> {
var elementsById = Collector.collect(Evaluator.Id(rules[1]), temp)
if (rules.size == 3 && rules[2].isNotEmpty()) {
val index = Integer.parseInt(rules[2])
if (index < 0) {
elements.add(elementsById[elementsById.size + index])
} else {
elements.add(elementsById[index])
}
} else {
if (needFilterElements)
elementsById = filterElements(elementsById, filterRules)
elements.addAll(elementsById)
}
}
"text" -> {
var elementsByText = temp.getElementsContainingOwnText(rules[1])
if (needFilterElements)
elementsByText = filterElements(elementsByText, filterRules)
elements.addAll(elementsByText)
}
else -> elements.addAll(temp.select(rulePcx[0]))
}
if (rulePcx.size > 1) {
val rulePcs = rulePcx[1].splitNotBlank(":")
for (pc in rulePcs) {
val pcInt = Integer.parseInt(pc)
if (pcInt < 0 && elements.size + pcInt >= 0) {
elements[elements.size + pcInt] = null
} else if (Integer.parseInt(pc) < elements.size) {
elements[Integer.parseInt(pc)] = null
}
}
val es = Elements()
es.add(null)
elements.removeAll(es)
}
}
} catch (ignore: Exception) {
var elements = Elements()
val fi = findIndexSet(rule) //执行索引列表处理器
val (filterType,ruleStr) = fi //获取操作类型及非索引部分的规则字串
// val rulePc = rulePcx[0].trim { it <= ' ' }.split(">")
// jsoup中,当前节点是参与选择的,tag.div 与 tag.div@tag.div 结果相同
// 此处">"效果和“@”完全相同,且容易让人误解成选择子节点,实际并不是。以后不允许这种无意义的写法
val rules = ruleStr.split(".")
elements.addAll(
when (rules[0]) {
"children" -> temp.children()
"class" -> temp.getElementsByClass(rules[1])
"tag" -> temp.getElementsByTag(rules[1])
"id" -> Collector.collect(Evaluator.Id(rules[1]), temp)
"text" -> temp.getElementsContainingOwnText(rules[1])
else -> temp.select(ruleStr)
} )
val indexSet = fi.getIndexs(elements.size) //传入元素数量,处理负数索引及索引越界问题,生成可用索引集合。
if(filterType == '!'){ //排除
for (pcInt in indexSet) elements[pcInt] = null
elements.removeAll(listOf(null)) //测试过,这样就行
}else if(filterType == '.'){ //选择
val es = Elements()
for (pcInt in indexSet) es.add(elements[pcInt])
elements = es
}
return elements
@ -331,13 +348,21 @@ class AnalyzeByJSoup(doc: Any) {
* 获取内容列表
*/
private fun getResultList(ruleStr: String): List<String>? {
if (isEmpty(ruleStr)) {
return null
}
if (ruleStr.isEmpty()) return null
var elements = Elements()
elements.add(element)
val rules = ruleStr.splitNotBlank("@")
for (i in 0 until rules.size - 1) {
val rule = RuleAnalyzer(ruleStr) //创建解析
while( rule.peek() =='@' || rule.peek() < '!' ) rule.advance() // 修剪当前规则之前的"@"或者空白符
val rules = rule.splitRule("@") // 切割成列表
val last = rules.size - 1
for (i in 0 until last) {
val es = Elements()
for (elt in elements) {
es.addAll(getElementsSingle(elt, rules[i]))
@ -345,9 +370,7 @@ class AnalyzeByJSoup(doc: Any) {
elements.clear()
elements = es
}
return if (elements.isEmpty()) {
null
} else getResultLast(elements, rules[rules.size - 1])
return if (elements.isEmpty()) null else getResultLast(elements, rules[last])
}
/**
@ -365,7 +388,7 @@ class AnalyzeByJSoup(doc: Any) {
val contentEs = element.textNodes()
for (item in contentEs) {
val temp = item.text().trim { it <= ' ' }
if (!isEmpty(temp)) {
if (temp.isNotEmpty()) {
tn.add(temp)
}
}
@ -382,10 +405,12 @@ class AnalyzeByJSoup(doc: Any) {
}
"all" -> textS.add(elements.outerHtml())
else -> for (element in elements) {
val url = element.attr(lastRule)
if (!isEmpty(url) && !textS.contains(url)) {
textS.add(url)
}
if(url.isEmpty() || textS.contains(url)) break
textS.add(url)
}
}
} catch (e: Exception) {
@ -395,17 +420,75 @@ class AnalyzeByJSoup(doc: Any) {
return textS
}
internal inner class SourceRule(ruleStr: String) {
var isCss = false
var elementsRule: String
data class IndexSet(var split:Char = '.',
var beforeRule:String = "",
val indexDefault:MutableList<Int> = mutableListOf(),
val indexs:MutableList<Any> = mutableListOf()){
fun getIndexs(len:Int): MutableSet<Int> {
val indexSet = mutableSetOf<Int>()
val lastIndexs = (indexDefault.size - 1).takeIf { it !=-1 } ?: indexs.size -1
if(indexs.isEmpty())for (ix in lastIndexs downTo 0 ){ //indexs为空,表明是非[]式索引,集合是逆向遍历插入的,所以这里也逆向遍历,好还原顺序
val it = indexDefault[ix]
if(it in 0 until len) indexSet.add(it) //将正数不越界的索引添加到集合
else if(it < 0 && len >= -it) indexSet.add(it + len) //将负数不越界的索引添加到集合
}else for (ix in lastIndexs downTo 0 ){ //indexs不空,表明是[]式索引,集合是逆向遍历插入的,所以这里也逆向遍历,好还原顺序
if(indexs[ix] is Triple<*, *, *>){ //区间
val (startx, endx, stepx) = indexs[ix] as Triple<Int?, Int?, Int> //还原储存时的类型
val start = if (startx == null) 0 //左端省略表示0
else if (startx >= 0) if (startx < len) startx else len - 1 //右端越界,设置为最大索引
else if (-startx <= len) len + startx /* 将负索引转正 */ else 0 //左端越界,设置为最小索引
val end = if (endx == null) len - 1 //右端省略表示 len - 1
else if (endx >= 0) if (endx < len) endx else len - 1 //右端越界,设置为最大索引
else if (-endx <= len) len + endx /* 将负索引转正 */ else 0 //左端越界,设置为最小索引
if (start == end || stepx >= len) { //两端相同,区间里只有一个数。或间隔过大,区间实际上仅有首位
indexSet.add(start)
continue
}
val step = if (stepx > 0) stepx else if (-stepx < len) stepx + len else 1 //最小正数间隔为1
//将区间展开到集合中,允许列表反向。
indexSet.addAll(if (end > start) start..end step step else start downTo end step step)
}else{//单个索引
val it = indexs[ix] as Int //还原储存时的类型
if(it in 0 until len) indexSet.add(it) //将正数不越界的索引添加到集合
else if(it < 0 && len >= -it) indexSet.add(it + len) //将负数不越界的索引添加到集合
}
init {
if (ruleStr.startsWith("@CSS:", true)) {
isCss = true
elementsRule = ruleStr.substring(5).trim { it <= ' ' }
} else {
elementsRule = ruleStr
}
return indexSet
}
}
internal inner class SourceRule(ruleStr: String) {
var isCss = false
var elementsRule: String = if (ruleStr.startsWith("@CSS:", true)) {
isCss = true
ruleStr.substring(5).trim { it <= ' ' }
} else {
ruleStr
}
}

@ -2,7 +2,6 @@ package io.legado.app.model.analyzeRule
import android.text.TextUtils
import androidx.annotation.Keep
import io.legado.app.utils.splitNotBlank
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
@ -27,10 +26,10 @@ class AnalyzeByXPath(doc: Any) {
private fun strToJXDocument(html: String): JXDocument {
var html1 = html
if (html1.endsWith("</td>")) {
html1 = String.format("<tr>%s</tr>", html1)
html1 = "<tr>${html1}</tr>"
}
if (html1.endsWith("</tr>") || html1.endsWith("</tbody>")) {
html1 = String.format("<table>%s</table>", html1)
html1 = "<table>${html1}</table>"
}
return JXDocument.create(html1)
}
@ -45,26 +44,13 @@ class AnalyzeByXPath(doc: Any) {
}
internal fun getElements(xPath: String): List<JXNode>? {
if (TextUtils.isEmpty(xPath)) {
return null
}
if(xPath.isEmpty()) return null
val jxNodes = ArrayList<JXNode>()
val elementsType: String
val rules: Array<String>
when {
xPath.contains("&&") -> {
rules = xPath.splitNotBlank("&&")
elementsType = "&"
}
xPath.contains("%%") -> {
rules = xPath.splitNotBlank("%%")
elementsType = "%"
}
else -> {
rules = xPath.splitNotBlank("||")
elementsType = "|"
}
}
val ruleAnalyzes = RuleAnalyzer(xPath)
val rules = ruleAnalyzes.splitRule("&&","||","%%")
if (rules.size == 1) {
return getResult(rules[0])
} else {
@ -73,13 +59,13 @@ class AnalyzeByXPath(doc: Any) {
val temp = getElements(rl)
if (temp != null && temp.isNotEmpty()) {
results.add(temp)
if (temp.isNotEmpty() && elementsType == "|") {
if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") {
break
}
}
}
if (results.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in results[0].indices) {
for (temp in results) {
if (i < temp.size) {
@ -98,23 +84,11 @@ class AnalyzeByXPath(doc: Any) {
}
internal fun getStringList(xPath: String): List<String> {
val result = ArrayList<String>()
val elementsType: String
val rules: Array<String>
when {
xPath.contains("&&") -> {
rules = xPath.splitNotBlank("&&")
elementsType = "&"
}
xPath.contains("%%") -> {
rules = xPath.splitNotBlank("%%")
elementsType = "%"
}
else -> {
rules = xPath.splitNotBlank("||")
elementsType = "|"
}
}
val ruleAnalyzes = RuleAnalyzer(xPath)
val rules = ruleAnalyzes.splitRule("&&","||","%%")
if (rules.size == 1) {
getResult(xPath)?.map {
result.add(it.asString())
@ -126,13 +100,13 @@ class AnalyzeByXPath(doc: Any) {
val temp = getStringList(rl)
if (temp.isNotEmpty()) {
results.add(temp)
if (temp.isNotEmpty() && elementsType == "|") {
if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") {
break
}
}
}
if (results.size > 0) {
if ("%" == elementsType) {
if ("%%" == ruleAnalyzes.elementsType) {
for (i in results[0].indices) {
for (temp in results) {
if (i < temp.size) {
@ -151,15 +125,8 @@ class AnalyzeByXPath(doc: Any) {
}
fun getString(rule: String): String? {
val rules: Array<String>
val elementsType: String
if (rule.contains("&&")) {
rules = rule.splitNotBlank("&&")
elementsType = "&"
} else {
rules = rule.splitNotBlank("||")
elementsType = "|"
}
val ruleAnalyzes = RuleAnalyzer(rule)
val rules = ruleAnalyzes.splitRule("&&","||")
if (rules.size == 1) {
getResult(rule)?.let {
return TextUtils.join("\n", it)
@ -171,7 +138,7 @@ class AnalyzeByXPath(doc: Any) {
val temp = getString(rl)
if (!temp.isNullOrEmpty()) {
textList.add(temp)
if (elementsType == "|") {
if (ruleAnalyzes.elementsType == "||") {
break
}
}

@ -0,0 +1,515 @@
package io.legado.app.model.analyzeRule
//通用的规则切分处理
class RuleAnalyzer(data: String) {
private var queue: String = data //被处理字符串
private var pos = 0 //处理到的位置
private var start = 0 //每次处理字段的开始
private var end:Int = queue.length //每次处理字段的终点
private var step:Int = 0 //分割字符的长度
var elementsType = ""
//当前平衡字段
fun currBalancedString( stepStart:Int = 1 , stepEnd:Int = 1): String { //stepStart平衡字符的起始分隔字串长度,stepEnd平衡字符的结束分隔字串长度
return queue.substring(start+stepStart,pos-stepEnd) //当前平衡字段
}
//将pos重置为0,方便复用
fun reSetPos() {
pos = 0
}
//当前拉取字段
fun currString(): String {
return queue.substring(start,pos) //当前拉取到的字段
}
//剩余字串
fun remainingString(): String {
start = pos
pos = queue.length
return queue.substring(start)
}
/**
* pos位置回退
*/
fun back(num :Int = 0) {
if(num == 0)pos = start //回退
else pos -= num
}
/**
* pos位置后移
*/
fun advance(num :Int = 1) {
pos+=num
}
/**
* 是否已无剩余字符?
* @return 若剩余字串中已无字符则返回true
*/
val isEmpty: Boolean
get() = queue.length - pos == 0 //是否处理到最后
/**
* 检索并返回首字符,但pos不变
* @return 首字符若为空则为 0 号字符
*/
fun peek(): Char { //检索首字符
return if (isEmpty) 0.toChar() else queue[pos]
}
/**
* 消耗剩余字串中一个字符
* @return 返回剩余字串中的下个字符
*/
fun consume(): Char {
return queue[pos++]
}
/**
* 字串与剩余字串是否匹配不区分大小写
* @param seq 字符串被检查
* @return 若下字符串匹配返回 true
*/
fun matches(seq: String): Boolean {
return queue.regionMatches(pos, seq, 0, seq.length, ignoreCase = true)
}
/**
* 测试下个字符是否与序列中相应位置的字符相等
* @param seq 被检查的字符列表
* @return 相等就为 true 不相等则为 false
*/
fun matchesAny(vararg seq: Char): Boolean {
if (isEmpty) return false
for (c in seq) {
if (queue[pos] == c) {
return true
}
}
return false
}
/**
* 测试下个字符是否与参数列表里的序列存在匹配 不区分大小写
* @param seq 被不区分大小写检查的字符串列表
* @return 只要匹配就为 true 没有匹配则为 false
*/
fun matchesAny(vararg seq: String): Boolean {
for (s in seq) {
if (matches(s)) {
step = s.length
return true
}
}
return false
}
/**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列或剩余字串用完
* @param seq 分隔字符 **区分大小写**
* @return 是否找到相应字段
*/
fun consumeTo(seq: String,setStartPos:Boolean = true): Boolean {
if(setStartPos)start = pos //将处理到的位置设置为规则起点
val offset = queue.indexOf(seq, pos)
return if (offset != -1) {
pos = offset
true
} else false
}
/**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列匹配参数列表中一项即为匹配或剩余字串用完
* @param f 消费函数返回true表示消费fasle表示不消费
* @param setStartPos 设置开始消费位置
* @return 消耗的字符串
*/
fun consumeToAny(setStartPos:Boolean = true, f:()->Boolean,): Boolean {
if(setStartPos)start = pos //将处理到的位置设置为规则起点
while (!isEmpty && !f()) {
pos++
}
return !isEmpty
}
//其中js只要符合语法,就不用避开任何阅读关键字,自由发挥
fun chompJsBalanced(f: ((Char) -> Boolean?) = {
if ( it == '{' )true //开始嵌套一层
else if ( it == '}') false //闭合一层嵌套
else null
} ): Boolean {
start = pos
var depth = 0 //嵌套深度
var bracketsDepth = 0 //[]嵌套深度
var inSingleQuote = false //单引号
var inDoubleQuote = false //双引号
var inOtherQuote = false //js原始字串分隔字符
var regex = false //正则
var commit = false //单行注释
var commits = false //多行注释
do {
if (isEmpty) break
var c = consume()
if (c != '\\') { //非转义字符
if (c == '\'' && !commits && !commit && !regex && !inDoubleQuote && !inOtherQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号
else if (c == '"' && !commits && !commit && !regex && !inSingleQuote && !inOtherQuote) inDoubleQuote = !inDoubleQuote //匹配具有语法功能的双引号
else if (c == '`' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote) inOtherQuote = !inOtherQuote //匹配具有语法功能的'`'
else if (c == '/' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote && !inOtherQuote) { //匹配注释或正则起点
c = consume()
when(c){
'/'->commit=true //匹配单行注释起点
'*'->commits=true //匹配多行注释起点
else ->regex=true //匹配正则起点
}
}
else if(commits && c == '*') { //匹配多行注释终点
c = consume()
if(c == '/')commits = false
}
else if(regex && c == '/') { //正则的终点或[]平衡
if(c == '/')regex = false//匹配正则终点
//为了保证当open为( 且 close 为 )时,正则中[(]或[)]的合法性。故对[]这对在任何规则中都平衡的成对符号做匹配。
// 注:正则里[(]、[)]、[{]、[}]都是合法的,所以只有[]必须平衡。
else if ( c == '[' )bracketsDepth++ //开始嵌套一层[]
else if ( c== ']') bracketsDepth-- //闭合一层嵌套[]
}
if (commits || commit || regex || inSingleQuote || inDoubleQuote || inOtherQuote) continue //语法单元未匹配结束,直接进入下个循环
val fn = f(c)
if (fn == null) continue
if (fn) depth++ else depth-- //嵌套或者闭合
}else { //转义字符
var next = consume() //拉出被转义字符
if(commit && next == 'n') commit = false //匹配单行注释终点。当前为\,下个为n,表示换行
else if (!commits && !commit && next == '\\') {
consume() //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据if条件"\\"字串不在注释中,则只能在字串或正则中
next = consume() //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构
if(next == '\\')consume() //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\
}
}
} while (depth > 0 || bracketsDepth >0) //拉出全部符合js语法的字段
if(depth > 0 || bracketsDepth >0) start = pos
return pos > start
}
/**
* 在双重转义字串中拉出一个规则平衡组
*/
fun chompRuleBalanced(open: Char = '[', close: Char = ']',f: ((Char) ->Boolean?)? = null ): Boolean {
start = pos
var depth = 0 //嵌套深度
var otherDepth = 0 //其他对称符合嵌套深度
var inSingleQuote = false //单引号
var inDoubleQuote = false //双引号
do {
if (isEmpty) break
val c = consume()
if (c != ESC) { //非转义字符
if (c == '\'' && !inDoubleQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号
else if (c == '"' && !inSingleQuote) inDoubleQuote = !inDoubleQuote //匹配具有语法功能的双引号
if (inSingleQuote || inDoubleQuote) continue //语法单元未匹配结束,直接进入下个循环
if ( c == open )depth++ //开始嵌套一层
else if ( c== close) depth-- //闭合一层嵌套
else if(depth == 0 && f != null) { //处于默认嵌套中的非默认字符不需要平衡,仅depth为0时默认嵌套全部闭合,此字符才进行嵌套
val fn = f(c)
if (fn == null) continue
if (fn) otherDepth++ else otherDepth--
}
}else { //转义字符
var next = consume() //拉出被转义字符,匹配\/、\"、\'等
if (next == ESC) {
consume() //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据语法特征当前字段在字串或正则中
next = consume() //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构
if(next == ESC)consume() //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\
}
}
} while (depth > 0 || otherDepth > 0) //拉出一个平衡字串
return !(depth > 0 || otherDepth > 0) //平衡返回false,不平衡返回true
}
/**
* 不用正则,不到最后不切片也不用中间变量存储,只在序列中标记当前查找字段的开头结尾,到返回时才切片,高效快速准确切割规则
* 解决jsonPath自带的"&&""||"与阅读的规则冲突,以及规则正则或字符串中包含"&&""||""%%"而导致的冲突
*/
tailrec fun splitRule(vararg split: String): Array<String>{ //首段匹配,elementsType为空
if (!consumeToAny { matchesAny(* split) }) return arrayOf(queue) //未找到分隔符
end = pos
val st = if( consumeToAny(false){ matchesAny( '(','[' ) } )pos else -1 //查找筛选器
pos = end
if(st == -1) {
var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组
pos += step //跳过分隔符
elementsType = queue.substring(pos - step, pos) //设置组合类型
while (consumeToAny { matchesAny(* split) }) { //循环切分规则压入数组
rule += queue.substring(start, pos)
pos += step //跳过分隔符
}
rule+= queue.substring(start) //将剩余字段压入数组末尾
return rule
}
val rule = if(st >pos ){ //先匹配到st1pos,表明"&&","||"不在选择器中,将选择器前"&&","||"分隔的字段依次压入数组
var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组
pos += step //跳过分隔符
elementsType = queue.substring(pos - step, pos) //设置组合类型
while (pos < st && consumeToAny { matchesAny( * split ) }) {
rule += queue.substring(start, pos) //循环切分规则压入数组
pos += step //跳过分隔符
}
rule
}else null
pos = st //位置推移到筛选器处
val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符
return if (rule == null) { //rule为空,首段未匹配完成
if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错
splitRule(* split) //递归调用首段匹配
} else {
val start0 = start //记录当前规则开头位置
if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错
start = start0 //筛选器的开头不是本段规则开头,故恢复开头设置
splitRule(rule) //首段已匹配,但当前段匹配未完成,调用二段匹配
}
}
@JvmName("splitRuleNext")
private tailrec fun splitRule(rules:Array<String>): Array<String>{ //二段匹配被调用,elementsType非空(已在首段赋值),直接按elementsType查找,比首段采用的方式更快
if (!consumeTo(elementsType,false)) return rules + queue.substring(start) //此处consumeTo(...)开始位置不是规则的开始位置,start沿用上次设置
end = pos
val st = if( consumeToAny(false){ matchesAny( '(','[' ) } )pos else -1 //查找筛选器
pos = end
if(st == -1) {
var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组
pos += step //跳过分隔符
while (consumeTo(elementsType)) { //循环切分规则压入数组
rule += queue.substring(start, pos)
pos += step //跳过分隔符
}
rule += queue.substring(start) //将剩余字段压入数组末尾
return rule
}
val rule = if(st > pos ){//先匹配到st1pos,表明"&&","||"不在选择器中,将选择器前"&&","||"分隔的字段依次压入数组
var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组
pos += step //跳过分隔符
while (pos < st && consumeTo(elementsType)) { //循环切分规则压入数组
rule += queue.substring(start, pos)
pos += step //跳过分隔符
}
rule
}else rules
pos = st //位置推移到筛选器处
val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符
val start0 = start //记录当前规则开头位置
if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡时返回true,表示未平衡
start = start0 //筛选器平衡,但筛选器的开头不是当前规则开头,故恢复开头设置
return splitRule(rule) //递归匹配
}
/**
* 替换内嵌规则
* @param inner 起始标志,{$. {{
* @param startStep 不属于规则部分的前置字符长度{$.{不属于规则的组成部分故startStep为1
* @param endStep 不属于规则部分的后置字符长度}}长度为2
* @param fr 查找到内嵌规则时用于解析的函数
*
* */
fun innerRule( inner:String,startStep:Int = 1,endStep:Int = 1,fr:(String)->String?): String {
val start0 = pos //规则匹配前起点
val st = StringBuilder()
while (!isEmpty && consumeTo(inner)) { //拉取成功返回true,ruleAnalyzes里的字符序列索引变量pos后移相应位置,否则返回false,且isEmpty为true
val start1 = start //记录拉取前起点
if (chompRuleBalanced {//拉出一个以[]为默认嵌套、以{}为补充嵌套的平衡字段
when (it) {
'{' -> true
'}' -> false
else -> null
}
}) {
val frv= fr(currBalancedString(startStep,endStep))
if(frv != null) {
st.append(queue.substring(start1,start)+frv) //压入内嵌规则前的内容,及内嵌规则解析得到的字符串
continue //获取内容成功,继续选择下个内嵌规则
}
}
start = start1 //拉出字段不平衡,重置起点
pos = start + inner.length //拉出字段不平衡,inner只是个普通字串,规则回退到开头,并跳到此inner后继续匹配
}
//匹配前起点与当前规则起点相同,证明无替换成功的内嵌规则,返回空字符串。否则返回替换后的字符串
return if(start0 == start) "" else {
st.append(remainingString()) //压入剩余字符串
st.toString()
}
}
// /**
// * 匹配并返回标签中的属性键字串(字母、数字、-、_、:)
// * @return 属性键字串
// */
// fun consumeAttributeKey(start:Int = pos): String {
// while (!isEmpty && (Character.isLetterOrDigit(queue[pos]) || matchesAny('-', '_', ':'))) pos++
// return queue.substring(start, pos)
// }
// fun splitRule(query:String,item:String = "other",listItem:String = "allInOne"):String{
//
// val cuurItem = item //当前项类型,list->列表项 mulu->章节列表项 url->链接项 search->搜索链接项 find发现链接列表项 other->其他项
// val cuurList = listItem//当前界面总列表项类型,allInOne,json,xml,kotin,java
// var Reverse = false //是否反转列表
//
// consumeWhitespace() //消耗开头空白
// var fisrt = consume() //拉出并消费首字符
//
// when(item){
// "search" ->
// "find" ->
// "mulu" -> if(fisrt == '-'){
// Reverse=true //开启反转
// consumeWhitespace() //拉出所有空白符
// fisrt = consume() //首字符后移
// }
// else ->
//
// }
//
// return query
// }
companion object {
/**
* 转义字符
*/
private const val ESC = '\\'
/**
* 阅读共有分隔字串起始部分
* "##","@@","{{","{[","<js>", "@js:"
*/
val splitList =arrayOf("##","@@","{{","{[","<js>", "@js:")
/**
* 发现名称-链接分隔字串
* "::"
*/
const val splitListFaXian = "::"
/**
* 目录专有起始字符
* "-"
*/
const val splitListMulu = "-"
/**
* 结果为元素列表的 all in one 模式起始字符
* "+"
*/
const val splitListTongYi = "+"
/**
* 结果为元素列表的项的同规则组合结构
* "||","&&","%%"
*/
val splitListReSplit = arrayOf("||","&&","%%")
/**
* js脚本结束字串
* "</js>"
*/
const val splitListEndJS = "</js>"
/**
*内嵌js结束字串
* "}}"
*/
const val splitListEndInnerJS = "}}"
/**
* 内嵌规则结束字串
* "]}"
*/
const val splitListEndInnerRule = "]}"
/**
* '[', ']', '(', ')','{','}'
*/
val splitListPublic = charArrayOf('[', ']', '(', ')','{','}')
/**
* '*',"/","//",":","::","@","|","@xpath:"
*/
val splitListXpath = arrayOf("*","/","//",":","::","@","|","@xpath:")
/**
* '*','$',".","..", "@json:"
*/
val splitListJson = arrayOf('*','$',".","..", "@json:")
/**
* '*',"+","~",".",",","|","@","@css:",":"
*/
val splitListCss = arrayOf('*',"+","~",".",",","|","@","@css:",":")
/**
* "-",".","!","@","@@"
*/
val splitListDefault = arrayOf("-",".","!","@","@@")
}
}
Loading…
Cancel
Save