diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSonPath.kt b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSonPath.kt index 62bde5a88..e9abb23b2 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSonPath.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSonPath.kt @@ -1,78 +1,77 @@ package io.legado.app.model.analyzeRule -import android.text.TextUtils import androidx.annotation.Keep import com.jayway.jsonpath.JsonPath import com.jayway.jsonpath.ReadContext -import io.legado.app.utils.splitNotBlank import java.util.* -import java.util.regex.Pattern @Suppress("RegExpRedundantEscape") @Keep class AnalyzeByJSonPath(json: Any) { companion object { - private val jsonRulePattern = Pattern.compile("(?<=\\{)\\$\\..+?(?=\\})") fun parse(json: Any): ReadContext { return when (json) { is ReadContext -> json - is String -> JsonPath.parse(json) - else -> JsonPath.parse(json) + is String -> JsonPath.parse(json) //JsonPath.parse(json) + else -> JsonPath.parse(json) //JsonPath.parse(json) } } } private var ctx: ReadContext = parse(json) + /** + * 改进解析方法 + * 解决阅读”&&“、”||“与jsonPath支持的”&&“、”||“之间的冲突 + * 解决{$.rule}形式规则可能匹配错误的问题,旧规则正则解析内容含‘}’的json文本,用规则中的字段去匹配这种内容时,会匹配错误.现改用平衡嵌套方法解决这个问题 + * */ fun getString(rule: String): String? { - if (TextUtils.isEmpty(rule)) return null - var result = "" - val rules: Array - val elementsType: String - if (rule.contains("&&")) { - rules = rule.splitNotBlank("&&") - elementsType = "&" - } else { - rules = rule.splitNotBlank("||") - elementsType = "|" - } + if (rule.isEmpty()) return null + var result: String + val ruleAnalyzes = RuleAnalyzer(rule) + val rules = ruleAnalyzes.splitRule("&&","||") + if (rules.size == 1) { - if (!rule.contains("{$.")) { + + ruleAnalyzes.reSetPos() //将pos重置为0,复用解析器 + + result = ruleAnalyzes.innerRule("{$."){ getString(it) } //替换所有{$.rule...} + + if (result.isEmpty()) { //st为空,表明无成功替换的内嵌规则 + try { + val ob = ctx.read(rule) - result = - if (ob is List<*>) { - val builder = StringBuilder() - for (o in ob) { - builder.append(o).append("\n") - } - builder.toString().replace("\\n$".toRegex(), "") - } else { - ob.toString() + + result =(if (ob is List<*>) { + + val builder = StringBuilder() + for (o in ob) { + builder.append(o).append("\n") } + + builder.deleteCharAt(builder.lastIndex) //删除末尾赘余换行 + + builder + + } else ob).toString() + } catch (ignored: Exception) { } - return result - } else { - result = rule - val matcher = jsonRulePattern.matcher(rule) - while (matcher.find()) { - result = result.replace( - String.format("{%s}", matcher.group()), - getString(matcher.group())!! - ) - } - return result + } + + return result + } else { val textList = arrayListOf() for (rl in rules) { val temp = getString(rl) if (!temp.isNullOrEmpty()) { textList.add(temp) - if (elementsType == "|") { + if (ruleAnalyzes.elementsType == "||") { break } } @@ -83,59 +82,48 @@ class AnalyzeByJSonPath(json: Any) { internal fun getStringList(rule: String): List { val result = ArrayList() - if (TextUtils.isEmpty(rule)) return result - val rules: Array - val elementsType: String - when { - rule.contains("&&") -> { - rules = rule.splitNotBlank("&&") - elementsType = "&" - } - rule.contains("%%") -> { - rules = rule.splitNotBlank("%%") - elementsType = "%" - } - else -> { - rules = rule.splitNotBlank("||") - elementsType = "|" - } - } + if (rule.isEmpty()) return result + val ruleAnalyzes = RuleAnalyzer(rule) + val rules = ruleAnalyzes.splitRule("&&","||","%%") + if (rules.size == 1) { - if (!rule.contains("{$.")) { + + ruleAnalyzes.reSetPos() //将pos重置为0,复用解析器 + + val st = ruleAnalyzes.innerRule("{$."){ getString(it) } //替换所有{$.rule...} + + if (st.isEmpty()) { //st为空,表明无成功替换的内嵌规则 + try { - val obj = ctx.read(rule) ?: return result + + val obj = ctx.read(rule) //kotlin的Any型返回值不包含null ,删除赘余 ?: return result + if (obj is List<*>) { - for (o in obj) - result.add(o.toString()) - } else { - result.add(obj.toString()) - } + + for (o in obj) result.add(o.toString()) + + } else result.add(obj.toString()) + } catch (ignored: Exception) { } - return result - } else { - val matcher = jsonRulePattern.matcher(rule) - while (matcher.find()) { - val stringList = getStringList(matcher.group()) - for (s in stringList) { - result.add(rule.replace(String.format("{%s}", matcher.group()), s)) - } - } - return result - } + + }else result.add(st) + + return result + } else { val results = ArrayList>() for (rl in rules) { val temp = getStringList(rl) if (temp.isNotEmpty()) { results.add(temp) - if (temp.isNotEmpty() && elementsType == "|") { + if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") { break } } } if (results.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in results[0].indices) { for (temp in results) { if (i < temp.size) { @@ -159,23 +147,9 @@ class AnalyzeByJSonPath(json: Any) { internal fun getList(rule: String): ArrayList? { val result = ArrayList() - if (TextUtils.isEmpty(rule)) return result - val elementsType: String - val rules: Array - when { - rule.contains("&&") -> { - rules = rule.splitNotBlank("&&") - elementsType = "&" - } - rule.contains("%%") -> { - rules = rule.splitNotBlank("%%") - elementsType = "%" - } - else -> { - rules = rule.splitNotBlank("||") - elementsType = "|" - } - } + if (rule.isEmpty()) return result + val ruleAnalyzes = RuleAnalyzer(rule) + val rules = ruleAnalyzes.splitRule("&&","||","%%") if (rules.size == 1) { ctx.let { try { @@ -191,13 +165,13 @@ class AnalyzeByJSonPath(json: Any) { val temp = getList(rl) if (temp != null && temp.isNotEmpty()) { results.add(temp) - if (temp.isNotEmpty() && elementsType == "|") { + if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") { break } } } if (results.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in 0 until results[0].size) { for (temp in results) { if (i < temp.size) { diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt index be68320e9..126e37497 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt @@ -1,9 +1,7 @@ package io.legado.app.model.analyzeRule -import android.text.TextUtils.isEmpty import android.text.TextUtils.join import androidx.annotation.Keep -import io.legado.app.utils.splitNotBlank import org.jsoup.Jsoup import org.jsoup.nodes.Element import org.jsoup.select.Collector @@ -19,6 +17,9 @@ import java.util.* @Keep class AnalyzeByJSoup(doc: Any) { companion object { + /** + * "class", "id", "tag", "text", "children" + */ val validKeys = arrayOf("class", "id", "tag", "text", "children") fun parse(doc: Any): Element { @@ -36,67 +37,44 @@ class AnalyzeByJSoup(doc: Any) { /** * 获取列表 */ - internal fun getElements(rule: String): Elements { - return getElements(element, rule) - } + internal fun getElements(rule: String) = getElements(element, rule) /** * 合并内容列表,得到内容 */ - internal fun getString(ruleStr: String): String? { - if (isEmpty(ruleStr)) { - return null - } - val textS = getStringList(ruleStr) - return if (textS.isEmpty()) { - null - } else { - textS.joinToString("\n") - } - - } + internal fun getString(ruleStr: String) = + if(ruleStr.isEmpty()) null + else getStringList(ruleStr).takeIf { it.isNotEmpty() }?.joinToString("\n") /** * 获取一个字符串 */ - internal fun getString0(ruleStr: String): String { - val urlList = getStringList(ruleStr) - return if (urlList.isNotEmpty()) { - urlList[0] - } else "" - } + internal fun getString0(ruleStr: String) = getStringList(ruleStr).let{ if ( it.isEmpty() ) "" else it[0] } /** * 获取所有内容列表 */ internal fun getStringList(ruleStr: String): List { + val textS = ArrayList() - if (isEmpty(ruleStr)) { - return textS - } + + if (ruleStr.isEmpty()) return textS + //拆分规则 val sourceRule = SourceRule(ruleStr) - if (isEmpty(sourceRule.elementsRule)) { + + if (sourceRule.elementsRule.isEmpty()) { + textS.add(element.data() ?: "") + } else { - val elementsType: String - val ruleStrS: Array - when { - sourceRule.elementsRule.contains("&&") -> { - elementsType = "&" - ruleStrS = sourceRule.elementsRule.splitNotBlank("&&") - } - sourceRule.elementsRule.contains("%%") -> { - elementsType = "%" - ruleStrS = sourceRule.elementsRule.splitNotBlank("%%") - } - else -> { - elementsType = "|" - ruleStrS = sourceRule.elementsRule.splitNotBlank("||") - } - } + + val ruleAnalyzes = RuleAnalyzer(sourceRule.elementsRule) + val ruleStrS = ruleAnalyzes.splitRule("&&","||" ,"%%") + val results = ArrayList>() for (ruleStrX in ruleStrS) { + val temp: List? = if (sourceRule.isCss) { val lastIndex = ruleStrX.lastIndexOf('@') @@ -107,15 +85,17 @@ class AnalyzeByJSoup(doc: Any) { } else { getResultList(ruleStrX) } + if (!temp.isNullOrEmpty()) { - results.add(temp) - if (results.isNotEmpty() && elementsType == "|") { - break - } + + results.add(temp) //!temp.isNullOrEmpty()时,results.isNotEmpty()为true + + if (ruleAnalyzes.elementsType == "||") break + } } if (results.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in results[0].indices) { for (temp in results) { if (i < temp.size) { @@ -137,47 +117,56 @@ class AnalyzeByJSoup(doc: Any) { * 获取Elements */ private fun getElements(temp: Element?, rule: String): Elements { + + if (temp == null || rule.isEmpty()) return Elements() + val elements = Elements() - if (temp == null || isEmpty(rule)) { - return elements - } + val sourceRule = SourceRule(rule) - val elementsType: String - val ruleStrS: Array - when { - sourceRule.elementsRule.contains("&&") -> { - elementsType = "&" - ruleStrS = sourceRule.elementsRule.splitNotBlank("&&") - } - sourceRule.elementsRule.contains("%%") -> { - elementsType = "%" - ruleStrS = sourceRule.elementsRule.splitNotBlank("%%") - } - else -> { - elementsType = "|" - ruleStrS = sourceRule.elementsRule.splitNotBlank("||") - } - } + val ruleAnalyzes = RuleAnalyzer(sourceRule.elementsRule) + val ruleStrS = ruleAnalyzes.splitRule("&&","||","%%") + val elementsList = ArrayList() if (sourceRule.isCss) { for (ruleStr in ruleStrS) { val tempS = temp.select(ruleStr) elementsList.add(tempS) - if (tempS.size > 0 && elementsType == "|") { + if (tempS.size > 0 && ruleAnalyzes.elementsType == "||") { break } } } else { for (ruleStr in ruleStrS) { - val tempS = getElementsSingle(temp, ruleStr) - elementsList.add(tempS) - if (tempS.size > 0 && elementsType == "|") { + //将原getElementsSingle函数调用的函数的部分代码内联过来,方便简化getElementsSingle函数 + + val rsRule = RuleAnalyzer(ruleStr) + + if( rsRule.peek() =='@' || rsRule.peek() < '!' ) rsRule.advance() // 修剪当前规则之前的"@"或者空白符 + + val rs = rsRule.splitRule("@") + + val el = if (rs.size > 1) { + val el = Elements() + el.add(temp) + for (rl in rs) { + val es = Elements() + for (et in el) { + es.addAll(getElements(et, rl)) + } + el.clear() + el.addAll(es) + } + el + }else getElementsSingle(temp,ruleStr) + + elementsList.add(el) + if (el.size > 0 && ruleAnalyzes.elementsType == "||") { break } } } if (elementsList.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in 0 until elementsList[0].size) { for (es in elementsList) { if (i < es.size) { @@ -194,134 +183,154 @@ class AnalyzeByJSoup(doc: Any) { return elements } - private fun filterElements(elements: Elements, rules: Array?): Elements { - if (rules == null || rules.size < 2) return elements - val result = Elements() - for (element in elements) { - var isOk = false - when (rules[0]) { - "class" -> isOk = element.getElementsByClass(rules[1]).size > 0 - "id" -> isOk = element.getElementById(rules[1]) != null - "tag" -> isOk = element.getElementsByTag(rules[1]).size > 0 - "text" -> isOk = element.getElementsContainingOwnText(rules[1]).size > 0 - } - if (isOk) { - result.add(element) + /** + * '.'开头表示选择元素,或'!'开头排除那些元素。两者都支持以索引列表按顺序指定元素列表 + * ':'分隔不同索引或区间 + * 区间格式为 start~end+step,其中start为0可省略,end为-1可省略。 + * 索引,区间两端及间隔都支持负数 + * 例如 tag.div.-1:3~-2+-10:2 + * 特殊用法 tag.div.-1~0 可在任意地方让列表反向 + * */ + fun findIndexSet( rule:String ): IndexSet { + + val indexSet = IndexSet() + + val rus = rule.trim{ it <= ' '} + + var last = rus.length + var step = 0 //区间步长,为0表示没设置区间 + var curInt: Int //当前数字 + var end = 0 //暂存区间结束数字 + + var range = false //true表示当前在区间开头,false表示当前在区间结尾 + var curMinus = false //当前数字是否为负 + var curEndMinus = false //当前区间右端数字是否为负 + var curStepMinus = false //当前区间间隔数字是否为负 + + var l = "" //暂存数字字符串 + + while (last --> 1 ){ //逆向遍历,至少有两位前置字符,如 p. + + val rl = rus[last] + if(rl == ' ' )continue //跳过空格 + if( rl in '0'..'9') l+= rl //将数值累接入临时字串中,遇到分界符才取出 + else if(rl == '-') curMinus = true + else if( rl in arrayOf('+','~','!','.',':')) { //分界符号 '+','~','!','.',':' + + when ( rl ) { + + '+' ->{ + curStepMinus = curMinus + step = l.toInt() //区间间隔数 + } + + '~' -> { + range = true + curEndMinus = curMinus + + if (l.isEmpty()) { + end = -1 //省略区间右端,设置为-1 + continue + } else end = l.toInt() + } + + else -> { + + curInt = if(l.isEmpty()) 0 /* 省略区间左端,设置为0 */ else if(curMinus) - l.toInt() else l.toInt() //区间左端数,省略则为最左边 + + indexSet.indexs.add( //压入以下值,为保证查找顺序,区间和单个索引都添加到同一集合 + + if ( range ) { + + range = false //重置 + + if (curEndMinus) { + end = -end + curEndMinus = false //重置 + } + + //没设置间隔时,间隔为1。将区间的三项数据压入,在获取到元素数量后再计算负数索引,并展开区间 + if( step == 0 ) Triple(curInt, end, 1) + + else { + + if (curStepMinus) { + step = -step + curStepMinus = false //重置 + } + + val stepx = step + step = 0 //重置 + + //将区间的三项数据压入,在获取到元素数量后再计算负数索引,并展开区间 + Triple(curInt, end, stepx) + + } + + }else curInt //压入单个索引,在获取到元素数量后再计算负数索引 + + ) + + if( rl == '!' || rl == '.' ) return indexSet.apply{ + split = rl + beforeRule = rus.substring(0, last) + } + } + } + l = "" //清空 + curMinus = false //重置 } + + else break + } - return result + + return indexSet.apply{ beforeRule = rus } //非索引格式 } /** * 获取Elements按照一个规则 */ private fun getElementsSingle(temp: Element, rule: String): Elements { - val elements = Elements() - try { - val rs = rule.trim { it <= ' ' }.splitNotBlank("@") - if (rs.size > 1) { - elements.add(temp) - for (rl in rs) { - val es = Elements() - for (et in elements) { - es.addAll(getElements(et, rl)) - } - elements.clear() - elements.addAll(es) - } - } else { - val rulePcx = rule.split("!") - val rulePc = rulePcx[0].trim { it <= ' ' }.split(">") - val rules = rulePc[0].trim { it <= ' ' }.split(".") - var filterRules: Array? = null - var needFilterElements = rulePc.size > 1 && !isEmpty(rulePc[1].trim { it <= ' ' }) - if (needFilterElements) { - filterRules = rulePc[1].trim { it <= ' ' }.split(".").toTypedArray() - filterRules[0] = filterRules[0].trim { it <= ' ' } - if (filterRules.size < 2 - || !validKeys.contains(filterRules[0]) - || filterRules[1].trim { it <= ' ' }.isEmpty() - ) { - needFilterElements = false - } - filterRules[1] = filterRules[1].trim { it <= ' ' } - } - when (rules[0]) { - "children" -> { - var children = temp.children() - if (needFilterElements) - children = filterElements(children, filterRules) - elements.addAll(children) - } - "class" -> { - var elementsByClass = temp.getElementsByClass(rules[1]) - if (rules.size == 3 && rules[2].isNotEmpty()) { - val index = Integer.parseInt(rules[2]) - if (index < 0) { - elements.add(elementsByClass[elementsByClass.size + index]) - } else { - elements.add(elementsByClass[index]) - } - } else { - if (needFilterElements) - elementsByClass = filterElements(elementsByClass, filterRules) - elements.addAll(elementsByClass) - } - } - "tag" -> { - var elementsByTag = temp.getElementsByTag(rules[1]) - if (rules.size == 3 && rules[2].isNotEmpty()) { - val index = Integer.parseInt(rules[2]) - if (index < 0) { - elements.add(elementsByTag[elementsByTag.size + index]) - } else { - elements.add(elementsByTag[index]) - } - } else { - if (needFilterElements) - elementsByTag = filterElements(elementsByTag, filterRules) - elements.addAll(elementsByTag) - } - } - "id" -> { - var elementsById = Collector.collect(Evaluator.Id(rules[1]), temp) - if (rules.size == 3 && rules[2].isNotEmpty()) { - val index = Integer.parseInt(rules[2]) - if (index < 0) { - elements.add(elementsById[elementsById.size + index]) - } else { - elements.add(elementsById[index]) - } - } else { - if (needFilterElements) - elementsById = filterElements(elementsById, filterRules) - elements.addAll(elementsById) - } - } - "text" -> { - var elementsByText = temp.getElementsContainingOwnText(rules[1]) - if (needFilterElements) - elementsByText = filterElements(elementsByText, filterRules) - elements.addAll(elementsByText) - } - else -> elements.addAll(temp.select(rulePcx[0])) - } - if (rulePcx.size > 1) { - val rulePcs = rulePcx[1].splitNotBlank(":") - for (pc in rulePcs) { - val pcInt = Integer.parseInt(pc) - if (pcInt < 0 && elements.size + pcInt >= 0) { - elements[elements.size + pcInt] = null - } else if (Integer.parseInt(pc) < elements.size) { - elements[Integer.parseInt(pc)] = null - } - } - val es = Elements() - es.add(null) - elements.removeAll(es) - } - } - } catch (ignore: Exception) { + + var elements = Elements() + + val fi = findIndexSet(rule) //执行索引列表处理器 + + val (filterType,ruleStr) = fi //获取操作类型及非索引部分的规则字串 + +// val rulePc = rulePcx[0].trim { it <= ' ' }.split(">") +// jsoup中,当前节点是参与选择的,tag.div 与 tag.div@tag.div 结果相同 +// 此处">"效果和“@”完全相同,且容易让人误解成选择子节点,实际并不是。以后不允许这种无意义的写法 + + val rules = ruleStr.split(".") + + elements.addAll( + when (rules[0]) { + "children" -> temp.children() + "class" -> temp.getElementsByClass(rules[1]) + "tag" -> temp.getElementsByTag(rules[1]) + "id" -> Collector.collect(Evaluator.Id(rules[1]), temp) + "text" -> temp.getElementsContainingOwnText(rules[1]) + else -> temp.select(ruleStr) + } ) + + val indexSet = fi.getIndexs(elements.size) //传入元素数量,处理负数索引及索引越界问题,生成可用索引集合。 + + if(filterType == '!'){ //排除 + + for (pcInt in indexSet) elements[pcInt] = null + + elements.removeAll( Elements().apply { add(null) } ) + + }else if(filterType == '.'){ //选择 + + val es = Elements() + + for (pcInt in indexSet) es.add(elements[pcInt]) + + elements = es + } return elements @@ -331,13 +340,21 @@ class AnalyzeByJSoup(doc: Any) { * 获取内容列表 */ private fun getResultList(ruleStr: String): List? { - if (isEmpty(ruleStr)) { - return null - } + + if (ruleStr.isEmpty()) return null + var elements = Elements() + elements.add(element) - val rules = ruleStr.splitNotBlank("@") - for (i in 0 until rules.size - 1) { + + val rule = RuleAnalyzer(ruleStr) //创建解析 + + while( rule.peek() =='@' || rule.peek() < '!' ) rule.advance() // 修剪当前规则之前的"@"或者空白符 + + val rules = rule.splitRule("@") // 切割成列表 + + val last = rules.size - 1 + for (i in 0 until last) { val es = Elements() for (elt in elements) { es.addAll(getElementsSingle(elt, rules[i])) @@ -345,9 +362,7 @@ class AnalyzeByJSoup(doc: Any) { elements.clear() elements = es } - return if (elements.isEmpty()) { - null - } else getResultLast(elements, rules[rules.size - 1]) + return if (elements.isEmpty()) null else getResultLast(elements, rules[last]) } /** @@ -365,7 +380,7 @@ class AnalyzeByJSoup(doc: Any) { val contentEs = element.textNodes() for (item in contentEs) { val temp = item.text().trim { it <= ' ' } - if (!isEmpty(temp)) { + if (temp.isNotEmpty()) { tn.add(temp) } } @@ -382,10 +397,12 @@ class AnalyzeByJSoup(doc: Any) { } "all" -> textS.add(elements.outerHtml()) else -> for (element in elements) { + val url = element.attr(lastRule) - if (!isEmpty(url) && !textS.contains(url)) { - textS.add(url) - } + + if(url.isEmpty() || textS.contains(url)) break + + textS.add(url) } } } catch (e: Exception) { @@ -395,17 +412,67 @@ class AnalyzeByJSoup(doc: Any) { return textS } - internal inner class SourceRule(ruleStr: String) { - var isCss = false - var elementsRule: String + data class IndexSet(var split:Char = ' ', + var beforeRule:String = "", + val indexs:MutableList = mutableListOf()){ + + fun getIndexs(len:Int): MutableSet { + + val indexSet = mutableSetOf() + + val lastIndexs = indexs.size - 1 + + for (ix in lastIndexs downTo 0 ){ //逆向遍历,还原顺序 + + if(indexs[ix] is Triple<*, *, *>){ //区间 + + var (start, end, step) = indexs[ix] as Triple //还原储存时的类型 + + if (start >= 0) { + if (start >= len) start = len - 1 //右端越界,设置为最大索引 + } else start = if (-start <= len) len + start /* 将负索引转正 */ else 0 //左端越界,设置为最小索引 + + if (end >= 0) { + if (end >= len) end = len - 1 //右端越界,设置为最大索引 + } else end = if (-end <= len) len + end /* 将负索引转正 */ else 0 //左端越界,设置为最小索引 + + if (start == end || step >= len) { //两端相同,区间里只有一个数。或间隔过大,区间实际上仅有首位 + + indexSet.add(start) + continue + + } + + step = if (step > 0) step else if (-step < len) step + len else 1 //最小正数间隔为1 + + //将区间展开到集合中,允许列表反向。 + indexSet.addAll(if (end > start) start..end step step else start downTo end step step) + + }else{//单个索引 + + val it = indexs[ix] as Int //还原储存时的类型 + + if(it in 0 until len) indexSet.add(it) //将正数不越界的索引添加到集合 + else if(it < 0 && len >= -it) indexSet.add(it + len) //将负数不越界的索引添加到集合 + + } - init { - if (ruleStr.startsWith("@CSS:", true)) { - isCss = true - elementsRule = ruleStr.substring(5).trim { it <= ' ' } - } else { - elementsRule = ruleStr } + + return indexSet + + } + + } + + + internal inner class SourceRule(ruleStr: String) { + var isCss = false + var elementsRule: String = if (ruleStr.startsWith("@CSS:", true)) { + isCss = true + ruleStr.substring(5).trim { it <= ' ' } + } else { + ruleStr } } diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByXPath.kt b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByXPath.kt index 5954ccdf2..dff178b46 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByXPath.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByXPath.kt @@ -2,7 +2,6 @@ package io.legado.app.model.analyzeRule import android.text.TextUtils import androidx.annotation.Keep -import io.legado.app.utils.splitNotBlank import org.jsoup.nodes.Document import org.jsoup.nodes.Element import org.jsoup.select.Elements @@ -27,10 +26,10 @@ class AnalyzeByXPath(doc: Any) { private fun strToJXDocument(html: String): JXDocument { var html1 = html if (html1.endsWith("")) { - html1 = String.format("%s", html1) + html1 = "${html1}" } if (html1.endsWith("") || html1.endsWith("")) { - html1 = String.format("%s
", html1) + html1 = "${html1}
" } return JXDocument.create(html1) } @@ -45,26 +44,13 @@ class AnalyzeByXPath(doc: Any) { } internal fun getElements(xPath: String): List? { - if (TextUtils.isEmpty(xPath)) { - return null - } + + if(xPath.isEmpty()) return null + val jxNodes = ArrayList() - val elementsType: String - val rules: Array - when { - xPath.contains("&&") -> { - rules = xPath.splitNotBlank("&&") - elementsType = "&" - } - xPath.contains("%%") -> { - rules = xPath.splitNotBlank("%%") - elementsType = "%" - } - else -> { - rules = xPath.splitNotBlank("||") - elementsType = "|" - } - } + val ruleAnalyzes = RuleAnalyzer(xPath) + val rules = ruleAnalyzes.splitRule("&&","||","%%") + if (rules.size == 1) { return getResult(rules[0]) } else { @@ -73,13 +59,13 @@ class AnalyzeByXPath(doc: Any) { val temp = getElements(rl) if (temp != null && temp.isNotEmpty()) { results.add(temp) - if (temp.isNotEmpty() && elementsType == "|") { + if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") { break } } } if (results.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in results[0].indices) { for (temp in results) { if (i < temp.size) { @@ -98,23 +84,11 @@ class AnalyzeByXPath(doc: Any) { } internal fun getStringList(xPath: String): List { + val result = ArrayList() - val elementsType: String - val rules: Array - when { - xPath.contains("&&") -> { - rules = xPath.splitNotBlank("&&") - elementsType = "&" - } - xPath.contains("%%") -> { - rules = xPath.splitNotBlank("%%") - elementsType = "%" - } - else -> { - rules = xPath.splitNotBlank("||") - elementsType = "|" - } - } + val ruleAnalyzes = RuleAnalyzer(xPath) + val rules = ruleAnalyzes.splitRule("&&","||","%%") + if (rules.size == 1) { getResult(xPath)?.map { result.add(it.asString()) @@ -126,13 +100,13 @@ class AnalyzeByXPath(doc: Any) { val temp = getStringList(rl) if (temp.isNotEmpty()) { results.add(temp) - if (temp.isNotEmpty() && elementsType == "|") { + if (temp.isNotEmpty() && ruleAnalyzes.elementsType == "||") { break } } } if (results.size > 0) { - if ("%" == elementsType) { + if ("%%" == ruleAnalyzes.elementsType) { for (i in results[0].indices) { for (temp in results) { if (i < temp.size) { @@ -151,15 +125,8 @@ class AnalyzeByXPath(doc: Any) { } fun getString(rule: String): String? { - val rules: Array - val elementsType: String - if (rule.contains("&&")) { - rules = rule.splitNotBlank("&&") - elementsType = "&" - } else { - rules = rule.splitNotBlank("||") - elementsType = "|" - } + val ruleAnalyzes = RuleAnalyzer(rule) + val rules = ruleAnalyzes.splitRule("&&","||") if (rules.size == 1) { getResult(rule)?.let { return TextUtils.join("\n", it) @@ -171,7 +138,7 @@ class AnalyzeByXPath(doc: Any) { val temp = getString(rl) if (!temp.isNullOrEmpty()) { textList.add(temp) - if (elementsType == "|") { + if (ruleAnalyzes.elementsType == "||") { break } } diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt new file mode 100644 index 000000000..73a861acf --- /dev/null +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -0,0 +1,515 @@ +package io.legado.app.model.analyzeRule + +//通用的规则切分处理 + +class RuleAnalyzer(data: String) { + + private var queue: String = data //被处理字符串 + private var pos = 0 //处理到的位置 + + private var start = 0 //每次处理字段的开始 + private var end:Int = queue.length //每次处理字段的终点 + private var step:Int = 0 //分割字符的长度 + + var elementsType = "" + + //当前平衡字段 + fun currBalancedString( stepStart:Int = 1 , stepEnd:Int = 1): String { //stepStart平衡字符的起始分隔字串长度,stepEnd平衡字符的结束分隔字串长度 + return queue.substring(start+stepStart,pos-stepEnd) //当前平衡字段 + } + + //将pos重置为0,方便复用 + fun reSetPos() { + pos = 0 + } + + //当前拉取字段 + fun currString(): String { + return queue.substring(start,pos) //当前拉取到的字段 + } + + //剩余字串 + fun remainingString(): String { + start = pos + pos = queue.length + return queue.substring(start) + } + + /** + * pos位置回退 + */ + fun back(num :Int = 0) { + if(num == 0)pos = start //回退 + else pos -= num + } + + /** + * pos位置后移 + */ + fun advance(num :Int = 1) { + pos+=num + } + + /** + * 是否已无剩余字符? + * @return 若剩余字串中已无字符则返回true + */ + val isEmpty: Boolean + get() = queue.length - pos == 0 //是否处理到最后 + + /** + * 检索并返回首字符,但pos不变 + * @return 首字符:若为空则为 0 号字符 + */ + fun peek(): Char { //检索首字符 + return if (isEmpty) 0.toChar() else queue[pos] + } + + /** + * 消耗剩余字串中一个字符。 + * @return 返回剩余字串中的下个字符。 + */ + fun consume(): Char { + return queue[pos++] + } + + /** + * 字串与剩余字串是否匹配,不区分大小写 + * @param seq 字符串被检查 + * @return 若下字符串匹配返回 true + */ + fun matches(seq: String): Boolean { + return queue.regionMatches(pos, seq, 0, seq.length, ignoreCase = true) + } + + /** + * 测试下个字符是否与序列中相应位置的字符相等。 + * @param seq :被检查的字符列表 + * @return 相等就为 true ,不相等则为 false + */ + fun matchesAny(vararg seq: Char): Boolean { + if (isEmpty) return false + for (c in seq) { + if (queue[pos] == c) { + return true + } + } + return false + } + + /** + * 测试下个字符(串)是否与参数列表里的序列存在匹配。 不区分大小写。 + * @param seq :被不区分大小写检查的字符串列表 + * @return 只要匹配就为 true ,没有匹配则为 false + */ + fun matchesAny(vararg seq: String): Boolean { + for (s in seq) { + if (matches(s)) { + step = s.length + return true + } + } + return false + } + + /** + * 从剩余字串中拉出一个字符串,直到但不包括匹配序列,或剩余字串用完。 + * @param seq :分隔字符 **区分大小写** + * @return 是否找到相应字段。 + */ + fun consumeTo(seq: String,setStartPos:Boolean = true): Boolean { + if(setStartPos)start = pos //将处理到的位置设置为规则起点 + val offset = queue.indexOf(seq, pos) + return if (offset != -1) { + pos = offset + true + } else false + } + + /** + * 从剩余字串中拉出一个字符串,直到但不包括匹配序列(匹配参数列表中一项即为匹配),或剩余字串用完。 + * @param f 消费函数,返回true表示消费,fasle表示不消费 + * @param setStartPos 设置开始消费位置 + * @return 消耗的字符串 + */ + fun consumeToAny(setStartPos:Boolean = true, f:()->Boolean,): Boolean { + if(setStartPos)start = pos //将处理到的位置设置为规则起点 + while (!isEmpty && !f()) { + pos++ + } + return !isEmpty + } + + //其中js只要符合语法,就不用避开任何阅读关键字,自由发挥 + fun chompJsBalanced(f: ((Char) -> Boolean?) = { + if ( it == '{' )true //开始嵌套一层 + else if ( it == '}') false //闭合一层嵌套 + else null + } ): Boolean { + start = pos + var depth = 0 //嵌套深度 + var bracketsDepth = 0 //[]嵌套深度 + + var inSingleQuote = false //单引号 + var inDoubleQuote = false //双引号 + var inOtherQuote = false //js原始字串分隔字符 + var regex = false //正则 + var commit = false //单行注释 + var commits = false //多行注释 + + do { + if (isEmpty) break + var c = consume() + if (c != '\\') { //非转义字符 + if (c == '\'' && !commits && !commit && !regex && !inDoubleQuote && !inOtherQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号 + else if (c == '"' && !commits && !commit && !regex && !inSingleQuote && !inOtherQuote) inDoubleQuote = !inDoubleQuote //匹配具有语法功能的双引号 + else if (c == '`' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote) inOtherQuote = !inOtherQuote //匹配具有语法功能的'`' + else if (c == '/' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote && !inOtherQuote) { //匹配注释或正则起点 + c = consume() + when(c){ + '/'->commit=true //匹配单行注释起点 + '*'->commits=true //匹配多行注释起点 + else ->regex=true //匹配正则起点 + } + } + else if(commits && c == '*') { //匹配多行注释终点 + c = consume() + if(c == '/')commits = false + } + else if(regex && c == '/') { //正则的终点或[]平衡 + + if(c == '/')regex = false//匹配正则终点 + + //为了保证当open为( 且 close 为 )时,正则中[(]或[)]的合法性。故对[]这对在任何规则中都平衡的成对符号做匹配。 + // 注:正则里[(]、[)]、[{]、[}]都是合法的,所以只有[]必须平衡。 + + else if ( c == '[' )bracketsDepth++ //开始嵌套一层[] + else if ( c== ']') bracketsDepth-- //闭合一层嵌套[] + + } + + if (commits || commit || regex || inSingleQuote || inDoubleQuote || inOtherQuote) continue //语法单元未匹配结束,直接进入下个循环 + + val fn = f(c) + if (fn == null) continue + if (fn) depth++ else depth-- //嵌套或者闭合 + + }else { //转义字符 + var next = consume() //拉出被转义字符 + if(commit && next == 'n') commit = false //匹配单行注释终点。当前为\,下个为n,表示换行 + else if (!commits && !commit && next == '\\') { + consume() //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据if条件"\\"字串不在注释中,则只能在字串或正则中 + next = consume() //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 + if(next == '\\')consume() //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ + } + } + } while (depth > 0 || bracketsDepth >0) //拉出全部符合js语法的字段 + + if(depth > 0 || bracketsDepth >0) start = pos + + return pos > start + } + + /** + * 在双重转义字串中拉出一个规则平衡组 + */ + fun chompRuleBalanced(open: Char = '[', close: Char = ']',f: ((Char) ->Boolean?)? = null ): Boolean { + start = pos + var depth = 0 //嵌套深度 + var otherDepth = 0 //其他对称符合嵌套深度 + + var inSingleQuote = false //单引号 + var inDoubleQuote = false //双引号 + + do { + if (isEmpty) break + val c = consume() + if (c != ESC) { //非转义字符 + if (c == '\'' && !inDoubleQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号 + else if (c == '"' && !inSingleQuote) inDoubleQuote = !inDoubleQuote //匹配具有语法功能的双引号 + + if (inSingleQuote || inDoubleQuote) continue //语法单元未匹配结束,直接进入下个循环 + + if ( c == open )depth++ //开始嵌套一层 + else if ( c== close) depth-- //闭合一层嵌套 + else if(depth == 0 && f != null) { //处于默认嵌套中的非默认字符不需要平衡,仅depth为0时默认嵌套全部闭合,此字符才进行嵌套 + val fn = f(c) + if (fn == null) continue + if (fn) otherDepth++ else otherDepth-- + } + + }else { //转义字符 + var next = consume() //拉出被转义字符,匹配\/、\"、\'等 + if (next == ESC) { + consume() //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据语法特征当前字段在字串或正则中 + next = consume() //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 + if(next == ESC)consume() //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ + } + } + } while (depth > 0 || otherDepth > 0) //拉出一个平衡字串 + + return !(depth > 0 || otherDepth > 0) //平衡返回false,不平衡返回true + } + + /** + * 不用正则,不到最后不切片也不用中间变量存储,只在序列中标记当前查找字段的开头结尾,到返回时才切片,高效快速准确切割规则 + * 解决jsonPath自带的"&&"和"||"与阅读的规则冲突,以及规则正则或字符串中包含"&&"或"||"或"%%"而导致的冲突 + */ + tailrec fun splitRule(vararg split: String): Array{ //首段匹配,elementsType为空 + + if (!consumeToAny { matchesAny(* split) }) return arrayOf(queue) //未找到分隔符 + + end = pos + val st = if( consumeToAny(false){ matchesAny( '(','[' ) } )pos else -1 //查找筛选器 + pos = end + + if(st == -1) { + + var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组 + + pos += step //跳过分隔符 + elementsType = queue.substring(pos - step, pos) //设置组合类型 + + while (consumeToAny { matchesAny(* split) }) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } + rule+= queue.substring(start) //将剩余字段压入数组末尾 + return rule + } + + val rule = if(st >pos ){ //先匹配到st1pos,表明"&&","||"不在选择器中,将选择器前"&&","||"分隔的字段依次压入数组 + + var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组 + + pos += step //跳过分隔符 + elementsType = queue.substring(pos - step, pos) //设置组合类型 + + while (pos < st && consumeToAny { matchesAny( * split ) }) { + rule += queue.substring(start, pos) //循环切分规则压入数组 + pos += step //跳过分隔符 + } + rule + }else null + + pos = st //位置推移到筛选器处 + val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符 + + return if (rule == null) { //rule为空,首段未匹配完成 + + if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错 + splitRule(* split) //递归调用首段匹配 + + } else { + + val start0 = start //记录当前规则开头位置 + if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错 + start = start0 //筛选器的开头不是本段规则开头,故恢复开头设置 + splitRule(rule) //首段已匹配,但当前段匹配未完成,调用二段匹配 + + } + + } + + @JvmName("splitRuleNext") + private tailrec fun splitRule(rules:Array): Array{ //二段匹配被调用,elementsType非空(已在首段赋值),直接按elementsType查找,比首段采用的方式更快 + + if (!consumeTo(elementsType,false)) return rules + queue.substring(start) //此处consumeTo(...)开始位置不是规则的开始位置,start沿用上次设置 + + end = pos + val st = if( consumeToAny(false){ matchesAny( '(','[' ) } )pos else -1 //查找筛选器 + pos = end + + if(st == -1) { + var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组 + pos += step //跳过分隔符 + while (consumeTo(elementsType)) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } + rule += queue.substring(start) //将剩余字段压入数组末尾 + return rule + } + + val rule = if(st > pos ){//先匹配到st1pos,表明"&&","||"不在选择器中,将选择器前"&&","||"分隔的字段依次压入数组 + var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组 + pos += step //跳过分隔符 + while (pos < st && consumeTo(elementsType)) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } + rule + }else rules + + pos = st //位置推移到筛选器处 + val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符 + + val start0 = start //记录当前规则开头位置 + if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡时返回true,表示未平衡 + start = start0 //筛选器平衡,但筛选器的开头不是当前规则开头,故恢复开头设置 + + return splitRule(rule) //递归匹配 + + } + + + /** + * 替换内嵌规则 + * @param inner 起始标志,如{$. 或 {{ + * @param startStep 不属于规则部分的前置字符长度,如{$.中{不属于规则的组成部分,故startStep为1 + * @param endStep 不属于规则部分的后置字符长度,如}}长度为2 + * @param fr 查找到内嵌规则时,用于解析的函数 + * + * */ + fun innerRule( inner:String,startStep:Int = 1,endStep:Int = 1,fr:(String)->String?): String { + + val start0 = pos //规则匹配前起点 + + val st = StringBuilder() + + while (!isEmpty && consumeTo(inner)) { //拉取成功返回true,ruleAnalyzes里的字符序列索引变量pos后移相应位置,否则返回false,且isEmpty为true + + val start1 = start //记录拉取前起点 + + if (chompRuleBalanced {//拉出一个以[]为默认嵌套、以{}为补充嵌套的平衡字段 + when (it) { + '{' -> true + '}' -> false + else -> null + } + }) { + val frv= fr(currBalancedString(startStep,endStep)) + if(frv != null) { + + st.append(queue.substring(start1,start)+frv) //压入内嵌规则前的内容,及内嵌规则解析得到的字符串 + continue //获取内容成功,继续选择下个内嵌规则 + + } + } + + start = start1 //拉出字段不平衡,重置起点 + pos = start + inner.length //拉出字段不平衡,inner只是个普通字串,规则回退到开头,并跳到此inner后继续匹配 + + } + + //匹配前起点与当前规则起点相同,证明无替换成功的内嵌规则,返回空字符串。否则返回替换后的字符串 + return if(start0 == start) "" else { + st.append(remainingString()) //压入剩余字符串 + st.toString() + } + } + +// /** +// * 匹配并返回标签中的属性键字串(字母、数字、-、_、:) +// * @return 属性键字串 +// */ +// fun consumeAttributeKey(start:Int = pos): String { +// while (!isEmpty && (Character.isLetterOrDigit(queue[pos]) || matchesAny('-', '_', ':'))) pos++ +// return queue.substring(start, pos) +// } + +// fun splitRule(query:String,item:String = "other",listItem:String = "allInOne"):String{ +// +// val cuurItem = item //当前项类型,list->列表项 mulu->章节列表项 url->链接项 search->搜索链接项 find发现链接列表项 other->其他项 +// val cuurList = listItem//当前界面总列表项类型,allInOne,json,xml,kotin,java +// var Reverse = false //是否反转列表 +// +// consumeWhitespace() //消耗开头空白 +// var fisrt = consume() //拉出并消费首字符 +// +// when(item){ +// "search" -> +// "find" -> +// "mulu" -> if(fisrt == '-'){ +// Reverse=true //开启反转 +// consumeWhitespace() //拉出所有空白符 +// fisrt = consume() //首字符后移 +// } +// else -> +// +// } +// +// return query +// } + + companion object { + /** + * 转义字符 + */ + private const val ESC = '\\' + + /** + * 阅读共有分隔字串起始部分 + * "##","@@","{{","{[","", "@js:" + */ + val splitList =arrayOf("##","@@","{{","{[","", "@js:") + + /** + * 发现‘名称-链接’分隔字串 + * "::" + */ + const val splitListFaXian = "::" + + /** + * 目录专有起始字符 + * "-" + */ + const val splitListMulu = "-" + + /** + * 结果为元素列表的 all in one 模式起始字符 + * "+" + */ + const val splitListTongYi = "+" + + /** + * 结果为元素列表的项的同规则组合结构 + * "||","&&","%%" + */ + val splitListReSplit = arrayOf("||","&&","%%") + + /** + * js脚本结束字串 + * "" + */ + const val splitListEndJS = "" + + /** + *内嵌js结束字串 + * "}}" + */ + const val splitListEndInnerJS = "}}" + + /** + * 内嵌规则结束字串 + * "]}" + */ + const val splitListEndInnerRule = "]}" + + /** + * '[', ']', '(', ')','{','}' + */ + val splitListPublic = charArrayOf('[', ']', '(', ')','{','}') + + /** + * '*',"/","//",":","::","@","|","@xpath:" + */ + val splitListXpath = arrayOf("*","/","//",":","::","@","|","@xpath:") + + /** + * '*','$',".","..", "@json:" + */ + val splitListJson = arrayOf('*','$',".","..", "@json:") + + /** + * '*',"+","~",".",",","|","@","@css:",":" + */ + val splitListCss = arrayOf('*',"+","~",".",",","|","@","@css:",":") + + /** + * "-",".","!","@","@@" + */ + val splitListDefault = arrayOf("-",".","!","@","@@") + + } +} \ No newline at end of file