From 08d53a974caa67c1d4bae02649d3fc0f1488d582 Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 00:47:14 +0800 Subject: [PATCH 1/6] Update RuleAnalyzer.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原来的规则还有点问题,有些情况下会跳过不分隔规则。 原因是拿一个变量记录了两个状态,有时忘记回退切换状态,导致队列位置处理混乱。 我重新梳理了一遍,更改了所有所有相关规则,新增了类成员变量保存另一种状态。 然后测试了我自己写的那几个书源,和其它用大量采用“||”、“&&”、“%%”分隔规则的书源,总算搞定。 --- .../app/model/analyzeRule/RuleAnalyzer.kt | 243 ++++++++++-------- 1 file changed, 141 insertions(+), 102 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt index e2f08660f..a2b78b251 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -6,8 +6,10 @@ class RuleAnalyzer(data: String) { private var queue: String = data //被处理字符串 private var pos = 0 //处理到的位置 + private var rule = arrayOf() private var start = 0 //每次处理字段的开始 + private var startX = 0 //规则的开始 private var step:Int = 0 //分割字符的长度 var elementsType = "" @@ -17,6 +19,11 @@ class RuleAnalyzer(data: String) { return queue.substring(start+stepStart,pos-stepEnd) //当前平衡字段 } + + fun trim(){ // 修剪当前规则之前的"@"或者空白符 + while (queue[pos] == '@' || queue[pos] < '!') pos++ + } + //将pos重置为0,方便复用 fun reSetPos() { pos = 0 @@ -34,21 +41,6 @@ class RuleAnalyzer(data: String) { return queue.substring(start) } - /** - * pos位置回退 - */ - fun back(num :Int = 0) { - if(num == 0)pos = start //回退 - else pos -= num - } - - /** - * pos位置后移 - */ - fun advance(num :Int = 1) { - pos+=num - } - /** * 是否已无剩余字符? * @return 若剩余字串中已无字符则返回true @@ -109,9 +101,8 @@ class RuleAnalyzer(data: String) { while (pos != queue.length) { for (s in seq) { - if (matches(s)) { + if (queue.regionMatches(pos, s, 0, s.length)) { step = s.length //间隔数 - start = this.pos //匹配成功, 设置规则下次起始位置 this.pos = pos //匹配成功, 同步处理位置到类 return true //匹配就返回 true } @@ -163,7 +154,7 @@ class RuleAnalyzer(data: String) { var commits = false //多行注释 do { - if (isEmpty) break + if (pos == queue.length) break var c = queue[pos++] if (c != '\\') { //非转义字符 if (c == '\'' && !commits && !commit && !regex && !inDoubleQuote && !inOtherQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号 @@ -192,22 +183,23 @@ class RuleAnalyzer(data: String) { ']' -> bracketsDepth-- //闭合一层嵌套[] } - } + }else if(c == '\n') commit = false if (commits || commit || regex || inSingleQuote || inDoubleQuote || inOtherQuote) continue //语法单元未匹配结束,直接进入下个循环 val fn = f(c) ?: continue if (fn) depth++ else depth-- //嵌套或者闭合 - }else { //转义字符 - var next = queue[pos++] //拉出被转义字符 - if(commit && next == 'n') commit = false //匹配单行注释终点。当前为\,下个为n,表示换行 - else if (!commits && !commit && next == '\\') { - queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据if条件"\\"字串不在注释中,则只能在字串或正则中 - next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 - if(next == '\\')queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ - } - } + }else pos++ +// { //转义字符 +// var next = queue[pos++] //拉出被转义字符 +// if(commit && next == 'n') commit = false //匹配单行注释终点。当前为\,下个为n,表示换行 +// else if (!commits && !commit && next == '\\') { +// queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据if条件"\\"字串不在注释中,则只能在字串或正则中 +// next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 +// if(next == '\\')queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ +// } +// } } while (depth > 0 || bracketsDepth >0) //拉出全部符合js语法的字段 return if(depth > 0 || bracketsDepth > 0) false else { @@ -230,7 +222,7 @@ class RuleAnalyzer(data: String) { var inDoubleQuote = false //双引号 do { - if (isEmpty) break + if (pos == queue.length) break val c = queue[pos++] if (c != ESC) { //非转义字符 if (c == '\'' && !inDoubleQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号 @@ -245,14 +237,15 @@ class RuleAnalyzer(data: String) { if (fn) otherDepth++ else otherDepth-- } - }else { //转义字符 - var next = queue[pos++] //拉出被转义字符,匹配\/、\"、\'等 - if (next == ESC) { - queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据语法特征当前字段在字串或正则中 - next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 - if(next == ESC)queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ - } - } + }else pos++ +// { //转义字符 +// var next = queue[pos++] //拉出被转义字符,匹配\/、\"、\'等 +// if (next == ESC) { +// queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据语法特征当前字段在字串或正则中 +// next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 +// if(next == ESC)queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ +// } +// } } while (depth > 0 || otherDepth > 0) //拉出一个平衡字串 return if(depth > 0 || otherDepth > 0) false else { @@ -269,96 +262,142 @@ class RuleAnalyzer(data: String) { if(split.size == 1) { elementsType = split[0] //设置分割字串 - step = elementsType.length //设置分隔符长度 - return splitRule(arrayOf()) //仅一个分隔字串时,直接二段解析更快 - }else if (!consumeToAny(* split)) return arrayOf(queue) //未找到分隔符 + return if(!consumeTo(elementsType)) { + rule += queue.substring(startX) + rule + }else { + step = elementsType.length //设置分隔符长度 + splitRule() + } //递归匹配 + }else if (!consumeToAny(* split)) { //未找到分隔符 + rule += queue.substring(startX) + return rule + } - val st = findToAny( '[','(' ) //查找筛选器 + val end = pos //记录分隔位置 + pos = start //重回开始,启动另一种查找 - if(st == -1) { + do{ + val st = findToAny('[', '(') //查找筛选器位置 - var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组 + if (st == -1) { - elementsType = queue.substring(pos, pos + step) //设置组合类型 - pos += step //跳过分隔符 + rule = arrayOf(queue.substring(startX, end)) //压入分隔的首段规则到数组 - while (consumeToAny(* split)) { //循环切分规则压入数组 - rule += queue.substring(start, pos) - pos += step //跳过分隔符 - } + elementsType = queue.substring(end, end + step) //设置组合类型 + pos = end + step //跳过分隔符 + + while (consumeTo(elementsType)) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } - rule += queue.substring(pos) //将剩余字段压入数组末尾 + rule += queue.substring(pos) //将剩余字段压入数组末尾 - return rule - } + return rule + } + + if (st > end) { //先匹配到st1pos,表明分隔字串不在选择器中,将选择器前分隔字串分隔的字段依次压入数组 - val rule = if(st >pos ){ //先匹配到st1pos,表明分隔字串不在选择器中,将选择器前分隔字串分隔的字段依次压入数组 + rule = arrayOf(queue.substring(startX, end)) //压入分隔的首段规则到数组 - var rule = arrayOf(queue.substring(0, pos)) //压入分隔的首段规则到数组 + elementsType = queue.substring(end, end + step) //设置组合类型 + pos = end + step //跳过分隔符 - elementsType = queue.substring(pos, pos + step) //设置组合类型 - pos += step //跳过分隔符 + while (consumeTo(elementsType) && pos < st) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } - while (consumeToAny( * split ) && pos < st ) { //循环切分规则压入数组 - rule += queue.substring(start, pos) - pos += step //跳过分隔符 + return if(pos > st) { + startX = start + splitRule() //首段已匹配,但当前段匹配未完成,调用二段匹配 + } + else { //执行到此,证明后面再无分隔字符 + rule += queue.substring(pos) //将剩余字段压入数组末尾 + rule + } } - rule + pos = st //位置推移到筛选器处 + val next = if (queue[pos] == '[') ']' else ')' //平衡组末尾字符 - }else null + if (!chompRuleBalanced(queue[pos], next)) throw Error( + queue.substring( + 0, + start + ) + "后未平衡" + ) //拉出一个筛选器,不平衡则报错 - pos = st //位置推移到筛选器处 - val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符 + }while( end > pos ) - return if (rule == null) { //rule为空,首段未匹配完成 + start = pos //设置开始查找筛选器位置的起始位置 - if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错 - splitRule(* split) //递归调用首段匹配 + return splitRule(* split) //递归调用首段匹配 + } - } else { + @JvmName("splitRuleNext") + private tailrec fun splitRule(): Array{ //二段匹配被调用,elementsType非空(已在首段赋值),直接按elementsType查找,比首段采用的方式更快 - if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡则报错 - splitRule(rule) //首段已匹配,但当前段匹配未完成,调用二段匹配 + val end = pos //记录分隔位置 + pos = start //重回开始,启动另一种查找 - } + do{ + val st = findToAny('[', '(') //查找筛选器位置 - } + if (st == -1) { - @JvmName("splitRuleNext") - private tailrec fun splitRule(rules:Array): Array{ //二段匹配被调用,elementsType非空(已在首段赋值),直接按elementsType查找,比首段采用的方式更快 + rule += arrayOf(queue.substring(startX, end)) //压入分隔的首段规则到数组 + pos = end + step //跳过分隔符 - if (!consumeTo(elementsType,false)) return rules + queue.substring(start) //此处consumeTo(...)开始位置不是规则的开始位置,start沿用上次设置 + while (consumeTo(elementsType)) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } - val st = findToAny( '[','(' ) //查找筛选器 + rule += queue.substring(pos) //将剩余字段压入数组末尾 - if(st == -1) { - var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组 - pos += step //跳过分隔符 - while (consumeTo(elementsType)) { //循环切分规则压入数组 - rule += queue.substring(start, pos) - pos += step //跳过分隔符 + return rule } - rule += queue.substring(pos) //将剩余字段压入数组末尾 - return rule - } - val rule = if(st > pos ){//先匹配到st1pos,表明分隔字串不在选择器中,将选择器前分隔字串分隔的字段依次压入数组 - var rule = rules + queue.substring(start, pos) //压入本次分隔的首段规则到数组 - pos += step //跳过分隔符 - while (consumeTo(elementsType) && pos < st) { //循环切分规则压入数组 - rule += queue.substring(start, pos) - pos += step //跳过分隔符 + if (st > end) { //先匹配到st1pos,表明分隔字串不在选择器中,将选择器前分隔字串分隔的字段依次压入数组 + + rule += arrayOf(queue.substring(startX, end)) //压入分隔的首段规则到数组 + pos = end + step //跳过分隔符 + + while (consumeTo(elementsType) && pos < st) { //循环切分规则压入数组 + rule += queue.substring(start, pos) + pos += step //跳过分隔符 + } + + return if(pos > st) { + startX = start + splitRule() //首段已匹配,但当前段匹配未完成,调用二段匹配 + } + else { //执行到此,证明后面再无分隔字符 + rule += queue.substring(pos) //将剩余字段压入数组末尾 + rule + } } - rule - }else rules - pos = st //位置推移到筛选器处 - val next = if(queue[pos] == '[' ) ']' else ')' //平衡组末尾字符 + pos = st //位置推移到筛选器处 + val next = if (queue[pos] == '[') ']' else ')' //平衡组末尾字符 + + if (!chompRuleBalanced(queue[pos], next)) throw Error( + queue.substring( + 0, + start + ) + "后未平衡" + ) //拉出一个筛选器,不平衡则报错 - if(!chompRuleBalanced(queue[pos],next)) throw Error(queue.substring(0, start)+"后未平衡") //拉出一个筛选器,不平衡时返回true,表示未平衡 + }while( end > pos ) - return splitRule(rule) //递归匹配 + start = pos //设置开始查找筛选器位置的起始位置 + + return if(!consumeTo(elementsType)) { + rule += queue.substring(startX) + rule + }else splitRule() //递归匹配 } @@ -380,12 +419,12 @@ class RuleAnalyzer(data: String) { while (!isEmpty && consumeTo(inner)) { //拉取成功返回true,ruleAnalyzes里的字符序列索引变量pos后移相应位置,否则返回false,且isEmpty为true val posPre = pos //记录上次结束位置 if (chompRuleBalanced {//拉出一个以[]为默认嵌套、以{}为补充嵌套的平衡字段 - when (it) { - '{' -> true - '}' -> false - else -> null - } - }) { + when (it) { + '{' -> true + '}' -> false + else -> null + } + }) { val frv= fr(currBalancedString(startStep,endStep)) if(frv != null) { From 2f38448c29c7d49ddabae64404ddc54a1680aa81 Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 00:55:14 +0800 Subject: [PATCH 2/6] Update AnalyzeByJSoup.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将AnalyzeByJSoup中,getElementsSingle,findIndexSet,getIndexs三个函数统一整合进 data class ElementsSingle,因为它们是个整体。 --- .../app/model/analyzeRule/AnalyzeByJSoup.kt | 348 +++++++++--------- 1 file changed, 171 insertions(+), 177 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt index a182dc7f2..6d59f8133 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeByJSoup.kt @@ -137,11 +137,10 @@ class AnalyzeByJSoup(doc: Any) { } } else { for (ruleStr in ruleStrS) { - //将原getElementsSingle函数调用的函数的部分代码内联过来,方便简化getElementsSingle函数 val rsRule = RuleAnalyzer(ruleStr) - if( rsRule.peek() =='@' || rsRule.peek() < '!' ) rsRule.advance() // 修剪当前规则之前的"@"或者空白符 + rsRule.trim() // 修剪当前规则之前的"@"或者空白符 val rs = rsRule.splitRule("@") @@ -157,7 +156,7 @@ class AnalyzeByJSoup(doc: Any) { el.addAll(es) } el - }else getElementsSingle(temp,ruleStr) + }else ElementsSingle().getElementsSingle(temp,ruleStr) elementsList.add(el) if (el.size > 0 && ruleAnalyzes.elementsType == "||") { @@ -183,173 +182,6 @@ class AnalyzeByJSoup(doc: Any) { return elements } - /** - * 1.支持阅读原有写法,':'分隔索引,!或.表示筛选方式,索引可为负数 - * - * 例如 tag.div.-1:10:2 或 tag.div!0:3 - * - * 2. 支持与jsonPath类似的[]索引写法 - * - * 格式形如 [it,it,。。。] 或 [!it,it,。。。] 其中[!开头表示筛选方式为排除,it为单个索引或区间。 - * - * 区间格式为 start:end 或 start:end:step,其中start为0可省略,end为-1可省略。 - * - * 索引,区间两端及间隔都支持负数 - * - * 例如 tag.div[-1, 3:-2:-10, 2] - * - * 特殊用法 tag.div[-1:0] 可在任意地方让列表反向 - * - * */ - - fun findIndexSet( rule:String ): IndexSet { - - val indexSet = IndexSet() - val rus = rule.trim{ it <= ' '} - - var len = rus.length - var curInt: Int? //当前数字 - var curMinus = false //当前数字是否为负 - val curList = mutableListOf() //当前数字区间 - var l = "" //暂存数字字符串 - - val head = rus.last() == ']' //是否为常规索引写法 - - if(head){ //常规索引写法[index...] - - len-- //跳过尾部']' - - while (len-- >= 0) { //逆向遍历,可以无前置规则 - - var rl = rus[len] - if (rl == ' ') continue //跳过空格 - - if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出 - else if (rl == '-') curMinus = true - else { - - curInt = if (l.isEmpty()) null else if (curMinus) -l.toInt() else l.toInt() //当前数字 - - when (rl) { - - ':' -> curList.add(curInt) //区间右端或区间间隔 - - else -> { - - //为保证查找顺序,区间和单个索引都添加到同一集合 - if(curList.isEmpty()) { - - if(curInt == null) break //是jsoup选择器而非索引列表,跳出 - - indexSet.indexs.add(curInt) - } - else{ - - //列表最后压入的是区间右端,若列表有两位则最先压入的是间隔 - indexSet.indexs.add( Triple(curInt, curList.last(), if(curList.size == 2) curList.first() else 1) ) - - curList.clear() //重置临时列表,避免影响到下个区间的处理 - - } - - if(rl == '!'){ - indexSet.split='!' - do{ rl = rus[--len] } while (len > 0 && rl == ' ')//跳过所有空格 - } - - if(rl == '[') return indexSet.apply { - beforeRule = rus.substring(0, len) - } //遇到索引边界,返回结果 - - if(rl != ',') break //非索引结构,跳出 - - } - } - - l = "" //清空 - curMinus = false //重置 - } - } - } else while (len-- >= 0) { //阅读原本写法,逆向遍历,可以无前置规则 - - val rl = rus[len] - if (rl == ' ') continue //跳过空格 - - if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出 - else if (rl == '-') curMinus = true - else { - - if(rl == '!' || rl == '.' || rl == ':') { //分隔符或起始符 - - indexSet.indexDefault.add(if (curMinus) -l.toInt() else l.toInt()) // 当前数字追加到列表 - - if (rl != ':') return indexSet.apply { //rl == '!' || rl == '.' - split = rl - beforeRule = rus.substring(0, len) - } - - }else break //非索引结构,跳出循环 - - l = "" //清空 - curMinus = false //重置 - } - - } - - return indexSet.apply{ - split = ' ' - beforeRule = rus } //非索引格式 - } - - /** - * 获取Elements按照一个规则 - */ - private fun getElementsSingle(temp: Element, rule: String): Elements { - - var elements = Elements() - - val fi = findIndexSet(rule) //执行索引列表处理器 - - val (filterType,ruleStr) = fi //获取操作类型及非索引部分的规则字串 - -// val rulePc = rulePcx[0].trim { it <= ' ' }.split(">") -// jsoup中,当前节点是参与选择的,tag.div 与 tag.div@tag.div 结果相同 -// 此处">"效果和“@”完全相同,且容易让人误解成选择子节点,实际并不是。以后不允许这种无意义的写法 - - val rules = ruleStr.split(".") - - elements.addAll( - if(ruleStr.isEmpty()) temp.children() //允许索引直接作为根元素,此时前置规则为空,效果与children相同 - else when (rules[0]) { - "children" -> temp.children() //允许索引直接作为根元素,此时前置规则为空,效果与children相同 - "class" -> temp.getElementsByClass(rules[1]) - "tag" -> temp.getElementsByTag(rules[1]) - "id" -> Collector.collect(Evaluator.Id(rules[1]), temp) - "text" -> temp.getElementsContainingOwnText(rules[1]) - else -> temp.select(ruleStr) - } ) - - val indexSet = fi.getIndexs(elements.size) //传入元素数量,处理负数索引及索引越界问题,生成可用索引集合。 - - if(filterType == '!'){ //排除 - - for (pcInt in indexSet) elements[pcInt] = null - - elements.removeAll(listOf(null)) //测试过,这样就行 - - }else if(filterType == '.'){ //选择 - - val es = Elements() - - for (pcInt in indexSet) es.add(elements[pcInt]) - - elements = es - - } - - return elements - } - /** * 获取内容列表 */ @@ -363,7 +195,7 @@ class AnalyzeByJSoup(doc: Any) { val rule = RuleAnalyzer(ruleStr) //创建解析 - while( rule.peek() =='@' || rule.peek() < '!' ) rule.advance() // 修剪当前规则之前的"@"或者空白符 + rule.trim() //修建前置赘余符号 val rules = rule.splitRule("@") // 切割成列表 @@ -371,7 +203,7 @@ class AnalyzeByJSoup(doc: Any) { for (i in 0 until last) { val es = Elements() for (elt in elements) { - es.addAll(getElementsSingle(elt, rules[i])) + es.addAll(ElementsSingle().getElementsSingle(elt, rules[i])) } elements.clear() elements = es @@ -426,12 +258,174 @@ class AnalyzeByJSoup(doc: Any) { return textS } - data class IndexSet(var split:Char = '.', - var beforeRule:String = "", - val indexDefault:MutableList = mutableListOf(), - val indexs:MutableList = mutableListOf()){ + data class ElementsSingle(var split:Char = '.', + var beforeRule:String = "", + val indexDefault:MutableList = mutableListOf(), + val indexs:MutableList = mutableListOf()){ + + /** + * 获取Elements按照一个规则 + */ + fun getElementsSingle(temp: Element, rule: String): Elements { + + var elements = Elements() + + findIndexSet(rule) //执行索引列表处理器 + + val rules = beforeRule.split(".") + + elements.addAll( + if(beforeRule.isEmpty()) temp.children() //允许索引直接作为根元素,此时前置规则为空,效果与children相同 + else when (rules[0]) { + "children" -> temp.children() //允许索引直接作为根元素,此时前置规则为空,效果与children相同 + "class" -> temp.getElementsByClass(rules[1]) + "tag" -> temp.getElementsByTag(rules[1]) + "id" -> Collector.collect(Evaluator.Id(rules[1]), temp) + "text" -> temp.getElementsContainingOwnText(rules[1]) + else -> temp.select(beforeRule) + } ) + + val indexSet = getIndexs(elements.size) //传入元素数量,处理负数索引及索引越界问题,生成可用索引集合。 + + if(split == '!'){ //排除 + + for (pcInt in indexSet) elements[pcInt] = null + + elements.removeAll(listOf(null)) //测试过,这样就行 + + }else if(split == '.'){ //选择 + + val es = Elements() + + for (pcInt in indexSet) es.add(elements[pcInt]) + + elements = es + + } + + return elements + + } + + /** + * 1.支持阅读原有写法,':'分隔索引,!或.表示筛选方式,索引可为负数 + * + * 例如 tag.div.-1:10:2 或 tag.div!0:3 + * + * 2. 支持与jsonPath类似的[]索引写法 + * + * 格式形如 [it,it,。。。] 或 [!it,it,。。。] 其中[!开头表示筛选方式为排除,it为单个索引或区间。 + * + * 区间格式为 start:end 或 start:end:step,其中start为0可省略,end为-1可省略。 + * + * 索引,区间两端及间隔都支持负数 + * + * 例如 tag.div[-1, 3:-2:-10, 2] + * + * 特殊用法 tag.div[-1:0] 可在任意地方让列表反向 + * + * */ + fun findIndexSet( rule:String ): ElementsSingle { + + val rus = rule.trim{ it <= ' '} + + var len = rus.length + var curInt: Int? //当前数字 + var curMinus = false //当前数字是否为负 + val curList = mutableListOf() //当前数字区间 + var l = "" //暂存数字字符串 + + val head = rus.last() == ']' //是否为常规索引写法 + + if(head){ //常规索引写法[index...] + + len-- //跳过尾部']' + + while (len-- >= 0) { //逆向遍历,可以无前置规则 + + var rl = rus[len] + if (rl == ' ') continue //跳过空格 + + if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出 + else if (rl == '-') curMinus = true + else { + + curInt = if (l.isEmpty()) null else if (curMinus) -l.toInt() else l.toInt() //当前数字 + + when (rl) { + + ':' -> curList.add(curInt) //区间右端或区间间隔 + + else -> { + + //为保证查找顺序,区间和单个索引都添加到同一集合 + if(curList.isEmpty()) { + + if(curInt == null) break //是jsoup选择器而非索引列表,跳出 + + indexs.add(curInt) + } + else{ + + //列表最后压入的是区间右端,若列表有两位则最先压入的是间隔 + indexs.add( Triple(curInt, curList.last(), if(curList.size == 2) curList.first() else 1) ) + + curList.clear() //重置临时列表,避免影响到下个区间的处理 + + } + + if(rl == '!'){ + split='!' + do{ rl = rus[--len] } while (len > 0 && rl == ' ')//跳过所有空格 + } + + if(rl == '[') { + beforeRule = rus.substring(0, len) //遇到索引边界,返回结果 + return this + } + + if(rl != ',') break //非索引结构,跳出 + + } + } + + l = "" //清空 + curMinus = false //重置 + } + } + } else while (len-- >= 0) { //阅读原本写法,逆向遍历,可以无前置规则 + + val rl = rus[len] + if (rl == ' ') continue //跳过空格 + + if (rl in '0'..'9') l += rl //将数值累接入临时字串中,遇到分界符才取出 + else if (rl == '-') curMinus = true + else { + + if(rl == '!' || rl == '.' || rl == ':') { //分隔符或起始符 + + indexDefault.add(if (curMinus) -l.toInt() else l.toInt()) // 当前数字追加到列表 + + if (rl != ':'){ //rl == '!' || rl == '.' + split = rl + beforeRule = rus.substring(0, len) + return this + } + + }else break //非索引结构,跳出循环 + + l = "" //清空 + curMinus = false //重置 + } + + } + + split = ' ' + beforeRule = rus + return this //非索引格式 + } - fun getIndexs(len:Int): MutableSet { + private fun getIndexs(len:Int): MutableSet { val indexSet = mutableSetOf() From d408265fe2e2c695201239d8d7d80d7614cc179d Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 01:07:37 +0800 Subject: [PATCH 3/6] Update RuleAnalyzer.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原来的规则还有点问题,有些情况下会跳过不分隔规则。 原因是拿一个变量记录了两个状态,有时忘记回退切换状态,导致队列位置处理混乱。 我重新梳理了一遍,更改了所有所有相关规则,新增了类成员变量保存另一种状态。 然后测试了我自己写的那几个书源,和其它用大量采用“||”、“&&”、“%%”分隔规则的书源,总算搞定。 --- .../main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt index a2b78b251..e6a86bde0 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -1,12 +1,11 @@ package io.legado.app.model.analyzeRule //通用的规则切分处理 - class RuleAnalyzer(data: String) { private var queue: String = data //被处理字符串 private var pos = 0 //处理到的位置 - private var rule = arrayOf() + private var rule = arrayOf() //规则列表 private var start = 0 //每次处理字段的开始 private var startX = 0 //规则的开始 From ff1398eae9932d33a82e166ab95dbf927c16d1b9 Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 02:05:26 +0800 Subject: [PATCH 4/6] Update RuleAnalyzer.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将平衡组分成采用实体字符的规则平衡组(jsoup、xpath),和允许转义字符的代码平衡组(json、JavaScript),分别对应两种情况,经测试完美解决过去遗留的问题。 --- .../app/model/analyzeRule/RuleAnalyzer.kt | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt index e6a86bde0..f0017abf1 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -1,11 +1,12 @@ package io.legado.app.model.analyzeRule //通用的规则切分处理 + class RuleAnalyzer(data: String) { private var queue: String = data //被处理字符串 private var pos = 0 //处理到的位置 - private var rule = arrayOf() //规则列表 + private var rule = arrayOf() private var start = 0 //每次处理字段的开始 private var startX = 0 //规则的开始 @@ -47,14 +48,6 @@ class RuleAnalyzer(data: String) { val isEmpty: Boolean get() = queue.length - pos == 0 //是否处理到最后 - /** - * 检索并返回首字符,但pos不变 - * @return 首字符:若为空则为 0 号字符 - */ - fun peek(): Char { //检索首字符 - return if (isEmpty) 0.toChar() else queue[pos] - } - /** * 消耗剩余字串中一个字符。 * @return 返回剩余字串中的下个字符。 @@ -190,15 +183,7 @@ class RuleAnalyzer(data: String) { if (fn) depth++ else depth-- //嵌套或者闭合 }else pos++ -// { //转义字符 -// var next = queue[pos++] //拉出被转义字符 -// if(commit && next == 'n') commit = false //匹配单行注释终点。当前为\,下个为n,表示换行 -// else if (!commits && !commit && next == '\\') { -// queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据if条件"\\"字串不在注释中,则只能在字串或正则中 -// next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 -// if(next == '\\')queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ -// } -// } + } while (depth > 0 || bracketsDepth >0) //拉出全部符合js语法的字段 return if(depth > 0 || bracketsDepth > 0) false else { @@ -208,9 +193,9 @@ class RuleAnalyzer(data: String) { } /** - * 在双重转义字串中拉出一个规则平衡组 + * 拉出一个代码平衡组,存在转义文本,没有实体字符,通常以{}作为模块 */ - fun chompRuleBalanced(open: Char = '[', close: Char = ']',f: ((Char) ->Boolean?)? = null ): Boolean { + fun chompCodeBalanced(open: Char = '[', close: Char = ']'): Boolean { var pos = pos //声明临时变量记录匹配位置,匹配成功后才同步到类的pos @@ -231,20 +216,14 @@ class RuleAnalyzer(data: String) { if ( c == open )depth++ //开始嵌套一层 else if ( c== close) depth-- //闭合一层嵌套 - else if(depth == 0 && f != null) { //处于默认嵌套中的非默认字符不需要平衡,仅depth为0时默认嵌套全部闭合,此字符才进行嵌套 - val fn = f(c) ?: continue - if (fn) otherDepth++ else otherDepth-- + else if(depth == 0 ) { + //处于默认嵌套中的非默认字符不需要平衡,仅depth为0时默认嵌套全部闭合,此字符才进行嵌套 + if(c == '{')otherDepth++ + else if(c == '}')otherDepth-- } }else pos++ -// { //转义字符 -// var next = queue[pos++] //拉出被转义字符,匹配\/、\"、\'等 -// if (next == ESC) { -// queue[pos++] //当前为\,下个为\,双重转义中"\\"表示转义字符本身,根据语法特征当前字段在字串或正则中 -// next = queue[pos++] //拉出下个字符,因为在双重转义的字串或正则中,类似于 \\/ 这样的结构才是转义结构 -// if(next == ESC)queue[pos++] //若为转义字符则继续拉出,因为双重转义中转义字符成对存在,即 \\\\ -// } -// } + } while (depth > 0 || otherDepth > 0) //拉出一个平衡字串 return if(depth > 0 || otherDepth > 0) false else { @@ -253,6 +232,36 @@ class RuleAnalyzer(data: String) { } } + /** + * 拉出一个规则平衡组,没有转义文本,有实体字符,通常以[]作为选择器 + */ + fun chompRuleBalanced(open: Char = '[', close: Char = ']'): Boolean { + + var pos = pos //声明临时变量记录匹配位置,匹配成功后才同步到类的pos + var depth = 0 //嵌套深度 + var inSingleQuote = false //单引号 + var inDoubleQuote = false //双引号 + + do { + if (pos == queue.length) break + val c = queue[pos++] + + if (c == '\'' && !inDoubleQuote) inSingleQuote = !inSingleQuote //匹配具有语法功能的单引号 + else if (c == '"' && !inSingleQuote) inDoubleQuote = !inDoubleQuote //匹配具有语法功能的双引号 + + if (inSingleQuote || inDoubleQuote) continue //语法单元未匹配结束,直接进入下个循环 + + if ( c == open )depth++ //开始嵌套一层 + else if ( c== close) depth-- //闭合一层嵌套 + + } while (depth > 0 ) //拉出一个平衡字串 + + return if(depth > 0) false else { + this.pos = pos //同步位置 + true + } + } + /** * 不用正则,不到最后不切片也不用中间变量存储,只在序列中标记当前查找字段的开头结尾,到返回时才切片,高效快速准确切割规则 * 解决jsonPath自带的"&&"和"||"与阅读的规则冲突,以及规则正则或字符串中包含"&&"、"||"、"%%"、"@"导致的冲突 @@ -417,13 +426,7 @@ class RuleAnalyzer(data: String) { while (!isEmpty && consumeTo(inner)) { //拉取成功返回true,ruleAnalyzes里的字符序列索引变量pos后移相应位置,否则返回false,且isEmpty为true val posPre = pos //记录上次结束位置 - if (chompRuleBalanced {//拉出一个以[]为默认嵌套、以{}为补充嵌套的平衡字段 - when (it) { - '{' -> true - '}' -> false - else -> null - } - }) { + if (chompCodeBalanced()) { val frv= fr(currBalancedString(startStep,endStep)) if(frv != null) { From 93194909b92677fcae998662ccb3a54cdb4949ac Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 02:13:10 +0800 Subject: [PATCH 5/6] Update RuleAnalyzer.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 调整下参数指向 --- .../io/legado/app/model/analyzeRule/RuleAnalyzer.kt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt index f0017abf1..39c52cb79 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -195,7 +195,7 @@ class RuleAnalyzer(data: String) { /** * 拉出一个代码平衡组,存在转义文本,没有实体字符,通常以{}作为模块 */ - fun chompCodeBalanced(open: Char = '[', close: Char = ']'): Boolean { + fun chompCodeBalanced(open: Char = '{', close: Char = '}'): Boolean { var pos = pos //声明临时变量记录匹配位置,匹配成功后才同步到类的pos @@ -214,12 +214,12 @@ class RuleAnalyzer(data: String) { if (inSingleQuote || inDoubleQuote) continue //语法单元未匹配结束,直接进入下个循环 - if ( c == open )depth++ //开始嵌套一层 - else if ( c== close) depth-- //闭合一层嵌套 + if ( c == '[' )depth++ //开始嵌套一层 + else if ( c== ']') depth-- //闭合一层嵌套 else if(depth == 0 ) { //处于默认嵌套中的非默认字符不需要平衡,仅depth为0时默认嵌套全部闭合,此字符才进行嵌套 - if(c == '{')otherDepth++ - else if(c == '}')otherDepth-- + if(c == open)otherDepth++ + else if(c == close)otherDepth-- } }else pos++ From 5ba01ddab9fca30d5df1b21e5c5c615fc4447e3c Mon Sep 17 00:00:00 2001 From: bushixuanqi <57338301+bushixuanqi@users.noreply.github.com> Date: Tue, 6 Jul 2021 07:02:37 +0800 Subject: [PATCH 6/6] Update RuleAnalyzer.kt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 解决采用新平衡组后,{$.rule}内嵌规则起始位置未记录,导致未将$.当中规则一部分的问题 --- .../main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt index 39c52cb79..5167b526f 100644 --- a/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt +++ b/app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt @@ -16,7 +16,7 @@ class RuleAnalyzer(data: String) { //当前平衡字段 fun currBalancedString( stepStart:Int = 1 , stepEnd:Int = 1): String { //stepStart平衡字符的起始分隔字串长度,stepEnd平衡字符的结束分隔字串长度 - return queue.substring(start+stepStart,pos-stepEnd) //当前平衡字段 + return queue.substring(startX+stepStart,pos-stepEnd) //当前平衡字段 } @@ -227,6 +227,7 @@ class RuleAnalyzer(data: String) { } while (depth > 0 || otherDepth > 0) //拉出一个平衡字串 return if(depth > 0 || otherDepth > 0) false else { + startX = this.pos //内嵌规则起始 this.pos = pos //同步位置 true } @@ -429,7 +430,6 @@ class RuleAnalyzer(data: String) { if (chompCodeBalanced()) { val frv= fr(currBalancedString(startStep,endStep)) if(frv != null) { - st.append(queue.substring(start,posPre)+frv) //压入内嵌规则前的内容,及内嵌规则解析得到的字符串 continue //获取内容成功,继续选择下个内嵌规则