优化链接分割规则,避免 ,{c参数} 的字符串中也存在 ,{ ,导致规则切错

修复<str0,str2,...{{js}}>这种页数列表写法中,js部分内含 < 或 > 就会切割错误的问题

简化图片格式化操作
pull/1114/head
bushixuanqi 3 years ago
parent 4a4208c031
commit c06df68404
  1. 5
      app/src/main/java/io/legado/app/constant/AppPattern.kt
  2. 13
      app/src/main/java/io/legado/app/data/entities/BookChapter.kt
  3. 12
      app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeRule.kt
  4. 86
      app/src/main/java/io/legado/app/model/analyzeRule/AnalyzeUrl.kt
  5. 115
      app/src/main/java/io/legado/app/model/analyzeRule/RuleAnalyzer.kt
  6. 3
      app/src/main/java/io/legado/app/ui/book/read/page/provider/ChapterProvider.kt
  7. 34
      app/src/main/java/io/legado/app/utils/HtmlFormatter.kt

@ -7,9 +7,10 @@ object AppPattern {
val JS_PATTERN: Pattern =
Pattern.compile("(<js>[\\w\\W]*?</js>|@js:[\\w\\W]*$)", Pattern.CASE_INSENSITIVE)
val EXP_PATTERN: Pattern = Pattern.compile("\\{\\{([\\w\\W]*?)\\}\\}")
//图片有data-开头的数据属性时优先用数据属性作为src,没有数据属性时才匹配src
//非格式化时,不需要原来那么复杂的正则表达式
val imgPattern: Pattern =
Pattern.compile("<img(?:(?![^>]*data-)[^>]*src|[^>]*data-)[^=]*= *\"([^\"{]+(?:\\{(?:[^{}]|\\{[^{}]*\\})*\\})?)\"[^>]*>", Pattern.CASE_INSENSITIVE)
Pattern.compile("<img[^>]*src *= *\"([^\"{]+(?:\\{(?:[^{}]|\\{[^{}]*\\})*\\})?)\"[^>]*>", Pattern.CASE_INSENSITIVE)
val nameRegex = Regex("\\s+作\\s*者.*|\\s+\\S+\\s+著")
val authorRegex = Regex("^.*?作\\s*者[::\\s]*|\\s+著")

@ -8,7 +8,6 @@ import androidx.room.Index
import io.legado.app.model.analyzeRule.AnalyzeUrl
import io.legado.app.utils.GSON
import io.legado.app.utils.MD5Utils
import io.legado.app.utils.NetworkUtils
import io.legado.app.utils.fromJsonObject
import kotlinx.parcelize.IgnoredOnParcel
import kotlinx.parcelize.Parcelize
@ -63,14 +62,10 @@ data class BookChapter(
return false
}
fun getAbsoluteURL(): String {
val urlArray = url.split(AnalyzeUrl.splitUrlRegex)
var absoluteUrl = NetworkUtils.getAbsoluteURL(baseUrl, urlArray[0])
if (urlArray.size > 1) {
absoluteUrl = "$absoluteUrl,${urlArray[1]}"
}
return absoluteUrl
}
fun getAbsoluteURL() = if(url.indexOf(',') != -1) {
val absoluteUrl = url.split(AnalyzeUrl.splitUrlRegex, 1)[0]
"${absoluteUrl},${url.substring(absoluteUrl.length)}"
} else url
fun getFileName(): String = String.format("%05d-%s.nb", index, MD5Utils.md5Encode16(title))

@ -25,7 +25,9 @@ import kotlin.collections.HashMap
@Keep
@Suppress("unused", "RegExpRedundantEscape")
class AnalyzeRule(val ruleData: RuleDataInterface) : JsExtensions {
var book: BaseBook? = null
var book: BaseBook? = if (ruleData is BaseBook) ruleData else null
var chapter: BookChapter? = null
var nextChapterUrl: String? = null
var content: Any? = null
@ -42,15 +44,9 @@ class AnalyzeRule(val ruleData: RuleDataInterface) : JsExtensions {
private var objectChangedJS = false
private var objectChangedJP = false
init {
if (ruleData is BaseBook) {
book = ruleData
}
}
@JvmOverloads
fun setContent(content: Any?, baseUrl: String? = null): AnalyzeRule {
if (content == null) throw AssertionError("Content cannot be null")
if (content == null) throw AssertionError("内容不可空(Content cannot be null)")
this.content = content
setBaseUrl(baseUrl)
isJSON = content.toString().isJson()

@ -1,13 +1,11 @@
package io.legado.app.model.analyzeRule
import android.annotation.SuppressLint
import android.text.TextUtils
import androidx.annotation.Keep
import com.bumptech.glide.load.model.GlideUrl
import com.bumptech.glide.load.model.LazyHeaders
import io.legado.app.constant.AppConst.SCRIPT_ENGINE
import io.legado.app.constant.AppConst.UA_NAME
import io.legado.app.constant.AppPattern.EXP_PATTERN
import io.legado.app.constant.AppPattern.JS_PATTERN
import io.legado.app.data.entities.BaseBook
import io.legado.app.data.entities.BookChapter
@ -41,7 +39,7 @@ class AnalyzeUrl(
headerMapF: Map<String, String>? = null
) : JsExtensions {
companion object {
val splitUrlRegex = Regex(",\\s*(?=\\{)")
val splitUrlRegex = Regex("\\s*,\\s*(?=\\{)")
private val pagePattern = Pattern.compile("<(.*?)>")
}
@ -82,7 +80,7 @@ class AnalyzeUrl(
if (jsMatcher.start() > start) {
tmp =
ruleUrl.substring(start, jsMatcher.start()).replace("\n", "").trim { it <= ' ' }
if (!TextUtils.isEmpty(tmp)) {
if (tmp.isNotEmpty()) {
ruleList.add(tmp)
}
}
@ -91,7 +89,7 @@ class AnalyzeUrl(
}
if (ruleUrl.length > start) {
tmp = ruleUrl.substring(start).replace("\n", "").trim { it <= ' ' }
if (!TextUtils.isEmpty(tmp)) {
if (tmp.isNotEmpty()) {
ruleList.add(tmp)
}
}
@ -114,23 +112,12 @@ class AnalyzeUrl(
/**
* 替换关键字,页数,JS
*/
private fun replaceKeyPageJs() {
//page
page?.let {
val matcher = pagePattern.matcher(ruleUrl)
while (matcher.find()) {
val pages = matcher.group(1)!!.split(",")
ruleUrl = if (page <= pages.size) {
ruleUrl.replace(matcher.group(), pages[page - 1].trim { it <= ' ' })
} else {
ruleUrl.replace(matcher.group(), pages.last().trim { it <= ' ' })
}
}
}
private fun replaceKeyPageJs() { //先替换内嵌规则再替换页数规则,避免内嵌规则中存在大于小于号时,规则被切错
//js
if (ruleUrl.contains("{{") && ruleUrl.contains("}}")) {
var jsEval: Any
val sb = StringBuffer()
val analyze = RuleAnalyzer(ruleUrl) //创建解析
val bindings = SimpleBindings()
bindings["java"] = this
bindings["cookie"] = CookieStore
@ -141,21 +128,28 @@ class AnalyzeUrl(
bindings["speakText"] = speakText
bindings["speakSpeed"] = speakSpeed
bindings["book"] = book
val expMatcher = EXP_PATTERN.matcher(ruleUrl)
while (expMatcher.find()) {
jsEval = expMatcher.group(1)?.let {
SCRIPT_ENGINE.eval(it, bindings)
} ?: ""
if (jsEval is String) {
expMatcher.appendReplacement(sb, jsEval)
} else if (jsEval is Double && jsEval % 1.0 == 0.0) {
expMatcher.appendReplacement(sb, String.format("%.0f", jsEval))
//替换所有内嵌{{js}}
val url = analyze.innerRule("{{",2,2){
when(val jsEval = SCRIPT_ENGINE.eval(it, bindings)){
is String -> jsEval
jsEval is Double && jsEval % 1.0 == 0.0 -> String.format("%.0f", jsEval)
else -> jsEval.toString()
}
}
if(url.isNotEmpty())ruleUrl = url
}
//page
page?.let {
val matcher = pagePattern.matcher(ruleUrl)
while (matcher.find()) {
val pages = matcher.group(1)!!.split(",")
ruleUrl = if (page < pages.size) { //pages[pages.size - 1]等同于pages.last()
ruleUrl.replace(matcher.group(), pages[page - 1].trim { it <= ' ' })
} else {
expMatcher.appendReplacement(sb, jsEval.toString())
ruleUrl.replace(matcher.group(), pages.last().trim { it <= ' ' })
}
}
expMatcher.appendTail(sb)
ruleUrl = sb.toString()
}
}
@ -163,15 +157,20 @@ class AnalyzeUrl(
* 处理URL
*/
private fun initUrl() {
var urlArray = ruleUrl.split(splitUrlRegex, 2)
url = NetworkUtils.getAbsoluteURL(baseUrl, urlArray[0])
urlHasQuery = urlArray[0]
val hasQuery = ruleUrl.indexOf(',') != -1
urlHasQuery = if(hasQuery) ruleUrl.split(splitUrlRegex, 1)[0] else ruleUrl
url = NetworkUtils.getAbsoluteURL(baseUrl,urlHasQuery )
NetworkUtils.getBaseUrl(url)?.let {
baseUrl = it
}
if (urlArray.size > 1) {
val option = GSON.fromJsonObject<UrlOption>(urlArray[1])
option?.let { _ ->
if(hasQuery) {
GSON.fromJsonObject<UrlOption>(ruleUrl.substring(urlHasQuery.length))?.let { option ->
option.method?.let {
if (it.equals("POST", true)) method = RequestMethod.POST
}
@ -201,16 +200,17 @@ class AnalyzeUrl(
retry = option.retry
}
}
headerMap[UA_NAME] ?: let {
headerMap[UA_NAME] = AppConfig.userAgent
}
when (method) {
RequestMethod.GET -> {
if (!useWebView) {
urlArray = url.split("?")
url = urlArray[0]
if (urlArray.size > 1) {
analyzeFields(urlArray[1])
val pos = url.indexOf('?')
if(pos != -1) {
analyzeFields(url.substring(pos + 1))
url = url.substring(0,pos)
}
}
}
@ -233,7 +233,7 @@ class AnalyzeUrl(
for (query in queryS) {
val queryM = query.splitNotBlank("=")
val value = if (queryM.size > 1) queryM[1] else ""
if (TextUtils.isEmpty(charset)) {
if (charset.isNullOrEmpty()) {
if (NetworkUtils.hasUrlEncoded(value)) {
fieldMap[queryM[0]] = value
} else {

@ -12,6 +12,7 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
private var rule = arrayOf<String>() //分割出的规则列表
private var step: Int = 0 //分割字符的长度
var elementsType = "" //当前分割字符串
var innerType = true //是否为内嵌{{}}
//设置平衡组函数,json或JavaScript时设置成chompCodeBalanced,否则为chompRuleBalanced
val chompBalanced = if (code) ::chompCodeBalanced else ::chompRuleBalanced
@ -31,9 +32,14 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
startX = 0
}
//返回剩余字段
fun Remained(): String {
return queue.substring(pos)
}
/**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列或剩余字串用完
* @param seq 分隔字符 **区分大小写**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列
* @param seq 查找的字符串 **区分大小写**
* @return 是否找到相应字段
*/
fun consumeTo(seq: String): Boolean {
@ -45,6 +51,20 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
} else false
}
/**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列
* @param seq 查找的字符串 **区分大小写**
* @return 返回查找的字符串之前的匹配字段
*/
fun consumeToString(seq: String): String {
start = pos //将处理到的位置设置为规则起点
val offset = queue.indexOf(seq, pos)
return if (offset != -1) {
pos = offset
queue.substring(start, offset)
} else ""
}
/**
* 从剩余字串中拉出一个字符串直到但不包括匹配序列匹配参数列表中一项即为匹配或剩余字串用完
* @param seq 匹配字符串序列
@ -91,50 +111,61 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
}
//其中js只要符合语法,就不用避开任何阅读关键字,自由发挥
fun chompJsBalanced(
f: ((Char) -> Boolean?) = {
when (it) {
'{' -> true //开始嵌套一层
'}' -> false //闭合一层嵌套
else -> null
}
}
): Boolean {
var pos = pos //声明变量记录临时处理位置
var depth = 0 //嵌套深度
fun chompJsBalanced(innerType:Boolean = true,startPos:Int = pos): String {
var pos = startPos //声明变量记录临时处理位置
var bracketsDepth = 0 //[]嵌套深度
var inSingleQuote = false //单引号
var inDoubleQuote = false //双引号
var inOtherQuote = false //js原始字串分隔字符
var regex = false //正则
var commit = false //单行注释
var commits = false //多行注释
var inRegex = false //正则
var inCommit = false //单行注释
var inCommits = false //多行注释
val start:String
val end:String
val endChar:Char
if(innerType){
start = "{{"
end = "}}"
endChar = '}'
}else{
start = "<js>"
end = "</js>"
endChar = '<'
}
pos += start.length //跳过起始字符串
do {
if (pos == queue.length) break
var c = queue[pos++]
if (c != '\\') { //非转义字符
if (c == '\'' && !commits && !commit && !regex && !inDoubleQuote && !inOtherQuote) inSingleQuote =
if (c == '\'' && !inCommits && !inCommit && !inRegex && !inDoubleQuote && !inOtherQuote) inSingleQuote =
!inSingleQuote //匹配具有语法功能的单引号
else if (c == '"' && !commits && !commit && !regex && !inSingleQuote && !inOtherQuote) inDoubleQuote =
else if (c == '"' && !inCommits && !inCommit && !inRegex && !inSingleQuote && !inOtherQuote) inDoubleQuote =
!inDoubleQuote //匹配具有语法功能的双引号
else if (c == '`' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote) inOtherQuote =
else if (c == '`' && !inCommits && !inCommit && !inRegex && !inSingleQuote && !inDoubleQuote) inOtherQuote =
!inOtherQuote //匹配具有语法功能的'`'
else if (c == '/' && !commits && !commit && !regex && !inSingleQuote && !inDoubleQuote && !inOtherQuote) { //匹配注释或正则起点
else if (c == '/' && !inCommits && !inCommit && !inRegex && !inSingleQuote && !inDoubleQuote && !inOtherQuote) { //匹配注释或正则起点
c = queue[pos++]
when (c) {
'/' -> commit = true //匹配单行注释起点
'*' -> commits = true //匹配多行注释起点
else -> regex = true //匹配正则起点
'/' -> inCommit = true //匹配单行注释起点
'*' -> inCommits = true //匹配多行注释起点
else -> inRegex = true //匹配正则起点
}
} else if (commits && c == '*') { //匹配多行注释终点
c = queue[pos++]
if (c == '/') commits = false
} else if (regex && c == '/') { //正则的终点或[]平衡
} else if (inCommits) { //匹配多行注释终点
pos = queue.indexOf("*/", pos) //跳过多行注释
if(pos == -1)break //没有终点,语法出错,跳出
continue
} else if (inRegex) { //正则的终点或[]平衡
when (c) {
'/' -> regex = false//匹配正则终点
'/' -> inRegex = false//匹配正则终点
//为了保证当open为( 且 close 为 )时,正则中[(]或[)]的合法性。故对[]这对在任何规则中都平衡的成对符号做匹配。
// 注:正则里[(]、[)]、[{]、[}]都是合法的,所以只有[]必须平衡。
@ -142,21 +173,20 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
']' -> bracketsDepth-- //闭合一层嵌套[]
}
} else if (c == '\n') commit = false
} else if (c == '\n') inCommit = false //单行注释终点
if (commits || commit || regex || inSingleQuote || inDoubleQuote || inOtherQuote) continue //语法单元未匹配结束,直接进入下个循环
val fn = f(c) ?: continue
if (fn) depth++ else depth-- //嵌套或者闭合
if (inCommits || inCommit || inRegex || inSingleQuote || inDoubleQuote || inOtherQuote) continue //语法单元未匹配结束,直接进入下个循环
if( c == endChar && queue.regionMatches(pos, end, 0, end.length)) {
this.pos = pos
return queue.substring(startPos + start.length, pos - end.length) //匹配到终点,返回结果
}
} else pos++
} while (depth > 0 || bracketsDepth > 0) //拉出全部符合js语法的字段
} while (bracketsDepth > 0) //拉出全部符合js语法的字段
return ""
return if (depth > 0 || bracketsDepth > 0) false else {
this.pos = pos //同步位置
true
}
}
/**
@ -392,7 +422,7 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
val posPre = pos //记录consumeTo匹配位置
if (chompCodeBalanced('{', '}')) {
val frv = fr(queue.substring(posPre + startStep, pos - endStep))
if (frv != null) {
if (!frv.isNullOrEmpty()) {
st.append(queue.substring(startX, posPre) + frv) //压入内嵌规则前的内容,及内嵌规则解析得到的字符串
startX = pos //记录下次规则起点
continue //获取内容成功,继续选择下个内嵌规则
@ -412,11 +442,16 @@ class RuleAnalyzer(data: String, code: Boolean = false) {
*/
private const val ESC = '\\'
/**
* "<js>"
*/
private const val JSSTART = "<js>"
/**
* 阅读共有分隔字串起始部分
* "##","@@","{{","{[","<js>", "@js:"
*/
val splitList = arrayOf("##", "@@", "{{", "{[", "<js>", "@js:")
val splitList2 = arrayOf("##", "@@", "{{", "{[")
/**
* 发现名称-链接分隔字串

@ -12,7 +12,6 @@ import io.legado.app.data.entities.Book
import io.legado.app.data.entities.BookChapter
import io.legado.app.help.AppConfig
import io.legado.app.help.ReadBookConfig
import io.legado.app.model.analyzeRule.AnalyzeUrl
import io.legado.app.ui.book.read.page.entities.TextChapter
import io.legado.app.ui.book.read.page.entities.TextChar
import io.legado.app.ui.book.read.page.entities.TextLine
@ -150,7 +149,7 @@ object ChapterProvider {
return TextChapter(
bookChapter.index, bookChapter.title,
bookChapter.getAbsoluteURL().split(AnalyzeUrl.splitUrlRegex)[0],
bookChapter.getAbsoluteURL().split(',',limit = 1)[0], //bookChapter.getAbsoluteURL已经处理过,直接按','就行
textPages, chapterSize
)
}

@ -1,8 +1,9 @@
package io.legado.app.utils
import io.legado.app.constant.AppPattern
import io.legado.app.constant.AppPattern.imgPattern
import io.legado.app.model.analyzeRule.AnalyzeUrl
import java.net.URL
import java.util.regex.Pattern
object HtmlFormatter {
private val wrapHtmlRegex = "</?(?:div|p|br|hr|h\\d|article|dd|dl)[^>]*>".toRegex()
@ -14,7 +15,7 @@ object HtmlFormatter {
return html.replace(wrapHtmlRegex, "\n")
.replace(otherRegex, "")
.replace("\\s*\\n+\\s*".toRegex(), "\n  ")
.replace("^[\\n\\s]+".toRegex(), "  ")
.replace("^[\\n\\s]*".toRegex(), "  ")
.replace("[\\n\\s]+$".toRegex(), "")
}
@ -24,17 +25,32 @@ object HtmlFormatter {
html ?: return ""
val keepImgHtml = formatKeepImg(html)
val sb = StringBuffer()
val matcher = AppPattern.imgPattern.matcher(keepImgHtml)
//图片有data-开头的数据属性时优先用数据属性作为src,没有数据属性时才匹配src
val hasData = keepImgHtml.matches("<img[^>]*data-".toRegex())
val imgPatternX = if(hasData) Pattern.compile("<img[^>]*data-[^=]*= *\"([^\"])\"[^>]*>", Pattern.CASE_INSENSITIVE) else imgPattern
val matcher = imgPatternX.matcher(keepImgHtml)
var appendPos = 0
while (matcher.find()) {
val urlArray = matcher.group(1)!!.split(AnalyzeUrl.splitUrlRegex)
var url = NetworkUtils.getAbsoluteURL(redirectUrl, urlArray[0])
if (urlArray.size > 1) {
url = "$url,${urlArray[1]}"
}
var url = matcher.group(1)!!
val param:String
url = NetworkUtils.getAbsoluteURL(redirectUrl, if(url.indexOf(',') != -1) {
val absoluteUrl = url.split(AnalyzeUrl.splitUrlRegex, 1)[0]
param = url.substring(absoluteUrl.length)
absoluteUrl
} else {
param = ""
url
})
sb.append(keepImgHtml.substring(appendPos, matcher.start()))
sb.append("<img src=\"$url\" >")
sb.append("<img src=\"${url+param}\" >")
appendPos = matcher.end()
}
if (appendPos < keepImgHtml.length) {
sb.append(keepImgHtml.substring(appendPos, keepImgHtml.length))

Loading…
Cancel
Save