优化epub解析

pull/883/head
gedoor 4 years ago
parent 28eb666af5
commit 62fa68cfb3
  1. 1
      app/src/main/assets/updateLog.md
  2. 25
      app/src/main/java/io/legado/app/data/AppDatabase.kt
  3. 32
      app/src/main/java/io/legado/app/data/dao/EpubChapterDao.kt
  4. 25
      app/src/main/java/io/legado/app/data/entities/Book.kt
  5. 2
      app/src/main/java/io/legado/app/data/entities/BookChapter.kt
  6. 23
      app/src/main/java/io/legado/app/data/entities/EpubChapter.kt
  7. 4
      app/src/main/java/io/legado/app/help/DefaultData.kt
  8. 2
      app/src/main/java/io/legado/app/help/storage/Backup.kt
  9. 2
      app/src/main/java/io/legado/app/help/storage/Restore.kt
  10. 4
      app/src/main/java/io/legado/app/model/localBook/AnalyzeTxtFile.kt
  11. 178
      app/src/main/java/io/legado/app/model/localBook/EPUBFile.kt
  12. 8
      app/src/main/java/io/legado/app/ui/book/read/config/TocRegexDialog.kt
  13. 6
      app/src/main/java/io/legado/app/ui/book/read/config/TocRegexViewModel.kt

@ -7,6 +7,7 @@
* 阅读页面停留10分钟之后自动备份进度
* 添加了针对中文的断行排版处理-by hoodie13, 需要再阅读界面设置里手动开启
* 添加朗读快捷方式
* 优化Epub解析 by hoodie13
**2021/02/26**
* 添加反转内容功能

@ -22,8 +22,8 @@ val appDb by lazy {
ReplaceRule::class, SearchBook::class, SearchKeyword::class, Cookie::class,
RssSource::class, Bookmark::class, RssArticle::class, RssReadRecord::class,
RssStar::class, TxtTocRule::class, ReadRecord::class, HttpTTS::class, Cache::class,
RuleSub::class],
version = 29,
RuleSub::class, EpubChapter::class],
version = 30,
exportSchema = true
)
abstract class AppDatabase : RoomDatabase() {
@ -40,11 +40,12 @@ abstract class AppDatabase : RoomDatabase() {
abstract val rssArticleDao: RssArticleDao
abstract val rssStarDao: RssStarDao
abstract val cookieDao: CookieDao
abstract val txtTocRule: TxtTocRuleDao
abstract val txtTocRuleDao: TxtTocRuleDao
abstract val readRecordDao: ReadRecordDao
abstract val httpTTSDao: HttpTTSDao
abstract val cacheDao: CacheDao
abstract val ruleSubDao: RuleSubDao
abstract val epubChapterDao: EpubChapterDao
companion object {
@ -58,7 +59,7 @@ abstract class AppDatabase : RoomDatabase() {
migration_14_15, migration_15_17, migration_17_18, migration_18_19,
migration_19_20, migration_20_21, migration_21_22, migration_22_23,
migration_23_24, migration_24_25, migration_25_26, migration_26_27,
migration_27_28, migration_28_29
migration_27_28, migration_28_29, migration_29_30
)
.allowMainThreadQueries()
.addCallback(dbCallback)
@ -261,6 +262,22 @@ abstract class AppDatabase : RoomDatabase() {
database.execSQL("ALTER TABLE rssSources ADD sourceComment TEXT")
}
}
private val migration_29_30 = object : Migration(29, 30) {
override fun migrate(database: SupportSQLiteDatabase) {
database.execSQL("ALTER TABLE chapters ADD `startFragmentId` TEXT")
database.execSQL("ALTER TABLE chapters ADD `endFragmentId` TEXT")
database.execSQL(
"""
CREATE TABLE IF NOT EXISTS `epubChapters`
(`bookUrl` TEXT NOT NULL, `href` TEXT NOT NULL, `parentHref` TEXT,
PRIMARY KEY(`bookUrl`, `href`), FOREIGN KEY(`bookUrl`) REFERENCES `books`(`bookUrl`) ON UPDATE NO ACTION ON DELETE CASCADE )
"""
)
database.execSQL("CREATE INDEX IF NOT EXISTS `index_epubChapters_bookUrl` ON `epubChapters` (`bookUrl`)")
database.execSQL("CREATE UNIQUE INDEX IF NOT EXISTS `index_epubChapters_bookUrl_href` ON `epubChapters` (`bookUrl`, `href`)")
}
}
}
}

@ -0,0 +1,32 @@
package io.legado.app.data.dao
import androidx.room.*
import io.legado.app.data.entities.EpubChapter
@Dao
interface EpubChapterDao {
@get:Query("select * from epubChapters")
val all: List<EpubChapter>
@Query("select count(*) from epubChapters Where bookUrl = :bookUrl")
fun getCnt(bookUrl: String): Int
@Query("select * from epubChapters Where bookUrl = :bookUrl and parentHref = :parentHref ")
fun get(bookUrl: String, parentHref: String): List<EpubChapter>?
@Insert(onConflict = OnConflictStrategy.REPLACE)
fun insert(vararg chapter: EpubChapter)
@Query("delete from epubChapters")
fun clear()
@Query("delete from epubChapters Where bookUrl = :bookUrl")
fun deleteByName(bookUrl: String)
@Delete
fun delete(vararg chapter: EpubChapter)
@Update
fun update(vararg chapter: EpubChapter)
}

@ -146,6 +146,23 @@ data class Book(
config().pageAnim = pageAnim
}
fun getDelParagraph(): Int {
return config().delParagraph
}
fun setDelParagraph(num: Int) {
config().delParagraph = num
}
fun setDelTag(tag: Long) {
config().delTag =
if ((config().delTag and tag) == tag) config().delTag and tag.inv() else config().delTag or tag
}
fun getDelTag(tag: Long): Boolean {
return config().delTag and tag == tag
}
fun getFolderName(): String {
//防止书名过长,只取9位
var folderName = name.replace(AppPattern.fileNameRegex, "")
@ -207,11 +224,19 @@ data class Book(
}
}
companion object {
const val hTag = 2L
const val rubyTag = 4L
const val imgTag = 8L
}
@Parcelize
data class ReadConfig(
var pageAnim: Int = -1,
var reSegment: Boolean = false,
var useReplaceRule: Boolean = AppConfig.replaceEnableDefault, // 正文使用净化替换规则
var delParagraph: Int = 0,//去除段首
var delTag: Long = 0L//去除标签
) : Parcelable
class Converters {

@ -37,6 +37,8 @@ data class BookChapter(
var tag: String? = null, //
var start: Long? = null, // 章节起始位置
var end: Long? = null, // 章节终止位置
var startFragmentId: String? = null, //EPUB书籍当前章节的fragmentId
var endFragmentId: String? = null, //EPUB书籍下一章节的fragmentId
var variable: String? = null //变量
) : Parcelable {

@ -0,0 +1,23 @@
package io.legado.app.data.entities
import androidx.room.Entity
import androidx.room.ForeignKey
import androidx.room.Index
@Entity(
tableName = "epubChapters",
primaryKeys = ["bookUrl", "href"],
indices = [(Index(value = ["bookUrl"], unique = false)),
(Index(value = ["bookUrl", "href"], unique = true))],
foreignKeys = [(ForeignKey(
entity = Book::class,
parentColumns = ["bookUrl"],
childColumns = ["bookUrl"],
onDelete = ForeignKey.CASCADE
))]
)
data class EpubChapter(
var bookUrl: String = "",
var href: String = "",
var parentHref: String? = null,
)

@ -61,8 +61,8 @@ object DefaultData {
}
fun importDefaultTocRules() {
appDb.txtTocRule.deleteDefault()
appDb.txtTocRule.insert(*txtTocRules.toTypedArray())
appDb.txtTocRuleDao.deleteDefault()
appDb.txtTocRuleDao.insert(*txtTocRules.toTypedArray())
}
fun importDefaultRssSources() {

@ -67,7 +67,7 @@ object Backup {
writeListToJson(appDb.readRecordDao.all, "readRecord.json", backupPath)
writeListToJson(appDb.searchKeywordDao.all, "searchHistory.json", backupPath)
writeListToJson(appDb.ruleSubDao.all, "sourceSub.json", backupPath)
writeListToJson(appDb.txtTocRule.all, DefaultData.txtTocRuleFileName, backupPath)
writeListToJson(appDb.txtTocRuleDao.all, DefaultData.txtTocRuleFileName, backupPath)
writeListToJson(appDb.httpTTSDao.all, DefaultData.httpTtsFileName, backupPath)
GSON.toJson(ReadBookConfig.configList).let {
FileUtils.createFileIfNotExist(backupPath + File.separator + ReadBookConfig.configFileName)

@ -141,7 +141,7 @@ object Restore {
appDb.ruleSubDao.insert(*it.toTypedArray())
}
fileToListT<TxtTocRule>(path, DefaultData.txtTocRuleFileName)?.let {
appDb.txtTocRule.insert(*it.toTypedArray())
appDb.txtTocRuleDao.insert(*it.toTypedArray())
}
fileToListT<HttpTTS>(path, DefaultData.httpTtsFileName)?.let {
appDb.httpTTSDao.insert(*it.toTypedArray())

@ -277,10 +277,10 @@ class AnalyzeTxtFile {
}
private fun getTocRules(): List<TxtTocRule> {
var rules = appDb.txtTocRule.enabled
var rules = appDb.txtTocRuleDao.enabled
if (rules.isEmpty()) {
rules = DefaultData.txtTocRules.apply {
appDb.txtTocRule.insert(*this.toTypedArray())
appDb.txtTocRuleDao.insert(*this.toTypedArray())
}.filter {
it.enable
}

@ -4,7 +4,9 @@ import android.graphics.Bitmap
import android.graphics.BitmapFactory
import android.net.Uri
import android.text.TextUtils
import io.legado.app.data.appDb
import io.legado.app.data.entities.BookChapter
import io.legado.app.data.entities.EpubChapter
import io.legado.app.utils.*
import net.sf.jazzlib.ZipEntry
import net.sf.jazzlib.ZipInputStream
@ -23,7 +25,7 @@ import java.io.InputStream
import java.nio.charset.Charset
import java.util.*
class EPUBFile(val book: io.legado.app.data.entities.Book) {
class EPUBFile(var book: io.legado.app.data.entities.Book) {
companion object {
private var eFile: EPUBFile? = null
@ -34,6 +36,7 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
eFile = EPUBFile(book)
return eFile!!
}
eFile?.book = book
return eFile!!
}
@ -54,6 +57,11 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
): InputStream? {
return getEFile(book).getImage(href)
}
@Synchronized
fun getBookInfo(book: io.legado.app.data.entities.Book) {
return getEFile(book).getBookInfo()
}
}
private var epubBook: Book? = null
@ -77,6 +85,7 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
)
}
if (!File(book.coverUrl!!).exists()) {
/*部分书籍DRM处理后,封面获取异常,待优化*/
epubBook!!.coverImage?.inputStream?.use {
val cover = BitmapFactory.decodeStream(it)
val out = FileOutputStream(FileUtils.createFileIfNotExist(book.coverUrl!!))
@ -102,10 +111,8 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
zipEntry = inZip.nextEntry
if ((zipEntry == null) || zipEntry.isDirectory || zipEntry == ZipEntry("<error>")) continue
val resource = ResourceUtil.createResource(zipEntry, inZip)
if (resource.mediaType == MediatypeService.XHTML) {
resource.inputEncoding = "UTF-8";
}
if (zipEntry.name.endsWith("opf")) {
if (resource.mediaType == MediatypeService.XHTML) resource.inputEncoding = "UTF-8";
if (zipEntry.name.endsWith(".opf")) {
/*掌上书苑有很多自制书OPF的nameSpace格式不标准,强制修复成正确的格式*/
val newS = String(resource.data).replace(
"\\smlns=\"http://www.idpf.org/2007/opf\"".toRegex(),
@ -123,13 +130,61 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
}
private fun getContent(chapter: BookChapter): String? {
epubBook?.let { eBook ->
val resource = eBook.resources.getByHref(chapter.url)
val doc = Jsoup.parse(String(resource.data, mCharset))
val elements = doc.body().children()
/*获取当前章节文本*/
var string = getChildChapter(chapter, chapter.url)
val childContends = appDb.epubChapterDao.get(book.bookUrl, chapter.url)
if (childContends != null) {
/*如果书籍当前章节有多个html文件,追加文本*/
for (child in childContends) {
string += "\n" + getChildChapter(chapter, child.href)
}
}
return string
}
private fun getChildChapter(chapter: BookChapter, href: String): String? {
epubBook?.let {
val body = Jsoup.parse(String(it.resources.getByHref(href).data, mCharset)).body()
if (chapter.url == href) {
val startFragmentId = chapter.startFragmentId
val endFragmentId = chapter.endFragmentId
/*一些书籍依靠href索引的resource会包含多个章节,需要依靠fragmentId来截取到当前章节的内容*/
/*注:这里较大增加了内容加载的时间,所以首次获取内容后可存储到本地cache,减少重复加载*/
if (!startFragmentId.isNullOrBlank())
body.getElementById(startFragmentId)?.previousElementSiblings()?.remove()
if (!endFragmentId.isNullOrBlank() && endFragmentId != startFragmentId)
body.getElementById(endFragmentId)?.nextElementSiblings()?.remove()
}
/*选择去除正文中的H标签,部分书籍标题与阅读标题重复待优化*/
var tag = io.legado.app.data.entities.Book.hTag
if (book.getDelTag(tag)) {
body.getElementsByTag("h1")?.remove()
body.getElementsByTag("h2")?.remove()
body.getElementsByTag("h3")?.remove()
body.getElementsByTag("h4")?.remove()
body.getElementsByTag("h5")?.remove()
body.getElementsByTag("h6")?.remove()
//body.getElementsMatchingOwnText(chapter.title)?.remove()
}
/*选择去除正文中的img标签,目前图片支持效果待优化*/
tag = io.legado.app.data.entities.Book.imgTag
if (book.getDelTag(tag)) {
body.getElementsByTag("img")?.remove()
}
val elements = body.children()
elements.select("script").remove()
elements.select("style").remove()
return elements.outerHtml().htmlFormat()
/*选择去除正文中的ruby标签,目前注释支持效果待优化*/
tag = io.legado.app.data.entities.Book.rubyTag
var html = elements.outerHtml()
if (book.getDelTag(tag)) {
html = html.replace("<ruby>\\s?([\\u4e00-\\u9fa5])\\s?.*?</ruby>".toRegex(), "$1")
}
return html.htmlFormat()
}
return null
}
@ -139,11 +194,13 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
return epubBook?.resources?.getByHref(abHref)?.inputStream
}
private fun getChapterList(): ArrayList<BookChapter> {
val chapterList = ArrayList<BookChapter>()
epubBook?.let { eBook ->
val metadata = eBook.metadata
book.name = metadata.firstTitle
private fun getBookInfo() {
if (epubBook == null) {
eFile = null
book.intro = "书籍导入异常"
} else {
val metadata = epubBook!!.metadata
book.name = book.originName
if (metadata.authors.size > 0) {
val author =
metadata.authors[0].toString().replace("^, |, $".toRegex(), "")
@ -152,7 +209,12 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
if (metadata.descriptions.size > 0) {
book.intro = Jsoup.parse(metadata.descriptions[0]).text()
}
}
}
private fun getChapterList(): ArrayList<BookChapter> {
val chapterList = ArrayList<BookChapter>()
epubBook?.let { eBook ->
val refs = eBook.tableOfContents.tocReferences
if (refs == null || refs.isEmpty()) {
val spineReferences = eBook.spine.spineReferences
@ -167,7 +229,7 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
val doc =
Jsoup.parse(String(resource.data, mCharset))
val elements = doc.getElementsByTag("title")
if (elements.size > 0) {
if (elements != null && elements.size > 0) {
title = elements[0].text()
}
} catch (e: IOException) {
@ -187,10 +249,12 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
i++
}
} else {
parseFirstPage(chapterList, refs)
parseMenu(chapterList, refs, 0)
for (i in chapterList.indices) {
chapterList[i].index = i
}
getChildChapter(chapterList)
}
}
book.latestChapterTitle = chapterList.lastOrNull()?.title
@ -198,6 +262,82 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
return chapterList
}
/*获取当前章节的子章节。部分书籍一个章节包含多个html文件,(一些精排书籍,每一章节正文前的标题、标题封面、引言等都会有独立html)*/
/*需在读取常规章节列表后调用,遍历书籍全内容,根据href检索原不包含在章节内的html归属父章节*/
private fun getChildChapter(chapterList: ArrayList<BookChapter>) {
epubBook?.let {
val contents = it.contents
val chapters = ArrayList<EpubChapter>()
if (contents != null) {
var i = 0
var j = 0
var parentHref: String? = null
while (i < contents.size) {
val content = contents[i]
if (j < chapterList.size && content.href == chapterList[j].url) {
parentHref = content.href
j++
} else if (!parentHref.isNullOrBlank() && content.mediaType.toString()
.contains("htm")
) {
val epubChapter = EpubChapter()
epubChapter.bookUrl = book.bookUrl
epubChapter.href = content.href
epubChapter.parentHref = parentHref
chapters.add(epubChapter)
}
i++
}
}
appDb.epubChapterDao.deleteByName(book.bookUrl)
if (chapters.size > 0) appDb.epubChapterDao.insert(*chapters.toTypedArray())
}
}
/*获取书籍起始页内容。部分书籍第一章之前存在封面,引言,扉页等内容*/
/*tile获取不同书籍风格杂乱,格式化处理待优化*/
private var durIndex = 0
private fun parseFirstPage(
chapterList: ArrayList<BookChapter>,
refs: List<TOCReference>?
) {
val contents = epubBook?.contents
if (epubBook == null || contents == null || refs == null) return
var i = 0
durIndex = 0
while (i < contents.size) {
val content = contents[i]
if (!content.mediaType.toString().contains("htm")) continue
/*检索到第一章href停止*/
if (refs[0].completeHref == content.href) break
val chapter = BookChapter()
var title = content.title
if (TextUtils.isEmpty(title)) {
val elements = Jsoup.parse(
String(
epubBook!!.resources.getByHref(content.href).data,
mCharset
)
).getElementsByTag("title")
title =
if (elements != null && elements.size > 0) elements[0].text() else "--卷首--"
}
chapter.bookUrl = book.bookUrl
chapter.title = title
chapter.url = content.href
chapter.startFragmentId =
if (content.href.substringAfter("#") == content.href) null
else content.href.substringAfter("#")
if (durIndex > 0) {
val preIndex = durIndex - 1
chapterList[preIndex].endFragmentId = chapter.startFragmentId
}
chapterList.add(chapter)
durIndex++
i++
}
}
private fun parseMenu(
chapterList: ArrayList<BookChapter>,
refs: List<TOCReference>?,
@ -210,7 +350,13 @@ class EPUBFile(val book: io.legado.app.data.entities.Book) {
chapter.bookUrl = book.bookUrl
chapter.title = ref.title
chapter.url = ref.completeHref
chapter.startFragmentId = ref.fragmentId
if (durIndex > 0) {
val preIndex = durIndex - 1
chapterList[preIndex].endFragmentId = chapter.startFragmentId
}
chapterList.add(chapter)
durIndex++
}
if (ref.children != null && ref.children.isNotEmpty()) {
parseMenu(chapterList, ref.children, level + 1)

@ -95,7 +95,7 @@ class TocRegexDialog : BaseDialogFragment(), Toolbar.OnMenuItemClickListener {
private fun initData() {
tocRegexLiveData?.removeObservers(viewLifecycleOwner)
tocRegexLiveData = appDb.txtTocRule.observeAll()
tocRegexLiveData = appDb.txtTocRuleDao.observeAll()
tocRegexLiveData?.observe(viewLifecycleOwner, { tocRules ->
initSelectedName(tocRules)
adapter.setItems(tocRules)
@ -225,7 +225,7 @@ class TocRegexDialog : BaseDialogFragment(), Toolbar.OnMenuItemClickListener {
getItem(holder.layoutPosition)?.let {
it.enable = isChecked
launch(IO) {
appDb.txtTocRule.update(it)
appDb.txtTocRuleDao.update(it)
}
}
}
@ -236,7 +236,7 @@ class TocRegexDialog : BaseDialogFragment(), Toolbar.OnMenuItemClickListener {
ivDelete.setOnClickListener {
getItem(holder.layoutPosition)?.let { item ->
launch(IO) {
appDb.txtTocRule.delete(item)
appDb.txtTocRuleDao.delete(item)
}
}
}
@ -258,7 +258,7 @@ class TocRegexDialog : BaseDialogFragment(), Toolbar.OnMenuItemClickListener {
item.serialNumber = index + 1
}
launch(IO) {
appDb.txtTocRule.update(*getItems().toTypedArray())
appDb.txtTocRuleDao.update(*getItems().toTypedArray())
}
}
isMoved = false

@ -15,9 +15,9 @@ class TocRegexViewModel(application: Application) : BaseViewModel(application) {
fun saveRule(rule: TxtTocRule) {
execute {
if (rule.serialNumber < 0) {
rule.serialNumber = appDb.txtTocRule.lastOrderNum + 1
rule.serialNumber = appDb.txtTocRuleDao.lastOrderNum + 1
}
appDb.txtTocRule.insert(rule)
appDb.txtTocRuleDao.insert(rule)
}
}
@ -31,7 +31,7 @@ class TocRegexViewModel(application: Application) : BaseViewModel(application) {
execute {
RxHttp.get(url).toText("utf-8").await().let { json ->
GSON.fromJsonArray<TxtTocRule>(json)?.let {
appDb.txtTocRule.insert(*it.toTypedArray())
appDb.txtTocRuleDao.insert(*it.toTypedArray())
}
}
}.onSuccess {

Loading…
Cancel
Save