最近一直在写一个手机端的小说阅读器,想了想还是写一个系列的博客记录一下踩到的坑吧。
首先,既然是小说阅读器,当然少不了智能分章的功能,话不多说,直接上代码。

import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; public class NovelParser{ private String path; private String charset; private String name; private List<TitleInfo> titleList; public static final int MAX_PARSE_NUMBER = 200; public NovelParser(String path,String charset){ this.path = path; this.charset = charset; titleList = new ArrayList<>(); int index = path.lastIndexOf("\\"); name = index == -1 ? path : path.substring(index + 1,path.lastIndexOf(".")); } //execute only once public void parseTitleInfo(){ long time = System.currentTimeMillis(); int count = 0; BufferedReader reader = null; InputStreamReader inputStreamReader = null; FileInputStream inputStream = null; try { inputStream = new FileInputStream(path); inputStreamReader = new InputStreamReader(inputStream,charset); reader = new BufferedReader(inputStreamReader); String line; //之所以设置这个变量是因为有的TXT文档会在一章的开头将标题重复一遍,造成一章内容被解析成两章 //所以设置一个最小行数,两个章节之间的行数差距最小为5 int number = 5; //因为一般的TXT文档开头都会有一些介绍性信息,这些不能被归到第一章中,所以单独新建一个章节保存起来 TitleInfo titleInfo = new TitleInfo(); titleInfo.setTitle(name); titleInfo.setIndex(0); titleInfo.setStartLength(0); titleList.add(titleInfo); System.out.println("书籍开始章节 : " + titleInfo.toString()); StringBuilder builder = new StringBuilder(); int parseLength = 0; while ((line = reader.readLine()) != null){ line = line.trim(); if(line.equals("")){ parseLength += 2;//这里的+2是因为要加上换行的长度 continue; } if(line.trim().length() < 4){ if(number >= 5 && TitleMatches.isExtra(line)) {//如果是额外章节 count++; parseLength += builder.toString().getBytes(charset).length; builder.delete(0,builder.length()); titleInfo = new TitleInfo(count, line, parseLength); titleList.add(titleInfo); number = 0; System.out.println("检测到额外章节" + titleInfo.toString()); } }else{ if(number >= 5 && TitleMatches.isZhang(line)){//如果是正文章节 count++; parseLength += builder.toString().getBytes(charset).length; builder.delete(0,builder.length()); titleInfo = new TitleInfo(count,line, parseLength); titleList.add(titleInfo); number = 0; System.out.println("检测到新章节" + titleInfo.toString()); } } builder.append(line); parseLength += 2; number++; if(number >= MAX_PARSE_NUMBER){ //为了避免某个文档一直没有匹配到新章节而不停的向StringBuilder中添加内容,导致Android内存溢出,这里对StringBuilder的大小进行了一定的限制 //即解析的行数达到一定的数目之后,即使没有匹配到新章节也将StringBuilder清空,同时更新parseLength。 //注意:这个数目的设定会影响到解析的时间,请谨慎设置!!!! parseLength += builder.toString().getBytes(charset).length; builder.delete(0,builder.length()); number = 5; } } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if(inputStream != null){ try { inputStream.close(); } catch (IOException e) { e.printStackTrace(); } } if(inputStreamReader != null){ try { inputStreamReader.close(); } catch (IOException e) { e.printStackTrace(); } } if(reader != null){ try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } System.out.println("执行完毕,耗时 : " + (System.currentTimeMillis() - time) + ",检测到" + titleList.size() + "章"); } } }
NovelParser类就是主要的工作类了。解析的原理很简单,就是用BufferedReader从文本文档中一行一行的读取内容,然后用正则来判断这一行是否是新章节的开始。主要的部分都有注释,下面是用来存储章节信息的TitleInfo类:

public class TitleInfo { private int index;//章节下下标 private String title;//章节标题 private int startLength;//章节开始字节数,用来和RandomAccessFile作章节跳转和单章解析用 public TitleInfo(){ } public TitleInfo(int index, String title, int startLength) { this.index = index; this.title = title; this.startLength = startLength; } public int getIndex() { return index; } public String getTitle() { return title; } public long getStartLength() { return startLength; } public void setIndex(int index) { this.index = index; } public void setTitle(String title) { this.title = title; } public void setStartLength(int startLength) { this.startLength = startLength; } public String toString(){ return "[index = " + index + ",title = " + title + ",startLength = " + startLength + "]"; } }
以及用来匹配新章节的TitleMatches类:

import java.util.regex.Pattern; /** * 这个类用来判断某一行是否为新章节 * 判定条件:1.如果是新章节则必定以"第"开头,且至少包含关键字数组key中的一个元素,且"第"到该关键字中的内容匹配正则p * 2.如果是额外章节,则其单行长度(去掉空格之后)不得超过3,且至少满足下列条件中的一条 * a.其第一个或者第二个字为"序"(e.g.序,序言,序章,魔序)且字符长度不超过2 * b.以extra_key_start关键字数组中任意一项开头(e.g.前言,附录1,后记1) */ public class TitleMatches { //匹配的优先度依次递减 public static final String[] key = {"部","卷","章","节","集","回","幕","计"}; public static final Pattern p = Pattern.compile("^[0-9零一二三四五六七八九十百千]+$"); public static boolean isZhang(String line){ if(!line.startsWith("第")){ return false; } int index = -1; for (int i = 0; i < key.length; i++) { index = line.indexOf(key[i]); if(index != -1){ break; } } if(index == -1){ return false; } String zhang = line.substring(1,index); return p.matcher(zhang).matches(); } public static final String[] extra_key = {"序"}; public static final String[] extra_key_start = {"前言","后记","楔子","附录","外传"}; public static boolean isExtra(String line){ if(line.length() > 3){ return false; } int index = line.indexOf(extra_key[0]); if(index != -1){ return (index == 0 || index == 1) && line.length() <= 2; }else{ for (int i = 0; i < extra_key_start.length; i++) { if(line.startsWith(extra_key_start[i])){ return true; } } return false; } } }
这几个类都添加完成之后就万事俱备了,需要注意的地方在代码中都有注释。下面是测试代码,用来测试的小说是希灵帝国,大小是15.7MB。

public static void main(String[] args){ NovelParser parser = new NovelParser("D:\\Users\\Excalibur\\Desktop\\希灵帝国.txt","GBK"); parser.parseTitleInfo(); }
以下是运行截图:
去掉输出语句之后如下:
至此,小说的智能分章就已经实现了,可以在各个章节之间自由跳转而不会导致阅读器卡顿了。