使用Lucene全文檢索並使用中文版和高亮顯示
中文分詞需要引入 中文分詞發的jar 包,咱們從maven中獲取
<!-- lucene中文分詞器 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>5.3.1</version>
</dependency>
下面是分詞和索引的事例
package LuceneTest.LuceneTest;
import java.nio.file.Paths;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;
public class IndexChina {
private Directory dir; //存放索引的位置
//准備一下用來測試的數據
private Integer ids[] = {1, 2, 3}; //用來標識文檔
private String citys[] = {"上海", "南京", "青島"};
private String descs[] = {
"上海是個繁華的城市。",
"南京是一個有文化的城市。",
"青島是一個美麗的城市。"
};
//生成索引
@Test
public void index(String indexDir) throws Exception {
dir = FSDirectory.open(Paths.get(indexDir));
IndexWriter writer = getWriter();
for(int i = 0; i < ids.length; i++) {
Document doc = new Document();
doc.add(new IntField("id", ids[i], Store.YES));
doc.add(new StringField("city", citys[i], Store.YES));
doc.add(new TextField("desc", descs[i], Store.YES));
writer.addDocument(doc); //添加文檔
}
writer.close(); //close了才真正寫到文檔中
}
//獲取IndexWriter實例
private IndexWriter getWriter() throws Exception {
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();//使用中文分詞器
IndexWriterConfig config = new IndexWriterConfig(analyzer); //將標准分詞器配到寫索引的配置中
IndexWriter writer = new IndexWriter(dir, config); //實例化寫索引對象
return writer;
}
public static void main(String[] args) throws Exception {
new IndexChina().index("D:\\lucene2");
}
}
新建的查詢
package LuceneTest.LuceneTest;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class SearcherChina {
public static void search(String indexDir, String q) throws Exception {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //獲取要查詢的路徑,也就是索引所在的位置
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分詞器
QueryParser parser = new QueryParser("desc", analyzer); //查詢解析器
Query query = parser.parse(q); //通過解析要查詢的String,獲取查詢對象
long startTime = System.currentTimeMillis(); //記錄索引開始時間
TopDocs docs = searcher.search(query, 10);//開始查詢,查詢前10條數據,將記錄保存在docs中
long endTime = System.currentTimeMillis(); //記錄索引結束時間
System.out.println("匹配" + q + "共耗時" + (endTime-startTime) + "毫秒");
System.out.println("查詢到" + docs.totalHits + "條記錄");
for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每條查詢結果
Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相當於docID,根據這個docID來獲取文檔
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
String desc = doc.get("desc");
}
reader.close();
}
public static void main(String[] args) {
String indexDir = "D:\\lucene2";
String q = "上海繁華"; //查詢這個字符
try {
search(indexDir, q);
} catch (Exception e) {
e.printStackTrace();
}
}
}
搜索結果的高亮顯示
引入jar文件
<!-- lucene高亮顯示 -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>5.3.1</version>
</dependency>
新建查詢並將查詢的結果高亮
package LuceneTest.LuceneTest;
import java.io.StringReader;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class SearcherChina {
public static void search(String indexDir, String q) throws Exception {
Directory dir = FSDirectory.open(Paths.get(indexDir)); //獲取要查詢的路徑,也就是索引所在的位置
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分詞器
QueryParser parser = new QueryParser("desc", analyzer); //查詢解析器
Query query = parser.parse(q); //通過解析要查詢的String,獲取查詢對象
long startTime = System.currentTimeMillis(); //記錄索引開始時間
TopDocs docs = searcher.search(query, 10);//開始查詢,查詢前10條數據,將記錄保存在docs中
long endTime = System.currentTimeMillis(); //記錄索引結束時間
System.out.println("匹配" + q + "共耗時" + (endTime-startTime) + "毫秒");
System.out.println("查詢到" + docs.totalHits + "條記錄");
//此處加入的是搜索結果的高亮部分
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color=red>","</font></b>"); //如果不指定參數的話,默認是加粗,即<b><b/>
QueryScorer scorer = new QueryScorer(query);//計算得分,會初始化一個查詢結果最高的得分
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); //根據這個得分計算出一個片段
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
highlighter.setTextFragmenter(fragmenter); //設置一下要顯示的片段
for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每條查詢結果
Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相當於docID,根據這個docID來獲取文檔
System.out.println(doc.get("city"));
System.out.println(doc.get("desc"));
String desc = doc.get("desc");
//顯示高亮部分
if(desc != null) {
TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));
String summary = highlighter.getBestFragment(tokenStream, desc);
System.out.println(summary);
}
}
reader.close();
}
public static void main(String[] args) {
String indexDir = "D:\\lucene2";
String q = "南京文化"; //查詢這個字符
try {
search(indexDir, q);
} catch (Exception e) {
e.printStackTrace();
}
}
}