使用Lucene全文檢索並使用中文版和高亮顯示


使用Lucene全文檢索並使用中文版和高亮顯示

中文分詞需要引入 中文分詞發的jar 包,咱們從maven中獲取

	<!-- lucene中文分詞器 -->
	<dependency>
	    <groupId>org.apache.lucene</groupId>
	    <artifactId>lucene-analyzers-smartcn</artifactId>
	    <version>5.3.1</version>
	</dependency>

下面是分詞和索引的事例

	package LuceneTest.LuceneTest;

	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field.Store;
	import org.apache.lucene.document.IntField;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.junit.Test;
	
	public class IndexChina {

private Directory dir; //存放索引的位置

//准備一下用來測試的數據
private Integer ids[] = {1, 2, 3}; //用來標識文檔
private String citys[] = {"上海", "南京", "青島"};
private String descs[] = {
    "上海是個繁華的城市。",
    "南京是一個有文化的城市。",
    "青島是一個美麗的城市。"
};

//生成索引
@Test
public void index(String indexDir) throws Exception {   
    dir = FSDirectory.open(Paths.get(indexDir));
    IndexWriter writer = getWriter();
    for(int i = 0; i < ids.length; i++) {
        Document doc = new Document();
        doc.add(new IntField("id", ids[i], Store.YES));
        doc.add(new StringField("city", citys[i], Store.YES));
        doc.add(new TextField("desc", descs[i], Store.YES));
        writer.addDocument(doc); //添加文檔
    }
    writer.close(); //close了才真正寫到文檔中
}

//獲取IndexWriter實例
private IndexWriter getWriter() throws Exception {
    SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();//使用中文分詞器
    IndexWriterConfig config = new IndexWriterConfig(analyzer); //將標准分詞器配到寫索引的配置中
    IndexWriter writer = new IndexWriter(dir, config); //實例化寫索引對象
    return writer;
}

public static void main(String[] args) throws Exception {
    new IndexChina().index("D:\\lucene2");     
}
}

新建的查詢

	package LuceneTest.LuceneTest;

	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	
	public class SearcherChina {

public static void search(String indexDir, String q) throws Exception {

    Directory dir = FSDirectory.open(Paths.get(indexDir)); //獲取要查詢的路徑,也就是索引所在的位置
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分詞器
    QueryParser parser = new QueryParser("desc", analyzer); //查詢解析器
    Query query = parser.parse(q); //通過解析要查詢的String,獲取查詢對象

    long startTime = System.currentTimeMillis(); //記錄索引開始時間
    TopDocs docs = searcher.search(query, 10);//開始查詢,查詢前10條數據,將記錄保存在docs中
    long endTime = System.currentTimeMillis(); //記錄索引結束時間
    System.out.println("匹配" + q + "共耗時" + (endTime-startTime) + "毫秒");
    System.out.println("查詢到" + docs.totalHits + "條記錄");

    for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每條查詢結果
        Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相當於docID,根據這個docID來獲取文檔
        System.out.println(doc.get("city")); 
        System.out.println(doc.get("desc")); 
        String desc = doc.get("desc");
    }
    reader.close();
}

public static void main(String[] args) {
    String indexDir = "D:\\lucene2";
    String q = "上海繁華"; //查詢這個字符
    try {
        search(indexDir, q);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
}

搜索結果的高亮顯示

引入jar文件

	 <!-- lucene高亮顯示 -->
		<dependency>
		    <groupId>org.apache.lucene</groupId>
		    <artifactId>lucene-highlighter</artifactId>
		    <version>5.3.1</version>
		</dependency>

新建查詢並將查詢的結果高亮

	package LuceneTest.LuceneTest;

	import java.io.StringReader;
	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.search.highlight.Fragmenter;
	import org.apache.lucene.search.highlight.Highlighter;
	import org.apache.lucene.search.highlight.QueryScorer;
	import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
	import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	
	public class SearcherChina {
	
	    public static void search(String indexDir, String q) throws Exception {
	
	        Directory dir = FSDirectory.open(Paths.get(indexDir)); //獲取要查詢的路徑,也就是索引所在的位置
	        IndexReader reader = DirectoryReader.open(dir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分詞器
	        QueryParser parser = new QueryParser("desc", analyzer); //查詢解析器
	        Query query = parser.parse(q); //通過解析要查詢的String,獲取查詢對象
	
	        long startTime = System.currentTimeMillis(); //記錄索引開始時間
	        TopDocs docs = searcher.search(query, 10);//開始查詢,查詢前10條數據,將記錄保存在docs中
	        long endTime = System.currentTimeMillis(); //記錄索引結束時間
	        System.out.println("匹配" + q + "共耗時" + (endTime-startTime) + "毫秒");
	        System.out.println("查詢到" + docs.totalHits + "條記錄");
	
	        
	        //此處加入的是搜索結果的高亮部分
	        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color=red>","</font></b>"); //如果不指定參數的話,默認是加粗,即<b><b/>
	        QueryScorer scorer = new QueryScorer(query);//計算得分,會初始化一個查詢結果最高的得分
	        Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); //根據這個得分計算出一個片段
	        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
	        highlighter.setTextFragmenter(fragmenter); //設置一下要顯示的片段
	
	        
	        
	        for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每條查詢結果
	            Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相當於docID,根據這個docID來獲取文檔
	            System.out.println(doc.get("city")); 
	            System.out.println(doc.get("desc")); 
	            String desc = doc.get("desc");
	            
	            
	          //顯示高亮部分
	            if(desc != null) {
	                TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));
	                String summary = highlighter.getBestFragment(tokenStream, desc);
	                System.out.println(summary);
	            }
	            
	        }
	        
	        
	        
	        reader.close();
	    }
	
	    public static void main(String[] args) {
	        String indexDir = "D:\\lucene2";
	        String q = "南京文化"; //查詢這個字符
	        try {
	            search(indexDir, q);
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
	    }
	}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM