本文通過代碼簡單展示了采用TermQuery和FuzzyLikeThisQuery進行索引查詢,並且展示了如何在查詢結果中高亮顯示匹配的關鍵字(這在實際使用中是一個很有用的功能)
1 public class Indexer 2 { 3 4 /** 5 * @param args 6 * @throws IOException 7 * @throws LockObtainFailedException 8 * @throws CorruptIndexException 9 * @throws InvalidTokenOffsetsException 10 */ 11 public static void main(String[] args) throws CorruptIndexException, 12 LockObtainFailedException, IOException, InvalidTokenOffsetsException 13 { 14 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); 15 16 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); 17 config.setOpenMode(OpenMode.CREATE_OR_APPEND); 18 19 Directory indexDir = new RAMDirectory(); 20 21 /** 22 * 1. Indexing... 23 */ 24 IndexWriter writer = new IndexWriter(indexDir, config); 25 26 File docs = new File("D:\\files"); 27 28 if (docs.exists() && docs.isDirectory()) 29 { 30 File[] files = docs.listFiles(); 31 32 if (files != null && files.length > 0) 33 { 34 for (File file : files) 35 { 36 // •Field.Index.NO 不索引,如果存儲選項為YES,一般用於只保存不搜索的字段; 37 // •Field.Index.ANALYZED 分詞建索引; 38 // •Field.Index.NOT_ANALYZED 建索引但不分詞,字段雖然被索引但是沒有任何分析器對字段進行分析,只能整詞精確搜索,可保存唯一性字段(例如ID)並用於更新索引 39 Document doc = new Document(); 40 doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NO)); 41 doc.add(new Field("id", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED)); 42 doc.add(new Field("name", file.getName(), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); 43 44 doc.add(new Field("size", file.getTotalSpace() + "b", Field.Store.YES, Field.Index.NO)); 45 46 writer.addDocument(doc); 47 } 48 49 writer.commit(); 50 } 51 } 52 53 writer.close(true); 54 55 56 /** 57 * 2. List indexed files ... 58 */ 59 IndexReader reader = IndexReader.open(indexDir); 60 IndexSearcher searcher = new IndexSearcher(reader); 61 62 System.out.println("Max doc:" + searcher.maxDoc()); 63 System.out.println("List files below...."); 64 65 Document doc = null; 66 for (int i = 0; i < searcher.maxDoc(); i++) 67 { 68 doc = searcher.doc(i); 69 System.out.println("Doc " + i + " Name: " + doc.get("name") + ", Path: " + doc.get("path") + ", Size: " + doc.get("size")); 70 } 71 System.out.println("==================================================================================="); 72 73 74 /** 75 * 3.Searching... 76 */ 77 String id = "we"; 78 // 此處若改為Query queryId = new TermQuery(new Term("id", id));則無法搜索出結果,除非id = "We are young.txt"; 79 Query queryId = new TermQuery(new Term("name", id)); 80 TopDocs hitsForId = searcher.search(queryId, null, 100); 81 if (hitsForId != null && hitsForId.totalHits > 0) 82 { 83 System.out.println("Searched " + hitsForId.totalHits + " docs for id " + id + "..."); 84 85 for (int j = 0; j < hitsForId.scoreDocs.length; j++) 86 { 87 System.out.println("Score doc for id " + j + " is " + hitsForId.scoreDocs[j].toString()); 88 } 89 } 90 System.out.println("==================================================================================="); 91 92 String keyword = "we are yy"; 93 FuzzyLikeThisQuery fuzzyLikeThisQuery = new FuzzyLikeThisQuery(100, analyzer); 94 fuzzyLikeThisQuery.addTerms(keyword, "name", 0.8F, 0); 95 96 // FuzzyLikeThisQuery不是lucene core自帶的查詢類,屬於contrib的query模塊 97 // 默認情況下QueryScorer的私有成員WeightedSpanTermExtractor無法識別它,getBestFragment將返回null 98 // 因此此處調用rewrite生成一個WeightedSpanTermExtractor可以識別的query對象,用於匹配內容關鍵字 99 Query query = fuzzyLikeThisQuery.rewrite(reader); 100 101 // 高亮顯示關鍵字,如果內容中本來就有<span></span>,可能導致顯示錯亂 102 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span>", "</span>"); 103 Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); 104 105 TopDocs hits = searcher.search(fuzzyLikeThisQuery, null, 100); 106 107 if (hits != null && hits.totalHits > 0) 108 { 109 System.out.println("Searched " + hits.totalHits + "docs for keyword " + keyword + "..."); 110 111 ScoreDoc[] sDocs = hits.scoreDocs; 112 113 Document docMatched = null; 114 for (int j = 0; j < sDocs.length; j++) 115 { 116 System.out.println("Score doc " + j + " is " + sDocs[j].toString()); 117 118 docMatched = searcher.doc(sDocs[j].doc); 119 120 TokenStream tokenStream = analyzer.tokenStream("name", new StringReader(docMatched.get("name"))); 121 String str = highlighter.getBestFragment(tokenStream, docMatched.get("name")); 122 123 System.out.println("Score doc " + j + " hightlight to: " + str); 124 125 } 126 } 127 128 reader.close(); 129 indexDir.close(); 130 } 131 }
輸出如下
Max doc:13
List files below....
Doc 0 Name: ab.txt, Path: D:\files\ab.txt, Size: 104857595904b
Doc 1 Name: abc.txt, Path: D:\files\abc.txt, Size: 104857595904b
Doc 2 Name: M_1.txt, Path: D:\files\M_1.txt, Size: 104857595904b
Doc 3 Name: M_11.txt, Path: D:\files\M_11.txt, Size: 104857595904b
Doc 4 Name: We are young.txt, Path: D:\files\We are young.txt, Size: 104857595904b
Doc 5 Name: 什么是微博.txt, Path: D:\files\什么是微博.txt, Size: 104857595904b
Doc 6 Name: 喝水不忘挖井人.txt, Path: D:\files\喝水不忘挖井人.txt, Size: 104857595904b
Doc 7 Name: 天蒼蒼野茫茫.txt, Path: D:\files\天蒼蒼野茫茫.txt, Size: 104857595904b
Doc 8 Name: 怎么使用lucene.txt, Path: D:\files\怎么使用lucene.txt, Size: 104857595904b
Doc 9 Name: 神馬是一種馬嗎.txt, Path: D:\files\神馬是一種馬嗎.txt, Size: 104857595904b
Doc 10 Name: 蒼井.txt, Path: D:\files\蒼井.txt, Size: 104857595904b
Doc 11 Name: 蒼白 - 副本.txt, Path: D:\files\蒼白 - 副本.txt, Size: 104857595904b
Doc 12 Name: 蒼白.txt, Path: D:\files\蒼白.txt, Size: 104857595904b
===================================================================================
Searched 1 docs for id we...
Score doc for id 0 is doc=4 score=1.7948763 shardIndex=-1
===================================================================================
Searched 1docs for keyword we are yy...
Score doc 0 is doc=4 score=0.625 shardIndex=-1
Score doc 0 hightlight to: <span>We</span> are young.txt