lucene同義詞搜索原理其實是根據 PositionIncrementAttribute 和 CharTermAttribute的次元記錄信息來實現的,當前使用lucene版本為4.8.0首先同義詞要實現
package lucene_index;
import java.io.IOException;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class MySameFiter extends TokenFilter {
Stack<String> stack = null;
private CharTermAttribute cta = null; //詞元信息
private PositionIncrementAttribute position = null; // 詞元位置信息
private AttributeSource.State current; // 記錄當前的詞元位置狀態
private Map<String, String[]> map ; // 同義詞表
protected MySameFiter(TokenStream input,Map<String, String[]> map ) {
super(input);
stack = new Stack<>();
cta = input.addAttribute(CharTermAttribute.class);
position = input.addAttribute(PositionIncrementAttribute.class);
this.map = map ;
}
@Override
public boolean incrementToken() throws IOException {
//同義詞操作
while (stack.size() > 0) {
String word = stack.pop();
restoreState(current);
cta.setEmpty();
cta.append(word);
position.setPositionIncrement(0);
return true;
}
//判斷是否有下一個分詞
if (!input.incrementToken()) {
return false;
}
//獲取當前的狀態
if (getSameWrds(cta.toString())) {
current = captureState();
}
return true;
}
private boolean getSameWrds(String words) {
String[] arr = map.get(words);
if (arr != null) {
for (String word : arr) {
stack.push(word);
}
return true;
}
return false;
}
}
自定義分詞器
3.測試
package lucene_index;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKTokenizer;
public class StopWrodsAnalyse extends Analyzer{
private Map<String, String[]> map = new HashMap<String, String[]>();
// private CharArraySet set = null;
public StopWrodsAnalyse(Map<String, String[]> map ){
//for(Map.Entry<String, String []> entry : map.entrySet()){
// set = StopFilter.makeStopSet(Version.LUCENE_48, entry.getValue(),true);
// }
// set.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
this.map = map ;
}
@Override
protected TokenStreamComponents createComponents(String words, Reader reader) {
Tokenizer source = new IKTokenizer(reader, false);
TokenStream stream = new MySameFiter(source,map);//將自定義的filter傳入詞庫的話用ik的
// stream = new StopFilter(Version.LUCENE_48, stream, set);
return new TokenStreamComponents(source,stream);
}
}
package lucene_index;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import javax.print.Doc;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class MainTest {
public static void main(String[] args) throws IOException, ParseException {
LineIterator it = FileUtils.lineIterator(new File("E://searchwork_custom//data_index//ConfigFile//ExpansionWord.csv"),"gbk");
Map<String, String []> map = new HashMap<String, String[]>();
while (it.hasNext()) {
String word = it.nextLine();
String [] wordArr = word.replace("-,", "").trim().split("\\,");
if(map.containsKey(wordArr[0]))
continue;
map.put(wordArr[0], wordArr);
}
Analyzer analyzer = new StopWrodsAnalyse(map);
Directory directory = FSDirectory.open(new File("E:\\luceneindex"));
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
IndexWriter writer = new IndexWriter(directory, config);
Collection<Document> coll = new ArrayList<Document>();
for (Map.Entry<String, String []> entry : map.entrySet()) {
Document docss = new Document();
Field field = new Field("name", entry.getKey(),Store.YES,Index.ANALYZED);
docss.add(field);
coll.add(docss);
}
writer.addDocuments(coll);
writer.commit();
writer.close();
IndexSearcher searcher = new IndexSearcher(IndexReader.open(FSDirectory.open(new File("E:\\luceneindex"))));
// QueryParser parser = new QueryParser(Version.LUCENE_48, "name", analyzer);
search(searcher);
//WordInfo.getWordInfo(word, analyzer);
}
public static void search(IndexSearcher searcher) throws IOException{
Query q = new TermQuery(new Term("name","中國建設銀行"));
System.out.println(q);
TopDocs doc = searcher.search(q, 10);
ScoreDoc [] docs = doc.scoreDocs;
for (int i = 0; i < docs.length; i++) {
Document d = searcher.doc(docs[i].doc);
System.out.println(d.get("name"));
}
}
}
3.測試
當搜建行建設銀行中國建設銀行時建行或者建設銀行時
