lucene5學習-各種分詞器簡用(中文分詞,標准分詞,簡單分詞,停用分詞,空格分詞)


//lucene5兼容的mmsege4j.jar包下載地址: http://download.csdn.net/detail/u012720534/9259621
 
package lucene5;
 
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
 
public class AnalyzerUtils {
      
       public static void main(String[] args) {
            String str = "hello, i'm a boy, and i like play basketball" ;
            String ztr = "你好,我是一個男孩,我喜歡打籃球" ;
            Analyzer a = new StandardAnalyzer() ;      //標准分詞器
            Analyzer b = new SimpleAnalyzer() ;        //簡單分詞器
            Analyzer c = new StopAnalyzer() ;          //停用詞分詞器
            Analyzer d = new WhitespaceAnalyzer() ; //空格分詞器
            Analyzer analyzer = new MMSegAnalyzer() ; //中文分詞器
             display(str,a) ;
            System. out .println( "-----------------------------" );
             display(str,b) ;
            System. out .println( "-----------------------------" );
             display(str,c) ;
            System. out .println( "-----------------------------" );
             display(str,d) ;
            System. out .println( "-----------------------------" );
             display(ztr,analyzer) ;
      }
      
       public static void display(String str, Analyzer a) {
            TokenStream stream = null ;
             try {
                  stream = a.tokenStream( "renyi" , new StringReader(str)) ;
                  PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute. class ) ;  //保存位置
                  OffsetAttribute oa = stream.addAttribute(OffsetAttribute. class ) ; //保存辭與詞之間偏移量
                  CharTermAttribute cta = stream.addAttribute(CharTermAttribute. class ) ; //保存響應詞匯
                  TypeAttribute ta = stream.addAttribute(TypeAttribute. class ) ; //保存類型
                   //在lucene 4 以上  要加入reset 和  end方法
                  stream.reset() ;
                   while (stream.incrementToken()) {
                        System. out .println(pia.getPositionIncrement() + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
                  }
                  stream.end() ;
            } catch (IOException e) {
                  e.printStackTrace();
            }
      }
}
 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM