lucene5学习-各种分词器简用(中文分词,标准分词,简单分词,停用分词,空格分词)


//lucene5兼容的mmsege4j.jar包下载地址: http://download.csdn.net/detail/u012720534/9259621
 
package lucene5;
 
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
 
public class AnalyzerUtils {
      
       public static void main(String[] args) {
            String str = "hello, i'm a boy, and i like play basketball" ;
            String ztr = "你好,我是一个男孩,我喜欢打篮球" ;
            Analyzer a = new StandardAnalyzer() ;      //标准分词器
            Analyzer b = new SimpleAnalyzer() ;        //简单分词器
            Analyzer c = new StopAnalyzer() ;          //停用词分词器
            Analyzer d = new WhitespaceAnalyzer() ; //空格分词器
            Analyzer analyzer = new MMSegAnalyzer() ; //中文分词器
             display(str,a) ;
            System. out .println( "-----------------------------" );
             display(str,b) ;
            System. out .println( "-----------------------------" );
             display(str,c) ;
            System. out .println( "-----------------------------" );
             display(str,d) ;
            System. out .println( "-----------------------------" );
             display(ztr,analyzer) ;
      }
      
       public static void display(String str, Analyzer a) {
            TokenStream stream = null ;
             try {
                  stream = a.tokenStream( "renyi" , new StringReader(str)) ;
                  PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute. class ) ;  //保存位置
                  OffsetAttribute oa = stream.addAttribute(OffsetAttribute. class ) ; //保存辞与词之间偏移量
                  CharTermAttribute cta = stream.addAttribute(CharTermAttribute. class ) ; //保存响应词汇
                  TypeAttribute ta = stream.addAttribute(TypeAttribute. class ) ; //保存类型
                   //在lucene 4 以上  要加入reset 和  end方法
                  stream.reset() ;
                   while (stream.incrementToken()) {
                        System. out .println(pia.getPositionIncrement() + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());
                  }
                  stream.end() ;
            } catch (IOException e) {
                  e.printStackTrace();
            }
      }
}
 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM