lucene5學習-各種分詞器簡用（中文分詞，標准分詞，簡單分詞，停用分詞，空格分詞）

本文轉載自查看原文 2015-11-11 09:38 2080 標准分詞/ （中文分詞/ 空格分詞）/ 停用分詞/ 各種分詞器簡用/ 簡單分詞/ lucene5學習

//lucene5兼容的mmsege4j.jar包下載地址： http://download.csdn.net/detail/u012720534/9259621

package lucene5;

import java.io.IOException;

import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.core.SimpleAnalyzer;

import org.apache.lucene.analysis.core.StopAnalyzer;

import org.apache.lucene.analysis.core.WhitespaceAnalyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;

public class AnalyzerUtils {

public static void main(String[] args) {

String str = "hello, i'm a boy, and i like play basketball" ;

String ztr = "你好，我是一個男孩，我喜歡打籃球" ;

Analyzer a = new StandardAnalyzer() ; //標准分詞器

Analyzer b = new SimpleAnalyzer() ; //簡單分詞器

Analyzer c = new StopAnalyzer() ; //停用詞分詞器

Analyzer d = new WhitespaceAnalyzer() ; //空格分詞器

Analyzer analyzer = new MMSegAnalyzer() ; //中文分詞器

display(str,a) ;

System. out .println( "-----------------------------" );

display(str,b) ;

System. out .println( "-----------------------------" );

display(str,c) ;

System. out .println( "-----------------------------" );

display(str,d) ;

System. out .println( "-----------------------------" );

display(ztr,analyzer) ;

}

public static void display(String str, Analyzer a) {

TokenStream stream = null ;

try {

stream = a.tokenStream( "renyi" , new StringReader(str)) ;

PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute. class ) ; //保存位置

OffsetAttribute oa = stream.addAttribute(OffsetAttribute. class ) ; //保存辭與詞之間偏移量

CharTermAttribute cta = stream.addAttribute(CharTermAttribute. class ) ; //保存響應詞匯

TypeAttribute ta = stream.addAttribute(TypeAttribute. class ) ; //保存類型

//在lucene 4 以上要加入reset 和 end方法

stream.reset() ;

while (stream.incrementToken()) {

System. out .println(pia.getPositionIncrement() + ":[" + cta.toString() + "]:" + oa.startOffset() + "->" + oa.endOffset() + ":" + ta.type());

}

stream.end() ;

} catch (IOException e) {

e.printStackTrace();

}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Lucene的中文分詞器IKAnalyzer Lucene 自定義分詞器 es學習(三)：分詞器介紹以及中文分詞器ik的安裝與使用 30.IK中文分詞器的安裝和簡單使用有哪些較好的中文分詞器 ElasticSearch中文分詞器-IK分詞器的使用 Elasticsearch的索引模塊（正排索引、倒排索引、索引分析模塊Analyzer、索引和搜索、停用詞、中文分詞器） Apache Lucene(全文檢索引擎)—分詞器 Lucene IK分詞器集成，詞典擴展 solr配置中文分詞器