//lucene5兼容的mmsege4j.jar包下載地址: http://download.csdn.net/detail/u012720534/9259621
package
lucene5;
import
java.io.IOException;
import
java.io.StringReader;
import
org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.TokenStream;
import
org.apache.lucene.analysis.core.SimpleAnalyzer;
import
org.apache.lucene.analysis.core.StopAnalyzer;
import
org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import
org.apache.lucene.analysis.standard.StandardAnalyzer;
import
org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import
org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import
org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import
com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public
class
AnalyzerUtils {
public
static
void
main(String[] args) {
String str =
"hello, i'm a boy, and i like play basketball"
;
String ztr =
"你好,我是一個男孩,我喜歡打籃球"
;
Analyzer a =
new
StandardAnalyzer() ;
//標准分詞器
Analyzer b =
new
SimpleAnalyzer() ;
//簡單分詞器
Analyzer c =
new
StopAnalyzer() ;
//停用詞分詞器
Analyzer d =
new
WhitespaceAnalyzer() ;
//空格分詞器
Analyzer analyzer =
new
MMSegAnalyzer() ;
//中文分詞器
display(str,a) ;
System.
out
.println(
"-----------------------------"
);
display(str,b) ;
System.
out
.println(
"-----------------------------"
);
display(str,c) ;
System.
out
.println(
"-----------------------------"
);
display(str,d) ;
System.
out
.println(
"-----------------------------"
);
display(ztr,analyzer) ;
}
public
static
void
display(String str, Analyzer a) {
TokenStream stream =
null
;
try
{
stream = a.tokenStream(
"renyi"
,
new
StringReader(str)) ;
PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.
class
) ;
//保存位置
OffsetAttribute oa = stream.addAttribute(OffsetAttribute.
class
) ;
//保存辭與詞之間偏移量
CharTermAttribute cta = stream.addAttribute(CharTermAttribute.
class
) ;
//保存響應詞匯
TypeAttribute ta = stream.addAttribute(TypeAttribute.
class
) ;
//保存類型
//在lucene 4 以上 要加入reset 和 end方法
stream.reset() ;
while
(stream.incrementToken()) {
System.
out
.println(pia.getPositionIncrement() +
":["
+ cta.toString() +
"]:"
+ oa.startOffset() +
"->"
+ oa.endOffset() +
":"
+ ta.type());
}
stream.end() ;
}
catch
(IOException e) {
e.printStackTrace();
}
}
}