本文引自 https://blog.csdn.net/caideb/article/details/81632154
cnblog的排版好看很多,所以在這里建一篇分享博客。
-----------------------------------------------------------------------------------------------
擴展字典中的詞會被篩選出來,擴展停止詞中的詞會被過濾掉
1.沒有加入擴展字典 停止詞字典用法
1) ik分詞器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 725
{ "tokens" : [ { "token" : "自古", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "刀", "start_offset" : 2, "end_offset" : 3, "type" : "CN_WORD", "position" : 1 }, { "token" : "扇", "start_offset" : 3, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 }, { "token" : "過", "start_offset" : 4, "end_offset" : 5, "type" : "CN_CHAR", "position" : 3 }, { "token" : "背", "start_offset" : 5, "end_offset" : 6, "type" : "CN_WORD", "position" : 4 }, { "token" : "刺", "start_offset" : 6, "end_offset" : 7, "type" : "CN_CHAR", "position" : 5 } ] }
2) ik_smart分詞器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 725
{ "tokens" : [ { "token" : "自古", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "刀", "start_offset" : 2, "end_offset" : 3, "type" : "CN_WORD", "position" : 1 }, { "token" : "扇", "start_offset" : 3, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 }, { "token" : "過", "start_offset" : 4, "end_offset" : 5, "type" : "CN_CHAR", "position" : 3 }, { "token" : "背", "start_offset" : 5, "end_offset" : 6, "type" : "CN_WORD", "position" : 4 }, { "token" : "刺", "start_offset" : 6, "end_offset" : 7, "type" : "CN_CHAR", "position" : 5 } ] }
3) ik_max_word分詞器
[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 725
{ "tokens" : [ { "token" : "自古", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "刀", "start_offset" : 2, "end_offset" : 3, "type" : "CN_WORD", "position" : 1 }, { "token" : "扇", "start_offset" : 3, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 }, { "token" : "過", "start_offset" : 4, "end_offset" : 5, "type" : "CN_CHAR", "position" : 3 }, { "token" : "背", "start_offset" : 5, "end_offset" : 6, "type" : "CN_WORD", "position" : 4 }, { "token" : "刺", "start_offset" : 6, "end_offset" : 7, "type" : "CN_CHAR", "position" : 5 } ] }
2.加入自定義字典
擴展字典:用於創建分詞的字典
停止字典:用於過濾的字典,也就是說,該字典的單詞或者字符串都會進行過濾
test.dic
刀扇
背刺
teststop.dic
自古
過
/analysis-ik/config/IKAnalyzer.cfg.xml
1) ik分詞器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 728
{ "tokens" : [ { "token" : "刀扇", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 }, { "token" : "刀", "start_offset" : 2, "end_offset" : 3, "type" : "CN_WORD", "position" : 1 }, { "token" : "扇", "start_offset" : 3, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 }, { "token" : "背刺", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 3 }, { "token" : "背", "start_offset" : 5, "end_offset" : 6, "type" : "CN_WORD", "position" : 4 }, { "token" : "刺", "start_offset" : 6, "end_offset" : 7, "type" : "CN_CHAR", "position" : 5 } ] }
2) ik_smart分詞器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 260
{ "tokens" : [ { "token" : "刀扇", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 }, { "token" : "背刺", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 1 } ] }
3) ik_max_word分詞器
[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty HTTP/1.1 200 OK Content-Type: application/json; charset=UTF-8 Content-Length: 728
{ "tokens" : [ { "token" : "刀扇", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 0 }, { "token" : "刀", "start_offset" : 2, "end_offset" : 3, "type" : "CN_WORD", "position" : 1 }, { "token" : "扇", "start_offset" : 3, "end_offset" : 4, "type" : "CN_WORD", "position" : 2 }, { "token" : "背刺", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 3 }, { "token" : "背", "start_offset" : 5, "end_offset" : 6, "type" : "CN_WORD", "position" : 4 }, { "token" : "刺", "start_offset" : 6, "end_offset" : 7, "type" : "CN_CHAR", "position" : 5 } ] }