elasticsearch ik分詞插件的擴展字典和擴展停止詞字典用法


本文引自 https://blog.csdn.net/caideb/article/details/81632154

cnblog的排版好看很多,所以在這里建一篇分享博客。

-----------------------------------------------------------------------------------------------

 

擴展字典中的詞會被篩選出來,擴展停止詞中的詞會被過濾掉

1.沒有加入擴展字典 停止詞字典用法

1) ik分詞器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2) ik_smart分詞器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty                   HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

3) ik_max_word分詞器

[root@localhost custom]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 725
{
  "tokens" : [ {
    "token" : "自古",
    "start_offset" : 0,
    "end_offset" : 2,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "",
    "start_offset" : 4,
    "end_offset" : 5,
    "type" : "CN_CHAR",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2.加入自定義字典

擴展字典:用於創建分詞的字典

停止字典:用於過濾的字典,也就是說,該字典的單詞或者字符串都會進行過濾

test.dic

刀扇
背刺

 

teststop.dic

自古
過

 

/analysis-ik/config/IKAnalyzer.cfg.xml

 

1) ik分詞器

[root@localhost config]# curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 728
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 

2) ik_smart分詞器

[root@localhost config]#  curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_smart","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty                  HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 260
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 1
  } ]
}

 

3) ik_max_word分詞器

[root@localhost config]#  curl -i -X GET -H 'Content-type:application/json' -d '{"analyzer":"ik_max_word","text":"自古刀扇過背刺"}' http://192.168.0.110:9200/_analyze?pretty
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 728
{
  "tokens" : [ {
    "token" : "刀扇",
    "start_offset" : 2,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 0
  }, {
    "token" : "",
    "start_offset" : 2,
    "end_offset" : 3,
    "type" : "CN_WORD",
    "position" : 1
  }, {
    "token" : "",
    "start_offset" : 3,
    "end_offset" : 4,
    "type" : "CN_WORD",
    "position" : 2
  }, {
    "token" : "背刺",
    "start_offset" : 5,
    "end_offset" : 7,
    "type" : "CN_WORD",
    "position" : 3
  }, {
    "token" : "",
    "start_offset" : 5,
    "end_offset" : 6,
    "type" : "CN_WORD",
    "position" : 4
  }, {
    "token" : "",
    "start_offset" : 6,
    "end_offset" : 7,
    "type" : "CN_CHAR",
    "position" : 5
  } ]
}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM