ik中文分詞器及拼音分詞器試用


安裝

./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.4/elasticsearch-analysis-ik-5.6.4.zip
./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v5.6.4/elasticsearch-analysis-pinyin-5.6.4.zip

安裝后需要重啟elasticsearch服務

查看當前已安裝插件

GET _cat/plugins

結果
node01 analysis-ik     5.6.4
node01 analysis-pinyin 5.6.4

測試中文分詞器,支持ik_max_word和ik_smart兩種方式

GET _analyze
{
  "analyzer":"ik_max_word",
  "text":"中華人民共和國國歌"
}
結果
{
  "tokens": [
    {
      "token": "中華人民共和國",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "中華人民",
      "start_offset": 0,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "中華",
      "start_offset": 0,
      "end_offset": 2,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "華人",
      "start_offset": 1,
      "end_offset": 3,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "人民共和國",
      "start_offset": 2,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "人民",
      "start_offset": 2,
      "end_offset": 4,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "共和國",
      "start_offset": 4,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "共和",
      "start_offset": 4,
      "end_offset": 6,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "",
      "start_offset": 6,
      "end_offset": 7,
      "type": "CN_CHAR",
      "position": 8
    },
    {
      "token": "國歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 9
    }
  ]
}
使用ik_smart,則會盡可能少的返回詞語:
{
  "tokens": [
    {
      "token": "中華人民共和國",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "國歌",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 1
    }
  ]
}

ik分詞器支持自定義詞庫

vi config/IKAnalyzer.cfg.xml

<?
xml version="1.0" encoding="UTF-8"?> <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd"> <properties> <comment>IK Analyzer 擴展配置</comment> <!--用戶可以在這里配置自己的擴展字典 --> <entry key="ext_dict">zhouls.dic</entry> <!--用戶可以在這里配置自己的擴展停止詞字典--> <entry key="ext_stopwords"></entry> <!--用戶可以在這里配置遠程擴展字典 --> <!-- <entry key="remote_ext_dict">words_location</entry> --> <!--用戶可以在這里配置遠程擴展停止詞字典--> <!-- <entry key="remote_ext_stopwords">words_location</entry> --> </properties>

#配置完成需要重啟服務

簡單測試拼音分詞

PUT test08
{
  "index": {
    "analysis": {
      "analyzer": {
        "pinyin_analyzer": {
          "tokenizer": "my_pinyin",
          "filter": "word_delimiter"
        }
      },
      "tokenizer": {
        "my_pinyin": {
          "type": "pinyin",
          "first_letter": "none",
          "padding_char": " "
        }
      }
    }
  }
}

GET medcl/_analyze
{
  "text":"劉德華",
  "analyzer":"pinyin_analyzer"
}
結果
{
  "tokens": [
    {
      "token": "liu",
      "start_offset": 0,
      "end_offset": 1,
      "type": "word",
      "position": 0
    },
    {
      "token": "ldh",
      "start_offset": 0,
      "end_offset": 3,
      "type": "word",
      "position": 0
    },
    {
      "token": "de",
      "start_offset": 1,
      "end_offset": 2,
      "type": "word",
      "position": 1
    },
    {
      "token": "hua",
      "start_offset": 2,
      "end_offset": 3,
      "type": "word",
      "position": 2
    }
  ]
}

同時支持中文和拼音的分詞器

PUT test06
{
  "settings":{
    "number_of_shards":"1",
    "index.refresh_interval":"15s",
    "index":{
      "analysis":{
        "analyzer":{
           "ik_pinyin_analyzer":{
            "type":"custom",
            "tokenizer":"ik_smart",
            "filter":"pinyin_filter"
          }
        },
        "filter":{
          "pinyin_filter":{
            "type":"pinyin",
            "keep_first_letter": false
          }
        }
      }
    }
  },
  "mappings": {
    "doc":{
      "properties": {
        "name":{
          "type": "text",
          "analyzer": "ik_pinyin_analyzer"
        }
      }
    }
  }
}

POST test06/_analyze
{
  "analyzer": "ik_pinyin_analyzer",
  "text":"中華人民共和國國歌"
}
結果
{
  "tokens": [
    {
      "token": "zhong",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 0
    },
    {
      "token": "hua",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 1
    },
    {
      "token": "ren",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 2
    },
    {
      "token": "min",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 3
    },
    {
      "token": "gong",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 4
    },
    {
      "token": "he",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 5
    },
    {
      "token": "guo",
      "start_offset": 0,
      "end_offset": 7,
      "type": "CN_WORD",
      "position": 6
    },
    {
      "token": "guo",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 7
    },
    {
      "token": "ge",
      "start_offset": 7,
      "end_offset": 9,
      "type": "CN_WORD",
      "position": 8
    }
  ]
}

 

參考文檔:

https://blog.csdn.net/u013905744/article/details/80935846

https://www.cnblogs.com/xing901022/p/5910139.html

https://blog.csdn.net/qq_28018283/article/details/80396937

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM