elasticsearch-cn-out-of-box
https://github.com/hangxin1940/elasticsearch-cn-out-of-box
為elasticsearch集成一些實用插件以及配置的開箱即用的版本。
======
- elasticsearch 1.4.2
- servicewrapper 0.90
站點插件:
- oob
- bigdesk 2.5.0
- head
- kopf 1.2.5
- segmentspy
- inquisitor
- paramedic
- hq
分詞插件
- analysis-smartcn 2.3.1
- analysis-mmseg 1.2.2
- analysis-ik 1.2.9
- analysis-stconvert 1.3.0
- analysis-pinyin 1.2.2
- analysis-ansj 1.x.1
- analysis-string2int 1.3.0
- analysis-combo 1.5.1
其他插件
- jetty oob-1.4.2
- mapper-attachments 2.4.1
為 inquisitor 插件增加自定義分析器的預覽等
使用方法
瀏覽器進入插件 http://localhost:9200/_plugin/oob
elasticsearch.yml
# 集群名
cluster.name: "cn-out-of-box"
# 節點名
node.name: "node1"
# 是否有資格成為主節點
node.master: true
# 是否存儲索引數據
node.data: true
# 默認索引分片數
index.number_of_shards: 3
# 默認索引副本數
index.number_of_replicas: 1
# 臨時文件存儲路路徑
#path.work: "/tmp/elasticsearch"
# 日志文件存儲路路徑
#path.logs: "/var/log/elasticsearch/logs"
# tcp傳輸端口
transport.tcp.port: 9300
# 是否壓縮tcp傳輸數據
transport.tcp.compress: true
# http端口
http.port: 9200
# 是否開啟http服務
#http.enabled: true
# 是否打開多播發現節點
discovery.zen.ping.multicast.enabled: true
# 慢查詢日志參數
#index.search.slowlog.threshold.query.warn: 10s
#index.search.slowlog.threshold.query.info: 5s
#index.search.slowlog.threshold.query.debug: 2s
#index.search.slowlog.threshold.query.trace: 500ms
#index.search.slowlog.threshold.fetch.warn: 1s
#index.search.slowlog.threshold.fetch.info: 800ms
#index.search.slowlog.threshold.fetch.debug: 500ms
#index.search.slowlog.threshold.fetch.trace: 200ms
# 啟用jetty插件提供http服務
http.type: com.sonian.elasticsearch.http.jetty.JettyHttpServerTransport
# sonian.elasticsearch.http.jetty:
# ==== 開啟 https
#ssl_port: 9443
#config: jetty.xml,jetty-ssl.xml, jetty-gzip.xml
#keystore_password: "OBF:1nc01vuz1w8f1w1c1rbu1rac1w261w9b1vub1ndq"
# ==== 開啟用戶認證
# config: jetty.xml,jetty-hash-auth.xml,jetty-restrict-all.xml
# 索引配置
index:
# 分析配置
analysis:
# 分詞器配置
tokenizer:
index_ansj_token:
type: ansj_index_token
is_name: false
is_num: false
is_quantifier: false
query_ansj_token:
type: ansj_query_token
is_name: false
is_num: false
is_quantifier: false
# ======== analysis-pinyin ========
# 完整拼音
my_pinyin:
type: pinyin
first_letter: prefix
padding_char: ' '
# 拼音首字母
pinyin_first_letter:
type: pinyin
first_letter: only
# ======== analysis-mmseg ========
# 簡單正向匹配
# example: 一個勁兒的說話
# 一個
# 一個勁
# 一個勁兒
# 一個勁兒的
mmseg_simple:
type: mmseg
seg_type: simple
# 匹配出所有的“三個詞的詞組”
# 並使用四種規則消歧(最大匹配、最大平均詞語長度、詞語長度的最小變化率、所有單字詞詞頻的自然對數之和)
# example: 研究生命起源
# 研_究_生
# 研_究_生命
# 研究生_命_起源
# 研究_生命_起源
mmseg_complex:
type: mmseg
seg_type: complex
# 基於complex的最多分詞
# example: 中國人民銀行
# 中國|人民|銀行
mmseg_maxword:
type: mmseg
seg_type: max_word
# ======== analysis-stconvert ========
# 簡繁轉換,只輸出繁體
s2t_convert:
type: stconvert
delimiter: ","
convert_type: s2t
# 繁簡轉換,只輸出簡體
t2s_convert:
type: stconvert
delimiter: ","
convert_type: t2s
# 簡繁轉換,同時輸出繁體簡體
s2t_keep_both_convert:
type: stconvert
delimiter: ","
keep_both: 'true'
convert_type: s2t
# 繁簡轉換,同時輸出簡體繁體
t2s_keep_both_convert:
type: stconvert
delimiter: ","
keep_both: 'true'
convert_type: t2s
# ======== analysis-pattern ========
# 正則,分號分詞
semicolon_spliter:
type: pattern
pattern: ";"
# 正則,%分詞
pct_spliter:
type: pattern
pattern: "[%]+"
# ======== analysis-nGram ========
# 1~2字為一詞
ngram_1_to_2:
type: nGram
min_gram: 1
max_gram: 2
# 1~3字為一詞
ngram_1_to_3:
type: nGram
min_gram: 1
max_gram: 3
# 過濾器配置
filter:
# ======== ngram filter ========
ngram_min_3:
max_gram: 10
min_gram: 3
type: nGram
ngram_min_2:
max_gram: 10
min_gram: 2
type: nGram
ngram_min_1:
max_gram: 10
min_gram: 1
type: nGram
# ======== length filter ========
min2_length:
min: 2
max: 4
type: length
min3_length:
min: 3
max: 4
type: length
# ======== string2int filter ========
# my_string2int:
# type: string2int
# redis_server: 127.0.0.1
# redis_port: 6379
# redis_key: index1_type2_name2
# ======== pinyin filter ========
pinyin_first_letter:
type: pinyin
first_letter: only
# 分析器配置
analyzer:
lowercase_keyword:
type: custom
filter:
- lowercase
tokenizer: standard
lowercase_keyword_ngram_min_size1:
type: custom
filter:
- lowercase
- stop
- trim
- unique
tokenizer: nGram
lowercase_keyword_ngram_min_size2:
type: custom
filter:
- lowercase
- min2_length
- stop
- trim
- unique
tokenizer: nGram
lowercase_keyword_ngram_min_size3:
type: custom
filter:
- lowercase
- min3_length
- stop
- trim
- unique
tokenizer: ngram_1_to_3
lowercase_keyword_ngram:
type: custom
filter:
- lowercase
- stop
- trim
- unique
tokenizer: ngram_1_to_3
lowercase_keyword_without_standard:
type: custom
filter:
- lowercase
tokenizer: keyword
lowercase_whitespace:
type: custom
filter:
- lowercase
tokenizer: whitespace
# ======== ik ========
# ik分詞器
ik:
alias:
- ik_analyzer
type: org.elasticsearch.index.analysis.IkAnalyzerProvider
# ik智能切分
ik_max_word:
type: ik
use_smart: false
# ik最細粒度切分
ik_smart:
type: ik
use_smart: true
# ======== mmseg ========
# mmseg分詞器
mmseg:
alias:
- mmseg_analyzer
type: org.elasticsearch.index.analysis.MMsegAnalyzerProvider
mmseg_maxword:
type: custom
filter:
- lowercase
tokenizer: mmseg_maxword
mmseg_complex:
type: custom
filter:
- lowercase
tokenizer: mmseg_complex
mmseg_simple:
type: custom
filter:
- lowercase
tokenizer: mmseg_simple
# ======== 正則 ========
comma_spliter:
type: pattern
pattern: "[,|\\s]+"
pct_spliter:
type: pattern
pattern: "[%]+"
custom_snowball_analyzer:
type: snowball
language: English
simple_english_analyzer:
type: custome
tokenizer: whitespace
filter:
- standard
- lowercase
- snowball
edge_ngram:
type: custom
tokenizer: edgeNGram
filter:
- lowercase
# ======== 拼音分析 ========
pinyin_ngram_analyzer:
type: custom
tokenizer: my_pinyin
filter:
- lowercase
- nGram
- trim
- unique
# ======== 拼音首字母分詞 ========
pinyin_first_letter_analyzer:
type: custom
tokenizer: pinyin_first_letter
filter:
- standard
- lowercase
# ======== 拼音首字母分詞並過濾 ========
pinyin_first_letter_keyword_analyzer:
alias:
- pinyin_first_letter_analyzer_keyword
type: custom
tokenizer: keyword
filter:
- pinyin_first_letter
- lowercase
# ======== 簡繁體 ========
stconvert:
alias:
- st_analyzer
type: org.elasticsearch.index.analysis.STConvertAnalyzerProvider
s2t_convert:
type: stconvert
delimiter: ","
convert_type: s2t
t2s_convert:
type: stconvert
delimiter: ","
convert_type: t2s
s2t_keep_both_convert:
type: stconvert
delimiter: ","
keep_both: 'true'
convert_type: s2t
t2s_keep_both_convert:
type: stconvert
delimiter: ","
keep_both: 'true'
convert_type: t2s
#string2int:
#type: org.elasticsearch.index.analysis.String2IntAnalyzerProvider
# redis_server: 127.0.0.1
# redis_port: 6379
# redis_key: index1_type1_name1
#custom_string2int:
#type: custom
#tokenizer: whitespace
#filter:
#- string2int
#- lowercase
# 路徑分析
path_analyzer:
type: custom
tokenizer: path_hierarchy
# ======== ansj ========
index_ansj:
alias:
- ansj_index_analyzer
type: ansj_index
user_path: ansj/user
ambiguity: ansj/ambiguity.dic
stop_path: ansj/stopLibrary.dic
#is_name: false
# s_num: true
#is_quantifier: true
redis: false
#pool:
#maxactive: 20
# maxidle: 10
#maxwait: 100
#testonborrow: true
#ip: 127.0.0.1:6379
#channel: ansj_term
query_ansj:
alias:
- ansj_query_analyzer
type: ansj_query
user_path: ansj/user
ambiguity: ansj/ambiguity.dic
stop_path: ansj/stopLibrary.dic
#is_name: false
# is_num: true
# is_quantifier: true
redis: false
#pool:
#maxactive: 20
# maxidle: 10
#maxwait: 100
#testonborrow: true
#ip: 127.0.0.1:6379
#channel: ansj_term
uax_url_email:
tokenizer: uax_url_email
filter: [standard, lowercase, stop]
# ======== combo ========
combo:
type: combo
sub_analyzers:
- ansj_index
- ik_smart
- mmseg_complex
- uax_url_email
- s2t_convert
- t2s_convert
- smartcn
- simple_english_analyzer
# 默認分析器
index.analysis.analyzer.default.type: combo
# 線程池設置
threadpool:
index:
type: fixed
size: 30
queue: -1
reject_policy: caller