本篇介紹的是基於Elasticsearch實現搜索推薦詞,其中需要用到Elasticsearch的pinyin插件以及ik分詞插件,代碼的實現這里提供了java跟C#的版本方便大家參考。
1.實現的結果
①當搜索【qiy】的時候,能匹配企業、祈願等
②當搜索【qi業】的時候,只能匹配的到企業,如果沒有企業,將使用模糊查詢,匹配祈願。
③當搜索【q業】的時候結果同②。
④當搜索【企y】或【企ye】的時候結果同②。
④當搜索【qy】的時候,能匹配企業、祈願等。
2.實現的邏輯
中文匹配前綴==》全拼匹配前綴==》拼音首字母匹配前綴==》拼音模糊匹配前綴
優先級從左到右,當前面三個有結果的時候不建議用模糊匹配,這樣結果更加精確。比如需要獲取8個推薦詞,先獲取中文的,如果足夠8個將不再獲取之后的匹配結果。但是當模糊匹配之前已經存在匹配結果了,即使數量沒有達到8個,也不再繼續獲取模糊匹配結果。
3.插件准備
ik分詞插件安裝相對簡單,網上教程也多,這里不做介紹。這里講解下pinyin插件,官方版本的拼音插件不支持中文,處理結果只有拼音的,這樣會出現同音字匹配,結果不准確。
這里感謝小伙伴分享的拼音插件修改方法:https://www.cnblogs.com/danvid/p/10691547.html。
按照里面的操作處理后的插件將實現:
企業畫報:{"qi","企","ye","業","hua","畫","bao","報"}
拼音插件的各項具體屬性參考:https://blog.csdn.net/a1148233614/article/details/80280024,里面有詳細介紹。
4.Elasticsearch創建index
這里使用的ES版本為7.0.1,不再支持mapping,創建代碼如下:
PUT /suggest_tset
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"prefix_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"prefix_pinyin"
]
},
"full_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"full_pinyin"
]
},
"like_pinyin_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"like_pinyin"
]
}
},
"filter": {
"_pattern": {
"type": "pattern_capture",
"preserve_original": true,
"patterns": [
"([0-9])",
"([a-z])"
]
},
"prefix_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "false",
"none_chinese_pinyin_tokenize": "false",
"keep_separate_chinese": "true",
"keep_original": "false"
},
"full_pinyin": {
"type": "pinyin",
"keep_first_letter": "false",
"keep_full_pinyin": "true",
"keep_original": "false",
"keep_separate_chinese": "true",
"keep_none_chinese_in_first_letter": "false"
},
"like_pinyin": {
"type": "pinyin",
"keep_first_letter": "true",
"keep_full_pinyin": "true",
"keep_joined_full_pinyin": "false",
"keep_original": "false",
"keep_separate_chinese": "false",
"keep_none_chinese_in_first_letter": "false"
}
}
}
},
"mappings": {
"dynamic": "false",
"properties": {
"kwsuggest": {
"fields": {
"suggestText": {
"type": "completion",
"analyzer": "standard",
"preserve_separators": "false",
"preserve_position_increments": "true",
"max_input_length": 50
},
"prefix_pinyin": {
"type": "completion",
"analyzer": "prefix_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"full_pinyin": {
"type": "completion",
"analyzer": "full_pinyin_analyzer",
"search_analyzer": "standard",
"preserve_separators": "false"
},
"like_pinyin": {
"type": "completion",
"analyzer": "like_pinyin_analyzer",
"preserve_separators": "false"
}
},
"type": "text"
}
}
}
}
這里插入幾條測試數據
POST _bulk/?refresh=true
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "企業規划"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "祈願設計 完美無瑕"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "懸崖的圖片 美景"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "縣衙地址 那里呢"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "懸崖風景圖"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜的風光 真的美"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "起夜第二個詞 測試使用"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "需要一半留下一半打一字謎"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "許亞為"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "許雅非測試"}
{ "index" : { "_index" : "suggest_tset", "_type" : "_doc" } }
{ "kwsuggest": "徐楊是誰"}
下面為測試的查詢語句
GET /suggest_tset/_search
{
"suggest": {
"suggestText": {
"prefix": "qi業",
"completion": {
"field": "kwsuggest.suggestText",
"skip_duplicates": true
}
},
"full_pinyin": {
"prefix": "qi業",
"completion": {
"field": "kwsuggest.full_pinyin",
"skip_duplicates": true
}
},
"prefix_pinyin": {
"prefix": "qi業",
"completion": {
"field": "kwsuggest.prefix_pinyin",
"skip_duplicates": true
}
},
"like_pinyin": {
"prefix": "qi業",
"completion": {
"field": "kwsuggest.like_pinyin",
"skip_duplicates": true,
"fuzzy": {
"fuzziness": 1
}
}
}
}
}
當輸入查詢條件為【qiy】的時候,結果為:
{
"took" : 17,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "起夜的風光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的風光 真的美"
}
},
{
"text" : "起夜第二個詞 測試使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二個詞 測試使用"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企業規划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企業規划"
}
},
{
"text" : "祈願設計 這是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈願設計 這是啥呢"
}
},
{
"text" : "起夜的風光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的風光 真的美"
}
},
{
"text" : "起夜第二個詞 測試使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二個詞 測試使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qiy",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}
輸入【qi業】的查詢結果為
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qi業",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企業規划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "企業規划"
}
}
]
}
],
"like_pinyin" : [
{
"text" : "qi業",
"offset" : 0,
"length" : 3,
"options" : [
{
"text" : "企業規划",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9TgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "企業規划"
}
},
{
"text" : "祈願設計 這是啥呢",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "9jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "祈願設計 這是啥呢"
}
},
{
"text" : "起夜的風光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的風光 真的美"
}
},
{
"text" : "起夜第二個詞 測試使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二個詞 測試使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qi業",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
],
"suggestText" : [
{
"text" : "qi業",
"offset" : 0,
"length" : 3,
"options" : [ ]
}
]
}
}
輸入【qy】的結果為
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"suggest" : {
"full_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
],
"like_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的風光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜的風光 真的美"
}
},
{
"text" : "起夜第二個詞 測試使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 2.0,
"_source" : {
"kwsuggest" : "起夜第二個詞 測試使用"
}
}
]
}
],
"prefix_pinyin" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [
{
"text" : "起夜的風光 真的美",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "-jgnlHMBSEyTxFiDO4lU",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜的風光 真的美"
}
},
{
"text" : "起夜第二個詞 測試使用",
"_index" : "suggest_tset",
"_type" : "_doc",
"_id" : "aDg3lHMBSEyTxFiDXprV",
"_score" : 1.0,
"_source" : {
"kwsuggest" : "起夜第二個詞 測試使用"
}
}
]
}
],
"suggestText" : [
{
"text" : "qy",
"offset" : 0,
"length" : 2,
"options" : [ ]
}
]
}
}
5.java版本代碼
這里使用elasticsearch-rest-high-level-client
application.yml添加配置
# ES配置 elasticsearch: ipAddress: [127.0.0.1:9200]
添加配置類
@Component
@Configuration
@ConfigurationProperties(prefix = "elasticsearch")
@Data
public class ElasticsearchRestClientConfig {
private Logger logger = LoggerFactory.getLogger(getClass());
private static final int ADDRESS_LENGTH = 2;
private static final String HTTP_SCHEME = "http";
/**
* 使用冒號隔開ip和端口
*/
public String[] ipAddress;
@Bean
public RestClientBuilder restClientBuilder() {
HttpHost[] hosts = Arrays.stream(ipAddress)
.map(this::makeHttpHost)
.filter(Objects::nonNull)
.toArray(HttpHost[]::new);
logger.debug("hosts:{}", Arrays.toString(hosts));
return RestClient.builder(hosts);
}
@Bean(name = "highLevelClient")
public RestHighLevelClient highLevelClient(@Autowired RestClientBuilder restClientBuilder) {
return new RestHighLevelClient(restClientBuilder);
}
private HttpHost makeHttpHost(String s) {
assert StringUtils.isNotEmpty(s);
String[] address = s.split(":");
if (address.length == ADDRESS_LENGTH) {
String ip = address[0];
int port = Integer.parseInt(address[1]);
return new HttpHost(ip, port, HTTP_SCHEME);
} else {
return null;
}
}
}
實現的代碼:
@Service
public class KwSuggestService implements IKwSuggest {
@Autowired
RestHighLevelClient highLevelClient;
@Override
public List<String> GetKwSuggestList(String kw){
SearchRequest searchRequest = new SearchRequest("suggest_tset");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
SuggestBuilder suggestBuilder=new SuggestBuilder();
suggestBuilder.addSuggestion("suggestText", SuggestBuilders.completionSuggestion("kwsuggest.suggestText").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("full_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.full_pinyin").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("prefix_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.prefix_pinyin").prefix(kw).skipDuplicates(true).size(5));
suggestBuilder.addSuggestion("like_pinyin", SuggestBuilders.completionSuggestion("kwsuggest.like_pinyin").prefix(kw, Fuzziness.fromEdits(1)).skipDuplicates(true).size(5));
sourceBuilder.suggest(suggestBuilder);
sourceBuilder.timeout(new TimeValue(10, TimeUnit.SECONDS));
searchRequest.source(sourceBuilder);
List<String> result = new ArrayList<>();
List<String> suggestionList= Arrays.asList("suggestText","full_pinyin","prefix_pinyin","like_pinyin");
try {
SearchResponse response = highLevelClient.search(searchRequest, RequestOptions.DEFAULT);
Suggest suggestions = response.getSuggest();
Integer index = 1;
for(String suggestionType : suggestionList){
CompletionSuggestion completionSuggestion = suggestions.getSuggestion(suggestionType);
for (CompletionSuggestion.Entry entry : completionSuggestion.getEntries()) {
for (CompletionSuggestion.Entry.Option option : entry) {
String suggestText = option.getHit().getSourceAsMap().get("kwsuggest").toString();
result.add(suggestText);
}
}
// 按照中文匹配、全拼匹配、拼音首字母匹配、模糊匹配的順序,結果大於5的時候返回結果,根據自己業務需要判斷這個返回的數量
if(result.size()>=5){
break;
}
// 中文匹配,全拼匹配以及拼音首字母匹配存在結果的,不需要模糊匹配
if(index==3 && result.size()>0){
break;
}
// 超過3個字模糊匹配不准確
if(kw.length()>3 && result.size()==0){
break;
}
}
return result;
} catch (IOException e) {
e.printStackTrace();
return new ArrayList<>();
}
}
}
6..c#代碼實現
C#使用的是NEST
public partial class ElasticFactory
{
public ExternalServiceResponse<KeywordsSuggestResponseDataEntity> GetKeywordsSuggest(ElasticKeywordsSuggestRequest request)
{
var result = new ExternalServiceResponse<KeywordsSuggestResponseDataEntity>();
try
{
if (string.IsNullOrEmpty(request.q)) return result;
var nodes = new Uri[0];
nodes[0] = new Uri("http://127.0.0.1:9200");
var pool = new StaticConnectionPool(nodes);
var settings = new ConnectionSettings(pool).DefaultIndex("suggest_tset");
var client = new ElasticClient(settings);
string[] keys = new[] { "suggestText", "full_pinyin", "prefix_pinyin", "like_pinyin" };
SearchDescriptor<object> search = new SearchDescriptor<object>();
search
.Source(r => r
.Includes(f => f
.Fields("kw")
)
)
.Suggest(s => s.Completion(keys[0], c => c.Field("kwsuggest.suggestText").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys[1], c => c.Field("kwsuggest.full_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys[2], c => c.Field("kwsuggest.prefix_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates())
.Completion(keys[3], c => c.Field("kwsuggest.like_pinyin").SkipDuplicates(true).Prefix(request.q).SkipDuplicates().Fuzzy(m=>m.Fuzziness(Fuzziness.EditDistance(1)))))
;
var esResult = client.Search<dynamic>(s => search);
if (esResult != null)
{
result.code = 1;
result.data = new KeywordsSuggestResponseDataEntity();
//1.先獲取中文全匹配
//2.上面不滿5個,再匹配全拼
//3.上面不滿5個,中文全拼匹配首字母
//4.上面都沒有用模糊匹配
if (esResult.Suggest != null)
{
result.data.items = new List<KeywordsSuggestResponseItemEntity>();
int index = 1;
foreach (var key in keys)
{
AddSuggestItems(esResult.Suggest, key, result.data.items);
//1-3之間,夠了5個就返回
if (index >= 1 && index <= 3 && result.data.items.Count >= 5)
{
result.data.items = result.data.items.Skip(0).Take(5).ToList();
break;
}
//到了第3步如果還沒有滿足5個,直接返回,模糊匹配不精確
if (index == 3 && result.data.items.Count > 0)
{
break;
}
//輸入的字符數大於3個以上,前面沒有關鍵詞匹配,后面不做模糊處理,匹配度太差了
if (index == 3 && request.q.Length>3)
{
break;
}
index++;
}
result.data.num = result.data.items.Count;
}
else
{
result.data.num = 0;
}
}
else
{
result.code = 0;
result.msg = "查詢失敗";
}
}
catch (Exception ex)
{
result.code = 0;
result.msg = ex.Message;
}
return result;
}
private void AddSuggestItems(ISuggestDictionary<dynamic> suggest, string key, List<KeywordsSuggestResponseItemEntity> items)
{
var suggestFullPinyin = suggest[key];
if (suggestFullPinyin != null)
{
foreach (var hit in suggestFullPinyin[0].Options)
{
string kwSource = hit.Source["kwsuggest"];
//已經存在的不要重復添加
if (items.Any(m => m.kw == kwSource))
{
continue;
}
items.Add(new KeywordsSuggestResponseItemEntity() { kw = kwSource });
}
}
}
}
