最近由於工作原因,一直忙於公司的各種項目(大部份都是基於spring cloud的微服務項目),故有一段時間沒有與大家分享總結最近的技術研究成果的,其實最近我一直在不斷的深入研究學習Spring、Spring Boot、Spring Cloud的各種框架原理,同時也隨時關注着.NET CORE的發展情況及最新技術點,也在極客時間上訂閱相關的專欄,只要下班有空我都會去認真閱讀觀看,紙質書箱也買了一些,總之近一年都是在通過:微信技術公眾號(.NET、JAVA、算法、前端等技術方向)、極客時間、技術書箱 不斷的吸取、借鑒他人之精華,從而不斷的充實提高自己的技術水平,所謂:學如逆水行舟,不進則退,工作中學習,學習后工作中運用,當然寫文章分享是一種總結,同時也是“溫故而知新”的最佳應用。
前面廢話說得有點多了,就直奔本文的主題內容,編寫一個基於Lucene.Net的搜索引擎查詢通用工具類:SearchEngineUtil,Lucene是什么,見百度百科 ,重點是:Lucene是一個全文檢索引擎的架構,提供了完整的查詢引擎和索引引擎,Lucene.NET是C#及.NET運行時下的另一種語言的實現,官網地址:http://lucenenet.apache.org/ ,具體用法就不多說了,官網以及網上都有很多,但由於Lucene.Net的原生SDK中的API比較復雜,用起來不太方便,故我進行了適當的封裝,把常用的增、刪、改、查(分頁查)在保證靈活度的情況下進行了封裝,使得操作Lucene.Net變得相對簡單一些,代碼本身也不復雜,貼出完整的SearchEngineUtil代碼如下:
using Lucene.Net.Analysis.PanGu;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.QueryParsers;
using Lucene.Net.Search;
using Lucene.Net.Store;
using NLog;
using PanGu;
using PanGu.HighLight;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
namespace CN.Zuowenjun.Blog.Common
{
/// <summary>
/// Lucene 搜索引擎實用工具類
/// Author:zuowenjun
/// </summary>
public class SearchEngineUtil
{
/// <summary>
/// 創建並添加索引記錄
/// </summary>
/// <typeparam name="TIndex"></typeparam>
/// <param name="indexDir"></param>
/// <param name="indexData"></param>
/// <param name="setDocFiledsAction"></param>
public static void AddIndex<TIndex>(string indexDir, TIndex indexData, Action<Document, TIndex> setDocFiledsAction)
{
//創建索引目錄
if (!System.IO.Directory.Exists(indexDir))
{
System.IO.Directory.CreateDirectory(indexDir);
}
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NativeFSLockFactory());
bool isUpdate = IndexReader.IndexExists(directory);
if (isUpdate)
{
//如果索引目錄被鎖定(比如索引過程中程序異常退出),則首先解鎖
if (IndexWriter.IsLocked(directory))
{
IndexWriter.Unlock(directory);
}
}
using (IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, IndexWriter.MaxFieldLength.UNLIMITED))
{
Document document = new Document();
setDocFiledsAction(document, indexData);
writer.AddDocument(document);
writer.Optimize();//優化索引
}
}
/// <summary>
/// 刪除索引記錄
/// </summary>
/// <param name="indexDir"></param>
/// <param name="keyFiledName"></param>
/// <param name="keyFileValue"></param>
public static void DeleteIndex(string indexDir, string keyFiledName, object keyFileValue)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NativeFSLockFactory());
if (!IndexReader.IndexExists(directory))
{
return;
}
using (IndexWriter iw = new IndexWriter(directory, new PanGuAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED))
{
iw.DeleteDocuments(new Term(keyFiledName, keyFileValue.ToString()));
iw.Optimize();//刪除文件后並非從磁盤中移除,而是生成一個.del的文件,需要調用Optimize方法來清除。在清除文件前可以使用UndeleteAll方法恢復
}
}
/// <summary>
/// 更新索引記錄
/// </summary>
/// <param name="indexDir"></param>
/// <param name="keyFiledName"></param>
/// <param name="keyFileValue"></param>
/// <param name="doc"></param>
public static void UpdateIndex(string indexDir, string keyFiledName, object keyFileValue, Document doc)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NativeFSLockFactory());
if (!IndexReader.IndexExists(directory))
{
return;
}
using (IndexWriter iw = new IndexWriter(directory, new PanGuAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED))
{
iw.UpdateDocument(new Term(keyFiledName, keyFileValue.ToString()), doc);
iw.Optimize();
}
}
/// <summary>
/// 是否存在指定的索引文檔
/// </summary>
/// <param name="indexDir"></param>
/// <param name="keyFiledName"></param>
/// <param name="keyFileValue"></param>
/// <returns></returns>
public static bool ExistsDocument(string indexDir, string keyFiledName, object keyFileValue)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NativeFSLockFactory());
if (!IndexReader.IndexExists(directory))
{
return false;
}
var reader = IndexReader.Open(directory, true);
return reader.DocFreq(new Term(keyFiledName, keyFileValue.ToString())) > 0;
}
/// <summary>
/// 查詢索引匹配到的記錄
/// </summary>
/// <typeparam name="TResult"></typeparam>
/// <param name="indexDir"></param>
/// <param name="buildQueryAction"></param>
/// <param name="getSortFieldsFunc"></param>
/// <param name="buildResultFunc"></param>
/// <param name="topCount"></param>
/// <param name="needHighlight"></param>
/// <returns></returns>
public static List<TResult> SearchIndex<TResult>(string indexDir, Func<BooleanQuery, IDictionary<string, string>> buildQueryAction,
Func<IEnumerable<SortField>> getSortFieldsFunc, Func<Document, TResult> buildResultFunc, bool needHighlight = true, int topCount = 0)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NoLockFactory());
if (!IndexReader.IndexExists(directory))
{
return new List<TResult>();
}
IndexReader reader = IndexReader.Open(directory, true);
IndexSearcher searcher = new IndexSearcher(reader);
BooleanQuery bQuery = new BooleanQuery();
var keyWords = buildQueryAction(bQuery);
Sort sort = null;
var sortFields = getSortFieldsFunc();
if (sortFields != null)
{
sort = new Sort();
sort.SetSort(sortFields.ToArray());
}
topCount = topCount > 0 ? topCount : int.MaxValue;//當未指定TOP值,則設置最大值以表示獲取全部
TopDocs resultDocs = null;
if (sort != null)
{
resultDocs = searcher.Search(bQuery, null, topCount, sort);
}
else
{
resultDocs = searcher.Search(bQuery, null, topCount);
}
if (topCount > resultDocs.TotalHits)
{
topCount = resultDocs.TotalHits;
}
Dictionary<string, PropertyInfo> highlightProps = null;
List<TResult> results = new List<TResult>();
if (resultDocs != null)
{
for (int i = 0; i < topCount; i++)
{
Document doc = searcher.Doc(resultDocs.ScoreDocs[i].Doc);
var model = buildResultFunc(doc);
if (needHighlight)
{
model = SetHighlighter(keyWords, model, ref highlightProps);
}
results.Add(model);
}
}
return results;
}
/// <summary>
/// 分頁查詢索引匹配到的記錄
/// </summary>
/// <typeparam name="TResult"></typeparam>
/// <param name="indexDir"></param>
/// <param name="buildQueryAction"></param>
/// <param name="getSortFieldsFunc"></param>
/// <param name="buildResultFunc"></param>
/// <param name="pageSize"></param>
/// <param name="page"></param>
/// <param name="totalCount"></param>
/// <param name="needHighlight"></param>
/// <returns></returns>
public static List<TResult> SearchIndexByPage<TResult>(string indexDir, Func<BooleanQuery, IDictionary<string, string>> buildQueryAction,
Func<IEnumerable<SortField>> getSortFieldsFunc, Func<Document, TResult> buildResultFunc, int pageSize, int page, out int totalCount, bool needHighlight = true)
{
FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexDir), new NoLockFactory());
if (!IndexReader.IndexExists(directory))
{
totalCount = 0;
return new List<TResult>();
}
IndexReader reader = IndexReader.Open(directory, true);
IndexSearcher searcher = new IndexSearcher(reader);
BooleanQuery bQuery = new BooleanQuery();
var keyWords = buildQueryAction(bQuery);
Sort sort = null;
var sortFields = getSortFieldsFunc();
if (sortFields != null)
{
sort = new Sort();
sort.SetSort(sortFields.ToArray());
}
TopScoreDocCollector docCollector = TopScoreDocCollector.Create(1, true);
searcher.Search(bQuery, docCollector);
totalCount = docCollector.TotalHits;
if (totalCount <= 0) return null;
TopDocs resultDocs = searcher.Search(bQuery, null, pageSize * page, sort);
Dictionary<string, PropertyInfo> highlightProps = null;
List<TResult> results = new List<TResult>();
int indexStart = (page - 1) * pageSize;
int indexEnd = indexStart + pageSize;
if (totalCount < indexEnd) indexEnd = totalCount;
if (resultDocs != null)
{
for (int i = indexStart; i < indexEnd; i++)
{
Document doc = searcher.Doc(resultDocs.ScoreDocs[i].Doc);
var model = buildResultFunc(doc);
if (needHighlight)
{
model = SetHighlighter(keyWords, model, ref highlightProps);
}
results.Add(model);
}
}
return results;
}
/// <summary>
/// 設置結果高亮
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="dicKeywords"></param>
/// <param name="model"></param>
/// <param name="props"></param>
/// <returns></returns>
private static T SetHighlighter<T>(IDictionary<string, string> dicKeywords, T model, ref Dictionary<string, PropertyInfo> props)
{
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new Segment());
highlighter.FragmentSize = 250;
Type modelType = typeof(T);
foreach (var item in dicKeywords)
{
if (!string.IsNullOrWhiteSpace(item.Value))
{
if (props == null)
{
props = new Dictionary<string, PropertyInfo>();
}
if (!props.ContainsKey(item.Key))
{
props[item.Key] = modelType.GetProperty(item.Key, BindingFlags.IgnoreCase | BindingFlags.Public | BindingFlags.Instance);
}
var modelProp = props[item.Key];
if (modelProp.PropertyType == typeof(string))
{
string newValue = highlighter.GetBestFragment(item.Value, modelProp.GetValue(model).ToString());
if (!string.IsNullOrEmpty(newValue))
{
modelProp.SetValue(model, newValue);
}
}
}
}
return model;
}
/// <summary>
/// 拆分關鍵詞
/// </summary>
/// <param name="keywords"></param>
/// <returns></returns>
public static string GetKeyWordsSplitBySpace(string keyword)
{
PanGuTokenizer ktTokenizer = new PanGuTokenizer();
StringBuilder result = new StringBuilder();
ICollection<WordInfo> words = ktTokenizer.SegmentToWordInfos(keyword);
foreach (WordInfo word in words)
{
if (word == null)
{
continue;
}
result.AppendFormat("{0}^{1}.0 ", word.Word, (int)Math.Pow(3, word.Rank));
}
return result.ToString().Trim();
}
/// <summary>
/// 【輔助方法】創建盤古查詢對象
/// </summary>
/// <param name="field"></param>
/// <param name="keyword"></param>
/// <returns></returns>
public static Query CreatePanGuQuery(string field, string keyword, bool needSplit = true)
{
if (needSplit)
{
keyword = GetKeyWordsSplitBySpace(keyword);
}
QueryParser parse = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, field, new PanGuAnalyzer());
parse.DefaultOperator = QueryParser.Operator.OR;
Query query = parse.Parse(keyword);
return query;
}
/// <summary>
/// 【輔助方法】創建盤古多字段查詢對象
/// </summary>
/// <param name="keyword"></param>
/// <param name="fields"></param>
/// <returns></returns>
public static Query CreatePanGuMultiFieldQuery(string keyword, bool needSplit, params string[] fields)
{
if (needSplit)
{
keyword = GetKeyWordsSplitBySpace(keyword);
}
QueryParser parse = new MultiFieldQueryParser(Lucene.Net.Util.Version.LUCENE_30, fields, new PanGuAnalyzer());
parse.DefaultOperator = QueryParser.Operator.OR;
Query query = parse.Parse(keyword);
return query;
}
}
}
里面除了使用了Lucene.Net nuget包,還單獨引用了PanGu分詞器及其相關組件,因為大多數情況下我們的內容會包含中文。如上代碼就不再細講了,注釋得比較清楚了。下面貼出一些實際的用法:
創建索引:
SearchEngineUtil.AddIndex(GetSearchIndexDir(), post, (doc, data) => BuildPostSearchDocument(data, doc));
private Document BuildPostSearchDocument(Post post, Document doc = null)
{
if (doc == null)
{
doc = new Document();//創建Document
}
doc.Add(new Field("Id", post.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("Title", post.Title, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("Summary", post.Summary, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("CreateTime", post.CreateTime.ToString("yyyy/MM/dd HH:mm"), Field.Store.YES, Field.Index.NO));
doc.Add(new Field("Author", post.IsOriginal ? (post.Creator ?? userQueryService.FindByName(post.CreateBy)).NickName : post.SourceBy, Field.Store.YES, Field.Index.NO));
return doc;
}
刪除索引:
SearchEngineUtil.DeleteIndex(GetSearchIndexDir(), "Id", post.Id);
更新索引:
SearchEngineUtil.UpdateIndex(GetSearchIndexDir(), "Id", post.Id, BuildPostSearchDocument(post));
分頁查詢:
var keyword = SearchEngineUtil.GetKeyWordsSplitBySpace("夢在旅途 中國夢");
var searchResult = SearchEngineUtil.SearchIndexByPage(indexDir, (bQuery) =>
{
var query = SearchEngineUtil.CreatePanGuMultiFieldQuery(keyword, false, "Title", "Summary");
bQuery.Add(query, Occur.SHOULD);
return new Dictionary<string, string> {
{ "Title",keyword},{"Summary",keyword}
};
}, () =>
{
return new[] { new SortField("Id", SortField.INT, true) };
}, doc =>
{
return new PostSearchInfoDto
{
Id = doc.Get("Id"),
Title = doc.Get("Title"),
Summary = doc.Get("Summary"),
Author = doc.Get("Author"),
CreateTime = doc.Get("CreateTime")
};
}, pageSize, pageNo, out totalCount);
其它的還有:判斷索引中的指定文檔記錄存不存在、查詢符合條件的索引文檔等在此沒有列出,大家有興趣的可以COPY到自己的項目中測試一下。
這里可以看一下我在自己的項目中(個人全新改版的自己博客,還在開發中)應用搜索場景的效果:

最后說明的是:Lucene並不是一個完整的全文檢索引擎,但了解它對於學習elasticsearch、solr還是有一定的幫助,目前一般應用於實際的生產項目中,多半是使用更高層的elasticsearch、solr。
(本文中的代碼我是今年很早前就寫好了,只是今天才分享出來)
我喜歡對一些常用的組件進行封裝,比如過往封裝有:
基於MongoDb官方C#驅動封裝MongoDbCsharpHelper類(CRUD類)
基於RabbitMQ.Client組件實現RabbitMQ可復用的 ConnectionPool(連接池)
