大多數情況下,我們的搜索一般用的是sql的模糊搜索,但是這個模糊搜索,總是不夠精確,而且總達不到我們的要求,於是乎,偶專門上網找了一些資料,研究了一下,現在比較流行的Lucene.net,感覺還蠻不錯的,搜索效果也蠻好的,再配合盤古分詞,感覺超酷.
嗯,我就動手嘗試了一下類似百度的多模塊搜索,感覺蠻好玩的.
網上一般都只做了一個搜索,借鑒他們的代碼,我在這里專門設計怎么做多個模塊搜索,做完可以考慮,怎么把這些內容整合搜索!
網上介紹lucene.net也蠻多了,這里就說一下,我做lucene.net的心得,lucene.net一般做出一個搜索比較容易,但是涉及到多個索引比較麻煩,這里我就專門看了一些資料寫了一個多模塊的搜索,正在看怎么把多種索引組合起來,能夠綜合性搜索,主要實現了2個模塊,新聞和工藝知識,
進入正題:
做站內搜索主要涉及以下幾個技術:
多線程技術,
Lucene.net,
盤古分詞.
Lucene.net實際上就是把數據建立一個索引庫保存起來,然后以后就像翻書一樣搜索.
盤古分詞就是專門為了把一段話分詞,比如李明去吃飯.盤古分詞就會拆分出關鍵詞,李明 吃飯,這樣,就可以到索引庫去查找這兩個詞.
多線程為了保證讓系統自動去索引我們寫入的文章或者工藝知識等內容,每次做增刪改系統就自動去更新索引庫.
這里主要涉及以下幾個dll,
其中還包括盤古分詞的高亮顯示和盤古分詞的配置文件.
主要代碼:
IndexJobItem類,
這個類定義了關於luncene.net的子任務信息類
1: using System;
2: using System.Data;
3: using System.Configuration;
4: using System.Linq;
5: using System.Web;
6: using System.Web.Security;
7: using System.Web.UI;
8: using System.Web.UI.HtmlControls;
9: using System.Web.UI.WebControls;
10: using System.Web.UI.WebControls.WebParts;
11: using System.Xml.Linq;
12:
13: /// <summary>
14: ///任務類型
15: /// </summary>
16: public class IndexJobItem
17: {
18: //任務類型
19: public enum JobType
20: {
21: Delete, Add
22: }
23:
24: public JobType ItemType { get; set; }
25: public long ThreadId { get; set; }
26: public int Id { get; set; }
27:
28: public override bool Equals(object obj)
29: {
30: IndexJobItem item = obj as IndexJobItem;
31: if (item == null)
32: {
33: return false;
34: }
35: return this.ItemType == item.ItemType && this.ThreadId == item.ThreadId;
36: //return base.Equals(obj);
37: }
38: public override int GetHashCode()
39: {
40: return base.GetHashCode();
41: }
42: public override string ToString()
43: {
44: return ItemType + ":" + ThreadId;
45: }
46: public IndexJobItem()
47: {
48: //
49: //TODO: 在此處添加構造函數邏輯
50: //
51: }
52: }
IndexManager類
這個是專門線程進行索引操作
1: using System;
2: using System.Data;
3: using System.Configuration;
4: using System.Linq;
5: using System.Web;
6: using System.Web.Security;
7: using System.Web.UI;
8: using System.Web.UI.HtmlControls;
9: using System.Web.UI.WebControls;
10: using System.Web.UI.WebControls.WebParts;
11: using System.Xml.Linq;
12: using log4net;
13: using System.Web.Hosting;
14: using Lucene.Net.Store;
15: using Lucene.Net.Index;
16: using System.IO;
17: using Lucene.Net.Analysis.PanGu;
18: using System.Net;
19: using czcraft.BLL;
20: using mshtml;
21: using czcraft;
22: using System.Text;
23: using Lucene.Net.Documents;
24: using System.Text.RegularExpressions;
25: using Quartz.Collection;
26: using System.Collections.Generic;
27: using System.Threading;
28:
29: /// <summary>
30: ///IndexManager 只能通過Instance實例化
31: /// </summary>
32: public class IndexManager
33: {
34: //單例模式
35: public readonly static IndexManager Instance = new IndexManager();
36: /// <summary>
37: /// 任務是否停止
38: /// </summary>
39: private bool IsStopped;
40: /// <summary>
41: /// 搜索類別枚舉
42: /// </summary>
43: public enum JobSearchType
44: {
45: Product, News, Knowledge
46: }
47: /// <summary>
48: /// 搜索類別
49: /// </summary>
50: public JobSearchType jobSearchType { get; set; }
51: /// <summary>
52: /// 索引任務集合列表
53: /// </summary>
54: private List<IndexJobItem> jobs = new List<IndexJobItem>();
55: private static ILog log = LogManager.GetLogger(typeof(IndexManager));
56: /// <summary>
57: /// 私有構造函數所有的地方要對索引庫進行修改都通過IndexManger,所以要單例,因為同時只能有一個在寫索引庫,
58: /// 別的地方要寫索引庫要請求這個類來進行索引
59: /// </summary>
60: private IndexManager()
61: {
62: //
63: //TODO: 在此處添加構造函數邏輯
64: //
65: }
66: /// <summary>
67: /// 啟動任務
68: /// </summary>
69: public void Start()
70: {
71: IsStopped = false;
72: Thread thread = new Thread(ScanThread);
73: //背景線程
74: thread.IsBackground = true;
75: thread.Start();
76:
77: }
78: /// <summary>
79: /// 停止任務
80: /// </summary>
81: public void Stop()
82: {
83: IsStopped = true;
84:
85: }
86: /// <summary>
87: /// 掃描線程
88: /// </summary>
89: private void ScanThread()
90: {
91: //如果停止,則不在無限循環
92: while (!IsStopped)
93: {
94: //休息5秒鍾,盡可能多的積累任務
95: Thread.Sleep(5000);
96: if (jobs.Count <= 0)
97: {
98: //如果沒有任務,線程等待
99: log.Debug("沒有任務,繼續線程等待");
100: Thread.Sleep(10 * 1000);
101: continue;
102: }
103: //為什么每次循環都要打開,關閉索引庫,因為關閉索引庫以后才會把寫入的數據提交到索引庫中.也可以每次操作都"提交"(參考Lucene.net文檔)
104: //Enum.Parse(typeof(JobSearchType), jobSearchType).ToString()獲取枚舉名稱
105: string indexPath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, ConfigurationManager.AppSettings["path"] + @"\" + Enum.Parse(typeof(JobSearchType), jobSearchType.ToString ()).ToString());
106: FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
107:
108: bool isUpdate = IndexReader.IndexExists(directory);
109: log.Debug("索引庫是否存在:" + isUpdate);
110: if (isUpdate)
111: {
112: //如果索引目錄被鎖定(比如索引過程中程序異常退出),則首先解鎖
113: if (IndexWriter.IsLocked(directory))
114: {
115: log.Debug("開始解鎖索引庫");
116: IndexWriter.Unlock(directory);
117: log.Debug("解鎖庫完成");
118: }
119: }
120: //索引
121: IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
122:
123: //開始建立索引
124: ProcessJob(writer);
125:
126: writer.Close();
127: //不要忘了close
128: directory.Close();
129: log.Debug("全部索引完畢");
130: }
131: }
132: /// <summary>
133: /// 索引任務
134: /// </summary>
135: /// <param name="writer"></param>
136: private void ProcessJob(IndexWriter writer)
137: {
138: foreach (var job in jobs.ToArray())
139: {
140: //刪除任務
141: jobs.Remove(job);
142: //因為自己的網站,直接讀取數據庫,不用WebClient
143: //為避免重復索引,所以先刪除number=i的記錄,再重新添加
144: writer.DeleteDocuments(new Term("number", job.Id.ToString()));
145: //索引
146: Document document = new Document();
147: string TypeName = "";
148: //如果"添加" 任務則再添加
149: if (job.ItemType == IndexJobItem.JobType.Add)
150: {
151: switch (jobSearchType)
152: {
153: //索引工藝知識
154: case JobSearchType.Knowledge:
155: document = AddDocumentBycraftknowledge(job);
156: TypeName = "工藝知識";
157: break;
158: case JobSearchType.News:
159: document = AddDocumentByNews(job);
160: TypeName = "新聞";
161: break;
162: case JobSearchType.Product:
163: AddDocumentByProduct(job);
164: TypeName = "商品";
165: break;
166: default:
167: log.Debug("未設置JobSearchType屬性,無法索引");
168: return;
169:
170:
171:
172: }
173:
174: writer.AddDocument(document);
175: log.Debug("索引" + TypeName + ":" + job.Id + "完成!");
176:
177:
178:
179: }
180: }
181: }
182: /// <summary>
183: /// 給商品添加索引
184: /// </summary>
185: /// <param name="job"></param>
186: /// <returns></returns>
187: public Document AddDocumentByProduct(IndexJobItem job)
188: {
189: return null;
190:
191: }
192: /// <summary>
193: /// 給新聞添加索引
194: /// </summary>
195: /// <param name="job"></param>
196: /// <returns></returns>
197: public Document AddDocumentByNews(IndexJobItem job)
198: {
199: newsBLL bll = new newsBLL();
200: //有可能剛添加就被刪除了
201: if (bll == null)
202: {
203: return null;
204:
205: }
206: var craftknowledge = bll.Get(job.Id);
207: string title = craftknowledge.Title;
208: //這里要去除標簽
209: string body = Common.Tools.HtmlToTxt(craftknowledge.Content);
210: Document document = new Document();
211: document.Add(new Field("number", job.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
212: document.Add(new Field("ArticleHtmlUrl", craftknowledge.ArticleHtmlUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
213: //以下內容要索引
214: document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
215: document.Add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
216: return document;
217: }
218: /// <summary>
219: /// 給工藝知識添加索引
220: /// </summary>
221: /// <returns></returns>
222: public Document AddDocumentBycraftknowledge(IndexJobItem job)
223: {
224: craftknowledgeBLL bll = new craftknowledgeBLL();
225: //有可能剛添加就被刪除了
226: if (bll == null)
227: {
228: return null;
229:
230: }
231: var craftknowledge = bll.Get(job.Id);
232: string title = craftknowledge.Title;
233: //這里要去除標簽
234: string body = Common.Tools.HtmlToTxt(craftknowledge.Content);
235: Document document = new Document();
236: document.Add(new Field("number", job.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
237: document.Add(new Field("ArticleHtmlUrl", craftknowledge.ArticleHtmlUrl, Field.Store.YES, Field.Index.NOT_ANALYZED));
238: //以下內容要索引
239: document.Add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
240: document.Add(new Field("body", body, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
241: return document;
242: }
243: /// <summary>
244: /// 添加任務
245: /// </summary>
246: /// <param name="Id">根據id</param>
247: public void AddJob(int Id)
248: {
249: IndexJobItem job = new IndexJobItem();
250: job.Id = Id;
251: job.ItemType = IndexJobItem.JobType.Add;
252: log.Debug(Id + "加入到任務列表中");
253: //把任務加入任務列表
254: jobs.Add(job);
255:
256: }
257: /// <summary>
258: /// 刪除任務
259: /// </summary>
260: /// <param name="Id">根據Id</param>
261: public void RemoveJob(int Id)
262: {
263: IndexJobItem job = new IndexJobItem();
264: job.Id = Id;
265: job.ItemType = IndexJobItem.JobType.Delete;
266: log.Debug(Id + "加入刪除任務列表");
267: jobs.Add(job);
268: }
269: /// <summary>
270: /// 實例化
271: /// </summary>
272: /// <returns></returns>
273: public static IndexManager GetInstance(JobSearchType jobType)
274: {
275: //設置job的類別為
276: Instance.jobSearchType = jobType;
277: return Instance;
278: }
279:
280:
281:
282: }
接下來在全局配置文件中開啟線程
在webconfig配置索引目錄
在這里一個很重要的問題就是,我的商品,工藝知識,和新聞是根據一個枚舉來判斷到底是給哪個進行索引,
索引的目錄也是動態的,根據枚舉判斷的
索引的目錄
搜索BLL
1: using System;
2: using System.Collections.Generic;
3: using System.Linq;
4: using System.Web;
5: using Lucene.Net.Store;
6: using System.IO;
7: using Lucene.Net.Index;
8: using Lucene.Net.Analysis.PanGu;
9: using System.Net;
10: using Lucene.Net.Documents;
11: using log4net;
12: using Lucene.Net.Search;
13: using System.Text;
14: using mshtml;
15: using PanGu;
16: using System.Xml.Linq;
17: using System.Text.RegularExpressions;
18: using czcraft.BLL;
19: using czcraft.Model;
20: using System.Collections;
21: using System.Web.Hosting;
22: using System.Configuration;
23:
24: namespace czcraft.BLL
25: {
26: public partial class SearchBLL
27: {
28: private ILog logger = LogManager.GetLogger(typeof(SearchBLL));
29:
30: /// <summary>
31: /// 搜索
32: /// </summary>
33: /// <param name="kw">關鍵詞</param>
34: /// <param name="startIndex">開始頁碼</param>
35: /// <param name="pageSize">每頁顯示個數</param>
36: /// <param name="totalCount">總個數</param>
37: /// <returns></returns>
38: public IEnumerable<SearchResult> Search(string kw, int startIndex, int pageSize, out int totalCount,SearchSum.searchType Type)
39: {
40: string indexPath = System.IO.Path.Combine(HostingEnvironment.ApplicationPhysicalPath, ConfigurationManager.AppSettings["path"] + @"\" + Type.ToString ());
41: FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
42: IndexReader reader = IndexReader.Open(directory, true);
43: IndexSearcher searcher = new IndexSearcher(reader);
44: PhraseQuery query = new PhraseQuery();
45:
46: //todo:把用戶輸入的關鍵詞進行拆詞
47:
48: foreach (string word in CommonHelper.SplitWord(kw))//先用空格,讓用戶去分詞,空格分隔的就是詞“計算機 專業”
49: {
50: query.Add(new Term("body", word));
51: }
52:
53: query.SetSlop(50);
54: TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
55: searcher.Search(query, null, collector);
56: totalCount = collector.GetTotalHits();//返回總條數
57: ScoreDoc[] docs = collector.TopDocs(startIndex, pageSize).scoreDocs;
58: List<SearchResult> listResult = new List<SearchResult>();
59: for (int i = 0; i < docs.Length; i++)
60: {
61: int docId = docs[i].doc;//取到文檔的編號(主鍵,這個是Lucene .net分配的)
62: //檢索結果中只有文檔的id,如果要取Document,則需要Doc再去取
63: //降低內容占用
64: Document doc = searcher.Doc(docId);//根據id找Document
65: string number = doc.Get("number");
66: string title = doc.Get("title");
67: string body = doc.Get("body");
68: string ArticleHtmlUrl = doc.Get("ArticleHtmlUrl");
69: SearchResult result = new SearchResult();
70: result.Number = number;
71: result.Title = title;
72:
73:
74: result.BodyPreview = Preview(body, kw);
75: result.ArticleHtmlUrl = ArticleHtmlUrl;
76: listResult.Add(result);
77: }
78: return listResult;
79: }
80: /// <summary>
81: /// 設置高亮顯示
82: /// </summary>
83: /// <param name="body">文章主體</param>
84: /// <param name="keyword">關鍵詞</param>
85: /// <returns></returns>
86: private static string Preview(string body, string keyword)
87: {
88: //創建HTMLFormatter,參數為高亮單詞的前后綴
89: PanGu.HighLight.SimpleHTMLFormatter simpleHTMLFormatter =
90: new PanGu.HighLight.SimpleHTMLFormatter("<font color=\"red\">", "</font>");
91: //創建 Highlighter ,輸入HTMLFormatter 和 盤古分詞對象Semgent
92: PanGu.HighLight.Highlighter highlighter =
93: new PanGu.HighLight.Highlighter(simpleHTMLFormatter,
94: new Segment());
95: //設置每個摘要段的字符數
96: highlighter.FragmentSize = 100;
97: //獲取最匹配的摘要段
98: String bodyPreview = highlighter.GetBestFragment(keyword, body);
99: return bodyPreview;
100: }
101: }
102: }
搜索也根據搜索類別枚舉動態判斷搜索類別!
前端頁面設計:
1: <%@ Page Language="C#" MasterPageFile="~/Top_Down.master" AutoEventWireup="true"
2: CodeFile="SearchKnowledge.aspx.cs" Inherits="Search_SearchKnowledge" Title="找找看-工藝知識" %>
3:
4: <asp:Content ID="Content1" ContentPlaceHolderID="head" runat="Server">
5: <link href="../css/baidu.css" rel="stylesheet" type="text/css" />
6: <link href="../css/other.css" rel="stylesheet" type="text/css" />
7: <link href="../css/ui-lightness/jquery-ui-1.8.2.custom.css" rel="stylesheet" type="text/css" />
8: <link href="../css/Pager.css" rel="stylesheet" type="text/css" />
9: <link href="../css/Search.css" rel="stylesheet" type="text/css" />
10: <script src="../Admin/scripts/jquery-1.7.1.min.js" type="text/javascript"></script>
11:
12: <script src="../js/jquery-ui-1.8.2.custom.min.js" type="text/javascript"></script>
13:
14: <script type="text/javascript">
15:
16: $(function () {
17: $("#kw").autocomplete(
18: { source: "Data/SearchSuggestion.ashx",
19: select: function (event, ui) { $("#kw").val(ui.item.value); $("#form1").submit(); }
20: });
21: });
22: </script>
23:
24:
25: </asp:Content>
26: <asp:Content ID="Content2" ContentPlaceHolderID="ContentPlaceHolder1" runat="Server">
27: <div class="content">
28: <div class="left_side">
29: <div class="logo_bottom">
30: </div>
31: </div>
32: <div class="gjss_load">
33: <h4>
34: 找找看</h4>
35: <span>當前位置:<a href="#">首頁</a> > <a href="#">找找看</a></span>
36: </div>
37: <div class="gjss">
38: <div class="gjss_top">
39: </div>
40: <div class="gjss_c">
41: <table width="804">
42: <tr>
43: <td colspan="7" align="center">
44: <label id="lbNews" style="margin-left:260px" class="tab"><a href="SearchNews.aspx">新聞</a></label> <label id="lbKnowledge" style="margin-left:50px" class="tab"><a href="SearchKnowledge.aspx">工藝知識</a></label><label id="lbProduct" style="margin-left:50px" class="tab"><a href="#">商品</a></label>
45:
46: </td>
47: </tr>
48: <tr>
49: <td class="style1">
50: <div id="m" align="center">
51: <div id="fm">
52: <form name="form1">
53: <span class="s_ipt_wr" style="float: left">
54: <input id="kw" class="s_ipt" name="kw" maxlength="100" value='<%=Request["kw"] %>' />
55: </span><span class="s_btn_wr">
56: <input id="su" class="s_btn" onmouseout="this.className='s_btn'" onmousedown="this.className='s_btn s_btn_h'"
57: value="找找看" type="submit" /></span></form>
58: </div>
59: </div>
60: </td>
61: </tr>
62: <tr>
63: <td colspan="7" align="center" class="style1">
64: <div style="text-align: center">
65: <ul id="hotwordsUL" class="hotWords">
66: <asp:Repeater ID="repeaterHotWords" runat="server">
67: <ItemTemplate>
68: <li><a href='SearchKnowledge.aspx?kw=<%#Eval("KeyWord") %>'>
69: <%#Eval("KeyWord") %>
70: </a></li>
71: </ItemTemplate>
72: </asp:Repeater>
73: </ul>
74: </div>
75: </td>
76: </tr>
77: <tr>
78: <td colspan="7" align="center">
79: <br />
80: <ul id="ulResult" class="hotWords">
81: <asp:Repeater EnableViewState="false" ID="repeaterResult" runat="server">
82: <ItemTemplate>
83: <li><span><%--<a href='../CraftKnowledge/ViewCraftKnowledge.aspx?KnowledgeId=<%#Eval("Number") %>'>--%>
84: <a href='<%#Eval("ArticleHtmlUrl") %>'>
85: <%#Eval("Title") %></a></span>
86: <br />
87: <span> <%#Eval("BodyPreview")%></span>
88: </li>
89: </ItemTemplate>
90: </asp:Repeater>
91: </ul>
92: <br />
93: <div class="pager">
94: <%=PageHtml%>
95: </div>
96:
97: </td>
98: </tr>
99: </table>
100: </div>
101: </div>
102: </div>
103: </asp:Content>
后台代碼:
1: using System;
2: using System.Collections;
3: using System.Configuration;
4: using System.Data;
5: using System.Linq;
6: using System.Web;
7: using System.Web.Security;
8: using System.Web.UI;
9: using System.Web.UI.HtmlControls;
10: using System.Web.UI.WebControls;
11: using System.Web.UI.WebControls.WebParts;
12: using System.Xml.Linq;
13: using czcraft;
14: using czcraft.BLL;
15: using Common;
16: using czcraft.Model;
17: using System.Collections.Generic;
18:
19: public partial class Search_SearchKnowledge : System.Web.UI.Page
20: {
21: //分頁控件
22: public string PageHtml { get;private set; }
23: protected void Page_Load(object sender, EventArgs e)
24: {
25: //加載熱詞
26: repeaterHotWords.DataSource = new SearchInfoBLL().GetHotWords(SearchSum.searchType.Knowledge);
27: repeaterHotWords.DataBind();
28:
29: //如果kw為空,則是第一次進入界面
30: string kw = Request["kw"];
31:
32: if (!Tools.IsValidInput(ref kw,true)||string.IsNullOrEmpty(kw))
33: {
34: return;
35: }
36: //把搜索記錄加入數據庫
37: SearchInfo kwLog = new SearchInfo();
38: kwLog.KeyWord = kw;
39: kwLog.DateTime = DateTime.Now;
40: kwLog.Ip = Request.UserHostAddress;
41: kwLog.SearchType = SearchSum.searchType.Knowledge.GetHashCode().ToString ();
42: new SearchInfoBLL().AddNew(kwLog);
43:
44: var pager =new Common.RupengPager();
45: pager.UrlFormat = "SearchKnowledge.aspx?pagenum={n}&kw=" + Server.UrlEncode(kw);
46: pager.PageSize = 10;
47: //解析當前頁面
48: pager.TryParseCurrentPageIndex(Request["pagenum"]);
49: int startRowIndex = (pager.CurrentPageIndex - 1) * pager.PageSize;
50:
51: int totalCount;
52: IEnumerable<SearchResult> result = new SearchBLL().Search(kw, startRowIndex, 10, out totalCount, SearchSum.searchType.Knowledge);
53: pager.TotalCount = totalCount;
54: PageHtml = pager.Render();//渲染頁碼條HTML
55:
56: repeaterResult.DataSource = result;
57: repeaterResult.DataBind();
58:
59: }
60: }
我們還可以再做做當前熱點,這里詳細粘貼代碼了
效果圖:
這里復制了好多重復數據,不是程序問題………………
哈哈,一個站內搜索給網站增添了不少亮點,
這個搜索框當然是copy的百度的,哈哈