貝葉斯算法可以用來做拼寫檢查、文本分類、垃圾郵件過濾等工作,前面我們用貝葉斯做了文本分類,這次用它來做拼寫檢查,參考:How to Write a Spelling Corrector
拼寫檢查器的原理
給定一個單詞, 我們的任務是選擇和它最相似的拼寫正確的單詞.
對應的貝葉斯問題就是, 給定一個詞 w, 在所有正確的拼寫詞中, 我們想要找一個正確的詞 c, 使得對於 w 的條件概率最大, 也就是說:
argmax c P( c| w)
按照貝葉斯理論上面的式子等價於:
argmax c P( w| c) P( c) / P( w)
因為用戶可以輸錯任何詞, 因此對於任何 c 來講, 出現 w 的概率 P(w) 都是一樣的, 從而我們在上式中忽略它, 寫成:
argmax c P( w| c) P( c)
因此argmaxc P(w|c) P(c)就是編輯距離與P(c)的的乘積
其中編輯距離:兩個詞之間的編輯距離定義為使用了幾次插入(在詞中插入一個單字母), 刪除(刪除一個單字母), 交換(交換相鄰兩個字母), 替換(把一個字母換成另一個)的操作從一個詞變到另一個詞.
一般情況下,編輯距離為2時已經可以覆蓋大部分情況
計算先驗概率P(c)
為了盡量覆蓋較多的詞語,首先從詞典中讀入常見的英文單詞

然后,從訓練語料(訓練語料在此下載 big.txt)訓練我們的詞典(語言模型,得到詞語概率,出現頻率越高的詞語越常見)

1 /// <summary> 2 /// 訓練詞典 3 /// </summary> 4 /// <param name="trainingFile"></param> 5 /// <param name="ht"></param> 6 public static void TrainDic(string trainingFile, Dictionary<string, int> ht) 7 { 8 9 StreamReader reader = new StreamReader(trainingFile); 10 string sLine = "";//存放每一個句子 11 12 string pattern = @"[a-z]+";//匹配單詞 13 14 Regex regex = new Regex(pattern); 15 int count = 0;//計算單詞的個數 16 17 while (sLine != null) 18 { 19 sLine = reader.ReadLine(); 20 if (sLine != null) 21 { 22 sLine = sLine.ToLower().Replace("'", " "); 23 var matchWords = regex.Matches(sLine); 24 25 foreach (Match match in matchWords) 26 { 27 var word = match.Value; 28 if (!ht.ContainsKey(word)) 29 { 30 count++; 31 ht.Add(word, 1); 32 } 33 else 34 { 35 ht[word]++; 36 } 37 } 38 } 39 } 40 reader.Close(); 41 }
為了復用,可以將訓練后的詞典保存取來
StringBuilder dicBuilder = new StringBuilder(); foreach (var item in Dic) { dicBuilder.AppendLine(item.Key + "\t" + item.Value); } File.WriteAllText(dicFile, dicBuilder.ToString());
獲取建議詞語
我們定義優先級: 編輯舉例為1》編輯舉例為2
首先,找到編輯距離為1的詞語

/// <summary> /// 編輯距離為1的詞語 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetEdits1(string word) { var n = word.Length; var tempWord = ""; var editsWords = new List<string>(); for (int i = 0; i < n; i++)//delete一個字母的情況 { tempWord = word.Substring(0, i) + word.Substring(i + 1); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } for (int i = 0; i < n - 1; i++)//調換transposition一個字母的情況 { tempWord = word.Substring(0, i) + word.Substring(i + 1, 1) + word.Substring(i, 1) + word.Substring(i + 2); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } for (int i = 0; i < n; i++)//替換replace一個字母的情況 { string t = word.Substring(i, 1); for (int ch = 'a'; ch <= 'z'; ch++) { if (ch != Convert.ToChar(t)) { tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i + 1); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } } } for (int i = 0; i <= n; i++)//insert一個字母的情況 { //string t = word.Substring(i, 1); for (int ch = 'a'; ch <= 'z'; ch++) { tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } } return editsWords; }
如果編輯舉例為1的詞語沒有正確的詞語時,繼續尋找為2的詞語,為了控制規模,只選取正確的詞語

/// <summary> /// 獲取編輯距離為2的單詞 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetEdits2(string word) { Stopwatch watch = new Stopwatch(); watch.Start(); var words = GetEdits1(word); var result = words.AsReadOnly().ToList(); foreach (var edit in words) { GetEdits1(edit).ForEach(w => { if (Dic.ContainsKey(w)) { result.Add(w); } }); } watch.Stop(); Console.WriteLine(watch.ElapsedMilliseconds); return result; }
最后是獲取建議詞語的代碼,最后的結果按照概率大小倒排序,取前5個

/// <summary> /// 獲取建議詞語 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetSuggestWords(string word) { var result = GetEdits1(word).Where(w => Dic.ContainsKey(w)).ToList(); if (result.Count == 0) { result = GetEdits2(word); if (result.Count == 0) { result.Add(word); } } // 按先驗概率排序 result = result.OrderByDescending(w => Dic.ContainsKey(w) ? Dic[w] : 1).ToList(); return result.Take(Math.Min(result.Count, 5)).ToList(); }
測試代碼

static Dictionary<string, int> Dic; static string dicFile = "dic.txt"; static string trainingFile = "training.txt"; static void Main(string[] args) { if (File.Exists(dicFile)) { Console.WriteLine("加載詞典中..."); LoadDic(); Console.WriteLine("加載詞典完成"); } else { Console.WriteLine("訓練詞典中..."); Dic = LoadUSDic(); TrainDic(trainingFile, Dic); StringBuilder dicBuilder = new StringBuilder(); foreach (var item in Dic) { dicBuilder.AppendLine(item.Key + "\t" + item.Value); } File.WriteAllText(dicFile, dicBuilder.ToString()); var wordCount = Dic.Count; Console.WriteLine("訓練完成..."); } Console.WriteLine("請輸入詞語..."); var inputWord = Console.ReadLine(); while (!inputWord.Equals("exit")) { if (Dic.ContainsKey(inputWord)) { Console.WriteLine("你輸入的詞語 【" + inputWord + "】 是正確的!"); } else { var suggestWords = GetSuggestWords(inputWord); Console.WriteLine("候選詞語: "); foreach (var word in suggestWords) { Console.WriteLine("\t\t\t " + word); } } Console.WriteLine("請輸入詞語...."); inputWord = Console.ReadLine(); } } /// <summary> /// 加載詞典 /// </summary> public static void LoadDic() { Dic = new Dictionary<string, int>(); var lines = File.ReadAllLines(dicFile); foreach (var line in lines) { if (line != "") { var dicItem = line.Split('\t'); if (dicItem.Length == 2) Dic.Add(dicItem[0], int.Parse(dicItem[1])); } } }
運行效果
完整代碼

using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Collections; using System.IO; using System.Text.RegularExpressions; using System.Diagnostics; namespace SpellCheck { class Program { static Dictionary<string, int> Dic; static string dicFile = "dic.txt"; static string trainingFile = "training.txt"; static void Main(string[] args) { if (File.Exists(dicFile)) { Console.WriteLine("加載詞典中..."); LoadDic(); Console.WriteLine("加載詞典完成"); } else { Console.WriteLine("訓練詞典中..."); Dic = LoadUSDic(); TrainDic(trainingFile, Dic); StringBuilder dicBuilder = new StringBuilder(); foreach (var item in Dic) { dicBuilder.AppendLine(item.Key + "\t" + item.Value); } File.WriteAllText(dicFile, dicBuilder.ToString()); var wordCount = Dic.Count; Console.WriteLine("訓練完成..."); } Console.WriteLine("請輸入詞語..."); var inputWord = Console.ReadLine(); while (!inputWord.Equals("exit")) { if (Dic.ContainsKey(inputWord)) { Console.WriteLine("你輸入的詞語 【" + inputWord + "】 是正確的!"); } else { var suggestWords = GetSuggestWords(inputWord); Console.WriteLine("候選詞語: "); foreach (var word in suggestWords) { Console.WriteLine("\t\t\t " + word); } } Console.WriteLine("請輸入詞語...."); inputWord = Console.ReadLine(); } } /// <summary> /// 加載詞典 /// </summary> public static void LoadDic() { Dic = new Dictionary<string, int>(); var lines = File.ReadAllLines(dicFile); foreach (var line in lines) { if (line != "") { var dicItem = line.Split('\t'); if (dicItem.Length == 2) Dic.Add(dicItem[0], int.Parse(dicItem[1])); } } } /// <summary> /// 訓練詞典 /// </summary> /// <param name="trainingFile"></param> /// <param name="ht"></param> public static void TrainDic(string trainingFile, Dictionary<string, int> ht) { StreamReader reader = new StreamReader(trainingFile); string sLine = "";//存放每一個句子 string pattern = @"[a-z]+";//匹配單詞 Regex regex = new Regex(pattern); int count = 0;//計算單詞的個數 while (sLine != null) { sLine = reader.ReadLine(); if (sLine != null) { sLine = sLine.ToLower().Replace("'", " "); var matchWords = regex.Matches(sLine); foreach (Match match in matchWords) { var word = match.Value; if (!ht.ContainsKey(word)) { count++; ht.Add(word, 1); } else { ht[word]++; } } } } reader.Close(); } /// <summary> /// 從en-US讀取詞語【詞語開始[Words]】 /// </summary> /// <returns></returns> public static Dictionary<string, int> LoadUSDic() { var dic = new Dictionary<string, int>(); string currentSection = ""; FileStream fs = new FileStream("en-US.dic", FileMode.Open, FileAccess.Read, FileShare.Read); StreamReader sr = new StreamReader(fs, Encoding.UTF8); while (sr.Peek() >= 0) { string tempLine = sr.ReadLine().Trim(); if (tempLine.Length > 0) { switch (tempLine) { case "[Words]": currentSection = tempLine; break; default: switch (currentSection) { case "[Words]": // dictionary word list // splits word into its parts string[] parts = tempLine.Split('/'); dic.Add(parts[0], 1); break; } // currentSection swith break; } //tempLine switch } // if templine } // read line sr.Close(); fs.Close(); return dic; } /// <summary> /// 編輯距離為1的詞語 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetEdits1(string word) { var n = word.Length; var tempWord = ""; var editsWords = new List<string>(); for (int i = 0; i < n; i++)//delete一個字母的情況 { tempWord = word.Substring(0, i) + word.Substring(i + 1); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } for (int i = 0; i < n - 1; i++)//調換transposition一個字母的情況 { tempWord = word.Substring(0, i) + word.Substring(i + 1, 1) + word.Substring(i, 1) + word.Substring(i + 2); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } for (int i = 0; i < n; i++)//替換replace一個字母的情況 { string t = word.Substring(i, 1); for (int ch = 'a'; ch <= 'z'; ch++) { if (ch != Convert.ToChar(t)) { tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i + 1); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } } } for (int i = 0; i <= n; i++)//insert一個字母的情況 { //string t = word.Substring(i, 1); for (int ch = 'a'; ch <= 'z'; ch++) { tempWord = word.Substring(0, i) + Convert.ToChar(ch) + word.Substring(i); if (!editsWords.Contains(tempWord)) editsWords.Add(tempWord); } } return editsWords; } /// <summary> /// 獲取編輯距離為2的單詞 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetEdits2(string word) { Stopwatch watch = new Stopwatch(); watch.Start(); var words = GetEdits1(word); var result = words.AsReadOnly().ToList(); foreach (var edit in words) { GetEdits1(edit).ForEach(w => { if (Dic.ContainsKey(w)) { result.Add(w); } }); } watch.Stop(); Console.WriteLine(watch.ElapsedMilliseconds); return result; } //static WordCompare compare = new WordCompare(); /// <summary> /// 獲取建議詞語 /// </summary> /// <param name="word"></param> /// <returns></returns> public static List<string> GetSuggestWords(string word) { var result = GetEdits1(word).Where(w => Dic.ContainsKey(w)).ToList(); if (result.Count == 0) { result = GetEdits2(word); if (result.Count == 0) { result.Add(word); } } // 按先驗概率排序 result = result.OrderByDescending(w => Dic.ContainsKey(w) ? Dic[w] : 1).ToList(); return result.Take(Math.Min(result.Count, 5)).ToList(); } /// <summary> /// 自定義比較 /// </summary> class WordCompare : IComparer<string> { public int Compare(string x, string y) { var hash1 = Dic.ContainsKey(x) ? Dic[x] : 1; var hash2 = Dic.ContainsKey(y) ? Dic[y] : 1; return hash1.CompareTo(hash2); } } } }