原文:https://blog.csdn.net/u011966339/article/details/72832197
1.DFA算法簡介
DFA全稱為:Deterministic Finite Automaton,即確定有窮自動機。其特征為:有一個有限狀態集合和一些從一個狀態通向另一個狀態的邊,每條邊上標記有一個符號,其中一個狀態是初態,某些狀態是終態。但不同於不確定的有限自動機,DFA中不會有從同一狀態出發的兩條邊標志有相同的符號。
簡單點說就是,它是是通過event和當前的state得到下一個state,即event+state=nextstate。理解為系統中有多個節點,通過傳遞進入的event,來確定走哪個路由至另一個節點,而節點是有限的。
2.實現代碼如下:
新建一個FilterHelper.cs類,放敏感詞的過濾統一處理方法
————————————————
using Microsoft.VisualBasic; using System; using System.Collections.Generic; using System.Text; namespace ConsoleApp1 { #region 非法關鍵字過濾 bate 1.1 /// <summary> /// 非法關鍵詞過濾(自動忽略漢字數字字母間的其他字符) /// </summary> public class FilterHelper { public FilterHelper() { } public FilterHelper(string dictionaryPath) { this.dictionaryPath = dictionaryPath; } private string dictionaryPath = string.Empty; /// <summary> /// 詞庫路徑 /// </summary> public string DictionaryPath { get { return dictionaryPath; } set { dictionaryPath = value; } } /// <summary> /// 內存詞典 /// </summary> private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue]; private string sourctText = string.Empty; /// <summary> /// 檢測源 /// </summary> public string SourctText { get { return sourctText; } set { sourctText = value; } } /// <summary> /// 檢測源游標 /// </summary> int cursor = 0; /// <summary> /// 匹配成功后偏移量 /// </summary> int wordlenght = 0; /// <summary> /// 檢測詞游標 /// </summary> int nextCursor = 0; private List<string> illegalWords = new List<string>(); /// <summary> /// 檢測到的非法詞集 /// </summary> public List<string> IllegalWords { get { return illegalWords; } } /// <summary> /// 判斷是否是中文 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isCHS(char character) { // 中文表意字符的范圍 4E00-9FA5 int charVal = (int)character; return (charVal >= 0x4e00 && charVal <= 0x9fa5); } /// <summary> /// 判斷是否是數字 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isNum(char character) { int charVal = (int)character; return (charVal >= 48 && charVal <= 57); } /// <summary> /// 判斷是否是字母 /// </summary> /// <param name="character"></param> /// <returns></returns> private bool isAlphabet(char character) { int charVal = (int)character; return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90)); } /// <summary> /// 轉半角小寫的函數(DBC case) /// </summary> /// <param name="input">任意字符串</param> /// <returns>半角字符串</returns> ///<remarks> ///全角空格為12288,半角空格為32 ///其他字符半角(33-126)與全角(65281-65374)的對應關系是:均相差65248 ///</remarks> private string ToDBC(string input) { char[] c = input.ToCharArray(); for (int i = 0; i < c.Length; i++) { if (c[i] == 12288) { c[i] = (char)32; continue; } if (c[i] > 65280 && c[i] < 65375) c[i] = (char)(c[i] - 65248); } return new string(c).ToLower(); } /// <summary> /// 加載內存詞庫 /// </summary> private void LoadDictionary() { if (DictionaryPath != string.Empty) { List<string> wordList = new List<string>(); Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length); string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default); foreach (string word in words) { string key = this.ToDBC(word); wordList.Add(key); // wordList.Add(Strings.StrConv(key, VbStrConv.SimplifiedChinese, 0)); //這個是將字符轉成簡體中文,需要引入Microsoft.VisualBasic,同時,有些系統不支持。 } Comparison<string> cmp = delegate (string key1, string key2) { return key1.CompareTo(key2); }; wordList.Sort(cmp); for (int i = wordList.Count - 1; i > 0; i--) { if (wordList[i].ToString() == wordList[i - 1].ToString()) { wordList.RemoveAt(i); } } foreach (var word in wordList) { if (word.Length > 0) { WordGroup group = MEMORYLEXICON[(int)word[0]]; if (group == null) { group = new WordGroup(); MEMORYLEXICON[(int)word[0]] = group; } group.Add(word.Substring(1)); } } } } /// <summary> /// 檢測 /// </summary> /// <param name="blackWord"></param> /// <returns></returns> private bool Check(string blackWord) { wordlenght = 0; //檢測源下一位游標 nextCursor = cursor + 1; bool found = false; //遍歷詞的每一位做匹配 for (int i = 0; i < blackWord.Length; i++) { //特殊字符偏移游標 int offset = 0; if (nextCursor >= sourctText.Length) { break; } else { //檢測下位字符如果不是漢字 數字 字符 偏移量加1 for (int y = nextCursor; y < sourctText.Length; y++) { if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y])) { offset++; //避讓特殊字符,下位游標如果>=字符串長度 跳出 if (nextCursor + offset >= sourctText.Length) break; wordlenght++; } else break; } if ((int)blackWord[i] == (int)sourctText[nextCursor + offset]) { found = true; } else { found = false; break; } } nextCursor = nextCursor + 1 + offset; wordlenght++; } return found; } /// <summary> /// 查找並替換 /// </summary> /// <param name="replaceChar"></param> public string Filter(char replaceChar) { LoadDictionary(); if (sourctText != string.Empty) { char[] tempString = sourctText.ToCharArray(); for (int i = 0; i < SourctText.Length; i++) { //查詢以該字為首字符的詞組 WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]]; if (group != null) { for (int z = 0; z < group.Count(); z++) { string word = group.GetWord(z); if (word.Length == 0 || Check(word)) { string blackword = string.Empty; for (int pos = 0; pos < wordlenght + 1; pos++) { blackword += tempString[pos + cursor].ToString(); tempString[pos + cursor] = replaceChar; } illegalWords.Add(blackword); cursor = cursor + wordlenght; i = i + wordlenght; } } } cursor++; } return new string(tempString); } else { return string.Empty; } } } /// <summary> /// 具有相同首字符的詞組集合 /// </summary> class WordGroup { /// <summary> /// 集合 /// </summary> private List<string> groupList; public WordGroup() { groupList = new List<string>(); } /// <summary> /// 添加詞 /// </summary> /// <param name="word"></param> public void Add(string word) { groupList.Add(word); } /// <summary> /// 獲取總數 /// </summary> /// <returns></returns> public int Count() { return groupList.Count; } /// <summary> /// 根據下標獲取詞 /// </summary> /// <param name="index"></param> /// <returns></returns> public string GetWord(int index) { return groupList[index]; } } #endregion }
使用:
static void Main(string[] args) { //該代碼為Net Core下的控制台demo string path = Directory.GetCurrentDirectory().Replace("\\bin\\Debug\\netcoreapp3.1",""); FilterHelper filter = new FilterHelper(path+"/暴恐詞庫.txt"); //存放敏感詞的文檔 filter.SourctText = "你個大推背"; string resultStr = filter.Filter('*'); //用*號代替敏感詞 var list=filter.IllegalWords; //這個可以獲取所有敏感詞集合 foreach(string s in list) { Console.WriteLine(s); } Console.WriteLine(resultStr); }
另附Demo的敏感詞下載:https://github.com/chason777777/mgck/archive/master.zip