C#敏感詞過濾算法實現

本文轉載自查看原文 2020-03-17 15:48 1272 敏感詞過濾

原文：https://blog.csdn.net/u011966339/article/details/72832197

1.DFA算法簡介
DFA全稱為：Deterministic Finite Automaton,即確定有窮自動機。其特征為：有一個有限狀態集合和一些從一個狀態通向另一個狀態的邊，每條邊上標記有一個符號，其中一個狀態是初態，某些狀態是終態。但不同於不確定的有限自動機，DFA中不會有從同一狀態出發的兩條邊標志有相同的符號。

簡單點說就是，它是是通過event和當前的state得到下一個state，即event+state=nextstate。理解為系統中有多個節點，通過傳遞進入的event，來確定走哪個路由至另一個節點，而節點是有限的。

2.實現代碼如下:
新建一個FilterHelper.cs類,放敏感詞的過濾統一處理方法

————————————————

using Microsoft.VisualBasic;
using System;
using System.Collections.Generic;
using System.Text;

namespace ConsoleApp1
{
    #region 非法關鍵字過濾 bate 1.1
    /// <summary>
    /// 非法關鍵詞過濾(自動忽略漢字數字字母間的其他字符)
    /// </summary>
    public class FilterHelper
    {

        public FilterHelper() { }

        public FilterHelper(string dictionaryPath)
        {
            this.dictionaryPath = dictionaryPath;
        }

        private string dictionaryPath = string.Empty;
        /// <summary>
        /// 詞庫路徑
        /// </summary>
        public string DictionaryPath
        {
            get { return dictionaryPath; }
            set { dictionaryPath = value; }
        }
        /// <summary>
        /// 內存詞典
        /// </summary>
        private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];

        private string sourctText = string.Empty;
        /// <summary>
        /// 檢測源
        /// </summary>
        public string SourctText
        {
            get { return sourctText; }
            set { sourctText = value; }
        }

        /// <summary>
        /// 檢測源游標
        /// </summary>
        int cursor = 0;

        /// <summary>
        /// 匹配成功后偏移量
        /// </summary>
        int wordlenght = 0;

        /// <summary>
        /// 檢測詞游標
        /// </summary>
        int nextCursor = 0;


        private List<string> illegalWords = new List<string>();

        /// <summary>
        /// 檢測到的非法詞集
        /// </summary>
        public List<string> IllegalWords
        {
            get { return illegalWords; }
        }

        /// <summary>
        /// 判斷是否是中文
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isCHS(char character)
        {
            //  中文表意字符的范圍 4E00-9FA5
            int charVal = (int)character;
            return (charVal >= 0x4e00 && charVal <= 0x9fa5);
        }

        /// <summary>
        /// 判斷是否是數字
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isNum(char character)
        {
            int charVal = (int)character;
            return (charVal >= 48 && charVal <= 57);
        }

        /// <summary>
        /// 判斷是否是字母
        /// </summary>
        /// <param name="character"></param>
        /// <returns></returns>
        private bool isAlphabet(char character)
        {
            int charVal = (int)character;
            return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
        }


        /// <summary>
        /// 轉半角小寫的函數(DBC case)
        /// </summary>
        /// <param name="input">任意字符串</param>
        /// <returns>半角字符串</returns>
        ///<remarks>
        ///全角空格為12288，半角空格為32
        ///其他字符半角(33-126)與全角(65281-65374)的對應關系是：均相差65248
        ///</remarks>
        private string ToDBC(string input)
        {
            char[] c = input.ToCharArray();
            for (int i = 0; i < c.Length; i++)
            {
                if (c[i] == 12288)
                {
                    c[i] = (char)32;
                    continue;
                }
                if (c[i] > 65280 && c[i] < 65375)
                    c[i] = (char)(c[i] - 65248);
            }
            return new string(c).ToLower();
        }

        /// <summary>
        /// 加載內存詞庫
        /// </summary>
        private void LoadDictionary()
        {
            if (DictionaryPath != string.Empty)
            {
                List<string> wordList = new List<string>();
                Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
                string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default);
                foreach (string word in words)
                {
                    string key = this.ToDBC(word);
                    wordList.Add(key);

                    // wordList.Add(Strings.StrConv(key, VbStrConv.SimplifiedChinese, 0)); //這個是將字符轉成簡體中文，需要引入Microsoft.VisualBasic，同時，有些系統不支持。
                }
                Comparison<string> cmp = delegate (string key1, string key2)
                {
                    return key1.CompareTo(key2);
                };
                wordList.Sort(cmp);
                for (int i = wordList.Count - 1; i > 0; i--)
                {
                    if (wordList[i].ToString() == wordList[i - 1].ToString())
                    {
                        wordList.RemoveAt(i);
                    }
                }
                foreach (var word in wordList)
                {
                    if (word.Length > 0)
                    {
                        WordGroup group = MEMORYLEXICON[(int)word[0]];
                        if (group == null)
                        {
                            group = new WordGroup();
                            MEMORYLEXICON[(int)word[0]] = group;

                        }
                        group.Add(word.Substring(1));
                    }
                }
            }

        }

        /// <summary>
        /// 檢測
        /// </summary>
        /// <param name="blackWord"></param>
        /// <returns></returns>
        private bool Check(string blackWord)
        {
            wordlenght = 0;
            //檢測源下一位游標
            nextCursor = cursor + 1;
            bool found = false;
            //遍歷詞的每一位做匹配
            for (int i = 0; i < blackWord.Length; i++)
            {
                //特殊字符偏移游標
                int offset = 0;
                if (nextCursor >= sourctText.Length)
                {
                    break;
                }
                else
                {
                    //檢測下位字符如果不是漢字 數字 字符 偏移量加1
                    for (int y = nextCursor; y < sourctText.Length; y++)
                    {

                        if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
                        {
                            offset++;
                            //避讓特殊字符，下位游標如果>=字符串長度 跳出
                            if (nextCursor + offset >= sourctText.Length) break;
                            wordlenght++;

                        }
                        else break;
                    }

                    if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
                    {
                        found = true;
                    }
                    else
                    {
                        found = false;
                        break;
                    }


                }
                nextCursor = nextCursor + 1 + offset;
                wordlenght++;


            }
            return found;
        }

        /// <summary>
        /// 查找並替換
        /// </summary>
        /// <param name="replaceChar"></param>
        public string Filter(char replaceChar)
        {
            LoadDictionary();
            if (sourctText != string.Empty)
            {
                char[] tempString = sourctText.ToCharArray();
                for (int i = 0; i < SourctText.Length; i++)
                {
                    //查詢以該字為首字符的詞組
                    WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
                    if (group != null)
                    {
                        for (int z = 0; z < group.Count(); z++)
                        {
                            string word = group.GetWord(z);
                            if (word.Length == 0 || Check(word))
                            {
                                string blackword = string.Empty;
                                for (int pos = 0; pos < wordlenght + 1; pos++)
                                {
                                    blackword += tempString[pos + cursor].ToString();
                                    tempString[pos + cursor] = replaceChar;
                                }
                                illegalWords.Add(blackword);
                                cursor = cursor + wordlenght;
                                i = i + wordlenght;

                            }
                        }
                    }
                    cursor++;
                }
                return new string(tempString);
            }
            else
            {
                return string.Empty;
            }

        }
    }
    /// <summary>
    /// 具有相同首字符的詞組集合
    /// </summary>
    class WordGroup
    {
        /// <summary>
        /// 集合
        /// </summary>
        private List<string> groupList;

        public WordGroup()
        {
            groupList = new List<string>();
        }

        /// <summary>
        /// 添加詞
        /// </summary>
        /// <param name="word"></param>
        public void Add(string word)
        {
            groupList.Add(word);
        }

        /// <summary>
        /// 獲取總數
        /// </summary>
        /// <returns></returns>
        public int Count()
        {
            return groupList.Count;
        }

        /// <summary>
        /// 根據下標獲取詞
        /// </summary>
        /// <param name="index"></param>
        /// <returns></returns>
        public string GetWord(int index)
        {
            return groupList[index];
        }
    }
 
    #endregion
}

使用：

  static void Main(string[] args)
        {
            //該代碼為Net Core下的控制台demo
            string path = Directory.GetCurrentDirectory().Replace("\\bin\\Debug\\netcoreapp3.1","");
            FilterHelper filter = new FilterHelper(path+"/暴恐詞庫.txt");   //存放敏感詞的文檔
            filter.SourctText = "你個大推背";
            string resultStr = filter.Filter('*'); //用*號代替敏感詞

            var list=filter.IllegalWords;  //這個可以獲取所有敏感詞集合

            foreach(string s in list)
            {
                Console.WriteLine(s);
            }

            Console.WriteLine(resultStr);
        }

另附Demo的敏感詞下載：https://github.com/chason777777/mgck/archive/master.zip

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 C# 實現敏感詞過濾 java實現敏感詞過濾（DFA算法）基於DFA算法實現的敏感詞過濾 Java實現敏感詞過濾 - DFA算法 Java實現敏感詞過濾 - DFA算法敏感詞過濾算法淺析敏感詞過濾算法(C++) Java實現敏感詞過濾 - DFA算法 DFA敏感詞過濾實現 PHP實現敏感詞過濾