超強敏感詞過濾算法第二版 可以忽略大小寫、全半角、簡繁體、特殊符號、HTML標簽干擾


 

上一篇 發一個高性能的敏感詞過濾算法 可以忽略大小寫、全半角、簡繁體、特殊符號干擾

改進主要有幾點:

  1.   用BitArray取代Dictionary用空間換時間 性能進一步提升 大概會增加詞庫的  6k*字符數的內存開銷 按1000個詞庫平均長度為4 30MB左右。
  2.   增加防HTML標簽的過濾干擾、特殊符號突破ASIIC范圍。
  3.   添加一個極限優化的unsafe類FilterKeyWordsFast 用指針取代原有的char[]數組 性能大概提高2-3倍左右。

目測已優化到極致了。

歡迎做各種對比測試。

 

    /// <summary>
    /// 敏感詞過濾 已忽略大小寫 全半角 簡繁體差異 特殊符號  html標簽 干擾
    /// </summary>
    public static class FilterKeyWords
    {

        private static readonly object LockObj = new object();
        private static FilterKeyWordsNode _root;
        private const string TraditionalChinese = "皚藹礙愛翺襖奧壩罷擺敗頒辦絆幫綁鎊謗剝飽寶報鮑輩貝鋇狽備憊繃筆畢斃閉邊編貶變辯辮鼈癟瀕濱賓擯餅撥缽鉑駁蔔補參蠶殘慚慘燦蒼艙倉滄廁側冊測層詫攙摻蟬饞讒纏鏟産闡顫場嘗長償腸廠暢鈔車徹塵陳襯撐稱懲誠騁癡遲馳恥齒熾沖蟲寵疇躊籌綢醜櫥廚鋤雛礎儲觸處傳瘡闖創錘純綽辭詞賜聰蔥囪從叢湊竄錯達帶貸擔單鄲撣膽憚誕彈當擋黨蕩檔搗島禱導盜燈鄧敵滌遞締點墊電澱釣調疊諜疊釘頂錠訂東動棟凍鬥犢獨讀賭鍍鍛斷緞兌隊對噸頓鈍奪鵝額訛惡餓兒爾餌貳發罰閥琺礬釩煩範販飯訪紡飛廢費紛墳奮憤糞豐楓鋒風瘋馮縫諷鳳膚輻撫輔賦複負訃婦縛該鈣蓋幹趕稈贛岡剛鋼綱崗臯鎬擱鴿閣鉻個給龔宮鞏貢鈎溝構購夠蠱顧剮關觀館慣貫廣規矽歸龜閨軌詭櫃貴劊輥滾鍋國過駭韓漢閡鶴賀橫轟鴻紅後壺護滬戶嘩華畫劃話懷壞歡環還緩換喚瘓煥渙黃謊揮輝毀賄穢會燴彙諱誨繪葷渾夥獲貨禍擊機積饑譏雞績緝極輯級擠幾薊劑濟計記際繼紀夾莢頰賈鉀價駕殲監堅箋間艱緘繭檢堿鹼揀撿簡儉減薦檻鑒踐賤見鍵艦劍餞漸濺澗漿蔣槳獎講醬膠澆驕嬌攪鉸矯僥腳餃繳絞轎較稭階節莖驚經頸靜鏡徑痙競淨糾廄舊駒舉據鋸懼劇鵑絹傑潔結誡屆緊錦僅謹進晉燼盡勁荊覺決訣絕鈞軍駿開凱顆殼課墾懇摳庫褲誇塊儈寬礦曠況虧巋窺饋潰擴闊蠟臘萊來賴藍欄攔籃闌蘭瀾讕攬覽懶纜爛濫撈勞澇樂鐳壘類淚籬離裏鯉禮麗厲勵礫曆瀝隸倆聯蓮連鐮憐漣簾斂臉鏈戀煉練糧涼兩輛諒療遼鐐獵臨鄰鱗凜賃齡鈴淩靈嶺領餾劉龍聾嚨籠壟攏隴樓婁摟簍蘆盧顱廬爐擄鹵虜魯賂祿錄陸驢呂鋁侶屢縷慮濾綠巒攣孿灤亂掄輪倫侖淪綸論蘿羅邏鑼籮騾駱絡媽瑪碼螞馬罵嗎買麥賣邁脈瞞饅蠻滿謾貓錨鉚貿麽黴沒鎂門悶們錳夢謎彌覓綿緬廟滅憫閩鳴銘謬謀畝鈉納難撓腦惱鬧餒膩攆撚釀鳥聶齧鑷鎳檸獰甯擰濘鈕紐膿濃農瘧諾歐鷗毆嘔漚盤龐賠噴鵬騙飄頻貧蘋憑評潑頗撲鋪樸譜臍齊騎豈啓氣棄訖牽扡釺鉛遷簽謙錢鉗潛淺譴塹槍嗆牆薔強搶鍬橋喬僑翹竅竊欽親輕氫傾頃請慶瓊窮趨區軀驅齲顴權勸卻鵲讓饒擾繞熱韌認紉榮絨軟銳閏潤灑薩鰓賽傘喪騷掃澀殺紗篩曬閃陝贍繕傷賞燒紹賒攝懾設紳審嬸腎滲聲繩勝聖師獅濕詩屍時蝕實識駛勢釋飾視試壽獸樞輸書贖屬術樹豎數帥雙誰稅順說碩爍絲飼聳慫頌訟誦擻蘇訴肅雖綏歲孫損筍縮瑣鎖獺撻擡攤貪癱灘壇譚談歎湯燙濤縧騰謄銻題體屜條貼鐵廳聽烴銅統頭圖塗團頹蛻脫鴕馱駝橢窪襪彎灣頑萬網韋違圍爲濰維葦偉僞緯謂衛溫聞紋穩問甕撾蝸渦窩嗚鎢烏誣無蕪吳塢霧務誤錫犧襲習銑戲細蝦轄峽俠狹廈鍁鮮纖鹹賢銜閑顯險現獻縣餡羨憲線廂鑲鄉詳響項蕭銷曉嘯蠍協挾攜脅諧寫瀉謝鋅釁興洶鏽繡虛噓須許緒續軒懸選癬絢學勳詢尋馴訓訊遜壓鴉鴨啞亞訝閹煙鹽嚴顔閻豔厭硯彥諺驗鴦楊揚瘍陽癢養樣瑤搖堯遙窯謠藥爺頁業葉醫銥頤遺儀彜蟻藝億憶義詣議誼譯異繹蔭陰銀飲櫻嬰鷹應纓瑩螢營熒蠅穎喲擁傭癰踴詠湧優憂郵鈾猶遊誘輿魚漁娛與嶼語籲禦獄譽預馭鴛淵轅園員圓緣遠願約躍鑰嶽粵悅閱雲鄖勻隕運蘊醞暈韻雜災載攢暫贊贓髒鑿棗竈責擇則澤賊贈紮劄軋鍘閘詐齋債氈盞斬輾嶄棧戰綻張漲帳賬脹趙蟄轍鍺這貞針偵診鎮陣掙睜猙幀鄭證織職執紙摯擲幟質鍾終種腫衆謅軸皺晝驟豬諸誅燭矚囑貯鑄築駐專磚轉賺樁莊裝妝壯狀錐贅墜綴諄濁茲資漬蹤綜總縱鄒詛組鑽緻鐘麼為隻兇準啟闆裡靂餘鍊洩";
        private const string SimplifiedChinese = "皚藹礙愛翱襖奧壩罷擺敗頒辦絆幫綁鎊謗剝飽寶報鮑輩貝鋇狽備憊綳筆畢斃閉邊編貶變辯辮鱉癟瀕濱賓擯餅撥缽鉑駁卜補參蠶殘慚慘燦蒼艙倉滄廁側冊測層詫攙摻蟬饞讒纏鏟產闡顫場嘗長償腸廠暢鈔車徹塵陳襯撐稱懲誠騁痴遲馳恥齒熾沖蟲寵疇躊籌綢丑櫥廚鋤雛礎儲觸處傳瘡闖創錘純綽辭詞賜聰蔥囪從叢湊竄錯達帶貸擔單鄲撣膽憚誕彈當擋黨盪檔搗島禱導盜燈鄧敵滌遞締點墊電淀釣調迭諜疊釘頂錠訂東動棟凍斗犢獨讀賭鍍鍛斷緞兌隊對噸頓鈍奪鵝額訛惡餓兒爾餌貳發罰閥琺礬釩煩范販飯訪紡飛廢費紛墳奮憤糞豐楓鋒風瘋馮縫諷鳳膚輻撫輔賦復負訃婦縛該鈣蓋干趕稈贛岡剛鋼綱崗皋鎬擱鴿閣鉻個給龔宮鞏貢鈎溝構購夠蠱顧剮關觀館慣貫廣規硅歸龜閨軌詭櫃貴劊輥滾鍋國過駭韓漢閡鶴賀橫轟鴻紅后壺護滬戶嘩華畫划話懷壞歡環還緩換喚瘓煥渙黃謊揮輝毀賄穢會燴匯諱誨繪葷渾伙獲貨禍擊機積飢譏雞績緝極輯級擠幾薊劑濟計記際繼紀夾莢頰賈鉀價駕殲監堅箋間艱緘繭檢鹼礆揀撿簡儉減薦檻鑒踐賤見鍵艦劍餞漸濺澗漿蔣槳獎講醬膠澆驕嬌攪鉸矯僥腳餃繳絞轎較秸階節莖驚經頸靜鏡徑痙競凈糾廄舊駒舉據鋸懼劇鵑絹傑潔結誡屆緊錦僅謹進晉燼盡勁荊覺決訣絕鈞軍駿開凱顆殼課墾懇摳庫褲誇塊儈寬礦曠況虧巋窺饋潰擴闊蠟臘萊來賴藍欄攔籃闌蘭瀾讕攬覽懶纜爛濫撈勞澇樂鐳壘類淚籬離里鯉禮麗厲勵礫歷瀝隸倆聯蓮連鐮憐漣簾斂臉鏈戀煉練糧涼兩輛諒療遼鐐獵臨鄰鱗凜賃齡鈴凌靈嶺領餾劉龍聾嚨籠壟攏隴樓婁摟簍蘆盧顱廬爐擄鹵虜魯賂祿錄陸驢呂鋁侶屢縷慮濾綠巒攣孿灤亂掄輪倫侖淪綸論蘿羅邏鑼籮騾駱絡媽瑪碼螞馬罵嗎買麥賣邁脈瞞饅蠻滿謾貓錨鉚貿么霉沒鎂門悶們錳夢謎彌覓綿緬廟滅憫閩鳴銘謬謀畝鈉納難撓腦惱鬧餒膩攆捻釀鳥聶嚙鑷鎳檸獰寧擰濘鈕紐膿濃農瘧諾歐鷗毆嘔漚盤龐賠噴鵬騙飄頻貧蘋憑評潑頗撲鋪朴譜臍齊騎豈啟氣棄訖牽扦釺鉛遷簽謙錢鉗潛淺譴塹槍嗆牆薔強搶鍬橋喬僑翹竅竊欽親輕氫傾頃請慶瓊窮趨區軀驅齲顴權勸卻鵲讓饒擾繞熱韌認紉榮絨軟銳閏潤灑薩鰓賽傘喪騷掃澀殺紗篩曬閃陝贍繕傷賞燒紹賒攝懾設紳審嬸腎滲聲繩勝聖師獅濕詩屍時蝕實識駛勢釋飾視試壽獸樞輸書贖屬術樹豎數帥雙誰稅順說碩爍絲飼聳慫頌訟誦擻蘇訴肅雖綏歲孫損筍縮瑣鎖獺撻抬攤貪癱灘壇譚談嘆湯燙濤絛騰謄銻題體屜條貼鐵廳聽烴銅統頭圖塗團頹蛻脫鴕馱駝橢窪襪彎灣頑萬網韋違圍為濰維葦偉偽緯謂衛溫聞紋穩問瓮撾蝸渦窩嗚鎢烏誣無蕪吳塢霧務誤錫犧襲習銑戲細蝦轄峽俠狹廈杴鮮纖咸賢銜閑顯險現獻縣餡羡憲線廂鑲鄉詳響項蕭銷曉嘯蠍協挾攜脅諧寫瀉謝鋅釁興洶銹綉虛噓須許緒續軒懸選癬絢學勛詢尋馴訓訊遜壓鴉鴨啞亞訝閹煙鹽嚴顏閻艷厭硯彥諺驗鴦楊揚瘍陽癢養樣瑤搖堯遙窯謠葯爺頁業葉醫銥頤遺儀彝蟻藝億憶義詣議誼譯異繹蔭陰銀飲櫻嬰鷹應纓瑩螢營熒蠅穎喲擁佣癰踴詠涌優憂郵鈾猶游誘輿魚漁娛與嶼語吁御獄譽預馭鴛淵轅園員圓緣遠願約躍鑰岳粵悅閱雲鄖勻隕運蘊醞暈韻雜災載攢暫贊贓臟鑿棗灶責擇則澤賊贈扎札軋鍘閘詐齋債氈盞斬輾嶄棧戰綻張漲帳賬脹趙蟄轍鍺這貞針偵診鎮陣掙睜猙幀鄭證織職執紙摯擲幟質鍾終種腫眾謅軸皺晝驟豬諸誅燭矚囑貯鑄築駐專磚轉賺樁庄裝妝壯狀錐贅墜綴諄濁茲資漬蹤綜總縱鄒詛組鑽致鍾么為只凶准啟板里靂余鏈泄";
        private static readonly Dictionary<char, char> TranslationChinese = TraditionalChinese.Select((c, i) => new { c, i }).ToDictionary(p => p.c, p => SimplifiedChinese[p.i]);
        private static string SkipList = " \t\r\n~!@#$%^&*()_+-=【】、{}|;':\",。、《》?αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ。,、;:?!…—·ˉ¨‘’“”々~‖∶"'`|〃〔〕〈〉《》「」『』.〖〗【】()[]{}ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑⒒⒓⒔⒕⒖⒗⒘⒙⒚⒛㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩①②③④⑤⑥⑦⑧⑨⑩⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽⑾⑿⒀⒁⒂⒃⒄⒅⒆⒇≈≡≠=≤≥<>≮≯∷±+-×÷/∫∮∝∞∧∨∑∏∪∩∈∵∴⊥∥∠⌒⊙≌∽√§№☆★○●◎◇◆□℃‰€■△▲※→←↑↓〓¤°#&@\︿_ ̄―♂♀┌┍┎┐┑┒┓─┄┈├┝┞┟┠┡┢┣│┆┊┬┭┮┯┰┱┲┳┼┽┾┿╀╁╂╃└┕┖┗┘┙┚┛━┅┉┤┥┦┧┨┩┪┫┃┇┋┴┵┶┷┸┹┺┻╋╊╉╈╇╆╅╄";
        private static BitArray SkipBitArray = new BitArray(char.MaxValue);
        /// <summary>
        /// 初始化 使用前必須調用一次
        /// </summary>
        /// <param name="keyWords">敏感詞列表</param>
        public static void Init(string[] keyWords)
        {
            if (_root != null) return;
            lock (LockObj)
            {

                _root = new FilterKeyWordsNode { CharArray = new BitArray(char.MaxValue) };

                /*初始化特殊符號*/
                var skipChar = MapChar(SkipList);
                foreach (var c in skipChar)
                {
                    SkipBitArray[c] = true;
                }


                var list = keyWords.Select(p => new string(MapChar(p))).Distinct().ToArray();
                for (int i = 1; i <= list.Max(p => p.Length); i++)
                {
                    int i1 = i;
                    var startList = list.Where(p => p.Length >= i1).Select(p => p.Substring(0, i1)).Distinct().ToArray();
                    foreach (var startWord in startList)
                    {
                        var tmp = _root;
                        for (int j = 0; j < startWord.Length; j++)
                        {
                            var t = startWord[j];
                            tmp.CharArray[t] = true;

                            if (tmp.Child == null)
                                tmp.Child = new Dictionary<char, FilterKeyWordsNode>();

                            if (!tmp.Child.ContainsKey(t))
                            {
                                var thisCode = startWord.Substring(0, j + 1);
                                var node = new FilterKeyWordsNode { CharArray = new BitArray(char.MaxValue), IsEnd = list.Contains(thisCode), Value = thisCode };
                                tmp.Child.Add(t, node);
                            }

                            tmp = tmp.Child[t];
                        }
                    }
                }

            }
        }

        /// <summary>
        /// 查找含有的關鍵詞
        /// </summary>
        public static bool Find(string text, out string[] keyWords)
        {
            keyWords = Find(text).Select(p => text.Substring(p.Key, p.Value)).Distinct().ToArray();
            return keyWords.Length > 0;
        }

        /// <summary>
        /// 簡單快速替換
        /// </summary>
        public static string Replace(string text)
        {
            var dic = Find(text);
            var list = text.ToCharArray();
            foreach (var i in dic)
            {
                for (var index = 0; index < i.Value; index++)
                {
                    list[index + i.Key] = '*';
                }
            }
            return new string(list);
        }

        /// <summary>
        /// 自定義過濾
        /// </summary>
        public static string Replace(string text, ReplaceDelegate replaceAction)
        {

            var dic = Find(text);
            var list = text.ToList();
            var offset = 0;
            foreach (var i in dic)
            {

                list.RemoveRange(i.Key + offset, i.Value);
                var newText = replaceAction(text.Substring(i.Key, i.Value), i.Key, i.Value);
                list.InsertRange(i.Key + offset, newText);
                offset = offset + newText.Length - i.Value;
            }
            return new string(list.ToArray());
        }


        /// <summary>
        /// 位置查找
        /// </summary>
        public static Dictionary<int, int> Find(string src)
        {
            if (_root == null)
                throw new InvalidOperationException("未初始化");

            var findResult = new Dictionary<int, int>();
            if (string.IsNullOrEmpty(src))
                return findResult;
            var charList = MapChar(src);


            var start = 0;
            var length = src.Length;
            while (start < length)
            {
                var node = _root;
                var firstChar = charList[start];
                //跳過html tag
                if (firstChar == '<')
                {
                    do
                    {
                        start++;
                        firstChar = charList[start];
                        if (firstChar == '>' || firstChar > 128)
                            break;
                    }
                    while (start < length);
                }


                int end = start;
                var find = false;
                while (node.Child != null && node.CharArray[firstChar])
                {
                    end++;
                    node = node.Child[firstChar];
                    if (end >= length)
                        break;

                    firstChar = charList[end];



                    //跳過忽略詞
                    while (SkipBitArray[firstChar] && !node.IsEnd && end < length-1)
                    {
                        //跳過html tag
                        if (firstChar == '<')
                        {
                            do
                            {
                                end++;
                                firstChar = charList[end];
                                if (firstChar == '>' || firstChar > 128)
                                    break;
                            }
                            while (end < length);
                        }
                        firstChar = charList[++end];
                    }
                    find = node.IsEnd;

                }

                //完整匹配 把起始位置移到結束位置
                if (find)
                {
                    findResult.Add(start, end - start);
                    start = end - 1;
                }
                else
                    start++;

            }

            return findResult;
        }

        private static char[] MapList;

        /// <summary>
        /// 字符預處理
        /// </summary>
        public static char[] MapChar(string src)
        {
            if (MapList == null)
            {
                MapList = new char[Char.MaxValue];
                for (int i = 0; i < MapList.Length; i++)
                {
                    MapList[i] = (char)i;
                    /*繁體=>簡體*/
                    if (MapList[i] > 0x4E00 && MapList[i] < 0x9FFF)
                    {
                        char chinese;
                        if (TranslationChinese.TryGetValue(MapList[i], out chinese))
                            MapList[i] = chinese;
                    }
                    else
                    {
                        /*全角=>半角*/
                        if (MapList[i] > 0xFF00 && MapList[i] < 0xFF5F)
                            MapList[i] = (char)(MapList[i] - 0xFEE0);

                        /*大寫=>小寫*/
                        if (MapList[i] > 0x40 && MapList[i] < 0x5b)
                            MapList[i] = (char)(MapList[i] + 0x20);
                    }
                }
            }

            var charList = src.ToCharArray();
            for (int n = 0; n < src.Length; n++)
            {

                charList[n] = MapList[charList[n]];
            }
            return charList;
        }


        /// <summary>
        /// 敏感詞樹
        /// </summary>
        private class FilterKeyWordsNode
        {
            public Dictionary<char, FilterKeyWordsNode> Child;
            public BitArray CharArray;
            public bool IsEnd;
            public string Value;
        }

        /// <summary>
        /// 自定義過濾方法
        /// </summary>
        /// <param name="text">找到的字符串</param>
        /// <param name="offset">起始位置</param>
        /// <param name="length">字符串長度</param>
        /// <returns>替換后的</returns>
        public delegate string ReplaceDelegate(string text, int offset, int length);


    }
FilterKeyWords

 

    /// <summary>
    /// 敏感詞過濾 已忽略大小寫 全半角 簡繁體差異 特殊符號  html標簽 干擾
    /// 指針優化版 性能提高3倍
    /// </summary>
    public static class FilterKeyWordsFast
    {

        private static readonly object LockObj = new object();
        private static FilterKeyWordsNode _root;
        private const string TraditionalChinese = "皚藹礙愛翺襖奧壩罷擺敗頒辦絆幫綁鎊謗剝飽寶報鮑輩貝鋇狽備憊繃筆畢斃閉邊編貶變辯辮鼈癟瀕濱賓擯餅撥缽鉑駁蔔補參蠶殘慚慘燦蒼艙倉滄廁側冊測層詫攙摻蟬饞讒纏鏟産闡顫場嘗長償腸廠暢鈔車徹塵陳襯撐稱懲誠騁癡遲馳恥齒熾沖蟲寵疇躊籌綢醜櫥廚鋤雛礎儲觸處傳瘡闖創錘純綽辭詞賜聰蔥囪從叢湊竄錯達帶貸擔單鄲撣膽憚誕彈當擋黨蕩檔搗島禱導盜燈鄧敵滌遞締點墊電澱釣調疊諜疊釘頂錠訂東動棟凍鬥犢獨讀賭鍍鍛斷緞兌隊對噸頓鈍奪鵝額訛惡餓兒爾餌貳發罰閥琺礬釩煩範販飯訪紡飛廢費紛墳奮憤糞豐楓鋒風瘋馮縫諷鳳膚輻撫輔賦複負訃婦縛該鈣蓋幹趕稈贛岡剛鋼綱崗臯鎬擱鴿閣鉻個給龔宮鞏貢鈎溝構購夠蠱顧剮關觀館慣貫廣規矽歸龜閨軌詭櫃貴劊輥滾鍋國過駭韓漢閡鶴賀橫轟鴻紅後壺護滬戶嘩華畫劃話懷壞歡環還緩換喚瘓煥渙黃謊揮輝毀賄穢會燴彙諱誨繪葷渾夥獲貨禍擊機積饑譏雞績緝極輯級擠幾薊劑濟計記際繼紀夾莢頰賈鉀價駕殲監堅箋間艱緘繭檢堿鹼揀撿簡儉減薦檻鑒踐賤見鍵艦劍餞漸濺澗漿蔣槳獎講醬膠澆驕嬌攪鉸矯僥腳餃繳絞轎較稭階節莖驚經頸靜鏡徑痙競淨糾廄舊駒舉據鋸懼劇鵑絹傑潔結誡屆緊錦僅謹進晉燼盡勁荊覺決訣絕鈞軍駿開凱顆殼課墾懇摳庫褲誇塊儈寬礦曠況虧巋窺饋潰擴闊蠟臘萊來賴藍欄攔籃闌蘭瀾讕攬覽懶纜爛濫撈勞澇樂鐳壘類淚籬離裏鯉禮麗厲勵礫曆瀝隸倆聯蓮連鐮憐漣簾斂臉鏈戀煉練糧涼兩輛諒療遼鐐獵臨鄰鱗凜賃齡鈴淩靈嶺領餾劉龍聾嚨籠壟攏隴樓婁摟簍蘆盧顱廬爐擄鹵虜魯賂祿錄陸驢呂鋁侶屢縷慮濾綠巒攣孿灤亂掄輪倫侖淪綸論蘿羅邏鑼籮騾駱絡媽瑪碼螞馬罵嗎買麥賣邁脈瞞饅蠻滿謾貓錨鉚貿麽黴沒鎂門悶們錳夢謎彌覓綿緬廟滅憫閩鳴銘謬謀畝鈉納難撓腦惱鬧餒膩攆撚釀鳥聶齧鑷鎳檸獰甯擰濘鈕紐膿濃農瘧諾歐鷗毆嘔漚盤龐賠噴鵬騙飄頻貧蘋憑評潑頗撲鋪樸譜臍齊騎豈啓氣棄訖牽扡釺鉛遷簽謙錢鉗潛淺譴塹槍嗆牆薔強搶鍬橋喬僑翹竅竊欽親輕氫傾頃請慶瓊窮趨區軀驅齲顴權勸卻鵲讓饒擾繞熱韌認紉榮絨軟銳閏潤灑薩鰓賽傘喪騷掃澀殺紗篩曬閃陝贍繕傷賞燒紹賒攝懾設紳審嬸腎滲聲繩勝聖師獅濕詩屍時蝕實識駛勢釋飾視試壽獸樞輸書贖屬術樹豎數帥雙誰稅順說碩爍絲飼聳慫頌訟誦擻蘇訴肅雖綏歲孫損筍縮瑣鎖獺撻擡攤貪癱灘壇譚談歎湯燙濤縧騰謄銻題體屜條貼鐵廳聽烴銅統頭圖塗團頹蛻脫鴕馱駝橢窪襪彎灣頑萬網韋違圍爲濰維葦偉僞緯謂衛溫聞紋穩問甕撾蝸渦窩嗚鎢烏誣無蕪吳塢霧務誤錫犧襲習銑戲細蝦轄峽俠狹廈鍁鮮纖鹹賢銜閑顯險現獻縣餡羨憲線廂鑲鄉詳響項蕭銷曉嘯蠍協挾攜脅諧寫瀉謝鋅釁興洶鏽繡虛噓須許緒續軒懸選癬絢學勳詢尋馴訓訊遜壓鴉鴨啞亞訝閹煙鹽嚴顔閻豔厭硯彥諺驗鴦楊揚瘍陽癢養樣瑤搖堯遙窯謠藥爺頁業葉醫銥頤遺儀彜蟻藝億憶義詣議誼譯異繹蔭陰銀飲櫻嬰鷹應纓瑩螢營熒蠅穎喲擁傭癰踴詠湧優憂郵鈾猶遊誘輿魚漁娛與嶼語籲禦獄譽預馭鴛淵轅園員圓緣遠願約躍鑰嶽粵悅閱雲鄖勻隕運蘊醞暈韻雜災載攢暫贊贓髒鑿棗竈責擇則澤賊贈紮劄軋鍘閘詐齋債氈盞斬輾嶄棧戰綻張漲帳賬脹趙蟄轍鍺這貞針偵診鎮陣掙睜猙幀鄭證織職執紙摯擲幟質鍾終種腫衆謅軸皺晝驟豬諸誅燭矚囑貯鑄築駐專磚轉賺樁莊裝妝壯狀錐贅墜綴諄濁茲資漬蹤綜總縱鄒詛組鑽緻鐘麼為隻兇準啟闆裡靂餘鍊洩";
        private const string SimplifiedChinese = "皚藹礙愛翱襖奧壩罷擺敗頒辦絆幫綁鎊謗剝飽寶報鮑輩貝鋇狽備憊綳筆畢斃閉邊編貶變辯辮鱉癟瀕濱賓擯餅撥缽鉑駁卜補參蠶殘慚慘燦蒼艙倉滄廁側冊測層詫攙摻蟬饞讒纏鏟產闡顫場嘗長償腸廠暢鈔車徹塵陳襯撐稱懲誠騁痴遲馳恥齒熾沖蟲寵疇躊籌綢丑櫥廚鋤雛礎儲觸處傳瘡闖創錘純綽辭詞賜聰蔥囪從叢湊竄錯達帶貸擔單鄲撣膽憚誕彈當擋黨盪檔搗島禱導盜燈鄧敵滌遞締點墊電淀釣調迭諜疊釘頂錠訂東動棟凍斗犢獨讀賭鍍鍛斷緞兌隊對噸頓鈍奪鵝額訛惡餓兒爾餌貳發罰閥琺礬釩煩范販飯訪紡飛廢費紛墳奮憤糞豐楓鋒風瘋馮縫諷鳳膚輻撫輔賦復負訃婦縛該鈣蓋干趕稈贛岡剛鋼綱崗皋鎬擱鴿閣鉻個給龔宮鞏貢鈎溝構購夠蠱顧剮關觀館慣貫廣規硅歸龜閨軌詭櫃貴劊輥滾鍋國過駭韓漢閡鶴賀橫轟鴻紅后壺護滬戶嘩華畫划話懷壞歡環還緩換喚瘓煥渙黃謊揮輝毀賄穢會燴匯諱誨繪葷渾伙獲貨禍擊機積飢譏雞績緝極輯級擠幾薊劑濟計記際繼紀夾莢頰賈鉀價駕殲監堅箋間艱緘繭檢鹼礆揀撿簡儉減薦檻鑒踐賤見鍵艦劍餞漸濺澗漿蔣槳獎講醬膠澆驕嬌攪鉸矯僥腳餃繳絞轎較秸階節莖驚經頸靜鏡徑痙競凈糾廄舊駒舉據鋸懼劇鵑絹傑潔結誡屆緊錦僅謹進晉燼盡勁荊覺決訣絕鈞軍駿開凱顆殼課墾懇摳庫褲誇塊儈寬礦曠況虧巋窺饋潰擴闊蠟臘萊來賴藍欄攔籃闌蘭瀾讕攬覽懶纜爛濫撈勞澇樂鐳壘類淚籬離里鯉禮麗厲勵礫歷瀝隸倆聯蓮連鐮憐漣簾斂臉鏈戀煉練糧涼兩輛諒療遼鐐獵臨鄰鱗凜賃齡鈴凌靈嶺領餾劉龍聾嚨籠壟攏隴樓婁摟簍蘆盧顱廬爐擄鹵虜魯賂祿錄陸驢呂鋁侶屢縷慮濾綠巒攣孿灤亂掄輪倫侖淪綸論蘿羅邏鑼籮騾駱絡媽瑪碼螞馬罵嗎買麥賣邁脈瞞饅蠻滿謾貓錨鉚貿么霉沒鎂門悶們錳夢謎彌覓綿緬廟滅憫閩鳴銘謬謀畝鈉納難撓腦惱鬧餒膩攆捻釀鳥聶嚙鑷鎳檸獰寧擰濘鈕紐膿濃農瘧諾歐鷗毆嘔漚盤龐賠噴鵬騙飄頻貧蘋憑評潑頗撲鋪朴譜臍齊騎豈啟氣棄訖牽扦釺鉛遷簽謙錢鉗潛淺譴塹槍嗆牆薔強搶鍬橋喬僑翹竅竊欽親輕氫傾頃請慶瓊窮趨區軀驅齲顴權勸卻鵲讓饒擾繞熱韌認紉榮絨軟銳閏潤灑薩鰓賽傘喪騷掃澀殺紗篩曬閃陝贍繕傷賞燒紹賒攝懾設紳審嬸腎滲聲繩勝聖師獅濕詩屍時蝕實識駛勢釋飾視試壽獸樞輸書贖屬術樹豎數帥雙誰稅順說碩爍絲飼聳慫頌訟誦擻蘇訴肅雖綏歲孫損筍縮瑣鎖獺撻抬攤貪癱灘壇譚談嘆湯燙濤絛騰謄銻題體屜條貼鐵廳聽烴銅統頭圖塗團頹蛻脫鴕馱駝橢窪襪彎灣頑萬網韋違圍為濰維葦偉偽緯謂衛溫聞紋穩問瓮撾蝸渦窩嗚鎢烏誣無蕪吳塢霧務誤錫犧襲習銑戲細蝦轄峽俠狹廈杴鮮纖咸賢銜閑顯險現獻縣餡羡憲線廂鑲鄉詳響項蕭銷曉嘯蠍協挾攜脅諧寫瀉謝鋅釁興洶銹綉虛噓須許緒續軒懸選癬絢學勛詢尋馴訓訊遜壓鴉鴨啞亞訝閹煙鹽嚴顏閻艷厭硯彥諺驗鴦楊揚瘍陽癢養樣瑤搖堯遙窯謠葯爺頁業葉醫銥頤遺儀彝蟻藝億憶義詣議誼譯異繹蔭陰銀飲櫻嬰鷹應纓瑩螢營熒蠅穎喲擁佣癰踴詠涌優憂郵鈾猶游誘輿魚漁娛與嶼語吁御獄譽預馭鴛淵轅園員圓緣遠願約躍鑰岳粵悅閱雲鄖勻隕運蘊醞暈韻雜災載攢暫贊贓臟鑿棗灶責擇則澤賊贈扎札軋鍘閘詐齋債氈盞斬輾嶄棧戰綻張漲帳賬脹趙蟄轍鍺這貞針偵診鎮陣掙睜猙幀鄭證織職執紙摯擲幟質鍾終種腫眾謅軸皺晝驟豬諸誅燭矚囑貯鑄築駐專磚轉賺樁庄裝妝壯狀錐贅墜綴諄濁茲資漬蹤綜總縱鄒詛組鑽致鍾么為只凶准啟板里靂余鏈泄";
        private static readonly Dictionary<char, char> TranslationChinese = TraditionalChinese.Select((c, i) => new { c, i }).ToDictionary(p => p.c, p => SimplifiedChinese[p.i]);
        private static string SkipList = " \t\r\n~!@#$%^&*()_+-=【】、{}|;':\",。、《》?αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ。,、;:?!…—·ˉ¨‘’“”々~‖∶"'`|〃〔〕〈〉《》「」『』.〖〗【】()[]{}ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑⒒⒓⒔⒕⒖⒗⒘⒙⒚⒛㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩①②③④⑤⑥⑦⑧⑨⑩⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽⑾⑿⒀⒁⒂⒃⒄⒅⒆⒇≈≡≠=≤≥<>≮≯∷±+-×÷/∫∮∝∞∧∨∑∏∪∩∈∵∴⊥∥∠⌒⊙≌∽√§№☆★○●◎◇◆□℃‰€■△▲※→←↑↓〓¤°#&@\︿_ ̄―♂♀┌┍┎┐┑┒┓─┄┈├┝┞┟┠┡┢┣│┆┊┬┭┮┯┰┱┲┳┼┽┾┿╀╁╂╃└┕┖┗┘┙┚┛━┅┉┤┥┦┧┨┩┪┫┃┇┋┴┵┶┷┸┹┺┻╋╊╉╈╇╆╅╄";
        private static BitArray SkipBitArray = new BitArray(char.MaxValue);
        /// <summary>
        /// 初始化 使用前必須調用一次
        /// </summary>
        /// <param name="keyWords">敏感詞列表</param>
        public static void Init(string[] keyWords)
        {
            if (_root != null) return;
            lock (LockObj)
            {

                _root = new FilterKeyWordsNode { CharArray = new BitArray(char.MaxValue) };

                /*初始化特殊符號*/
                MapChar(SkipList);
                foreach (var c in SkipList)
                {
                    SkipBitArray[c] = true;
                }


                foreach (var key in keyWords)
                    MapChar(key);

                var list = keyWords.Distinct().ToArray();

                for (int i = 1; i <= list.Max(p => p.Length); i++)
                {
                    int i1 = i;
                    var startList = list.Where(p => p.Length >= i1).Select(p => p.Substring(0, i1)).Distinct().ToArray();
                    foreach (var startWord in startList)
                    {
                        var tmp = _root;
                        for (int j = 0; j < startWord.Length; j++)
                        {
                            var t = startWord[j];
                            tmp.CharArray[t] = true;

                            if (tmp.Child == null)
                                tmp.Child = new Dictionary<char, FilterKeyWordsNode>();

                            if (!tmp.Child.ContainsKey(t))
                            {
                                var thisCode = startWord.Substring(0, j + 1);
                                var node = new FilterKeyWordsNode { CharArray = new BitArray(char.MaxValue), IsEnd = list.Contains(thisCode), Value = thisCode };
                                tmp.Child.Add(t, node);
                            }

                            tmp = tmp.Child[t];
                        }
                    }
                }

            }
        }

        /// <summary>
        /// 查找含有的關鍵詞
        /// </summary>
        public static bool Find(string text, out string[] keyWords)
        {
            keyWords = Find(text).Select(p => text.Substring(p.Key,p.Value)).Distinct().ToArray();
            return keyWords.Length > 0;
        }

        /// <summary>
        /// 簡單快速替換
        /// </summary>
        unsafe public static string Replace(string text)
        {
            var dic = Find(text);
            fixed (char* newText = text) {
                var c = newText;
                foreach (var i in dic) {
                    c = newText + i.Key;
                    for (var index = 0; index < i.Value; index++) {
                        *c++ = '*';
                    }
                }
            }
            return text;
        }

        /// <summary>
        /// 自定義過濾
        /// </summary>
        public static string Replace(string text, ReplaceDelegate replaceAction)
        {

            var dic = Find(text);
            var list = text.ToList();
            var offset = 0;
            foreach (var i in dic)
            {

                list.RemoveRange(i.Key + offset, i.Value);
                var newText = replaceAction(text.Substring(i.Key, i.Value), i.Key, i.Value);
                list.InsertRange(i.Key + offset, newText);
                offset = offset + newText.Length - i.Value;
            }
            return new string(list.ToArray());
        }


        /// <summary>
        /// 位置查找
        /// </summary>
        unsafe public static Dictionary<int, int> Find(string src)
        {
            if (_root == null)
                throw new InvalidOperationException("未初始化");

            var findResult = new Dictionary<int, int>();
            if (string.IsNullOrEmpty(src))
                return findResult;
            MapChar(src);

            fixed (char* text = src)
            {
                char* t1 = text;
                char* end = text + src.Length;
                while (t1 < end)
                {
                    var node = _root;
                    //跳過html tag
                    if (*t1 == '<')
                    {
                        do
                        {
                            t1++;
                        }
                        while (*t1 != '>' && *t1 < 128 && t1 < end);
                    }

                    var start = t1;
                    //貪婪模式匹配 優先匹配長的關鍵詞
                    while (node.CharArray[*t1])
                    {

                        node = node.Child[*t1];

                        if (t1 >= end)
                            break;

                        t1++;


                        if (node.IsEnd)
                            continue;

                        //跳過忽略詞
                        while (SkipBitArray[*t1] && t1 < end)
                        {
                            t1++;
                        }
                    }

                    //完整匹配 把起始位置移到結束位置
                    if (node.IsEnd)
                    {
                        findResult.Add((int)(start - text), (int)(t1 - start));
                    }
                    else
                        t1 = start + 1;

                }

            }

            return findResult;


        }

        private static char[] MapList;

        /// <summary>
        /// 字符預處理
        /// </summary>
        unsafe public static void MapChar(string src)
        {
            if (MapList == null)
            {
                MapList = new char[Char.MaxValue];
                for (int i = 0; i < MapList.Length; i++)
                {
                    MapList[i] = (char)i;
                    /*繁體=>簡體*/
                    if (MapList[i] > 0x4E00 && MapList[i] < 0x9FFF)
                    {
                        char chinese;
                        if (TranslationChinese.TryGetValue(MapList[i], out chinese))
                            MapList[i] = chinese;
                    }
                    else
                    {
                        /*全角=>半角*/
                        if (MapList[i] > 0xFF00 && MapList[i] < 0xFF5F)
                            MapList[i] = (char)(MapList[i] - 0xFEE0);

                        /*大寫=>小寫*/
                        if (MapList[i] > 0x40 && MapList[i] < 0x5b)
                            MapList[i] = (char)(MapList[i] + 0x20);
                    }
                }
            }

            fixed (char* text = src, dic = MapList)
            {
                char* t1 = text;
                for (int i = 0; i < src.Length; i++)
                {
                    *t1 = *(dic + *t1);
                    t1++;
                }
            }

        }


        /// <summary>
        /// 敏感詞樹
        /// </summary>
        private class FilterKeyWordsNode
        {
            public Dictionary<char, FilterKeyWordsNode> Child;
            public BitArray CharArray;
            public bool IsEnd;
            public string Value;
        }

        /// <summary>
        /// 自定義過濾方法
        /// </summary>
        /// <param name="text">找到的字符串</param>
        /// <param name="offset">起始位置</param>
        /// <param name="length">字符串長度</param>
        /// <returns>替換后的</returns>
        public delegate string ReplaceDelegate(string text, int offset, int length);


    }
FilterKeyWordsFast

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM