關鍵詞匹配是比較常見的需求,如留言、彈幕及游戲聊天中的敏感詞過濾,都需要對一段文字進行關鍵詞匹配。提取到關鍵詞后,再做進一步處理。
本類借助PHP高效的數組和mbstring擴展,來實現對中文關鍵詞的匹配。主要思想是以關鍵詞為key,構建字典數組,這樣便可以對每個關鍵詞可實現常數級別的查找。
具體代碼如下:
1 class WordMatcher { 2 public $dict = []; 3 public $wordMaxLen = 0; 4 5 function __construct(){ 6 if(! extension_loaded('mbstring')) { 7 exit('extension mbstring is not loaded'); 8 } 9 } 10 11 function addWord($word) { 12 $len = mb_strlen($word, 'utf-8'); 13 $this->wordMaxLen = $len > $this->wordMaxLen ? $len : $this->wordMaxLen; 14 $this->dict[$word] = 1; 15 } 16 17 function removeWord($word) { 18 unset($this->dict[$word]); 19 } 20 21 function match($str, &$matched, $matchAll=false) { 22 if(mb_strlen($str) < 1) { 23 return; 24 } 25 26 $matchLen = 0; 27 $len = $this->wordMaxLen; 28 while($len>0) { 29 $substr = mb_substr($str, 0, $len, 'utf-8'); 30 if(isset($this->dict[$substr])) { 31 $matchLen = $len; 32 $matched[] = $substr; 33 break; 34 } else { 35 $len--; 36 } 37 } 38 39 if(!$matchAll && $matchLen) { 40 $str = mb_substr($str, $matchLen, null, 'utf-8'); 41 } else { 42 $str = mb_substr($str, 1, null, 'utf-8'); 43 } 44 45 $this->match($str, $matched, $matchAll); 46 } 47 } 48 49 $matcher = new WordMatcher; 50 $matcher->addWord('PHP'); 51 $matcher->addWord('語言'); 52 $matcher->addWord('H'); 53 54 55 $matcher->match('PHP是最好的語言', $matched);