JAVA敏感詞過濾


JAVA敏感詞過濾

一、初始化敏感詞庫

  1 import java.io.BufferedReader;
  2 import java.io.File;
  3 import java.io.FileInputStream;
  4 import java.io.InputStreamReader;
  5 import java.util.HashMap;
  6 import java.util.HashSet;
  7 import java.util.Iterator;
  8 import java.util.Map;
  9 import java.util.Set;
 10 
 11 /**
 12  * 初始化敏感詞庫,將敏感詞加入到HashMap中,構建DFA算法模型
 13  */
 14 public class SensitiveWordInit {
 15     private String ENCODING = "utf-8";    //字符編碼
 16     public HashMap sensitiveWordMap;
 17     public SensitiveWordInit(){
 18         super();
 19     }
 20 
 21     /**
 22      * 初始化
 23      */
 24     public Map initKeyWord(){
 25         try {
 26             //讀取敏感詞庫
 27             Set<String> keyWordSet = readSensitiveWordFile();
 28             //將敏感詞庫加入到HashMap中
 29             addSensitiveWordToHashMap(keyWordSet);
 30             //spring獲取application,然后application.setAttribute("sensitiveWordMap",sensitiveWordMap);
 31         } catch (Exception e) {
 32             e.printStackTrace();
 33         }
 34         return sensitiveWordMap;
 35     }
 36 
 37     /**
 38      * 讀取敏感詞庫,將敏感詞放入HashSet中,構建一個DFA算法模型:<br>
 39      * 中 = {
 40      *      isEnd = 0
 41      *      國 = {<br>
 42      *           isEnd = 1
 43      *           人 = {isEnd = 0
 44      *                民 = {isEnd = 1}
 45      *                }
 46      *           男  = {
 47      *                  isEnd = 0
 48      *                   人 = {
 49      *                        isEnd = 1
 50      *                       }
 51      *               }
 52      *           }
 53      *      }
 54      *  五 = {
 55      *      isEnd = 0
 56      *      星 = {
 57      *          isEnd = 0
 58      *          紅 = {
 59      *              isEnd = 0
 60      *              旗 = {
 61      *                   isEnd = 1
 62      *                  }
 63      *              }
 64      *          }
 65      *      }
 66      */
 67     private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
 68         sensitiveWordMap = new HashMap(keyWordSet.size());     //初始化敏感詞容器,減少擴容操作
 69         String key = null;
 70         Map nowMap = null;
 71         Map<String, String> newWorMap = null;
 72         //迭代keyWordSet
 73         Iterator<String> iterator = keyWordSet.iterator();
 74         while(iterator.hasNext()){
 75             key = iterator.next();    //關鍵字
 76             nowMap = sensitiveWordMap;
 77             for(int i = 0 ; i < key.length() ; i++){
 78                 char keyChar = key.charAt(i);       //轉換成char型
 79                 Object wordMap = nowMap.get(keyChar);       //獲取
 80 
 81                 if(wordMap != null){        //如果存在該key,直接賦值
 82                     nowMap = (Map) wordMap;
 83                 }
 84                 else{     //不存在則,則構建一個map,同時將isEnd設置為0,因為他不是最后一個
 85                     newWorMap = new HashMap<String,String>();
 86                     newWorMap.put("isEnd", "0");     //不是最后一個
 87                     nowMap.put(keyChar, newWorMap);
 88                     nowMap = newWorMap;
 89                 }
 90 
 91                 if(i == key.length() - 1){
 92                     nowMap.put("isEnd", "1");    //最后一個
 93                 }
 94             }
 95         }
 96     }
 97 
 98     /**
 99      * 讀取敏感詞庫中的內容,將內容添加到set集合中
100      */
101     @SuppressWarnings("resource")
102     private Set<String> readSensitiveWordFile() throws Exception{
103         Set<String> set = null;
104         //https://github.com/heqiyoujing/config_file 詞庫地址
105         File file = new File("D:\\SensitiveWord.txt");    //讀取文件
106         InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
107         try {
108             if(file.isFile() && file.exists()){      //文件流是否存在
109                 set = new HashSet<String>();
110                 BufferedReader bufferedReader = new BufferedReader(read);
111                 String txt = null;
112                 while((txt = bufferedReader.readLine()) != null){    //讀取文件,將文件內容放入到set中
113                     set.add(txt);
114                 }
115             }
116             else{         //不存在拋出異常信息
117                 throw new Exception("敏感詞庫文件不存在");
118             }
119         } catch (Exception e) {
120             throw e;
121         }finally{
122             read.close();     //關閉文件流
123         }
124         return set;
125     }
126 }
View Code

二、檢查敏感詞並替換

  1 import java.util.HashSet;
  2 import java.util.Iterator;
  3 import java.util.Map;
  4 import java.util.Set;
  5 
  6 /**
  7  * 敏感詞過濾
  8  */
  9 public class SensitivewordFilter {
 10     private Map sensitiveWordMap = null;
 11     public static int minMatchTYpe = 1;      //最小匹配規則
 12     public static int maxMatchType = 2;      //最大匹配規則
 13     private static String replaceString = null;
 14     /**例如:敏感詞中含有中國人、中國
 15      * 最小匹配規則minMatchTYpe為1時,會匹配出**人,為2時,會匹配出***
 16      * */
 17     public static void main(String[] args) throws Exception{
 18         SensitivewordFilter filter = new SensitivewordFilter();
 19         System.out.println("敏感詞的數量:" + filter.sensitiveWordMap.size());
 20         String string = "dfa是面向三級裝配的設計(Design for assembly)的英文簡稱,是指在產品設計階段設計產品使得產品具有良好" +
 21                 "的可裝配性,確保裝配工序簡單、裝配效率高、裝配質量高、裝配不良率低和裝配成本低。面向裝配的設計通過一系" +
 22                 "列有利於裝配的設計指南例如簡化產品設計、減少零件數量等,女女並同裝配工程師一起合作,被逼簡化產品結構,近親使其便於" +
 23                 "裝配,為提高產品質量、縮短產品開發周期和降低產品成本奠定基礎";
 24         // ------獲取敏感詞---------
 25         Set<String> set = filter.getSensitiveWord(string, 1);
 26         System.out.println("含敏感詞的個數為:" + set.size() + "。包含:" + set);
 27         // ------------------------替換敏感字begin----------------------
 28         Iterator<String> iterator = set.iterator();
 29         String word = null;
 30         while (iterator.hasNext()) {
 31             word = iterator.next();
 32             /**
 33              * 得到word中敏感關鍵詞被替換后的字符串,例如:***
 34              * */
 35             getReplaceCharsS("*", word.length());
 36             /**
 37              * 將原字符串中的敏感關鍵詞替換成帶有replaceChar
 38              * 或全部為replaceChar的關鍵詞
 39              * */
 40             string = string.replaceAll(word, replaceString);
 41         }
 42         // ------------------------替換敏感字end----------------------
 43         System.out.println(string);
 44     }
 45 
 46     /**
 47      * 構造函數,初始化敏感詞庫
 48      */
 49     public SensitivewordFilter(){
 50         sensitiveWordMap = new SensitiveWordInit().initKeyWord();
 51     }
 52 
 53     /**
 54      * 判斷文字是否包含敏感字符
 55      * @param matchType  匹配規則&nbsp;1:最小匹配規則,2:最大匹配規則
 56      */
 57     public boolean isContaintSensitiveWord(String txt,int matchType){
 58         boolean flag = false;
 59         for(int i = 0 ; i < txt.length() ; i++){
 60             int matchFlag = this.CheckSensitiveWord(txt, i, matchType); //判斷是否包含敏感字符
 61             if(matchFlag > 0){    //大於0存在,返回true
 62                 flag = true;
 63             }
 64         }
 65         return flag;
 66     }
 67 
 68     /**
 69      * 獲取文字中的敏感詞
 70      * @param matchType 匹配規則&nbsp;1:最小匹配規則,2:最大匹配規則
 71      */
 72     public Set<String> getSensitiveWord(String txt , int matchType){
 73         Set<String> sensitiveWordList = new HashSet<String>();
 74 
 75         for(int i = 0 ; i < txt.length() ; i++){
 76             int length = CheckSensitiveWord(txt, i, matchType);    //判斷是否包含敏感字符
 77             if(length > 0){    //存在,加入list中
 78                 sensitiveWordList.add(txt.substring(i, i+length));
 79                 i = i + length - 1;    //減1的原因,是因為for會自增
 80             }
 81         }
 82 
 83         return sensitiveWordList;
 84     }
 85 
 86     /**
 87      * 替換敏感字字符,默認*
 88      */
 89     public String replaceSensitiveWord(String txt,int matchType,String replaceChar){
 90         String resultTxt = txt;
 91         Set<String> set = getSensitiveWord(txt, matchType);     //獲取所有的敏感詞
 92         Iterator<String> iterator = set.iterator();
 93         String word = null;
 94         String replaceString = null;
 95         while (iterator.hasNext()) {
 96             word = iterator.next();
 97             replaceString = getReplaceChars(replaceChar, word.length());
 98             resultTxt = resultTxt.replaceAll(word, replaceString);
 99         }
100 
101         return resultTxt;
102     }
103 
104     /**
105      * 獲取替換字符串
106      */
107     private String getReplaceChars(String replaceChar,int length){
108         String resultReplace = replaceChar;
109         for(int i = 1 ; i < length ; i++){
110             resultReplace += replaceChar;
111         }
112 
113         return resultReplace;
114     }
115 
116     /**
117      * 獲取替換字符串,無返回值
118      */
119     private static void getReplaceCharsS(String replaceChar,int length){
120         replaceString = "";
121         String resultReplace = replaceChar;
122         for(int i = 1 ; i < length ; i++){
123             resultReplace += replaceChar;
124         }
125         replaceString = resultReplace;
126     }
127 
128     /**
129      * 檢查文字中是否包含敏感字符,檢查規則如下:<br>
130      */
131     @SuppressWarnings({ "rawtypes"})
132     public int CheckSensitiveWord(String txt,int beginIndex,int matchType){
133         boolean  flag = false;    //敏感詞結束標識位:用於敏感詞只有1位的情況
134         int matchFlag = 0;     //匹配標識數默認為0
135         char word = 0;
136         Map nowMap = sensitiveWordMap;
137         for(int i = beginIndex; i < txt.length() ; i++){
138             word = txt.charAt(i);
139             nowMap = (Map) nowMap.get(word);     //獲取指定key
140             if(nowMap != null){     //存在,則判斷是否為最后一個
141                 matchFlag++;     //找到相應key,匹配標識+1
142                 if("1".equals(nowMap.get("isEnd"))){       //如果為最后一個匹配規則,結束循環,返回匹配標識數
143                     flag = true;       //結束標志位為true
144                     if(SensitivewordFilter.minMatchTYpe == matchType){    //最小規則,直接返回,最大規則還需繼續查找
145                         break;
146                     }
147                 }
148             }
149             else{     //不存在,直接返回
150                 break;
151             }
152         }
153         if(matchFlag < 2 || !flag){        //長度必須大於等於1,為詞
154             matchFlag = 0;
155         }
156         return matchFlag;
157     }
158 
159 }
View Code

三、運行結果

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM