前言
現在幾乎所有的網站再發布帶有文字信息的內容時都會要求過濾掉發動的、不健康的、影響社會安定的等敏感詞匯,這里為大家提供了可以是現在這種功能的解決方案
第一種方式
- 創建敏感詞匯文件;首先需要准備一個txt格式的文件用於存放需要過濾的敏感詞匯,這個文件放到resources資源文件的根目錄
代碼如下
package com.xxxx.service;
import lombok.Data;
import org.springframework.stereotype.Service;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* 敏感詞匯service
*
* @author
* @date
*/
@Data
@Service
public class SensitiveWordService {
private StringBuilder replaceAll;
/**
* 編碼
* <P>
* 在讀敏感詞匯文件時需要用到
*/
private String encoding = "UTF-8";
/**
* 替換字符竄
* <P>
* 用於替換敏感詞匯的字符竄
*/
private String replceStr = "*";
/**
*單次替換的敏感詞匯的長度
*/
private int replceSize = 500;
/**
* 敏感詞匯文件
* <P>
* 此文件放在資源文件的根目錄下
*/
private String fileName = "censorwords.txt";
private List<String> arrayList;
/**
* 包含的敏感詞列表,過濾掉重復項
*/
public Set<String> sensitiveWordSet;
/**
* 包含的敏感詞列表,包括重復項,統計次數
*/
public List<String> sensitiveWordList;
/**
* 移除敏感詞匯
*
* @param str 需要過濾的字符竄
*
* @return 過濾之后的字符竄
*/
public String removeSensitiveWord(String str){
SensitiveWordService sw = new SensitiveWordService("censorwords.txt");
sw.InitializationWork();
return sw.filterInfo(str);
}
/**
* 攔截信息
* <P>
* 過濾掉敏感詞匯的方法
*
* @param str 將要被過濾信息
*
* @return 過濾后的信息
*/
public String filterInfo(String str) {
sensitiveWordSet = new HashSet<String>();
sensitiveWordList= new ArrayList<>();
StringBuilder buffer = new StringBuilder(str);
HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>(arrayList.size());
String temp;
for(int x = 0; x < arrayList.size();x++) {
temp = arrayList.get(x);
int findIndexSize = 0;
for(int start = -1;(start=buffer.indexOf(temp,findIndexSize)) > -1;){
//System.out.println("###replace="+temp);
findIndexSize = start+temp.length();//從已找到的后面開始找
Integer mapStart = hash.get(start);//起始位置
//滿足1個,即可更新map
if(mapStart == null || (mapStart != null && findIndexSize > mapStart)){
hash.put(start, findIndexSize);
//System.out.println("###敏感詞:"+buffer.substring(start, findIndexSize));
}
}
}
Collection<Integer> values = hash.keySet();
for(Integer startIndex : values){
Integer endIndex = hash.get(startIndex);
//獲取敏感詞,並加入列表,用來統計數量
String sensitive = buffer.substring(startIndex, endIndex);
//System.out.println("###敏感詞:"+sensitive);
if (!sensitive.contains("*")) {//添加敏感詞到集合
sensitiveWordSet.add(sensitive);
sensitiveWordList.add(sensitive);
}
buffer.replace(startIndex, endIndex, replaceAll.substring(0,endIndex-startIndex));
}
hash.clear();
return buffer.toString();
}
/**
* 初始化敏感詞庫
*/
private void InitializationWork() {
replaceAll = new StringBuilder(replceSize);
for(int x=0;x < replceSize;x++)
{
replaceAll.append(replceStr);
}
//加載詞庫
arrayList = new ArrayList<String>();
InputStreamReader read = null;
BufferedReader bufferedReader = null;
try {
read = new InputStreamReader(SensitiveWordService.class.getClassLoader().getResourceAsStream(fileName),encoding);
bufferedReader = new BufferedReader(read);
for(String txt = null;(txt = bufferedReader.readLine()) != null;){
if(!arrayList.contains(txt))
arrayList.add(txt);
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
if(null != bufferedReader)
bufferedReader.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
if(null != read)
read.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 測試方法
*
* @param args 參數
*/
public static void main(String[] args){
long startNumer = System.currentTimeMillis();
SensitiveWordService sw = new SensitiveWordService("censorwords.txt");
sw.InitializationWork();
//System.out.println("敏感詞的數量:" + arrayList.size());
String str = "你好呀,我這里有敏感詞匯,來過濾我呀";
System.out.println("被檢測字符串長度:"+str.length());
str = sw.filterInfo(str);
long endNumber = System.currentTimeMillis();
//System.out.println("語句中包含敏感詞的個數為:" + sensitiveWordSet.size() + "。包含:" + sensitiveWordSet);
//System.out.println("語句中包含敏感詞的個數為:" + sensitiveWordList.size() + "。包含:" + sensitiveWordList);
System.out.println("總共耗時:"+(endNumber-startNumer)+"ms");
System.out.println("替換后的字符串為:\n"+str);
System.out.println("替換后的字符串長度為:\n"+str.length());
}
/**
* 有參構造
* <P>
* 文件要求路徑在src或resource下,默認文件名為censorwords.txt
* @param fileName 詞庫文件名(含后綴)
*/
public SensitiveWordService(String fileName) {
this.fileName = fileName;
}
/**
* 有參構造
*
* @param replceStr 敏感詞被轉換的字符
* @param replceSize 初始轉義容量
*/
public SensitiveWordService(String replceStr, int replceSize){
this.replceStr = fileName;
this.replceSize = replceSize;
}
/**
* 無參構造
*/
public SensitiveWordService(){
}
}
第二種方法
package com.xxxx.filters;
import java.io.IOException;
import java.io.InputStream;
import java.util.Enumeration;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 【匹配度可以,速度較慢】
* Java關鍵字過濾:http://blog.csdn.net/linfssay/article/details/7599262
* @author ShengDecheng
*
*/
public class KeyWordFilter {
private static Pattern pattern = null;
private static int keywordsCount = 0;
// 從words.properties初始化正則表達式字符串
private static void initPattern() {
StringBuffer patternBuffer = new StringBuffer();
try {
//words.properties
InputStream in = KeyWordFilter.class.getClassLoader().getResourceAsStream("keywords.properties");
Properties property = new Properties();
property.load(in);
Enumeration<?> enu = property.propertyNames();
patternBuffer.append("(");
while (enu.hasMoreElements()) {
String scontent = (String) enu.nextElement();
patternBuffer.append(scontent + "|");
//System.out.println(scontent);
keywordsCount ++;
}
patternBuffer.deleteCharAt(patternBuffer.length() - 1);
patternBuffer.append(")");
//System.out.println(patternBuffer);
// unix換成UTF-8
// pattern = Pattern.compile(new
// String(patternBuf.toString().getBytes("ISO-8859-1"), "UTF-8"));
// win下換成gb2312
// pattern = Pattern.compile(new String(patternBuf.toString()
// .getBytes("ISO-8859-1"), "gb2312"));
// 裝換編碼
pattern = Pattern.compile(patternBuffer.toString());
} catch (IOException ioEx) {
ioEx.printStackTrace();
}
}
private static String doFilter(String str) {
Matcher m = pattern.matcher(str);
// while (m.find()) {// 查找符合pattern的字符串
// System.out.println("The result is here :" + m.group());
// }
// 選擇替換方式,這里以* 號代替
str = m.replaceAll("*");
return str;
}
public static void main(String[] args) {
long startNumer = System.currentTimeMillis();
initPattern();
//String str = "我日,艹,fuck,你妹的 干啥呢";
System.out.println("敏感詞的數量:" + keywordsCount);
String str = "你好呀,我這里有敏感詞匯,來過濾我呀";
System.out.println("被檢測字符串長度:"+str.length());
str = doFilter(str);
//高效Java敏感詞、關鍵詞過濾工具包_過濾非法詞句:http://blog.csdn.net/ranjio_z/article/details/6299834
//FilteredResult result = WordFilterUtil.filterText(str, '*');
long endNumber = System.currentTimeMillis();
System.out.println("總共耗時:"+(endNumber-startNumer)+"ms");
System.out.println("替換后的字符串為:\n"+str);
//System.out.println("替換后的字符串為:\n"+result.getFilteredContent());
//System.out.println("替換后的字符串為1:\n"+result.getOriginalContent());
//System.out.println("替換后的字符串為2:\n"+result.getBadWords());
}
}
敏感詞匯文件keywords.properties