前提:首先說明一下TFIDF的部分是借用
http://www.cnblogs.com/ywl925/archive/2013/08/26/3275878.html
這篇博文寫的代碼,因為工作需要在后面加上了使用信息增益的方法進行特征降維。
TFIDF的介紹在此就不贅述了,直接將公式擺出來。
TF公式:
以上式子中 是該詞在文件
中的出現次數,而分母則是在文件
中所有字詞的出現次數之和。
IDF公式:
- |D|:語料庫中的文件總數
:包含詞語
的文件數目(即
的文件數目)如果該詞語不在語料庫中,就會導致被除數為零,因此一般情況下使用
然后
信息增益
其公式為:
假如有變量X,其可能的取值有n種,每一種取到的概率為Pi,那么X的熵就定義為
也就是說X可能的變化越多,X所攜帶的信息量越大,熵也就越大。對於文本分類或聚類而言,就是說文檔屬於哪個類別的變化越多,類別的信息量就越大。所以特征T給聚類C或分類C帶來的信息增益為
IG(T)=H(C)-H(C|T)
H(C|T)包含兩種情況:一種是特征T出現,標記為t,一種是特征T不出現,標記為t'。所以
H(C|T)=P(t)H(C|t)+P(t')H(C|t‘)
本例屬於文本分類其p(t)為該詞在所有分類中出現的概率,H(C|t)該詞出現的條件下分類的熵。
本例的數據為自行搜索的不良信息中的兩類,暴力和反動。提供兩種篩選方式一種設立閾值另一種是進行排序后選取前多少個為特征值。
涉及的文件
停用詞表和分詞jar包:http://files.cnblogs.com/files/mansiisnam/%E6%96%87%E4%BB%B6.zip
代碼如下
package TIDF; import java.io.*; import java.util.*; import org.wltea.analyzer.lucene.IKAnalyzer; /** * 分詞-TFIDF-信息增益 * @author LJ * * @datetime 2015-6-15 */ public class TestTfIdf { public static final String stopWordTable = "C:/Users/zzw/Desktop/sc_ot-tingyongzhongwen_hc/stopWordTable.txt"; // 加載停用詞庫 private static ArrayList<String> FileList = new ArrayList<String>(); // 文件列表 // 遞歸讀取該路徑下文件返回文件列表 public static List<String> readDirs(String filepath) throws FileNotFoundException, IOException { try { File file = new File(filepath); if (!file.isDirectory()) { System.out.println("輸入的[]"); System.out.println("filepath:" + file.getAbsolutePath()); } else { String[] flist = file.list(); for (int i = 0; i < flist.length; i++) { File newfile = new File(filepath + "\\" + flist[i]); if (!newfile.isDirectory()) { FileList.add(newfile.getAbsolutePath()); } else if (newfile.isDirectory()) { readDirs(filepath + "\\" + flist[i]); } } } } catch (FileNotFoundException e) { System.out.println(e.getMessage()); } return FileList; } // 讀入文件 public static String readFile(String file) throws FileNotFoundException, IOException { StringBuffer strSb = new StringBuffer(); InputStreamReader inStrR = new InputStreamReader(new FileInputStream( file), "gbk"); BufferedReader br = new BufferedReader(inStrR); String line = br.readLine(); while (line != null) { strSb.append(line).append("\r\n"); line = br.readLine(); } return strSb.toString(); } // 分詞處理 public static ArrayList<String> cutWords(String file) throws IOException { ArrayList<String> fenci = new ArrayList<String>(); ArrayList<String> words = new ArrayList<String>(); String text = TestTfIdf.readFile(file); IKAnalyzer analyzer = new IKAnalyzer(); fenci = analyzer.split(text); // 分詞處理 BufferedReader StopWordFileBr = new BufferedReader( new InputStreamReader(new FileInputStream(new File( stopWordTable)))); // 用來存放停用詞的集合 Set<String> stopWordSet = new HashSet<String>(); // 初如化停用詞集 String stopWord = null; for (; (stopWord = StopWordFileBr.readLine()) != null;) { stopWordSet.add(stopWord); } for (String word : fenci) { if (stopWordSet.contains(word)) { continue; } words.add(word); } System.out.println(words); return words; } // 統計一個文件中每個詞出現的次數 public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords) { HashMap<String, Integer> resTF = new HashMap<String, Integer>(); for (String word : cutwords) { if (resTF.get(word) == null) { resTF.put(word, 1); System.out.println(word); } else { resTF.put(word, resTF.get(word) + 1); System.out.println(word.toString()); } } System.out.println(resTF); return resTF; } // 計算一個文件每個詞tf值 @SuppressWarnings("unchecked") public static HashMap<String, Float> tf(ArrayList<String> cutwords) { HashMap<String, Float> resTF = new HashMap<String, Float>(); int wordLen = cutwords.size(); HashMap<String, Integer> intTF = TestTfIdf.normalTF(cutwords); Iterator iter = intTF.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); resTF.put(entry.getKey().toString(), Float.parseFloat(entry .getValue().toString()) / wordLen); System.out.println(entry.getKey().toString() + " = " + Float.parseFloat(entry.getValue().toString()) / wordLen); } return resTF; } // tf times for file 。。。。。。。 public static HashMap<String, HashMap<String, Integer>> normalTFAllFiles( String dirc) throws IOException { FileList.clear(); HashMap<String, HashMap<String, Integer>> allNormalTF = new HashMap<String, HashMap<String, Integer>>(); List<String> filelist = TestTfIdf.readDirs(dirc); for (String file : filelist) { HashMap<String, Integer> dict = new HashMap<String, Integer>(); ArrayList<String> cutwords = TestTfIdf.cutWords(file); dict = TestTfIdf.normalTF(cutwords); allNormalTF.put(file, dict); } return allNormalTF; } // 返回所有文件tf值 public static HashMap<String, HashMap<String, Float>> tfAllFiles(String dirc) throws IOException { FileList.clear(); HashMap<String, HashMap<String, Float>> allTF = new HashMap<String, HashMap<String, Float>>(); List<String> filelist = TestTfIdf.readDirs(dirc); for (String file : filelist) { HashMap<String, Float> dict = new HashMap<String, Float>(); ArrayList<String> cutwords = TestTfIdf.cutWords(file); dict = TestTfIdf.tf(cutwords); allTF.put(file, dict); } return allTF; } // 計算該目錄下所有詞的idf @SuppressWarnings("unchecked") public static HashMap<String, Float> idf( HashMap<String, HashMap<String, Float>> all_tf, String file) throws IOException { FileList.clear(); HashMap<String, Float> resIdf = new HashMap<String, Float>(); HashMap<String, Integer> dict = new HashMap<String, Integer>(); int docNum = readDirs(file).size(); for (int i = 0; i < docNum; i++) { HashMap<String, Float> temp = all_tf.get(FileList.get(i)); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); if (dict.get(word) == null) { dict.put(word, 1); } else { dict.put(word, dict.get(word) + 1); } } } // 生成文件記錄所有詞和包含該詞的文件數 StringBuilder sb1 = new StringBuilder(); Iterator iter1 = dict.entrySet().iterator(); while (iter1.hasNext()) { Map.Entry entry = (Map.Entry) iter1.next(); if (entry.getKey().toString() != null) { sb1.append(entry.getKey().toString() + " " + dict.get(entry.getKey()) + "\r\n"); } } File filewriter = new File("E:/allCount.txt"); FileWriter fw = new FileWriter(filewriter.getAbsoluteFile()); BufferedWriter bb = new BufferedWriter(fw); bb.write(sb1.toString()); bb.close(); System.out.println(dict); // 計算idf System.out.println("IDF for every word is:"); Iterator iter_dict = dict.entrySet().iterator(); while (iter_dict.hasNext()) { Map.Entry entry = (Map.Entry) iter_dict.next(); float value = (float) Math.log(docNum / Float.parseFloat(entry.getValue().toString())); resIdf.put(entry.getKey().toString(), value); System.out.println(entry.getKey().toString() + " = " + value); } return resIdf; } // 返回該目錄下所有詞以及包含詞的文件數 @SuppressWarnings("unchecked") public static HashMap<String, Integer> idf_dict( HashMap<String, HashMap<String, Float>> all_tf, String file) throws IOException { FileList.clear(); HashMap<String, Integer> dict = new HashMap<String, Integer>(); List<String> filelist = readDirs(file); int docNum = filelist.size(); for (int i = 0; i < docNum; i++) { HashMap<String, Float> temp = all_tf.get(filelist.get(i)); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); if (dict.get(word) == null) { dict.put(word, 1); } else { dict.put(word, dict.get(word) + 1); } } } System.out.println(dict); return dict; } // 計算TFIDF值 @SuppressWarnings("unchecked") public static void tf_idf(HashMap<String, HashMap<String, Float>> all_tf, HashMap<String, Float> idfs, String file) throws IOException { HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>(); FileList.clear(); int docNum = readDirs(file).size(); for (int i = 0; i < docNum; i++) { String filepath = FileList.get(i); HashMap<String, Float> tfidf = new HashMap<String, Float>(); HashMap<String, Float> temp = all_tf.get(filepath); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); Float value = (float) Float.parseFloat(entry.getValue() .toString()) * idfs.get(word); tfidf.put(word, value); } resTfIdf.put(filepath, tfidf); } System.out.println("TF-IDF for Every file is :"); DisTfIdf(resTfIdf); // 顯示TFIDF } // 返回計算的TFIDF值 @SuppressWarnings("unchecked") public static HashMap<String, HashMap<String, Float>> tf_idf_return( HashMap<String, HashMap<String, Float>> all_tf, HashMap<String, Float> idfs, String file) throws IOException { FileList.clear(); HashMap<String, HashMap<String, Float>> resTfIdf = new HashMap<String, HashMap<String, Float>>(); int docNum = readDirs(file).size(); for (int i = 0; i < docNum; i++) { @SuppressWarnings("unused") HashMap<String, Float> tfidf_reduce = new HashMap<String, Float>(); String filepath = FileList.get(i); HashMap<String, Float> tfidf = new HashMap<String, Float>(); HashMap<String, Float> temp = all_tf.get(filepath); Iterator iter = temp.entrySet().iterator(); while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String word = entry.getKey().toString(); Float value = (float) Float.parseFloat(entry.getValue() .toString()) * idfs.get(word); tfidf.put(word, value); } resTfIdf.put(filepath, tfidf); } return resTfIdf; } // TFIDF顯示輸出 並建立文件存儲該信息 @SuppressWarnings("unchecked") public static void DisTfIdf(HashMap<String, HashMap<String, Float>> tfidf) throws IOException { StringBuilder stall = new StringBuilder(); Iterator iter1 = tfidf.entrySet().iterator(); while (iter1.hasNext()) { Map.Entry entrys = (Map.Entry) iter1.next(); System.out.println("FileName: " + entrys.getKey().toString()); System.out.print("{"); HashMap<String, Float> temp = (HashMap<String, Float>) entrys .getValue(); Iterator iter2 = temp.entrySet().iterator(); while (iter2.hasNext()) { Map.Entry entry = (Map.Entry) iter2.next(); System.out.print(entry.getKey().toString() + " = " + entry.getValue().toString() + ", "); stall.append(entrys.getKey().toString() + " " + entry.getKey().toString() + " " + entry.getValue().toString() + "\r\n"); } System.out.println("}"); } File filewriter = new File("E:/allTFIDF.txt"); FileWriter fw = new FileWriter(filewriter.getAbsoluteFile()); BufferedWriter bz = new BufferedWriter(fw); bz.write(stall.toString()); bz.close(); } // 單屬性熵 public static double Entropy(double[] p, double tot) { double entropy = 0.0; for (int i = 0; i < p.length; i++) { if (p[i] > 0.0) { entropy += -p[i] / tot * Math.log(p[i] / tot) / Math.log(2.0); } } return entropy; } // 信息增益特征降維 @SuppressWarnings("unchecked") private static void Total(int N, HashMap<String, HashMap<String, Float>> result, HashMap<String, Integer> idfs_dict_neg, HashMap<String, Integer> idfs_dict_pos, String file) throws IOException { FileList.clear(); double[] classCnt = new double[N]; // 類別數組 double totalCnt = 0.0; // 總文件數 for (int c = 0; c < N; c++) { classCnt[c] = 125; // 每個類別的文件數目 totalCnt += classCnt[c]; } int docNum = readDirs(file).size(); int num = 0; // 詞f的編號 int numb = 0; // 詞f的編號 double totalEntroy = Entropy(classCnt, totalCnt); // 總的熵 HashMap<String, Integer> count = new HashMap<String, Integer>();// 存儲詞及其編號 HashMap<String, Integer> countG = new HashMap<String, Integer>();// 存儲特征降維后word和其編號 HashMap<String, Double> countG1 = new HashMap<String, Double>();// 存儲特征降維后word和其信息增益 HashMap<String, Double> infogains = new HashMap<String, Double>();// 存儲詞和該詞的信息增益 StringBuilder st = new StringBuilder();// 緩存文件名,詞,信息增益,TFIDF StringBuilder ss = new StringBuilder();// 緩存未特征處理的類別,單詞的編號,單詞的TFIDF值 StringBuilder sr = new StringBuilder();// 緩存經過特征處理后的類別,單詞的編號,單詞的TFIDF值 for (int i = 0; i < docNum; i++) { String filepath = FileList.get(i); HashMap<String, Float> temp = result.get(filepath); Iterator iter = temp.entrySet().iterator(); if (filepath.contains("dubo")) { ss.append(1 + " "); // 將賭博類定義為類別1 } else if (filepath.contains("fangdong")) { ss.append(2 + " "); // 將反動類定義為類別2 } while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); String f = entry.getKey().toString(); double[] featureCntWithF = new double[N]; // 包括詞F的分布(類別1,2分別包含該詞的文件數) double[] featureCntWithoutF = new double[N]; // 不包括詞F的分布 double totalCntWithF = 0.0; // 所有類別中包括詞F的文件數 double totalCntWithoutF = 0.0; // 所有類別中不包括詞F的文件數 for (int c = 0; c < N; c++) { Iterator iter_dict = null; switch (c) { case 0: iter_dict = idfs_dict_neg.entrySet().iterator(); break; case 1: iter_dict = idfs_dict_pos.entrySet().iterator(); break; } while (iter_dict.hasNext()) { Map.Entry entry_neg = (Map.Entry) iter_dict.next(); if (f.equals(entry_neg.getKey().toString())) { // 該詞在該類別中出現 featureCntWithF[c] = Double.parseDouble(entry_neg .getValue().toString()); // 將該出現該詞的文件數賦值給數組 break; } else { featureCntWithF[c] = 0.0; } } featureCntWithoutF[c] = classCnt[c] - featureCntWithF[c]; // 不包括詞F的文件數等於該類別總數減去包含該詞的文件數 totalCntWithF += featureCntWithF[c]; totalCntWithoutF += featureCntWithoutF[c]; } double entropyWithF = Entropy(featureCntWithF, totalCntWithF); double entropyWithoutF = Entropy(featureCntWithoutF, totalCntWithoutF); double wf = totalCntWithF / totalCnt; double infoGain = totalEntroy - wf * entropyWithF - (1.0 - wf) // 信息增益的公式 * entropyWithoutF; infogains.put(f, infoGain); st.append(filepath + " " + f + " " + "信息增益" + "=" + infoGain // 緩存格式 + " " + "tfidf" + "=" + entry.getValue().toString() + "\r\n"); // } // 方式一:直接用閾值選取特征值可以省去下面再次遍歷的過程 // if(infogains.get(f)>0.004011587943125061){ // 給詞f編號 if (count.get(f) == null) { num++; count.put(f, num); } ss.append(count.get(f) + ":" + entry.getValue() + " "); // 緩存格式 // } } ss.append("\r\n"); } File fileprepare = new File("E:/test.txt"); FileWriter fz = new FileWriter(fileprepare.getAbsoluteFile()); BufferedWriter bz = new BufferedWriter(fz); bz.write(ss.toString()); bz.close(); File filewriter = new File("E:/jieguo.txt"); FileWriter fw = new FileWriter(filewriter.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); bw.write(st.toString()); bw.close(); // 方式二:將信息增益從大到小排列,選取前特定數的詞為特征詞 // 對信息增益排序(從大到小) ArrayList<Map.Entry<String, Double>> infoIds = new ArrayList<Map.Entry<String, Double>>( infogains.entrySet()); Collections.sort(infoIds, new Comparator<Map.Entry<String, Double>>() { public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) { if (o2.getValue() - o1.getValue() > 0) { return 1; // 降序排列 } else { return -1; } } }); // 選取信息增益為前2000的詞做特征詞 for (int c = 0; c < 2000; c++) { countG1.put(infoIds.get(c).getKey(), infoIds.get(c).getValue()); // 將處理后的數據存儲到countG1中 } // 再次遍歷 for (int i = 0; i < docNum; i++) { String filepath = FileList.get(i); HashMap<String, Float> temp = result.get(filepath); Iterator iter = temp.entrySet().iterator(); if (filepath.contains("dubo")) { sr.append(1 + " "); } else if (filepath.contains("fangdong")) { sr.append(2 + " "); } while (iter.hasNext()) { Map.Entry entry = (Map.Entry) iter.next(); // for(Iterator<Feature> // i=index.featureIterator();i.hasNext();){ String f = entry.getKey().toString(); // 判斷該詞是特征降維后的那些詞 if (countG1.get(f) != null) { // 給該詞編號 if (countG.get(f) == null) { numb++; countG.put(f, numb); } sr.append(countG.get(f) + ":" + entry.getValue() + " "); } } sr.append("\r\n"); } File fileprepare1 = new File("E:/testt.txt"); FileWriter fr = new FileWriter(fileprepare1.getAbsoluteFile()); BufferedWriter br = new BufferedWriter(fr); br.write(sr.toString()); br.close(); } public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String file = "C:/Users/zzw/Desktop/項目管理/語料/test"; // 總的數據路徑 String file1 = "C:/Users/zzw/Desktop/項目管理/語料/test/賭博"; // 類1數據路徑 String file2 = "C:/Users/zzw/Desktop/項目管理/語料/test/反動"; // 類2數據路徑 HashMap<String, HashMap<String, Float>> all_tf = tfAllFiles(file); HashMap<String, HashMap<String, Float>> all_tf_neg = tfAllFiles(file1); // file1文件的tf值和路徑 HashMap<String, HashMap<String, Float>> all_tf_pos = tfAllFiles(file2); // file2文件的tf值和路徑 System.out.println(); HashMap<String, Integer> idfs_dict_neg = idf_dict(all_tf_neg, file1); // 返回file1下所有詞以及包含詞的文件數 HashMap<String, Integer> idfs_dict_pos = idf_dict(all_tf_pos, file2); // 返回file2下所有詞以及包含詞的文件數 HashMap<String, Float> idfs = idf(all_tf, file); System.out.println(); tf_idf(all_tf, idfs, file); HashMap<String, HashMap<String, Float>> result = tf_idf_return(all_tf, idfs, file); int N = 2; // 輸入類別數 /* * 信息增益公式 IG(T)=H(C)-H(C|T) H(C|T)=P(t)H(C|t)+P(t')H(C|t‘) */ Total(N, result, idfs_dict_neg, idfs_dict_pos, file); // 按信息增益進行特征降維 } }
各個文件結果如下:
allCount.txt
allTFIDF.txt
test.txt
jieguo.txt
testt.txt
本人也是初學者,如有問題,萬望大神指正!