java實現文件單詞頻率統計

本文轉載自查看原文 2013-01-17 21:28 8863

思路：

1、將文件內容存入StringBuffer中。

2、利用split()函數分割字符串，可按(“，”，“.”，“！”，“空格”，“回車”)分割，得到一個數組。

3、遍歷數組，將其放入一個Map <String,Integer>中,key=單詞，value=單詞出現的次數。

4、如要求出文件中出現頻率最高的幾個單詞，則要對Map進行排序。

-----------------------------------------------------------------------------------------------------------------------------------------

以下是實現一個文件中出現頻率最高的單詞的統計 FileWordCount.java主函數所在文件

 
import java.io.BufferedReader;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.*;
 
/**
  * Created by IntelliJ IDEA.
  * User: FLY
  * Date: 11-9-13
  * Time: 下午3:59
  * To change this template use File | Settings | File Templates.
  */
 public class FileWordCount {
     public static void main(String[] args) {
         try {
             BufferedReader br = new BufferedReader(new FileReader("D:\\test.txt"));
             String s;
             StringBuffer sb = new StringBuffer();
             while ((s = br.readLine()) != null) {
                 sb.append(s);
             }
             Map<String,Integer> map = new HashMap<String, Integer>();
             StringTokenizer st = new StringTokenizer(sb.toString(),",.! \n");
             while (st.hasMoreTokens()) {
                 String letter = st.nextToken();
                 int count;
                 if (map.get(letter) == null) {
                     count = 1;
                 } else {
                     count = map.get(letter).intValue() + 1;
                 }
                 map.put(letter,count);
             }
             Set<WordEntity> set = new TreeSet<WordEntity>();
             for (String key : map.keySet()) {
                 set.add(new WordEntity(key,map.get(key)));
             }
             // 自己拼接字符串，輸出我們想要的字符串格式
             System.out.println("輸出形式一：");
             for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
                 WordEntity w = it.next();
                 System.out.println("單詞:" + w.getKey() + " 出現的次數為： " + w.getCount());
             }
             // 直接打印 WordEntity 對象，實現我們想要的輸出效果，只需在WordEntity類中重寫toString()方法
             System.out.println("輸出形式二：");
             for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
                 WordEntity w = it.next();
                 System.out.println(w);
             }
             // 我們可以控制只輸出前三名來
             System.out.println("輸出形式三：");
             int count = 1;
             for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
                 WordEntity w = it.next();
                 System.out.println("第" + count + "名為單詞:" + w.getKey() + " 出現的次數為： "
                         + w.getCount());
                 if (count == 3)// 當輸出3個后跳出循環
                     break;
                 count++;
             }
         } catch (FileNotFoundException e) {
             System.out.println("文件未找到~！");
         } catch (IOException e) {
             System.out.println("文件讀異常~！");
         }
     }
 }
 
 
 
WordEntity.java文件
 
/**
  * Created by IntelliJ IDEA.
  * User: FLY
  * Date: 11-9-13
  * Time: 下午4:57
  * To change this template use File | Settings | File Templates.
  */
 public class WordEntity implements Comparable<WordEntity> {
     private String key;
     private Integer count;
     public WordEntity (String key, Integer count) {
         this.key = key;
         this.count = count;
     }
     public int compareTo(WordEntity o) {
         int cmp = count.intValue() - o.count.intValue();
         return (cmp == 0 ? key.compareTo(o.key) : -cmp);
         //只需在這兒加一個負號就可以決定是升序還是降序排列  -cmp降序排列，cmp升序排列
         //因為TreeSet會調用WorkForMap的compareTo方法來決定自己的排序
     }
 
    @Override
     public String toString() {
         return key + " 出現的次數為：" + count;
     }
 
    public String getKey() {
         return key;
     }
 
    public Integer getCount() {
         return count;
     }
 }
 

 
 
輸出結果：
 
輸出形式一：
 單詞:is 出現的次數為： 4
 單詞:my 出現的次數為： 2
 單詞:very 出現的次數為： 2
 單詞:word 出現的次數為： 2
 單詞:a 出現的次數為： 1
 單詞:are 出現的次數為： 1
 單詞:boy 出現的次數為： 1
 單詞:english 出現的次數為： 1
 單詞:fool 出現的次數為： 1
 單詞:good 出現的次數為： 1
 單詞:hah 出現的次數為： 1
 單詞:hello 出現的次數為： 1
 單詞:hey 出現的次數為： 1
 單詞:i 出現的次數為： 1
 單詞:love 出現的次數為： 1
 單詞:mary 出現的次數為： 1
 單詞:much 出現的次數為： 1
 單詞:name 出現的次數為： 1
 單詞:ok 出現的次數為： 1
 單詞:poor 出現的次數為： 1
 單詞:so 出現的次數為： 1
 單詞:sophie 出現的次數為： 1
 單詞:you 出現的次數為： 1
 輸出形式二：
 is 出現的次數為：4
 my 出現的次數為：2
 very 出現的次數為：2
 word 出現的次數為：2
 a 出現的次數為：1
 are 出現的次數為：1
 boy 出現的次數為：1
 english 出現的次數為：1
 fool 出現的次數為：1
 good 出現的次數為：1
 hah 出現的次數為：1
 hello 出現的次數為：1
 hey 出現的次數為：1
 i 出現的次數為：1
 love 出現的次數為：1
 mary 出現的次數為：1
 much 出現的次數為：1
 name 出現的次數為：1
 ok 出現的次數為：1
 poor 出現的次數為：1
 so 出現的次數為：1
 sophie 出現的次數為：1
 you 出現的次數為：1
 輸出形式三：
 第1名為單詞:is 出現的次數為： 4
 第2名為單詞:my 出現的次數為： 2
 第3名為單詞:very 出現的次數為： 2

　　原文地址

方法二

//CountWord.java
 
package com.xie.tencent;
 
import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.HashMap;
 
/**
  * 這個類用於統計某個單詞在所有文件中出現的次數。
  * @author centre
  *
  */
 public class CounterWord {
     FileReader fr;
     private HashMap<String, Integer> hMap=new HashMap<String, Integer>();;
     
  public HashMap<String, Integer> gethMap() {
   return hMap;
  }
  /**
   * 這個構造方法用於持有對方引用
   * @author centre
   * @param f FileReader
   */
  public CounterWord(FileReader f) {
   fr=f;
  }
  /**
   * 根據傳入的文件路徑，打開文件，同時分析其中的單詞個數，
   * 存放進一個HashMap
   * @author centre
   * @param path String 文件的絕對路徑
   * @return 返回值貌似沒用，在需要時可以接收
   */
  public boolean openFile(String path){
   boolean b=true;
   try {
        FileInputStream fis=new FileInputStream(path);
        BufferedReader br=new BufferedReader(new InputStreamReader(fis));
        try {
          String words=br.readLine();
          while (words!=null) {
           //System.out.println(words);
           String[] word=words.split(",");
           for (int i = 0; i < word.length; i++) {
          addWordCount(word[i]);
      }
           words=br.readLine();
     }
    } catch (IOException e) {
     System.out.println("文件讀取異常。");
     e.printStackTrace();
    }
        
   } catch (FileNotFoundException e) {
    System.out.println("沒有找到該文件:"+path);
    b=false;
    e.printStackTrace();
   }
   return b;
  }
  /**
   * 將單詞放進map，同時統計單詞出現的次數
   * @param word String 單詞
   */
  private void addWordCount(String word) {
   if (hMap.containsKey(word)) {
    hMap.put(word, hMap.get(word).intValue()+1);
   }else {
    hMap.put(word, 1);
   }
  }

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 用C語言實現了對英文文章中單詞頻率的統計，得到出現最多的前十個！ Java實現的詞頻統計【shell腳本實例】shell腳本統計單詞頻率、出現次數最多的n個單詞利用python實現簡單詞頻統計、構建詞雲 python簡單詞頻統計統計文件中單詞出現頻率最高的10個以及他們出現的次數 Java實驗--統計字母出現頻率及其單詞個數【學習筆記】C#中HashTable和快速排序的用法，從單詞頻率統計小程序寫起 Java實現的詞頻統計——功能改進利用多種方式來統計詞頻（單詞個數）