java 對文件內容進行分詞統計


本文記錄了我在面試過程中感覺有用的問題,方便日后參考。



問題描述:
給定一個文本文件,按以下要求進行分詞統計:

時間限制:5000ms
內存限制:256MB

要求1:讀取文本信息(input.txt),設置分詞大小,輸出相應詞頻信息
要求2:統計一個單詞在文本中的出現頻率(一個單詞出現次數/總單詞數),排序輸出結果

文本內容大致如下:

程序實現:


import java.util.Map;
import java.util.HashMap;
import java.util.Scanner;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.math.BigDecimal;

public class Participle {
	
	public static void solutionOne(){
		Scanner in = new Scanner(System.in);
		print("請輸入分詞大小: ");
		int inputPartiSize = 0;
		while(in.hasNextInt()){
			inputPartiSize = in.nextInt();
			break;
		}
		
		final Map<String,Integer> dictFreq = new HashMap<String,Integer>();
		final int partiSize = inputPartiSize;
		
		readInput(new LineSolution(){
		
			public void solveLine(String line){
				String[] lineDicts = lineParser(line, partiSize);
				if(lineDicts != null){
					for(int i=0; i<lineDicts.length; i++){
						String word = lineDicts[i];
						if(dictFreq.containsKey(word)){
							int num = dictFreq.get(word);
							dictFreq.put(word, ++num);
						}
						else {
							dictFreq.put(word, 1);
						}
					}
				}
			}
		});
		
		for(Map.Entry<String,Integer> entry : dictFreq.entrySet()){
			print(entry.getKey() + "\t times: " + entry.getValue() + '\n');
		}
	}
	
	public static void solutionTwo(){
		print("方案二:\n");
		
		final Map<String,Integer> singleDictFreq = new HashMap<String,Integer>();
		final Map<String,Integer> callResult = new HashMap<String,Integer>();
		callResult.put("sum", 0);
		
		readInput(new LineSolution(){
		
			public void solveLine(String line){
				String[] lineDicts = lineParser(line, 1);
				if(lineDicts != null){
					callResult.put("sum", callResult.get("sum") + lineDicts.length);
					
					for(int i=0; i<lineDicts.length; i++){
						String word = lineDicts[i];
						if(singleDictFreq.containsKey(word)){
							int num = singleDictFreq.get(word);
							singleDictFreq.put(word, ++num);
						}
						else {
							singleDictFreq.put(word, 1);
						}
					}
				}
			}
		});
		
		Map<String,Double> singleDictFreqCalc = new HashMap<String,Double>();
		
		int sum = callResult.get("sum");
		for(Map.Entry<String,Integer> entry : singleDictFreq.entrySet()){
			singleDictFreqCalc.put(entry.getKey(), divide(entry.getValue(), sum));
		}
		
		for(Map.Entry<String,Double> entry : singleDictFreqCalc.entrySet()){
			print(entry.getKey() + "\t frequency: " + entry.getValue() + '\n');
		}
	}
	
	private static String[] lineParser(String line, int scale){
		String[] lineDicts = null;
		if(line != null && !"".equals(line.trim())){
			String[] spliter = line.split("\\s+|,");
			List<String> container = new ArrayList<String>();
			
			for(int i=0; i<spliter.length; i += scale){
				StringBuilder phase = new StringBuilder("");
				
				for(int j=0; (i+j) < spliter.length && j<scale; j++){
					phase.append(spliter[i+j]).append(" ");
				} 
				
				String phaseStr = phase.toString().trim();
				if("".equals(phaseStr)){
					continue;
				}
				container.add(phaseStr);
			}
			lineDicts = new String[container.size()];
			container.toArray(lineDicts);
		}
		return lineDicts;
	} 
	
	private static void readInput(LineSolution solution){
		try{
			String dir = System.getProperty("java.class.path");
			FileReader fr = new FileReader(dir + "/input.txt");
			BufferedReader br = new BufferedReader(fr);
			
			String line = "";
			while(line != null){
				line = br.readLine();
				if(line == null){
					break;
				}
				
				solution.solveLine(line);
			}
			
		}catch(IOException e){
			e.printStackTrace();
		}
	}
	
	private static double divide(double divisor, double dividend){
		return new BigDecimal(divisor)
			.divide(new BigDecimal(dividend), 5, BigDecimal.ROUND_HALF_UP).doubleValue();
	}
	
	public static void main(String[] args){
		
		String Q1 = "1:讀取文本信息(input.txt),設置分詞大小,輸出相應詞頻信息";
		String Q2 = "2:統計一個單詞在文本中的出現頻率(一個單詞出現次數/總單詞數),排序輸出結果";
		
		print(Q1+'\n');
		print(Q2+'\n');
		print("\n");
		
		print("請輸入問題序號: ");
		Scanner in = new Scanner(System.in);
		
		while(in.hasNextInt()){
			int num = in.nextInt();
			if(num == 1){
				solutionOne();
				break;
			}
			else if(num == 2){
				solutionTwo();
				break;
			}
			else {
				print("\n請輸入有效問題序號: ");
			}
		}
		
	}
	
	private static void print(String str){
		System.out.print(str);
	}
}

interface LineSolution {
	void solveLine(String line);
}

結果輸出:




免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM