MapReduce 默認使用 TextInputFormat 進行切片,其機制如下
(1)簡單地按照文件的內容長度進行切片 (2)切片大小,默認等於Block大小,可單獨設置 (3)切片時不考慮數據集整體,而是逐個針對每一個文件單獨切片 例如: (1)輸入數據有兩個文件: filel.txt 320M file2.txt 10M (2)經過 FilelnputFormat(TextInputFormat為其實現類)的切片機制運算后,形成的切片信息如下: filel.txt.splitl--0~128 filel.txt.split2--128~256 filel.txt.split3--256~320 file2.txt.splitl--0~10M
測試讀取數據的方式
輸入數據(中間為空格,末尾為換行符)
map 階段的 k-v
可以看出 k 為偏移量,v 為一行的值,即 TextInputFormat 按行讀取
以 WordCount 為例進行測試,測試切片數
測試數據,三個相同的文件
測試代碼
package com.mapreduce.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.log4j.BasicConfigurator; import java.io.IOException; import java.util.StringTokenizer; public class WordCount { static { try { // 設置 HADOOP_HOME 環境變量 System.setProperty("hadoop.home.dir", "D:/DevelopTools/hadoop-2.9.2/"); // 日志初始化 BasicConfigurator.configure(); // 加載庫文件 System.load("D:/DevelopTools/hadoop-2.9.2/bin/hadoop.dll"); } catch (UnsatisfiedLinkError e) { System.err.println("Native code library failed to load.\n" + e); System.exit(1); } } public static void main(String[] args) throws Exception { args = new String[]{"D:\\tmp\\input2", "D:\\tmp\\456"}; Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 設置 InputFormat,默認為 TextInputFormat.class,這里顯式設置下,后面設置切片大小 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setMinInputSplitSize(job, 1); TextInputFormat.setMaxInputSplitSize(job, 1024 * 1024 * 128); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // 查看 k-v System.out.println(key + "\t" + value); StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } }