Hadoop學習之第一個MapReduce程序


期望

通過這個mapreduce程序了解mapreduce程序執行的流程,着重從程序解執行的打印信息中提煉出有用信息。

執行前

程序代碼

程序代碼基本上是《hadoop權威指南》上原封不動搬下來的,目的為求出某一年份中最高氣溫,相關代碼如下:

public class NcdcWeather {
    private String USAF_station_id;
    private String WBAN_station_id;
    private String date;
    private String time;
    private String latitude;
    private String longitude;
    /** 海拔*/
    private String elevation;
    /** 風向*/
    private String wind_direction;
    private String wind_direction_quality_code;
    private String sky_ceiling_height;
    private String sky_ceiling_height_quality_code;
    private String visibility_distance;
    private String visibility_distance_quality_code;
    private String air_temperature;
    private String air_temperature_quality_code;
    private String dew_point_temperature;
    private String dew_point_temperature_quality_code;
    private String atmospheric_pressure;
    private String atmospheric_pressure_quality_code;

    public NcdcWeather(String rowData) {
        if (StringUtils.isEmpty(rowData) || rowData.length() < 105) {
            return;
        }

        USAF_station_id = rowData.substring(4, 10);
        WBAN_station_id = rowData.substring(10, 15);
        date = rowData.substring(15, 23);
        time = rowData.substring(23, 27);
        latitude = rowData.substring(28, 34);
        longitude = rowData.substring(34, 41);
        elevation = rowData.substring(46, 51);
        wind_direction = rowData.substring(60, 63);
        wind_direction_quality_code = rowData.substring(63, 64);
        sky_ceiling_height = rowData.substring(70, 75);
        sky_ceiling_height_quality_code = rowData.substring(75, 76);
        visibility_distance = rowData.substring(78, 84);
        visibility_distance_quality_code = rowData.substring(84, 85);
        air_temperature = rowData.substring(87, 92);
        air_temperature_quality_code = rowData.substring(92, 93);
        dew_point_temperature = rowData.substring(93, 98);
        dew_point_temperature_quality_code = rowData.substring(98, 99);
        atmospheric_pressure = rowData.substring(99, 104);
        atmospheric_pressure_quality_code = rowData.substring(104, 105);
    }

    public String getUSAF_station_id() {
        return USAF_station_id;
    }

    public void setUSAF_station_id(String USAF_station_id) {
        this.USAF_station_id = USAF_station_id;
    }

    public String getWBAN_station_id() {
        return WBAN_station_id;
    }

    public void setWBAN_station_id(String WBAN_station_id) {
        this.WBAN_station_id = WBAN_station_id;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getLatitude() {
        return latitude;
    }

    public void setLatitude(String latitude) {
        this.latitude = latitude;
    }

    public String getLongitude() {
        return longitude;
    }

    public void setLongitude(String longitude) {
        this.longitude = longitude;
    }

    public String getElevation() {
        return elevation;
    }

    public void setElevation(String elevation) {
        this.elevation = elevation;
    }

    public String getWind_direction() {
        return wind_direction;
    }

    public void setWind_direction(String wind_direction) {
        this.wind_direction = wind_direction;
    }

    public String getWind_direction_quality_code() {
        return wind_direction_quality_code;
    }

    public void setWind_direction_quality_code(String wind_direction_quality_code) {
        this.wind_direction_quality_code = wind_direction_quality_code;
    }

    public String getSky_ceiling_height() {
        return sky_ceiling_height;
    }

    public void setSky_ceiling_height(String sky_ceiling_height) {
        this.sky_ceiling_height = sky_ceiling_height;
    }

    public String getSky_ceiling_height_quality_code() {
        return sky_ceiling_height_quality_code;
    }

    public void setSky_ceiling_height_quality_code(String sky_ceiling_height_quality_code) {
        this.sky_ceiling_height_quality_code = sky_ceiling_height_quality_code;
    }

    public String getVisibility_distance() {
        return visibility_distance;
    }

    public void setVisibility_distance(String visibility_distance) {
        this.visibility_distance = visibility_distance;
    }

    public String getVisibility_distance_quality_code() {
        return visibility_distance_quality_code;
    }

    public void setVisibility_distance_quality_code(String visibility_distance_quality_code) {
        this.visibility_distance_quality_code = visibility_distance_quality_code;
    }

    public String getAir_temperature() {
        return air_temperature;
    }

    public void setAir_temperature(String air_temperature) {
        this.air_temperature = air_temperature;
    }

    public String getAir_temperature_quality_code() {
        return air_temperature_quality_code;
    }

    public void setAir_temperature_quality_code(String air_temperature_quality_code) {
        this.air_temperature_quality_code = air_temperature_quality_code;
    }

    public String getDew_point_temperature() {
        return dew_point_temperature;
    }

    public void setDew_point_temperature(String dew_point_temperature) {
        this.dew_point_temperature = dew_point_temperature;
    }

    public String getDew_point_temperature_quality_code() {
        return dew_point_temperature_quality_code;
    }

    public void setDew_point_temperature_quality_code(String dew_point_temperature_quality_code) {
        this.dew_point_temperature_quality_code = dew_point_temperature_quality_code;
    }

    public String getAtmospheric_pressure() {
        return atmospheric_pressure;
    }

    public void setAtmospheric_pressure(String atmospheric_pressure) {
        this.atmospheric_pressure = atmospheric_pressure;
    }

    public String getAtmospheric_pressure_quality_code() {
        return atmospheric_pressure_quality_code;
    }

    public void setAtmospheric_pressure_quality_code(String atmospheric_pressure_quality_code) {
        this.atmospheric_pressure_quality_code = atmospheric_pressure_quality_code;
    }

    @Override
    public String toString() {
        return "NcdcWeather{" +
                "USAF_station_id='" + USAF_station_id + '\'' +
                ", WBAN_station_id='" + WBAN_station_id + '\'' +
                ", date='" + date + '\'' +
                ", time='" + time + '\'' +
                ", latitude='" + latitude + '\'' +
                ", longitude='" + longitude + '\'' +
                ", elevation='" + elevation + '\'' +
                ", wind_direction='" + wind_direction + '\'' +
                ", wind_direction_quality_code='" + wind_direction_quality_code + '\'' +
                ", sky_ceiling_height='" + sky_ceiling_height + '\'' +
                ", sky_ceiling_height_quality_code='" + sky_ceiling_height_quality_code + '\'' +
                ", visibility_distance='" + visibility_distance + '\'' +
                ", visibility_distance_quality_code='" + visibility_distance_quality_code + '\'' +
                ", air_temperature='" + air_temperature + '\'' +
                ", air_temperature_quality_code='" + air_temperature_quality_code + '\'' +
                ", dew_point_temperature='" + dew_point_temperature + '\'' +
                ", dew_point_temperature_quality_code='" + dew_point_temperature_quality_code + '\'' +
                ", atmospheric_pressure='" + atmospheric_pressure + '\'' +
                ", atmospheric_pressure_quality_code='" + atmospheric_pressure_quality_code + '\'' +
                '}';
    }
}
Weather Bean
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private static int MISS_CODE = 9999;

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        NcdcWeather ncdcWeather = new NcdcWeather(line);
        String year = ncdcWeather.getDate().substring(0, 4);
        int temperature = 0;
        if (ncdcWeather.getAir_temperature().startsWith("+")) {
            temperature = Integer.parseInt(ncdcWeather.getAir_temperature().substring(1));
        } else {
            temperature = Integer.parseInt(ncdcWeather.getAir_temperature());
        }


        if (temperature != MISS_CODE && ncdcWeather.getAir_temperature_quality_code().matches("[01459]")) {
            context.write(new Text(year), new IntWritable(temperature));
        }
    }
}
Mapper
public class MaxTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int max = Integer.MIN_VALUE;
        for (IntWritable temp : values) {
            max = Math.max(max, temp.get());
        }

        context.write(key, new IntWritable(max));
    }
}
Reducer
public class MaxTemperature {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        if (args.length != 2) {
            System.err.println("Usage: MaxTemperature <input path> <output path>");
            System.exit(-1);
        }

        Job job = Job.getInstance();
        job.setJarByClass(MaxTemperature.class);
        job.setJobName("Max Temperature");

        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.setMapperClass(MaxTemperatureMapper.class);
        job.setReducerClass(MaxTemperatureReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

 數據准備

提前往hdfs中放了1901、1902兩個年份的天氣數據如下圖所示:

驗證執行

 將以上代碼打成一個jar,push到我們的虛擬機Hadoop環境中,為防止程序bug也為了節省執行時間,我們先在一個小集群、小數據量中驗證代碼。為此,我已經提前准備好了一個偽分布式(所謂為分布式即只有一個節點的全分布式)集群環境,下面開始在該環境中執行以上程序:

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/ /max_temperature_out/

成功執行時的日志打印:

 1 2019-09-10 16:19:00,367 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
 2 2019-09-10 16:19:03,364 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
 3 2019-09-10 16:19:03,463 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001
 4 2019-09-10 16:19:05,748 INFO input.FileInputFormat: Total input files to process : 2
 5 2019-09-10 16:19:07,012 INFO mapreduce.JobSubmitter: number of splits:2
 6 2019-09-10 16:19:07,677 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
 7 2019-09-10 16:19:08,909 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001
 8 2019-09-10 16:19:08,912 INFO mapreduce.JobSubmitter: Executing with tokens: []
 9 2019-09-10 16:19:09,911 INFO conf.Configuration: resource-types.xml not found
10 2019-09-10 16:19:09,915 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
11 2019-09-10 16:19:11,331 INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001
12 2019-09-10 16:19:11,660 INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/
13 2019-09-10 16:19:11,661 INFO mapreduce.Job: Running job: job_1568096329203_0001
14 2019-09-10 16:20:09,966 INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false
15 2019-09-10 16:20:10,014 INFO mapreduce.Job:  map 0% reduce 0%
16 2019-09-10 16:21:22,515 INFO mapreduce.Job:  map 100% reduce 0%
17 2019-09-10 16:21:52,053 INFO mapreduce.Job:  map 100% reduce 100%
18 2019-09-10 16:21:54,192 INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully
19 2019-09-10 16:21:55,534 INFO mapreduce.Job: Counters: 54
20         File System Counters
21                 FILE: Number of bytes read=132380
22                 FILE: Number of bytes written=928635
23                 FILE: Number of read operations=0
24                 FILE: Number of large read operations=0
25                 FILE: Number of write operations=0
26                 HDFS: Number of bytes read=1628892
27                 HDFS: Number of bytes written=18
28                 HDFS: Number of read operations=11
29                 HDFS: Number of large read operations=0
30                 HDFS: Number of write operations=2
31                 HDFS: Number of bytes read erasure-coded=0
32         Job Counters 
33                 Launched map tasks=2
34                 Launched reduce tasks=1
35                 Data-local map tasks=2
36                 Total time spent by all maps in occupied slots (ms)=134255
37                 Total time spent by all reduces in occupied slots (ms)=20841
38                 Total time spent by all map tasks (ms)=134255
39                 Total time spent by all reduce tasks (ms)=20841
40                 Total vcore-milliseconds taken by all map tasks=134255
41                 Total vcore-milliseconds taken by all reduce tasks=20841
42                 Total megabyte-milliseconds taken by all map tasks=137477120
43                 Total megabyte-milliseconds taken by all reduce tasks=21341184
44         Map-Reduce Framework
45                 Map input records=12035
46                 Map output records=12034
47                 Map output bytes=108306
48                 Map output materialized bytes=132386
49                 Input split bytes=200
50                 Combine input records=0
51                 Combine output records=0
52                 Reduce input groups=2
53                 Reduce shuffle bytes=132386
54                 Reduce input records=12034
55                 Reduce output records=2
56                 Spilled Records=24068
57                 Shuffled Maps =2
58                 Failed Shuffles=0
59                 Merged Map outputs=2
60                 GC time elapsed (ms)=1606
61                 CPU time spent (ms)=7900
62                 Physical memory (bytes) snapshot=470282240
63                 Virtual memory (bytes) snapshot=8201203712
64                 Total committed heap usage (bytes)=261046272
65                 Peak Map Physical memory (bytes)=184123392
66                 Peak Map Virtual memory (bytes)=2731495424
67                 Peak Reduce Physical memory (bytes)=102227968
68                 Peak Reduce Virtual memory (bytes)=2738212864
69         Shuffle Errors
70                 BAD_ID=0
71                 CONNECTION=0
72                 IO_ERROR=0
73                 WRONG_LENGTH=0
74                 WRONG_MAP=0
75                 WRONG_REDUCE=0
76         File Input Format Counters 
77                 Bytes Read=1628692
78         File Output Format Counters 
79                 Bytes Written=18

接下來逐行理解日志含義:

2019-09-10 16:19:00,367 INFO client.RMProxy: Connecting to ResourceManager at localhost/127.0.0.1:8032
#emm 我要先找到資源管理器

2019-09-10 16:19:03,364 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
#ohoh,代碼里用的運行方式過時了,推薦使用繼承Tool接口的方式來實現它 2019-09-10 16:19:03,463 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoopuser/.staging/job_1568096329203_0001
2019-09-10 16:19:05,748 INFO input.FileInputFormat: Total input files to process : 2
#輸入文件數 2

2019-09-10 16:19:07,012 INFO mapreduce.JobSubmitter: number of splits:2
#分片數2(與輸入塊數出奇的一致呢~)

2019-09-10 16:19:07,677 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
2019-09-10 16:19:08,909 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568096329203_0001
#提交作業token,終於看到作業命名規則了,大概是job_<時間戳>_<4位數的遞增序號>

2019-09-10 16:19:08,912 INFO mapreduce.JobSubmitter: Executing with tokens: []
2019-09-10 16:19:09,911 INFO conf.Configuration: resource-types.xml not found
2019-09-10 16:19:09,915 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
2019-09-10 16:19:11,331 INFO impl.YarnClientImpl: Submitted application application_1568096329203_0001
#似乎是提交作業到Yarn了 2019-09-10 16:19:11,660 INFO mapreduce.Job: The url to track the job: http://slave1:8088/proxy/application_1568096329203_0001/
#em,提供了一個追蹤作業執行進度的url

2019-09-10 16:19:11,661 INFO mapreduce.Job: Running job: job_1568096329203_0001
2019-09-10 16:20:09,966 INFO mapreduce.Job: Job job_1568096329203_0001 running in uber mode : false
#作業不滿足uber作業的條件,將以非uber模式執行

2019-09-10 16:20:10,014 INFO mapreduce.Job:  map 0% reduce 0%
2019-09-10 16:21:22,515 INFO mapreduce.Job:  map 100% reduce 0%
2019-09-10 16:21:52,053 INFO mapreduce.Job:  map 100% reduce 100%
2019-09-10 16:21:54,192 INFO mapreduce.Job: Job job_1568096329203_0001 completed successfully
#em 作業執行成功了 2019-09-10 16:21:55,534 INFO mapreduce.Job: Counters: 54 #下面是作業的一些計數器數據
        File System Counters
                FILE: Number of bytes read=132380
                FILE: Number of bytes written=928635
                FILE: Number of read operations=0
                FILE: Number of large read operations=0
                FILE: Number of write operations=0
                HDFS: Number of bytes read=1628892
                HDFS: Number of bytes written=18
                HDFS: Number of read operations=11
                HDFS: Number of large read operations=0
                HDFS: Number of write operations=2
                HDFS: Number of bytes read erasure-coded=0
        Job Counters 
                Launched map tasks=2
                Launched reduce tasks=1
                Data-local map tasks=2
                Total time spent by all maps in occupied slots (ms)=134255
                Total time spent by all reduces in occupied slots (ms)=20841
                Total time spent by all map tasks (ms)=134255
                Total time spent by all reduce tasks (ms)=20841
                Total vcore-milliseconds taken by all map tasks=134255
                Total vcore-milliseconds taken by all reduce tasks=20841
                Total megabyte-milliseconds taken by all map tasks=137477120
                Total megabyte-milliseconds taken by all reduce tasks=21341184
        Map-Reduce Framework
                Map input records=12035
                Map output records=12034
                Map output bytes=108306
                Map output materialized bytes=132386
                Input split bytes=200
                Combine input records=0
                Combine output records=0
                Reduce input groups=2
                Reduce shuffle bytes=132386
                Reduce input records=12034
                Reduce output records=2
                Spilled Records=24068
                Shuffled Maps =2
                Failed Shuffles=0
                Merged Map outputs=2
                GC time elapsed (ms)=1606
                CPU time spent (ms)=7900
                Physical memory (bytes) snapshot=470282240
                Virtual memory (bytes) snapshot=8201203712
                Total committed heap usage (bytes)=261046272
                Peak Map Physical memory (bytes)=184123392
                Peak Map Virtual memory (bytes)=2731495424
                Peak Reduce Physical memory (bytes)=102227968
                Peak Reduce Virtual memory (bytes)=2738212864
        Shuffle Errors
                BAD_ID=0
                CONNECTION=0
                IO_ERROR=0
                WRONG_LENGTH=0
                WRONG_MAP=0
                WRONG_REDUCE=0
        File Input Format Counters 
                Bytes Read=1628692
        File Output Format Counters 
                Bytes Written=18

 再看看我們指定的輸出目錄下到底輸出了什么:

 

 

 可以看到,輸出目錄包含兩個文件,其中名為_SUCCESS的空文件標識作業執行成功,名為part-r-00000的文件記錄Reduce任務的輸出。查看part-r-00000文件內容:

 

 

 可以看到,順利得出了1901年和1902年的最高氣溫分別是317和244。
這里我們可以通過非mapreduce的方式去計算一下最高氣溫,以驗證這個mapred程序的氣溫計算是否正確,但由於這不是我們的重點,而且程序邏輯比較簡單,姑且認為至結果可信。

猜想

基於輸入輸入和程序執行過程中的日志打印,我們做如下猜想:
猜想1、分片數等於輸入的塊數
猜想2、計數器中File Input Format Counters等於讀入的文件總字節數
猜想3、計數器中File Output Format Counters等於寫入輸出目錄的總字節數

驗證猜想

在搭建的分布式環境中運行以上代碼,以驗證猜想。分布式環境中准備好的輸入文件如下:

 

 即這個集群中准備了44個年份的數據文件,總大小為3137231401字節,分成了61個數據塊。

下面開始提交執行作業:

yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_temperature_out/

作業提交和執行並非一帆風順,期間遇到過很多問題,比較典型的是java.net.NoRouteToHostException,對這個錯誤的解決過程記錄在《Hadoop學習問題記錄之基礎篇》的問題一

解決遇到的問題后,成功運行的日志打印如下:

  1 [hadoop_user@master hadoop-3.2.0]$ yarn jar ~/max-temperature-1.0-SNAPSHOT-jar-with-dependencies.jar /ncdc/raw/ /max_out
  2 2019-09-16 11:48:03,916 INFO client.RMProxy: Connecting to ResourceManager at master/192.168.212.132:8032
  3 2019-09-16 11:48:05,846 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
  4 2019-09-16 11:48:05,937 INFO mapreduce.JobResourceUploader: Disabling Erasure Coding for path: /tmp/hadoop-yarn/staging/hadoop_user/.staging/job_1568605271327_0002
  5 2019-09-16 11:48:07,075 INFO input.FileInputFormat: Total input files to process : 44
  6 2019-09-16 11:48:08,397 INFO mapreduce.JobSubmitter: number of splits:58
  7 2019-09-16 11:48:08,638 INFO Configuration.deprecation: yarn.resourcemanager.system-metrics-publisher.enabled is deprecated. Instead, use yarn.system-metrics-publisher.enabled
  8 2019-09-16 11:48:09,646 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1568605271327_0002
  9 2019-09-16 11:48:09,648 INFO mapreduce.JobSubmitter: Executing with tokens: []
 10 2019-09-16 11:48:10,350 INFO conf.Configuration: resource-types.xml not found
 11 2019-09-16 11:48:10,351 INFO resource.ResourceUtils: Unable to find 'resource-types.xml'.
 12 2019-09-16 11:48:11,402 INFO impl.YarnClientImpl: Submitted application application_1568605271327_0002
 13 2019-09-16 11:48:11,595 INFO mapreduce.Job: The url to track the job: http://master:8088/proxy/application_1568605271327_0002/
 14 2019-09-16 11:48:11,596 INFO mapreduce.Job: Running job: job_1568605271327_0002
 15 2019-09-16 11:48:44,279 INFO mapreduce.Job: Job job_1568605271327_0002 running in uber mode : false
 16 2019-09-16 11:48:44,300 INFO mapreduce.Job:  map 0% reduce 0%
 17 2019-09-16 11:55:25,431 INFO mapreduce.Job:  map 1% reduce 0%
 18 2019-09-16 11:55:36,166 INFO mapreduce.Job:  map 2% reduce 0%
 19 2019-09-16 11:55:37,287 INFO mapreduce.Job:  map 3% reduce 0%
 20 2019-09-16 11:55:38,501 INFO mapreduce.Job:  map 4% reduce 0%
 21 2019-09-16 11:55:49,480 INFO mapreduce.Job:  map 5% reduce 0%
 22 2019-09-16 11:55:55,289 INFO mapreduce.Job:  map 6% reduce 0%
 23 2019-09-16 11:56:00,062 INFO mapreduce.Job:  map 7% reduce 0%
 24 2019-09-16 11:56:11,488 INFO mapreduce.Job:  map 8% reduce 0%
 25 2019-09-16 11:56:25,560 INFO mapreduce.Job:  map 9% reduce 0%
 26 2019-09-16 11:56:31,260 INFO mapreduce.Job:  map 10% reduce 0%
 27 2019-09-16 11:56:48,742 INFO mapreduce.Job:  map 11% reduce 0%
 28 2019-09-16 11:56:51,329 INFO mapreduce.Job:  map 12% reduce 0%
 29 2019-09-16 11:57:07,586 INFO mapreduce.Job:  map 13% reduce 0%
 30 2019-09-16 11:57:12,254 INFO mapreduce.Job:  map 14% reduce 0%
 31 2019-09-16 11:57:19,353 INFO mapreduce.Job:  map 15% reduce 0%
 32 2019-09-16 11:57:29,968 INFO mapreduce.Job:  map 16% reduce 0%
 33 2019-09-16 11:57:41,148 INFO mapreduce.Job:  map 17% reduce 0%
 34 2019-09-16 11:57:50,065 INFO mapreduce.Job:  map 18% reduce 0%
 35 2019-09-16 11:58:01,423 INFO mapreduce.Job:  map 19% reduce 0%
 36 2019-09-16 11:58:11,850 INFO mapreduce.Job:  map 20% reduce 0%
 37 2019-09-16 11:58:24,556 INFO mapreduce.Job:  map 21% reduce 0%
 38 2019-09-16 11:58:34,826 INFO mapreduce.Job:  map 22% reduce 0%
 39 2019-09-16 11:58:49,540 INFO mapreduce.Job:  map 23% reduce 0%
 40 2019-09-16 11:58:59,619 INFO mapreduce.Job:  map 24% reduce 0%
 41 2019-09-16 11:59:14,612 INFO mapreduce.Job:  map 25% reduce 0%
 42 2019-09-16 11:59:28,346 INFO mapreduce.Job:  map 26% reduce 0%
 43 2019-09-16 11:59:38,732 INFO mapreduce.Job:  map 27% reduce 0%
 44 2019-09-16 11:59:49,123 INFO mapreduce.Job:  map 28% reduce 0%
 45 2019-09-16 12:00:47,007 INFO mapreduce.Job:  map 30% reduce 0%
 46 2019-09-16 12:00:52,384 INFO mapreduce.Job:  map 33% reduce 0%
 47 2019-09-16 12:00:54,900 INFO mapreduce.Job:  map 34% reduce 0%
 48 2019-09-16 12:01:09,491 INFO mapreduce.Job:  map 35% reduce 0%
 49 2019-09-16 12:01:20,103 INFO mapreduce.Job:  map 36% reduce 0%
 50 2019-09-16 12:01:24,594 INFO mapreduce.Job:  map 38% reduce 0%
 51 2019-09-16 12:01:29,124 INFO mapreduce.Job:  map 41% reduce 0%
 52 2019-09-16 12:01:51,511 INFO mapreduce.Job:  map 42% reduce 0%
 53 2019-09-16 12:02:01,443 INFO mapreduce.Job:  map 43% reduce 0%
 54 2019-09-16 12:02:10,151 INFO mapreduce.Job:  map 43% reduce 5%
 55 2019-09-16 12:02:51,950 INFO mapreduce.Job:  map 44% reduce 5%
 56 2019-09-16 12:02:56,771 INFO mapreduce.Job:  map 44% reduce 6%
 57 2019-09-16 12:03:05,455 INFO mapreduce.Job:  map 45% reduce 6%
 58 2019-09-16 12:03:29,194 INFO mapreduce.Job:  map 45% reduce 7%
 59 2019-09-16 12:04:10,366 INFO mapreduce.Job:  map 47% reduce 7%
 60 2019-09-16 12:04:12,997 INFO mapreduce.Job:  map 47% reduce 8%
 61 2019-09-16 12:04:21,667 INFO mapreduce.Job:  map 47% reduce 9%
 62 2019-09-16 12:05:11,082 INFO mapreduce.Job:  map 47% reduce 10%
 63 2019-09-16 12:06:12,053 INFO mapreduce.Job:  map 48% reduce 10%
 64 2019-09-16 12:06:32,845 INFO mapreduce.Job:  map 50% reduce 10%
 65 2019-09-16 12:07:04,607 INFO mapreduce.Job:  map 50% reduce 11%
 66 2019-09-16 12:07:13,138 INFO mapreduce.Job:  map 52% reduce 11%
 67 2019-09-16 12:07:14,418 INFO mapreduce.Job:  map 56% reduce 11%
 68 2019-09-16 12:07:16,825 INFO mapreduce.Job:  map 58% reduce 11%
 69 2019-09-16 12:07:22,626 INFO mapreduce.Job:  map 59% reduce 11%
 70 2019-09-16 12:07:26,273 INFO mapreduce.Job:  map 60% reduce 11%
 71 2019-09-16 12:07:27,522 INFO mapreduce.Job:  map 62% reduce 11%
 72 2019-09-16 12:07:29,943 INFO mapreduce.Job:  map 63% reduce 11%
 73 2019-09-16 12:07:32,402 INFO mapreduce.Job:  map 64% reduce 11%
 74 2019-09-16 12:07:36,126 INFO mapreduce.Job:  map 65% reduce 11%
 75 2019-09-16 12:07:37,321 INFO mapreduce.Job:  map 66% reduce 11%
 76 2019-09-16 12:07:41,297 INFO mapreduce.Job:  map 67% reduce 11%
 77 2019-09-16 12:07:49,753 INFO mapreduce.Job:  map 67% reduce 12%
 78 2019-09-16 12:07:55,465 INFO mapreduce.Job:  map 68% reduce 12%
 79 2019-09-16 12:07:56,617 INFO mapreduce.Job:  map 69% reduce 12%
 80 2019-09-16 12:08:03,315 INFO mapreduce.Job:  map 70% reduce 12%
 81 2019-09-16 12:08:22,525 INFO mapreduce.Job:  map 70% reduce 13%
 82 2019-09-16 12:08:39,289 INFO mapreduce.Job:  map 71% reduce 13%
 83 2019-09-16 12:08:52,345 INFO mapreduce.Job:  map 72% reduce 13%
 84 2019-09-16 12:08:55,031 INFO mapreduce.Job:  map 73% reduce 13%
 85 2019-09-16 12:09:08,315 INFO mapreduce.Job:  map 74% reduce 13%
 86 2019-09-16 12:09:15,337 INFO mapreduce.Job:  map 76% reduce 13%
 87 2019-09-16 12:09:43,442 INFO mapreduce.Job:  map 77% reduce 14%
 88 2019-09-16 12:09:52,962 INFO mapreduce.Job:  map 78% reduce 14%
 89 2019-09-16 12:09:58,723 INFO mapreduce.Job:  map 79% reduce 14%
 90 2019-09-16 12:10:07,494 INFO mapreduce.Job:  map 80% reduce 14%
 91 2019-09-16 12:10:12,875 INFO mapreduce.Job:  map 80% reduce 16%
 92 2019-09-16 12:10:20,189 INFO mapreduce.Job:  map 81% reduce 16%
 93 2019-09-16 12:10:22,588 INFO mapreduce.Job:  map 81% reduce 17%
 94 2019-09-16 12:10:27,180 INFO mapreduce.Job:  map 82% reduce 17%
 95 2019-09-16 12:10:57,845 INFO mapreduce.Job:  map 83% reduce 17%
 96 2019-09-16 12:11:06,005 INFO mapreduce.Job:  map 83% reduce 18%
 97 2019-09-16 12:11:10,925 INFO mapreduce.Job:  map 84% reduce 18%
 98 2019-09-16 12:11:16,869 INFO mapreduce.Job:  map 86% reduce 18%
 99 2019-09-16 12:11:20,148 INFO mapreduce.Job:  map 86% reduce 22%
100 2019-09-16 12:11:29,748 INFO mapreduce.Job:  map 86% reduce 24%
101 2019-09-16 12:12:08,999 INFO mapreduce.Job:  map 86% reduce 26%
102 2019-09-16 12:12:34,681 INFO mapreduce.Job:  map 88% reduce 26%
103 2019-09-16 12:12:35,741 INFO mapreduce.Job:  map 89% reduce 26%
104 2019-09-16 12:12:37,876 INFO mapreduce.Job:  map 91% reduce 26%
105 2019-09-16 12:12:41,116 INFO mapreduce.Job:  map 92% reduce 27%
106 2019-09-16 12:12:44,267 INFO mapreduce.Job:  map 93% reduce 27%
107 2019-09-16 12:12:45,337 INFO mapreduce.Job:  map 94% reduce 27%
108 2019-09-16 12:12:46,373 INFO mapreduce.Job:  map 96% reduce 27%
109 2019-09-16 12:12:47,431 INFO mapreduce.Job:  map 96% reduce 28%
110 2019-09-16 12:12:56,285 INFO mapreduce.Job:  map 97% reduce 28%
111 2019-09-16 12:12:59,417 INFO mapreduce.Job:  map 97% reduce 30%
112 2019-09-16 12:13:18,013 INFO mapreduce.Job:  map 97% reduce 31%
113 2019-09-16 12:13:25,418 INFO mapreduce.Job:  map 98% reduce 31%
114 2019-09-16 12:13:27,577 INFO mapreduce.Job:  map 100% reduce 31%
115 2019-09-16 12:13:45,273 INFO mapreduce.Job:  map 100% reduce 39%
116 2019-09-16 12:13:51,482 INFO mapreduce.Job:  map 100% reduce 55%
117 2019-09-16 12:13:57,756 INFO mapreduce.Job:  map 100% reduce 71%
118 2019-09-16 12:14:03,989 INFO mapreduce.Job:  map 100% reduce 91%
119 2019-09-16 12:14:08,345 INFO mapreduce.Job:  map 100% reduce 100%
120 2019-09-16 12:14:12,633 INFO mapreduce.Job: Job job_1568605271327_0002 completed successfully
121 2019-09-16 12:14:13,176 INFO mapreduce.Job: Counters: 56
122         File System Counters
123                 FILE: Number of bytes read=217765235
124                 FILE: Number of bytes written=448594463
125                 FILE: Number of read operations=0
126                 FILE: Number of large read operations=0
127                 FILE: Number of write operations=0
128                 HDFS: Number of bytes read=3137294603
129                 HDFS: Number of bytes written=396
130                 HDFS: Number of read operations=179
131                 HDFS: Number of large read operations=0
132                 HDFS: Number of write operations=2
133                 HDFS: Number of bytes read erasure-coded=0
134         Job Counters 
135                 Killed map tasks=14
136                 Launched map tasks=72
137                 Launched reduce tasks=1
138                 Data-local map tasks=70
139                 Rack-local map tasks=2
140                 Total time spent by all maps in occupied slots (ms)=28632455
141                 Total time spent by all reduces in occupied slots (ms)=1004700
142                 Total time spent by all map tasks (ms)=28632455
143                 Total time spent by all reduce tasks (ms)=1004700
144                 Total vcore-milliseconds taken by all map tasks=28632455
145                 Total vcore-milliseconds taken by all reduce tasks=1004700
146                 Total megabyte-milliseconds taken by all map tasks=29319633920
147                 Total megabyte-milliseconds taken by all reduce tasks=1028812800
148         Map-Reduce Framework
149                 Map input records=20796941
150                 Map output records=19796839
151                 Map output bytes=178171551
152                 Map output materialized bytes=217765577
153                 Input split bytes=5858
154                 Combine input records=0
155                 Combine output records=0
156                 Reduce input groups=44
157                 Reduce shuffle bytes=217765577
158                 Reduce input records=19796839
159                 Reduce output records=44
160                 Spilled Records=39593678
161                 Shuffled Maps =58
162                 Failed Shuffles=0
163                 Merged Map outputs=58
164                 GC time elapsed (ms)=1507499
165                 CPU time spent (ms)=1468390
166                 Physical memory (bytes) snapshot=6290808832
167                 Virtual memory (bytes) snapshot=161167908864
168                 Total committed heap usage (bytes)=7517712384
169                 Peak Map Physical memory (bytes)=200282112
170                 Peak Map Virtual memory (bytes)=2736029696
171                 Peak Reduce Physical memory (bytes)=419164160
172                 Peak Reduce Virtual memory (bytes)=2737905664
173         Shuffle Errors
174                 BAD_ID=0
175                 CONNECTION=0
176                 IO_ERROR=0
177                 WRONG_LENGTH=0
178                 WRONG_MAP=0
179                 WRONG_REDUCE=0
180         File Input Format Counters 
181                 Bytes Read=3137288745
182         File Output Format Counters 
183                 Bytes Written=396
View Code

通過日志可以得出
猜想1錯誤,輸入分片數是58,並不等於輸入文件的塊數61。
猜想2錯誤,File Input Format Counters並不總是等於實際輸入文件字節數,這里mapred任務統計的讀入字節數為3137288745,略大於fsck工具檢測出的字節數3137231401。
猜想3正確,File Output Format Counters等於實際輸出的part-r-*的文件總字節數。

 結論

  1. 分片數不會一定等於輸入塊數。
  2. File Input Format Conters並不總等於fsck工具檢測出的輸入文件總字節數。
  3. File Output Format Conters會等於實際輸出的所有part-r-*文件總字節數

ps1:以上結論只是在處理文本輸入的情況下得出的,對於其它類型的InputFormat是否會違背上述結論,這里先打個問號。

ps2:關於分片數的問題,將另起篇幅敘述,這里暫且不話。見這里


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM