1.環境准備
下載:http://mirror.bit.edu.cn/apache/hadoop/common/hadoop-2.7.2/hadoop-2.7.2.tar.gz
解壓:解壓后,修改etc/hadoop/hadoop-env.sh 中JAVA_HOME, 我的java_home(可以通過cat /etc/profile)是/user/java/latest
2.Hadoop Single_Node Cluster
參考官方文檔:http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html
3.WordCount示例
a.maven 配置(pom.xml)
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>my.hadoopstudy</groupId> <artifactId>hadoopstudy</artifactId> <packaging>jar</packaging> <version>1.0-SNAPSHOT</version> <name>hadoopstudy</name> <url>http://maven.apache.org</url> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>jdk.tools</groupId> <artifactId>jdk.tools</artifactId> <version>1.8.0_65</version> <scope>system</scope> <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> </project>
b.Mapper代碼:
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } }
c.Reducer代碼:
public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } }
d.整個java代碼如下:
package my.hadoopstudy.mapreduce; import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
e.打包: 在項目目錄下 mvn package,將target中jar包放到hadoop目錄下自己建的study目錄
f.運行:bin/hadoop jar study/hadoopstudy-1.0-SNAPSHOT.jar my.hadoopstudy.mapreduce.WordCount /user/wangke/wordcount/input /user/wangke/wordcount/output
4.遇到的問題及解決方式:
a.JAVA_HOME一定要記得修改
b.要按照2中官方文檔修改相關的xml文件配置
c.第二次按照官方pseudo-distributed,報錯如下:hadoop-there-are-0-datanodes-running-and-no-nodes-are-excluded-in-this-operation
解決方式: sbin/stop_all.sh --> 刪除current文件(rm -r /tmp/hadoop-admin/dfs/data/current) ,然后重新按照pseudo-distributed就沒問題了
d.在pseudo-distributed下跑jar時,connecting to resourcemanager一直連不上,,retry。這是因為沒有啟動yarn(本來以為這個local跑的時候,根本不需要yarn,但是只有啟動yarn,才能打開8032resourceManager端口)
解決方式:修改yarn-site.xml配置:
<configuration> <!-- Site specific YARN configuration properties --> <property> <name>yarn.nodemanager.aux-services</name> <value>mapreduce_shuffle</value> </property> <property> <name>yarn.resourcemanager.address</name> <value>127.0.0.1:8032</value> </property> <property> <name>yarn.resourcemanager.scheduler.address</name> <value>127.0.0.1:8030</value> </property> <property> <name>yarn.resourcemanager.resource-tracker.address</name> <value>127.0.0.1:8031</value> </property> </configuration>
sbin/yarn-start.sh 發現執行成功,bin/hdfs dfs -cat /user/wangke/wordcount/output/part-r-00000 查看結果沒問題