有關hadoop及java安裝配置請見:https://www.cnblogs.com/lxc1910/p/11734477.html
1、新建Java project:
選擇合適的jdk,如圖所示:
將工程命名為WordCount。
2、添加WordCount類文件:
在src中添加新的Java類文件,類名為WordCount,代碼如下:
1 import java.io.IOException; 2 3 import java.util.StringTokenizer; 4 5 import org.apache.hadoop.conf.Configuration; 6 7 import org.apache.hadoop.fs.Path; 8 9 import org.apache.hadoop.io.IntWritable; 10 11 import org.apache.hadoop.io.Text; 12 13 import org.apache.hadoop.mapreduce.Job; 14 15 import org.apache.hadoop.mapreduce.Mapper; 16 17 import org.apache.hadoop.mapreduce.Reducer; 18 19 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 20 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 23 import org.apache.hadoop.util.GenericOptionsParser; 24 25 public class WordCount { 26 public static class TokenizerMapper //定義Map類實現字符串分解 27 extends Mapper<Object, Text, Text, IntWritable> 28 { 29 private final static IntWritable one = new IntWritable(1); 30 private Text word = new Text(); 31 //實現map()函數 32 public void map(Object key, Text value, Context context) 33 throws IOException, InterruptedException 34 { //將字符串拆解成單詞 35 StringTokenizer itr = new StringTokenizer(value.toString()); 36 while (itr.hasMoreTokens()) 37 { word.set(itr.nextToken()); //將分解后的一個單詞寫入word類 38 context.write(word, one); //收集<key, value> 39 } 40 } 41 } 42 43 //定義Reduce類規約同一key的value 44 public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> 45 { 46 private IntWritable result = new IntWritable(); 47 //實現reduce()函數 48 public void reduce(Text key, Iterable<IntWritable> values, Context context ) 49 throws IOException, InterruptedException 50 { 51 int sum = 0; 52 //遍歷迭代values,得到同一key的所有value 53 for (IntWritable val : values) { sum += val.get(); } 54 result.set(sum); 55 //產生輸出對<key, value> 56 context.write(key, result); 57 } 58 } 59 60 public static void main(String[] args) throws Exception 61 { //為任務設定配置文件 62 Configuration conf = new Configuration(); 63 //命令行參數 64 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 65 if (otherArgs.length != 2) 66 { System.err.println("Usage: wordcount <in> <out>"); 67 System.exit(2); 68 } 69 Job job = Job.getInstance(conf, "word count");//新建一個用戶定義的Job 70 job.setJarByClass(WordCount.class); //設置執行任務的jar 71 job.setMapperClass(TokenizerMapper.class); //設置Mapper類 72 job.setCombinerClass(IntSumReducer.class); //設置Combine類 73 job.setReducerClass(IntSumReducer.class); //設置Reducer類 74 job.setOutputKeyClass(Text.class); //設置job輸出的key 75 //設置job輸出的value 76 job.setOutputValueClass(IntWritable.class); 77 //設置輸入文件的路徑 78 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 79 //設置輸出文件的路徑 80 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 81 //提交任務並等待任務完成 82 System.exit(job.waitForCompletion(true) ? 0 : 1); 83 } 84 85 }
3、添加依賴庫:
點擊 File -> Project Structure -> Modules,選擇Dependencies,點擊加號,添加以下依賴庫:
4、編譯生成JAR包:
點擊 File -> Project Structure ->Artifacts,點擊加號->JAR->from modules with dependencies,
Mainclass選擇WordCount類:
下面開始編譯生成JAR包:
點擊 build->build Artifacts->build,完成編譯后,會發現多出一個目錄output.
5、在hadoop系統中運行JAR包:
我之前在hadoop用戶下安裝了偽分布式的hadoop系統,因此首先把JAR包復制到hadoop用戶目錄下。
啟動hadoop服務:(在hadoop安裝目錄的sbin文件夾下)
./start-all.sh
在hdfs下新建test-in文件夾,並放入file1.txt、file2.txt兩個文件,
1 hadoop fs -mkdir test-in 2 hadoop fs -put file1.txt file2.txt test-in/
執行jar包:
1 hadoop jar WordCount.jar test-in test-out
因為之前生成JAR包時設置了主類,所以WordCount.jar后面不需要再加WordCount.
另外需要注意運行JAR包之前hdfs中不能有test-out文件夾。
6、查看運行結果
可通過http://localhost:50070/查看hadoop系統狀況,
點擊Utilities->Browse the file system即可查看hdfs文件系統:
可以看到test-out文件下有輸出文件,可通過命令:
1 hadoop fs -cat test-out/part-r-00000
查看文件輸出情況:
7、參考
https://blog.csdn.net/chaoping315/article/details/78904970