MapReduce 開發環境搭建(Eclipse\MyEclipse + Maven)


 

 

 

寫在前面的話

    可詳細參考,一定得去看

HBase 開發環境搭建(Eclipse\MyEclipse + Maven)

Zookeeper項目開發環境搭建(Eclipse\MyEclipse + Maven)

Hive項目開發環境搭建(Eclipse\MyEclipse + Maven)

  我這里,相信,能看此博客的朋友,想必是有一定基礎的了。我前期寫了大量的基礎性博文。可以去補下基礎。

 

 

步驟一:File  ->  New  -> Project   ->  Maven Project

 

 

 

 步驟二:自行設置,待會創建的myHBase工程,放在哪個目錄下。

 

 

 步驟三:

 

 

 步驟四:自行設置

 

 

 

 步驟五:修改jdk

省略,很簡單!

 

 

 步驟六:修改pom.xml配置文件

 

 

 

 

官網Maven的zookeeper配置文件內容:

地址:http://www.mvnrepository.com/search?q=mapreduce

 

 

 

  因為我的hadoop版本是hadoop-2.6.0

 

參考: http://blog.csdn.net/e421083458/article/details/45792111

 

1、

 

 

2、

 

 

 

 

 

 

 暫時這些吧,以后需要,可以自行再加呢!

 

 

 

 最后的pom.xml配置文件為

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>zhouls.bigdata</groupId>
<artifactId>myMapreduce</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>

<name>myMapreduce</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0</version>
</dependency>
</dependencies>
</project>

 

 當然,這只是初步而已,最簡單的,以后可以自行增刪。

 

 

步驟七:這里,給大家,通過一組簡單的Hive應用程序實例來向大家展示Hive的某些功能。

  類名為MapReduceTestCase.java

 

 

 

package zhouls.bigdata.myMapreduce;


import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MapReduceTestCase
{
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}

public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(MapReduceTestCase.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path("hdfs:/HadoopMaster:9000/djt.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs:/HadoopMaster:9000/word-count"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

 

 

 

或者

 

package zhouls.bigdata.myMapreduce;

import java.io.IOException;

import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


/**
* 基於樣本數據做Hadoop工程師薪資統計:計算各工作年限段的薪水范圍
*/
public class SalaryCount extends Configured implements Tool
{
public static class SalaryMapper extends Mapper<LongWritable, Text, Text, Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
// 美團 3-5年經驗 15-30k 北京 【夠牛就來】hadoop高級工程...
//北信源 3-5年經驗 15-20k 北京 Java高級工程師(有Hadoo...
// 蘑菇街 3-5年經驗 10-24k 杭州 hadoop開發工程師

//第一步,將輸入的純文本文件的數據轉化成String
String line = value.toString();//讀取每行數據

String[] record = line.split( "\\s+");//使用空格正則解析數據
//key=record[1]:輸出3-5年經驗
//value=record[2]:15-30k
//作為Mapper輸出,發給 Reduce 端

//第二步
if(record.length >= 3)//因為取得的薪資在第3列,所以要大於3
{
context.write( new Text(record[1]), new Text(record[2]) );
//Map輸出,record數組的第2列,第3列
}
}
}
public static class SalaryReducer extends Reducer< Text, Text, Text, Text>
{
public void reduce(Text Key, Iterable< Text> Values, Context context) throws IOException, InterruptedException
{

int low = 0;//記錄最低工資
int high = 0;//記錄最高工資
int count = 1;
//針對同一個工作年限(key),循環薪資集合(values),並拆分value值,統計出最低工資low和最高工資high
for (Text value : Values)
{
String[] arr = value.toString().split("-");//其中的一行而已,15 30K
int l = filterSalary(arr[0]);//過濾數據 //15
int h = filterSalary(arr[1]);//過濾數據 //30
if(count==1 || l< low)
{
low = l;
}
if(count==1 || h>high)
{
high = h;
}
count++;
}
context.write(Key, new Text(low + "-" +high + "k"));//即10-30K
}
}
//正則表達式提取工資值,因為15 30k,后面有k,不干凈
public static int filterSalary(String salary)//過濾數據
{
String sal = Pattern.compile("[^0-9]").matcher(salary).replaceAll("");
return Integer.parseInt(sal);
}


public int run(String[] args) throws Exception
{
//第一步:讀取配置文件
Configuration conf = new Configuration();//讀取配置文件

//第二步:輸出路徑存在就先刪除
Path out = new Path(args[1]);//定義輸出路徑的Path對象,mypath
FileSystem hdfs = out.getFileSystem(conf);//通過路徑下的getFileSystem來獲得文件系統
if (hdfs.isDirectory(out))
{//刪除已經存在的輸出目錄
hdfs.delete(out, true);
}
//第三步:構建job對象
Job job = new Job(conf, "SalaryCount" );//新建一個任務
job.setJarByClass(SalaryCount.class);//設置 主類
//通過job對象來設置主類SalaryCount.class

//第四步:指定數據的輸入路徑和輸出路徑
FileInputFormat.addInputPath(job, new Path(args[0]));// 文件輸入路徑
FileOutputFormat.setOutputPath(job, new Path(args[1]));// 文件輸出路徑

//第五步:指定Mapper和Reducer
job.setMapperClass(SalaryMapper.class);// Mapper
job.setReducerClass(SalaryReducer.class);// Reducer

//第六步:設置map函數和reducer函數的輸出類型
job.setOutputKeyClass(Text.class);//輸出結果key類型
job.setOutputValueClass(Text.class);//輸出結果的value類型

//第七步:提交作業
job.waitForCompletion(true);//等待完成退出作業

return 0;
}


/**
* @param args 輸入文件、輸出路徑,可在Eclipse中Run Configurations中配Arguments,如:
* hdfs://HadoopMaster:9000/salary.txt
* hdfs://HadoopMaster:9000/salary/out
*/
public static void main(String[] args) throws Exception
{
//第一步
String[] args0 =
{
"hdfs://HadoopMaster:9000/salary/",
"hdfs://HadoopMaster:9000/salary/out"
};
//第二步
int ec = ToolRunner.run(new Configuration(), new SalaryCount(), args0);
//第一個參數是讀取配置文件,第二個參數是主類Temperature,第三個參數是輸如路徑和輸出路徑的屬組
System.exit(ec);

}
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM