傳參關鍵代碼:
//從配置文件獲取參數,必須在作業創建的前面
conf.addResource("hadoop-bigdata.xml"); keepUrl=conf.get("KeepUrlString",""); filterUrl=conf.get("FilterUrlString",""); conf.set("FilterUrl", filterUrl); conf.set("KeepUrl", keepUrl);
//獲取參數
String fstr=context.getConfiguration().get("FilterUrl");
String kstr=context.getConfiguration().get("KeepUrl");
package org.apache.hadoop.examples; import java.io.IOException; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class FilterUrl { public static class FilterUrlMap extends Mapper<Object,Text,Text,Text> { private static Text word=new Text(); public void map(Object key,Text values,Context context) throws IOException,InterruptedException { boolean fflag=false; boolean kflag=false; //獲取參數 String fstr=context.getConfiguration().get("FilterUrl"); String kstr=context.getConfiguration().get("KeepUrl"); //循環的方式 // StringTokenizer fitr=new StringTokenizer(fstr,"|"); // StringTokenizer kitr=new StringTokenizer(kstr,"|"); //正則表達式,替換特殊字符 Pattern filter=Pattern.compile(fstr.replace(".","\\.")); Pattern keep=Pattern.compile(kstr.replace(".","\\.")); //有一大段的內容 StringTokenizer itr = new StringTokenizer(values.toString(),"\n"); String url=""; while(itr.hasMoreTokens()) { url=itr.nextToken().toLowerCase(); //正則表達式的模式匹配 Matcher mkeep=keep.matcher(url); if(mkeep.find()) { kflag=true; Matcher mfilter=filter.matcher(url); if(mfilter.find()) fflag=true; } //需要保留的URL /** //循環的模式匹配 while(kitr.hasMoreTokens()) { if(url.indexOf(kitr.nextToken())>0) { kflag=true; break; } } //需要過濾掉的URL while(kflag && fitr.hasMoreTokens()) { if(url.indexOf(fitr.nextToken())>0) { fflag=true; break; } } */ //是需要保留的並且不是需要過濾掉的URL if(kflag && !fflag) { word.set(url); context.write(word,new Text("")); } } } } public static class FilterUrlReduce extends Reducer<Text,Text,Text,Text> { public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException { context.write(key, new Text("")); } } public static void main(String[] args) throws Exception{ // TODO Auto-generated method stub Configuration conf=new Configuration(); String filterUrl=new String(); String keepUrl=new String(); if(args.length!=2) { System.err.println("please input two args:<in> <out>"); System.exit(2); } //從配置文件獲取參數,必須在作業創建的前面 conf.addResource("hadoop-bigdata.xml"); keepUrl=conf.get("KeepUrlString",""); filterUrl=conf.get("FilterUrlString",""); conf.set("FilterUrl", filterUrl); conf.set("KeepUrl", keepUrl); //這句必須在參數設置語句的后面,否則參數獲取失敗 Job job=new Job(conf,"filter url"); job.setJarByClass(FilterUrl.class); job.setMapperClass(FilterUrlMap.class); job.setReducerClass(FilterUrlReduce.class); //job.setNumReduceTasks(0); //如果不要的話會有多個小的文件 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); System.exit(job.waitForCompletion(true)?0:1); } }
需要從配置文件獲取的參數:
<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <!--C net keep url string --> <name>KeepUrlString</name> <value>anjueke.com|soufun.com</value> </property> <property> <!--filter url--> <name>FilterUrlString</name> <value>.js|.jpg|.jpeg|.gif|.png|.css|error.html</value> </property> </configuration>