Result文件數據說明:
Ip:106.39.41.166,(城市)
Date:10/Nov/2016:00:01:02 +0800,(日期)
Day:10,(天數)
Traffic: 54 ,(流量)
Type: video,(類型:視頻video或文章article)
Id: 8701(視頻或者文章的id)
測試要求:
1、 數據清洗:按照進行數據清洗,並將清洗后的數據導入hive數據庫中。
兩階段數據清洗:
(1)第一階段:把需要的信息從原始日志中提取出來
ip: 199.30.25.88
time: 10/Nov/2016:00:01:03 +0800
traffic: 62
文章: article/11325
視頻: video/3235
(2)第二階段:根據提取出來的信息做精細化操作
ip--->城市 city(IP)
date--> time:2016-11-10 00:01:03
day: 10
traffic:62
type:article/video
id:11325
(3)hive數據庫表結構:
create table data( ip string, time string , day string, traffic bigint,
type string, id string )
2、數據處理:
·統計最受歡迎的視頻/文章的Top10訪問次數 (video/article)
·按照地市統計最受歡迎的Top10課程 (ip)
·按照流量統計最受歡迎的Top10課程 (traffic)
3、數據可視化:將統計結果倒入MySql數據庫中,通過圖形化展示的方式展現出來。
階段一:
/** * MapReduce實驗-數據清洗-階段一 * 高澤偉19.11.20 * */ package classtest3; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.Locale; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; public class DataClean { static String INPUT_PATH="hdfs://192.168.57.128:9000/testhdfs1026/run/input/DataClean.txt"; static String OUTPUT_PATH="hdfs://192.168.57.128:9000/testhdfs1026/run/output/DataClean"; /* * 數據格式: * Ip Date Day|Traffic|Type|Id * 106.39.41.166,10/Nov/2016:00:01:02 +0800,10,54,video,8701 */ public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH); //原時間格式 public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//現時間格式 //提取數據的函數 ######################################################################################### //將一行數據清洗整合到一個字符串數組里 parse:解析 //String line --> String[] public static String[] parse(String line){ String ip = parseIP(line); String date = parseTime(line); String day = parseDay(line); String traffic = parseTraffic(line); String type = parseType(line); String id = parseId(line); return new String[]{ip,date,day,traffic,type,id}; } //Ip private static String parseIP(String line) { String ip =line.split(",")[0].trim(); return ip; } //Date private static String parseTime(String line) { //time=日期String String time =line.split(",")[1].trim(); //截取最后的" +0800" final int f = time.indexOf(" "); String time1 = time.substring(0, f); Date date = parseDateFormat(time1); return dateformat1.format(date); } //把String類型轉換成Date類型 private static Date parseDateFormat(String string){ Date parse = null; try{ parse = FORMAT.parse(string);//parse()方法,把String型的字符串轉換成特定格式的date類型 }catch (Exception e){ e.printStackTrace(); } return parse; } //Day private static String parseDay(String line) { String day =line.split(",")[2].trim(); return day; } //Traffic private static String parseTraffic(String line) { String traffic = line.split(",")[3].trim(); return traffic; } //Type private static String parseType(String line) { String type = line.split(",")[4].trim(); return type; } //Id private static String parseId(String line) { String id =line.split(",")[5].trim(); return id; } /* * Mapper * 把需要的信息從原始日志中提取出來,根據提取出來的信息做精細化操作 */ public static class Map extends Mapper<LongWritable,Text,Text,NullWritable>{ public static Text word = new Text(); public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{ String line = value.toString(); String arr[] = parse(line); word.set(arr[0]+"\t"+arr[1]+"\t"+arr[2]+"\t"+arr[3]+"\t"+arr[4]+"\t"+arr[5]+"\t"); context.write(word,NullWritable.get()); } } public static class Reduce extends Reducer<Text,NullWritable,Text,NullWritable>{ public void reduce(Text key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException { context.write(key, NullWritable.get()); } } public static void main(String[] args) throws Exception{ Path inputpath=new Path(INPUT_PATH); Path outputpath=new Path(OUTPUT_PATH); Configuration conf=new Configuration(); System.out.println("Start"); Job job=Job.getInstance(conf); job.setJarByClass(DataClean.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPaths(job, INPUT_PATH); FileOutputFormat.setOutputPath(job,outputpath); boolean flag = job.waitForCompletion(true); System.out.println(flag); System.exit(flag? 0 : 1); } }
知識點1:SimpleDateFormat的用法
SimpleDateFormat用於格式化時間
實例::::::::::::::::::::::::::::::::::::::::::::::::::::
import java.util.Date;
import java.text.SimpleDateFormat;
public class SimpleDateFormat1 {
public static void main(String[] args){
Date date = new Date();
String dat = date.toString();
System.out.println(dat);
String strDateFormat = "yyyy-MM-dd HH:mm:ss";
SimpleDateFormat sdf = new SimpleDateFormat(strDateFormat);
System.out.println(sdf.format(date));
}
}
::::::::::::::::::::::::::::::::::::::::::::::::::::::
輸出結果:
Tue Nov 19 18:55:29 CST 2019
2019-11-19 18:55:29
知識點2:lastIndexOf()方法和indexOf()方法比較
lastIndexOf()方法,返回子字符串最后出現的位置。沒有找到,則返回 -1。
如:"ABCDABCD".lastIndexOf("BC"); 返回5
"ABCDABCD".lastIndexOf("DE"); 返回-1
indexOf()方法返回子字符串第一次出現字符位置。沒有找到,則返回 -1。
如:"ABCDABCD".indexOf("BC"); 返回1
"ABCDABCD".indexOf("B"); 返回1
"ABCDABCD".indexOf("DE"); 返回-1
導入Hive語句:
hive數據庫的操作: hive> create table if not exists data( > dip string, > dtime string, > dday string, > dtraffic bigint, > dtype string, > did string) > row format delimited fields terminated by ',' lines terminated by '\n'; [root@localhost 桌面]# hadoop fs -get hdfs://localhost:9000/testhdfs1026/run/input/DataClean.txt /usr/local hive> load data local inpath '/usr/local/DataClean.txt' into table data; hive> select * from data limit 3;