三 Hive 自定義函數UDF和Transform
開篇提示:
快速鏈接beeline的方式:
./beeline -u jdbc:hive2://hadoop1:10000 -n hadoop
1.自定義函數UDF
當Hive提供的內置函數無法滿足你的業務處理需要時,此時就可以考慮使用用戶自定義函數(UDF:user-defined function)
UDF 作用於單個數據行,產生一個數據行作為輸出。(數學函數,字符串函數)
2開發實例
2.1 原始數據格式
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}
{"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"}
{"movie":"595","rate":"5","timeStamp":"978824268","uid":"1"}
{"movie":"938","rate":"4","timeStamp":"978301752","uid":"1"}
2.2 創建數據表
create table t_rating (line string)
row format delimited;
2.3 導入數據
load data local inpath '/home/hadoop/rating.json' into table t_rating;
2.4 開發UDF程序
package cn.itcast.hive; import org.apache.hadoop.hive.ql.exec.UDF; import org.codehaus.jackson.map.ObjectMapper; /** * @author ntjr * 解析json數據 * */ public class PaserJson extends UDF { private ObjectMapper mapper = new ObjectMapper(); public String evaluate(String line) { try { RatingBean ratingBean = mapper.readValue(line, RatingBean.class); return ratingBean.toString(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return ""; } }
用於解析t_rating表中每一行的json數據。
package cn.itcast.hive; public class RatingBean { private String movie; private String rate; private String timeStamp; private String uid; public String getMovie() { return movie; } public void setMovie(String movie) { this.movie = movie; } public String getRate() { return rate; } public void setRate(String rate) { this.rate = rate; } public String getTimeStamp() { return timeStamp; } public void setTimeStamp(String timeStamp) { this.timeStamp = timeStamp; } public String getUid() { return uid; } public void setUid(String uid) { this.uid = uid; } @Override public String toString() { return movie + "\t" + rate + "\t" + timeStamp + "\t" + uid; } }
2.4將udf程序打成jar 導入hive
add JAR /home/hadoop/udf.jar;
2.5 創建臨時函數與開發好的udf進行關聯
create temporary function paseJson as 'cn.itcast.hive.PaserJson';
2.6 創建完整字段的t_rating02表(用於存放將單列json數據表t_rating轉換成多列數據表t_rating02的結果)
create table t_rating02 as select split(paseJson(line),'\t')[0] as movieid, split(paseJson(line),'\t')[1] as rate, split(paseJson(line),'\t')[2] as timestring, split(paseJson(line),'\t')[3] as uid from t_rating;
至此:完成字段表t_rating02轉換完成。
3.利用Transfrom將t_rating02表中的timestring字段轉換成周幾的形式。
3.1 t_rating02中的樣式:

3.2編寫weekday_mapper.py腳本,處理t_rating02表中的timestring字段
#!/bin/python import sys import datetime for line in sys.stdin: line = line.strip() movieid, rating, unixtime,userid = line.split('\t') weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday() print '\t'.join([movieid, rating, str(weekday),userid])
3.3 上傳weekday_mapper.py腳本,前提是保證本機裝有python
add FILE weekday_mapper.py;
3.4 創建新表t_rating_date,保存腳本處理后的數據
create TABLE t_rating_date as SELECT TRANSFORM (movieid , rate, timestring,uid) USING 'python weekday_mapper.py' AS (movieid, rating, weekday,userid) FROM t_rating02;
3.5查看t_rating_date表

至此將json數據轉換成數據表。
