UDTF(User-Defined Table-Generating Functions)一進多出,如lateral view explore()
實現方法:
1)繼承org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
2)重寫initialize、process、close方法
UDTF首先會調用initialize方法,此方法返回UDTF的返回行的信息(返回個數,類型,名稱)。初始化完成后,會調用process方法,對傳入的參數進行處理,可以通過forword()方法把結果返回。最后close()方法調用,對需要清理的方法進行清理
應用案例
需求:使用自定義UDTF函數獲取兩個時間之間的時間列表
package com.sjck.hive.udf;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
public class DateMap extends GenericUDTF {
@Override
public void close() throws HiveException {
}
@Override
public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
if (args.length != 2) {
throw new UDFArgumentLengthException("DateMap takes only two argument");
}
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
fieldNames.add("begin_date"); //指定輸出參數名稱
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
fieldNames.add("end_date");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
try {
String begin = String.valueOf(args[0]);
String end = String.valueOf(args[1]);
SimpleDateFormat timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
int days = (int) (dateFormat.parse(end).getTime() - dateFormat.parse(begin).getTime())
/ (60 * 60 * 24 * 1000);
Date startDate = timeFormat.parse(begin);
Date endDate = timeFormat.parse(end);
Calendar ecalendar = Calendar.getInstance();
ecalendar.setTime(startDate);
Date d1 = startDate;
Date d2 = null;
if(days==0){
d2=endDate;
}else{
ecalendar.add(Calendar.DATE, 1);
ecalendar.set(Calendar.HOUR_OF_DAY, 0);
ecalendar.set(Calendar.MINUTE, 0);
ecalendar.set(Calendar.SECOND, 0);
d2=ecalendar.getTime();
}
String datas[][] = new String[days + 1][2];
datas[0][0] = timeFormat.format(d1);
datas[0][1] = timeFormat.format(d2);
for (int i = 1; i < days + 1; i++) {
d1 = d2;
ecalendar.add(Calendar.DATE, 1);
d2 = ecalendar.getTime();
if (d2.after(endDate)) {
d2 = endDate;
}
datas[i][0] = timeFormat.format(d1);
datas[i][1] = timeFormat.format(d2);
}
for (int i = 0; i < datas.length; i++) {
String[] s = new String[2];
s[0] = datas[i][0];
s[1] = datas[i][1];
forward(s);
}
} catch (ParseException e) {
e.printStackTrace();
}
}
}
將代碼打成jar包上傳到服務器上,我這邊是上傳到hdfs上的
hadoop fs -put hive-udf.jar /user/hive/udf
聲明函數
create function datemap AS 'com.sjck.hive.udf.DateMap' using jar 'hdfs://nameservice1/user/hive/udf/hive-udf.jar';
使用
方式一:直接放在select后面
看下使用到表結構

select * from bst_bas_driver_info_work_time where id='2440780' //看看選擇的一條數據

使用datemap函數獲取開始時間-結束時間中間的時間(按天展開)
select datemap(date_format(t.work_start_time, 'yyyy-MM-dd HH:mm:ss'),date_format(t.work_end_time, 'yyyy-MM-dd HH:mm:ss')) as (begin_date,end_date) from bst_bas_driver_info_work_time t
where id='2440780'

注意:
1)不可以添加其他字段使用:
select datemap(date_format(t.work_start_time, 'yyyy-MM-dd HH:mm:ss'),date_format(t.work_end_time, 'yyyy-MM-dd HH:mm:ss')) as (begin_date,end_date) from bst_bas_driver_info_work_time t where id='2440780'
2)不可以嵌套調用:
select datemap(datemap(xx,xxx),datemap(xx,xxx)) from bst_bas_driver_info_work_time
3)不可以和group by/cluster by/distribute by/sort by一起使用:
select datemap(xx,xxx)as (begin_date,end_date) from bst_bas_driver_info_work_time group by begin_date, end_date
方式二:和lateral view一起使用
select work_start_time as start_date,
work_end_time as end_date,
t.mid_start_date,
t.mid_end_date
from bst_bas_driver_info_work_time lateral view datemap(date_format(work_start_time, 'yyyy-MM-dd HH:mm:ss'), date_format(work_end_time, 'yyyy-MM-dd HH:mm:ss')) t as mid_start_date,
mid_end_date
where id in ('2440780')

其實這個功能用posexplode也可以完成,看下代碼:

效果一樣,只不過中間邏輯自己還得單獨做處理下,代碼參考我的另一篇https://www.cnblogs.com/kopao/p/13750818.html
這里重點在於UDTF自定義函數的實現
