目錄
1、Hive 中的wordCount
- explode:一對多,拆分一行,返回多行
- posexplode:一對多,拆分一行,返回多行,並生成對應的索引值
- 聚合函數:多對一,聚合多行,返回一行
explode:
(1)將數組中的元素拆分成多行顯示
舉例:
hive> select explode(array(1,2,3)) from XXX;
(2)將Map中的元素拆分成多行顯示
舉例:
hive> select explode(map('k1','v1','k2','v2')) from lxw1234;
posexplode:
將元素拆分成多行顯示,並產生對應的一列索引值
舉例:
hive> select explode(array(6,7,8)) from XXX;
結果:
0 6
1 7
2 8
//建表
create table words(
words string
)row format delimited fields terminated by '|';
// 加載數據
hello,java,hello,java,scala,python
hbase,hadoop,hadoop,hdfs,hive,hive
hbase,hadoop,hadoop,hdfs,hive,hive
//切分數據(map端實現)
select split(words,",") from words;
//結果
["hello","java","hello","java","scala","python"] //這代表一列,不是一行
["hbase","hadoop","hadoop","hdfs","hive","hive"]
["hbase","hadoop","hadoop","hdfs","hive","hive"]
//通過explode將一行轉化為多行
select explode(split(words,",")) from words;
//結果
hello
java
hello
java
scala
python
hbase
hadoop
hadoop
hdfs
...
//實現wordcount需求
select word,count(*) from (select explode(split(words,',')) as word from words) t1 group by t1.word;
// 結果
hadoop 4
hbase 2
hdfs 2
hello 2
hive 4
java 2
python 1
scala 1
2、Hive自定義函數UserDefineFunction
(1)UDF:一進一出(一對一)
- 創建maven項目,並加入依賴
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
如果加入依賴后,一直報錯,解決方法如下:
先將IDEA關閉,在我的電腦中,找到該項目對應的.idea目錄,將.idea目錄刪除,再重新打開IDEA
- 編寫代碼,繼承org.apache.hadoop.hive.ql.exec.UDF,實現evaluate方法,在evaluate方法中實現自己的邏輯
//需求:將students表中的班級漢字變為數字
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyUDF extends UDF {
// 自定義UDF 需要繼承UDF類,實現evaluate方法
public String evaluate(String clazz) {
//定義屬性,初始化為空字符串
String resStr = "";
resStr = clazz.replace("一", "1");
resStr = resStr.replace("二", "2");
resStr = resStr.replace("三", "3");
resStr = resStr.replace("四", "4");
resStr = resStr.replace("五", "5");
resStr = resStr.replace("六", "6");
return resStr;
}
}
- 打成jar包並上傳至Linux虛擬機
- 在hive shell中,使用
add jar 路徑
將jar包作為資源添加到hive環境中
add jar /usr/local/soft/jars/Hive-1.0.jar;
- 使用jar包資源注冊一個臨時函數,my_udf是我們自定義函數名,as后跟是主類名
create temporary function my_udf as 'com.shujia.Hive.UDF.MyUDF';
- 使用函數名處理數據(SQL)
select clazz,my_udf(clazz) from students limit 10;
文科六班 文科6班
文科六班 文科6班
理科六班 理科6班
理科三班 理科3班
理科五班 理科5班
理科二班 理科2班
文科六班 文科6班
理科六班 理科6班
理科一班 理科1班
理科六班 理科6班
(2)UDTF:一進多出(一對多)
案例1
"key1:value1,key2:value2,key3:value3"
key1 value1
key2 value2
key3 value3
方法一:使用 split+explode
1、使用split先將數據轉化為數組形式
select split("key1:value1,key2:value2,key3:value3",",");
//結果
["key1:value1","key2:value2","key3:value3"]
2、再使用explode將數組數據轉換為多行
select explode(split("key1:value1,key2:value2,key3:value3",","));
//結果
key1:value1
key2:value2
key3:value3
3、將結果2再次進行切分,並通過索引取出第1、2個元素值
select split(t1.kv,":")[0] as key,split(t1.kv,":")[1] as value
from
(select explode(split("key1:value1,key2:value2,key3:value3",","))as kv) t1;
//結果
key1 value1
key2 value2
key3 value3
方法二:自定UDTF(編寫代碼)
- 編寫UDTF代碼
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class MyUDTF extends GenericUDTF {
@Override
// initialize方法,會在UDTF被調用的時候執行一次
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//下面兩行代碼表示自定義數據的第一列
fieldNames.add("col1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//自定義數據的第二列
fieldNames.add("col2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
// "key1:value1,key2:value2,key3:value3"
for (Object arg : args) {
String[] kvSplit = arg.toString().split(",");
for (String kv : kvSplit) {
String[] splits = kv.split(":"); //根據冒號切分
String key = splits[0]; //定義切分后的第一個元素為key
String value = splits[1]; //定義切分后的第二個元素為value
ArrayList<String> kvList = new ArrayList<>();
kvList.add(key);
kvList.add(value);
forward(kvList);
}
}
}
@Override
public void close() throws HiveException {
}
}
- 打成jar包並上傳至Linux虛擬機
- 在hive shell中,使用
add jar 路徑
將jar包作為資源添加到hive環境中
add jar /usr/local/soft/jars/Hive-1.0.jar;
- 使用jar包資源注冊一個臨時函數,my_udf是我們自定義函數名,as后跟是主類名
create temporary function my_utdf as 'com.shujia.Hive.UDF.MyUDTF';
- 使用函數名處理數據(SQL)
select my_udtf("key1:value1,key2:value2,key3:value3");
結果
key1 value1
key2 value2
key3 value3
案例2
字段:id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12 共13列
數據:
a,1,2,3,4,5,6,7,8,9,10,11,12
b,11,12,13,14,15,16,17,18,19,20,21,22
c,21,22,23,24,25,26,27,28,29,30,31,32
需求:轉成3列:id,hours,value
例如:將其中的12列轉為12行(字段id不轉換)
a,1,2,3,4,5,6,7,8,9,10,11,12
a,0時,1
a,2時,2
a,4時,3
a,6時,4
a,8時,5
......
a,22時,12
colx對應的就是(x-1)*2時
//建表
create table udtfData(
id string
,col1 string
,col2 string
,col3 string
,col4 string
,col5 string
,col6 string
,col7 string
,col8 string
,col9 string
,col10 string
,col11 string
,col12 string
)row format delimited fields terminated by ',';
//加載數據
load data local inpath '/usr/local/soft/data/udtfData.txt' into table udtfData;
編寫代碼:
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class MyUDTF2 extends GenericUDTF{
//實現initialize方法
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//下面兩行代碼表示自定義數據的第一列
fieldNames.add("hours");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//自定義數據的第二列
fieldNames.add("value");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
int hours = 0;
//遍歷args
for (Object arg : args) {
String value = arg.toString();
ArrayList<String> hoursValueList = new ArrayList<>();
hoursValueList.add(hours + "時");
hoursValueList.add(value);
forward(hoursValueList);
hours = hours + 2;
}
}
@Override
public void close() throws HiveException {
}
}
添加jar資源:
add jar /usr/local/soft/jars/Hive-1.0.jar;
注冊udtf函數:
create temporary function my_udtf2 as 'com.shujia.Hive.UDF.MyUDTF2';
SQL:
注意:
select id,my_udtf2(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12)
from udtfData; 這樣會報錯,id和my_udtf2()不能同時查,在udtf中需要使用關鍵詞lateral view
select id
,hours
,value
from udtfData
lateral view
my_udtf2(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12) t
as hours,value ;
//my_udtf2()后的起個表名不能少,as后面也需要起名(返回幾列就起幾個名)
執行結果
a 0時 1
a 2時 2
a 4時 3
a 6時 4
a 8時 5
a 10時 6
a 12時 7
a 14時 8
a 16時 9
a 18時 10
a 20時 11
a 22時 12
b 0時 11
b 2時 12
b 4時 13
b 6時 14
b 8時 15
b 10時 16
b 12時 17
b 14時 18
b 16時 19
b 18時 20
b 20時 21
b 22時 22
c 0時 21
c 2時 22
c 4時 23
c 6時 24
c 8時 25
c 10時 26
c 12時 27
c 14時 28
c 16時 29
c 18時 30
c 20時 31
c 22時 32
或者直接使用SQL,不要寫代碼(利用posexplode產生索引值、concat拼接)
select id
,concat(index*2,"時") as hours
,value
from udtfData
lateral view
posexplode (array(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12)) t
as index,value;
3、Hive 行轉列
- lateral view explode
//建表
create table testArray2(
name string,
weight array<string>
)row format delimited
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';
//加載數據
志凱 "150","170","180"
上單 "150","180","190"
//SQL
select name,col1 from testarray2 lateral view explode(weight) t1 as col1;
//結果
志凱 150
志凱 170
志凱 180
上單 150
上單 180
上單 190
//explode也可以轉map()
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
key1
key2
key3
select name,col1,col2 from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
志凱 key1 1
志凱 key2 2
志凱 key3 3
上單 key1 1
上單 key2 2
上單 key3 3
select name,pos,col1 from testarray2 lateral view posexplode(weight) t1 as pos,col1;
志凱 0 150
志凱 1 170
志凱 2 180
上單 0 150
上單 1 180
上單 2 190
4、Hive 列轉行
- collect_list
//建表
create table testLieToLine(
name string,
col1 int
)row format delimited
fields terminated by '\t';
//數據
name col1
志凱 150
志凱 170
志凱 180
上單 150
上單 180
上單 190
//第一列分組,第二列轉行
select name,collect_list(col1) from testLieToLine group by name;
// 結果
上單 [150,180,190]
志凱 [150,170,180]
//或者
select t1.name
,collect_list(t1.col1)
from (
select name
,col1
from testarray2
lateral view explode(weight) t1 as col1
) t1 group by t1.name;