目录
1、Hive 中的wordCount
- explode:一对多,拆分一行,返回多行
- posexplode:一对多,拆分一行,返回多行,并生成对应的索引值
- 聚合函数:多对一,聚合多行,返回一行
explode:
(1)将数组中的元素拆分成多行显示
举例:
hive> select explode(array(1,2,3)) from XXX;
(2)将Map中的元素拆分成多行显示
举例:
hive> select explode(map('k1','v1','k2','v2')) from lxw1234;
posexplode:
将元素拆分成多行显示,并产生对应的一列索引值
举例:
hive> select explode(array(6,7,8)) from XXX;
结果:
0 6
1 7
2 8
//建表
create table words(
words string
)row format delimited fields terminated by '|';
// 加载数据
hello,java,hello,java,scala,python
hbase,hadoop,hadoop,hdfs,hive,hive
hbase,hadoop,hadoop,hdfs,hive,hive
//切分数据(map端实现)
select split(words,",") from words;
//结果
["hello","java","hello","java","scala","python"] //这代表一列,不是一行
["hbase","hadoop","hadoop","hdfs","hive","hive"]
["hbase","hadoop","hadoop","hdfs","hive","hive"]
//通过explode将一行转化为多行
select explode(split(words,",")) from words;
//结果
hello
java
hello
java
scala
python
hbase
hadoop
hadoop
hdfs
...
//实现wordcount需求
select word,count(*) from (select explode(split(words,',')) as word from words) t1 group by t1.word;
// 结果
hadoop 4
hbase 2
hdfs 2
hello 2
hive 4
java 2
python 1
scala 1
2、Hive自定义函数UserDefineFunction
(1)UDF:一进一出(一对一)
- 创建maven项目,并加入依赖
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
如果加入依赖后,一直报错,解决方法如下:
先将IDEA关闭,在我的电脑中,找到该项目对应的.idea目录,将.idea目录删除,再重新打开IDEA
- 编写代码,继承org.apache.hadoop.hive.ql.exec.UDF,实现evaluate方法,在evaluate方法中实现自己的逻辑
//需求:将students表中的班级汉字变为数字
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDF;
public class MyUDF extends UDF {
// 自定义UDF 需要继承UDF类,实现evaluate方法
public String evaluate(String clazz) {
//定义属性,初始化为空字符串
String resStr = "";
resStr = clazz.replace("一", "1");
resStr = resStr.replace("二", "2");
resStr = resStr.replace("三", "3");
resStr = resStr.replace("四", "4");
resStr = resStr.replace("五", "5");
resStr = resStr.replace("六", "6");
return resStr;
}
}
- 打成jar包并上传至Linux虚拟机
- 在hive shell中,使用
add jar 路径
将jar包作为资源添加到hive环境中
add jar /usr/local/soft/jars/Hive-1.0.jar;
- 使用jar包资源注册一个临时函数,my_udf是我们自定义函数名,as后跟是主类名
create temporary function my_udf as 'com.shujia.Hive.UDF.MyUDF';
- 使用函数名处理数据(SQL)
select clazz,my_udf(clazz) from students limit 10;
文科六班 文科6班
文科六班 文科6班
理科六班 理科6班
理科三班 理科3班
理科五班 理科5班
理科二班 理科2班
文科六班 文科6班
理科六班 理科6班
理科一班 理科1班
理科六班 理科6班
(2)UDTF:一进多出(一对多)
案例1
"key1:value1,key2:value2,key3:value3"
key1 value1
key2 value2
key3 value3
方法一:使用 split+explode
1、使用split先将数据转化为数组形式
select split("key1:value1,key2:value2,key3:value3",",");
//结果
["key1:value1","key2:value2","key3:value3"]
2、再使用explode将数组数据转换为多行
select explode(split("key1:value1,key2:value2,key3:value3",","));
//结果
key1:value1
key2:value2
key3:value3
3、将结果2再次进行切分,并通过索引取出第1、2个元素值
select split(t1.kv,":")[0] as key,split(t1.kv,":")[1] as value
from
(select explode(split("key1:value1,key2:value2,key3:value3",","))as kv) t1;
//结果
key1 value1
key2 value2
key3 value3
方法二:自定UDTF(编写代码)
- 编写UDTF代码
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class MyUDTF extends GenericUDTF {
@Override
// initialize方法,会在UDTF被调用的时候执行一次
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//下面两行代码表示自定义数据的第一列
fieldNames.add("col1");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//自定义数据的第二列
fieldNames.add("col2");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
// "key1:value1,key2:value2,key3:value3"
for (Object arg : args) {
String[] kvSplit = arg.toString().split(",");
for (String kv : kvSplit) {
String[] splits = kv.split(":"); //根据冒号切分
String key = splits[0]; //定义切分后的第一个元素为key
String value = splits[1]; //定义切分后的第二个元素为value
ArrayList<String> kvList = new ArrayList<>();
kvList.add(key);
kvList.add(value);
forward(kvList);
}
}
}
@Override
public void close() throws HiveException {
}
}
- 打成jar包并上传至Linux虚拟机
- 在hive shell中,使用
add jar 路径
将jar包作为资源添加到hive环境中
add jar /usr/local/soft/jars/Hive-1.0.jar;
- 使用jar包资源注册一个临时函数,my_udf是我们自定义函数名,as后跟是主类名
create temporary function my_utdf as 'com.shujia.Hive.UDF.MyUDTF';
- 使用函数名处理数据(SQL)
select my_udtf("key1:value1,key2:value2,key3:value3");
结果
key1 value1
key2 value2
key3 value3
案例2
字段:id,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12 共13列
数据:
a,1,2,3,4,5,6,7,8,9,10,11,12
b,11,12,13,14,15,16,17,18,19,20,21,22
c,21,22,23,24,25,26,27,28,29,30,31,32
需求:转成3列:id,hours,value
例如:将其中的12列转为12行(字段id不转换)
a,1,2,3,4,5,6,7,8,9,10,11,12
a,0时,1
a,2时,2
a,4时,3
a,6时,4
a,8时,5
......
a,22时,12
colx对应的就是(x-1)*2时
//建表
create table udtfData(
id string
,col1 string
,col2 string
,col3 string
,col4 string
,col5 string
,col6 string
,col7 string
,col8 string
,col9 string
,col10 string
,col11 string
,col12 string
)row format delimited fields terminated by ',';
//加载数据
load data local inpath '/usr/local/soft/data/udtfData.txt' into table udtfData;
编写代码:
package com.shujia.Hive.UDF;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import java.util.ArrayList;
public class MyUDTF2 extends GenericUDTF{
//实现initialize方法
@Override
public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException {
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
//下面两行代码表示自定义数据的第一列
fieldNames.add("hours");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
//自定义数据的第二列
fieldNames.add("value");
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,
fieldOIs);
}
@Override
public void process(Object[] args) throws HiveException {
int hours = 0;
//遍历args
for (Object arg : args) {
String value = arg.toString();
ArrayList<String> hoursValueList = new ArrayList<>();
hoursValueList.add(hours + "时");
hoursValueList.add(value);
forward(hoursValueList);
hours = hours + 2;
}
}
@Override
public void close() throws HiveException {
}
}
添加jar资源:
add jar /usr/local/soft/jars/Hive-1.0.jar;
注册udtf函数:
create temporary function my_udtf2 as 'com.shujia.Hive.UDF.MyUDTF2';
SQL:
注意:
select id,my_udtf2(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12)
from udtfData; 这样会报错,id和my_udtf2()不能同时查,在udtf中需要使用关键词lateral view
select id
,hours
,value
from udtfData
lateral view
my_udtf2(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12) t
as hours,value ;
//my_udtf2()后的起个表名不能少,as后面也需要起名(返回几列就起几个名)
执行结果
a 0时 1
a 2时 2
a 4时 3
a 6时 4
a 8时 5
a 10时 6
a 12时 7
a 14时 8
a 16时 9
a 18时 10
a 20时 11
a 22时 12
b 0时 11
b 2时 12
b 4时 13
b 6时 14
b 8时 15
b 10时 16
b 12时 17
b 14时 18
b 16时 19
b 18时 20
b 20时 21
b 22时 22
c 0时 21
c 2时 22
c 4时 23
c 6时 24
c 8时 25
c 10时 26
c 12时 27
c 14时 28
c 16时 29
c 18时 30
c 20时 31
c 22时 32
或者直接使用SQL,不要写代码(利用posexplode产生索引值、concat拼接)
select id
,concat(index*2,"时") as hours
,value
from udtfData
lateral view
posexplode (array(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12)) t
as index,value;
3、Hive 行转列
- lateral view explode
//建表
create table testArray2(
name string,
weight array<string>
)row format delimited
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';
//加载数据
志凯 "150","170","180"
上单 "150","180","190"
//SQL
select name,col1 from testarray2 lateral view explode(weight) t1 as col1;
//结果
志凯 150
志凯 170
志凯 180
上单 150
上单 180
上单 190
//explode也可以转map()
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
key1
key2
key3
select name,col1,col2 from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
志凯 key1 1
志凯 key2 2
志凯 key3 3
上单 key1 1
上单 key2 2
上单 key3 3
select name,pos,col1 from testarray2 lateral view posexplode(weight) t1 as pos,col1;
志凯 0 150
志凯 1 170
志凯 2 180
上单 0 150
上单 1 180
上单 2 190
4、Hive 列转行
- collect_list
//建表
create table testLieToLine(
name string,
col1 int
)row format delimited
fields terminated by '\t';
//数据
name col1
志凯 150
志凯 170
志凯 180
上单 150
上单 180
上单 190
//第一列分组,第二列转行
select name,collect_list(col1) from testLieToLine group by name;
// 结果
上单 [150,180,190]
志凯 [150,170,180]
//或者
select t1.name
,collect_list(t1.col1)
from (
select name
,col1
from testarray2
lateral view explode(weight) t1 as col1
) t1 group by t1.name;