Hive是基於Hadoop的一個數據倉庫工具,可以將結構化的數據文件映射為一張數據庫表,並提供類SQL查詢功能。本文描述了HIve的一些基本操作,如有錯誤之處還請指出。
常用語法
#顯示相關信息
show tables;
show databases;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
#創建庫
create database test_db;
#刪除庫
drop database 庫名;
#刪除表
drop table 表名;
#重命名表名
ALTER TABLE table_name RENAME TO new_table_name;
#清空表數據
truncate table 表名;
建表語句
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name
[(col_name data_type [COMMENT col_comment], ...)]
[COMMENT table_comment]
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)]
[CLUSTERED BY (col_name, col_name, ...)
[SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
創建內部表
create table if not exists my_tb(id int,name string)
row format delimited fields terminated by ',';
創建外部表
#創建外部表要指明表的數據所在路徑
create table if not exists my_ex_tb(id int,name string)
row format delimited fields terminated by ','
location 'hdfs://192.168.38.3:9000/externdb/my_ex_tb/';
在刪除表的時候,內部表的元數據和數據會被一起刪除,而外部表只刪除元數據,不刪除數據。
加載數據到表目錄下
#在表的最后插入或覆蓋整個表的數據(into/overwrite)
load data local inpath '/root/1.txt' into(overwrite) table my_ex_tb;
創建分區表
create table if not exists my_par_tb(id int,name string)
partitioned by(country string)
row format delimited fields terminated by ',';
load data local inpath '/root/1.txt' into table my_par_tb partition(country='China');
load data local inpath '/root/1.txt.us' into table my_par_tb partition(country='US');
#1.txt中的數據
1,張三
2,李四
3,王五
#1.txt.us中的數據
1,張三
2,李四
3,王五
#select * from my_par_tb顯示的數據
1 張三 China
2 李四 China
3 王五 China
1 張三 US
2 李四 US
3 王五 US
#查某個分區里的數據
select * from my_par_tb where country='China'
1 張三 China
2 李四 China
3 王五 China
添加刪除分區
#添加分區
alter table my_par_tb add partition(country='Eng') partition(country='Ame');
#刪除分區
alter table my_par_tb drop partition(country='Eng') partition(country='Ame');
#顯示表中的分區
show partitions my_par_tb;
country=China
country=US
創建分桶表
create table if not exists my_buck_tb(id int,name string)
clustered by(id) sorted by(id)
into 4 buckets
row format delimited fields terminated by ',';
#指定開啟分桶
set hive.enforce.bucketing=true;
#分了幾個桶就設置幾個reduce,將從其他表中查出來多個文件,分表放入到多個桶里。
set mapreduce.job.reduces=4;
#從my_tb表中查出數據插入到分桶表里
insert into table my_buck_tb
#指定map輸出的數據根據id去分區,排序(cluster by等價於distribute by+sort by的效果)
select id,name from my_tb cluster by(id);
保存查詢結果
默認情況下查詢結果顯示在屏幕上,可以將查詢結果保存到表里。
#將查詢結果保存到一張新創建的表中
create table tmp_tb as select * from my_tb;
#將查詢結果保存到一張已經存在的表中
insert into table tmp_tb select * from my_tb;
#將查詢結果保存到指定目錄下(本地或hdfs上)
#本地
insert overwrite local directory '/root/out_tb/'
select * from my_tb;
#hdfs
insert overwrite directory '/out_tb/'
select * from my_tb;
join操作
a表數據:
1,張三
2,李四
3,c
4,a
5,e
6,r
b表數據:
1,綠間
3,青峰
4,黑子
9,紅發
建表:
create table a(id int,name string)
row format delimited fields terminated by ',';
create table b(id int,name string)
row format delimited fields terminated by ',';
導入數據:
load data local inpath '/root/a.txt' into table a;
load data local inpath '/root/b.txt' into table b;
#內連接(交集)
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | 張三 | 1 | 綠間 |
| 3 | c | 3 | 青峰 |
| 4 | a | 4 | 黑子 |
+-------+---------+-------+---------+--+
#左連接
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | 張三 | 1 | 綠間 |
| 2 | 李四 | NULL | NULL |
| 3 | c | 3 | 青峰 |
| 4 | a | 4 | 黑子 |
| 5 | e | NULL | NULL |
| 6 | r | NULL | NULL |
+-------+---------+-------+---------+--+
#右連接
select * from a right join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | 張三 | 1 | 綠間 |
| 3 | c | 3 | 青峰 |
| 4 | a | 4 | 黑子 |
| NULL | NULL | 9 | 紅發 |
+-------+---------+-------+---------+--+
#全連接
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id | a.name | b.id | b.name |
+-------+---------+-------+---------+--+
| 1 | 張三 | 1 | 綠間 |
| 2 | 李四 | NULL | NULL |
| 3 | c | 3 | 青峰 |
| 4 | a | 4 | 黑子 |
| 5 | e | NULL | NULL |
| 6 | r | NULL | NULL |
| NULL | NULL | 9 | 紅發 |
+-------+---------+-------+---------+--+
#左半連接(內連接的結果中只取左邊的表的數據)
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| 1 | 張三 |
| 3 | c |
| 4 | a |
+-------+---------+--+
select查詢語句
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_name[..join..on(a.id=b.id)]
[WHERE where_condition]
[GROUP BY col_list [HAVING condition]]
[CLUSTER BY col_list
| [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list]
]
[LIMIT number]
#指定map輸出的數據根據id去分區,排序(cluster by等價於distribute by+sort by的效果)
select id,name from my_tb cluster by(id);
自定義函數
hive內置函數
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF
pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.xiaojie.mm</groupId>
<artifactId>my_hive</artifactId>
<version>0.0.1-SNAPSHOT</version>
<properties>
<hadoop.version>2.6.5</hadoop.version>
<hive.version>1.2.1</hive.version>
</properties>
<dependencies>
<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.5</version>
</dependency>
<!-- Hive -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-metastore</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-pdk</artifactId>
<version>0.10.0</version>
</dependency>
<dependency>
<groupId>javax.jdo</groupId>
<artifactId>jdo2-api</artifactId>
<version>2.3-eb</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jdk.tools</groupId>
<artifactId>jdk.tools</artifactId>
<version>1.7</version>
<scope>system</scope>
<systemPath>/home/miao/apps/install/jdk1.7.0_45/lib/tools.jar</systemPath>
</dependency>
</dependencies>
</project>
自定義將大寫轉為小寫的方法
package com.xiaojie.mm;
import org.apache.hadoop.hive.ql.exec.UDF;
public class ToLower extends UDF{
// 重載該方法
public String evaluate(String field) {
return field.toLowerCase();
}
}
導出jar包,並放到hive所在的機器上
scp tolower.jar mini1:/root/apps/
hive客戶端添加自定義函數
#第一步
add JAR /root/apps/tolower.jar;
#第二步 引號里是自定義方法的全名(臨時方法,只在該回話窗口有效)
create temporary function tolower as 'com.xiaojie.mm.ToLower';
#第三步使用
select * from a;
+-------+---------+--+
| a.id | a.name |
+-------+---------+--+
| 7 | AAAAA |
| 1 | 張三 |
| 2 | 李四 |
| 3 | c |
| 4 | a |
| 5 | e |
| 6 | r |
+-------+---------+--+
select id,tolower(name) from a;
+-----+--------+--+
| id | _c1 |
+-----+--------+--+
| 7 | aaaaa |
| 1 | 張三 |
| 2 | 李四 |
| 3 | c |
| 4 | a |
| 5 | e |
| 6 | r |
+-----+--------+--+
自定義獲取手機歸屬地
package com.xiaojie.mm;
import java.util.HashMap;
import org.apache.hadoop.hive.ql.exec.UDF;
public class GetProvince extends UDF{
public static HashMap<String,String> provinceMap = new HashMap<String,String>();
static {
provinceMap.put("183", "hangzhou");
provinceMap.put("186", "nanjing");
provinceMap.put("187", "suzhou");
provinceMap.put("188", "ningbo");
}
public String evaluate(int phonenumber) {
String phone_num = String.valueOf(phonenumber);
#取手機號碼前三位
String phone = phone_num.substring(0, 3);
return provinceMap.get(phone)==null?"未知":provinceMap.get(phone);
}
}
原數據:
+----------------------+---------------------+--+
| flow_province.phone | flow_province.flow |
+----------------------+---------------------+--+
| 1837878 | 12m |
| 1868989 | 13m |
| 1878989 | 14m |
| 1889898 | 15m |
| 1897867 | 16m |
| 1832323 | 78m |
| 1858767 | 88m |
| 1862343 | 99m |
| 1893454 | 77m |
+----------------------+---------------------+--+
調用自定義方法后:
select phone,getpro(phone),flow from flow_province;
+----------+-----------+-------+--+
| phone | _c1 | flow |
+----------+-----------+-------+--+
| 1837878 | hangzhou | 12m |
| 1868989 | nanjing | 13m |
| 1878989 | suzhou | 14m |
| 1889898 | ningbo | 15m |
| 1897867 | 未知 | 16m |
| 1832323 | hangzhou | 78m |
| 1858767 | 未知 | 88m |
| 1862343 | nanjing | 99m |
| 1893454 | 未知 | 77m |
+----------+-----------+-------+--+
自定義解析json格式的數據
#創建表
create table json_tb(line string);
#導入數據
load data local inpath '/root/test_data/a.json' into table json_tb;
#顯示原數據
select line from my_tb limit 10;
+----------------------------------------------------------------+--+
| json_tb.line |
+----------------------------------------------------------------+--+
| {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"} |
| {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"} |
| {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"} |
| {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"} |
| {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"} |
| {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"} |
| {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"} |
| {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"} |
| {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"} |
| {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"} |
+----------------------------------------------------------------+--+
#自定義函數
package com.xiaojie.mm;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;
public class JsonParse extends UDF{
public String evaluate(String jsonLine) {
ObjectMapper objectMapper = new ObjectMapper();
try {
MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class);
return bean.toString();
}catch(Exception e){
}
return "";
}
}
package com.xiaojie.mm;
public class MovieBean {
// 電影id
private String movie;
// 電影評分
private String rate;
// 評分時間
private String timeStamp;
// 用戶id
private String uid;
public String getMovie() {
return movie;
}
public void setMovie(String movie) {
this.movie = movie;
}
public String getRate() {
return rate;
}
public void setRate(String rate) {
this.rate = rate;
}
public String getTimeStamp() {
return timeStamp;
}
public void setTimeStamp(String timeStamp) {
this.timeStamp = timeStamp;
}
public String getUid() {
return uid;
}
public void setUid(String uid) {
this.uid = uid;
}
@Override
public String toString() {
return this.movie + "\t" +this.rate + "\t" + this.timeStamp + "\t" + this.uid;
}
}
#打jar包上傳到hive所在機器,創建函數
add JAR /root/test_data/json_parse.jar;
create temporary function json_parse as 'com.xiaojie.mm.JsonParse';
#使用自定義的json解析函數
select json_parse(line) from json_tb limit 10;
+---------------------+--+
| _c0 |
+---------------------+--+
| 1193 5 978300760 1 |
| 661 3 978302109 1 |
| 914 3 978301968 1 |
| 3408 4 978300275 1 |
| 2355 5 978824291 1 |
| 1197 3 978302268 1 |
| 1287 5 978302039 1 |
| 2804 5 978300719 1 |
| 594 4 978302268 1 |
| 919 4 978301368 1 |
+---------------------+--+
#將json解析的數據保存到一張新創建的表里
create table json_parse_tb as
select split(json_parse(line),'\t')[0] as movieid,
split(json_parse(line),'\t')[1] as rate,
split(json_parse(line),'\t')[2] as time,
split(json_parse(line),'\t')[3] as userid
from json_tb limit 100;
#內置json函數
select get_json_object(line,'$.movie') as moiveid,
get_json_object(line,'$.rate') as rate,
get_json_object(line,'$.timeStamp') as time,
get_json_object(line,'$.uid') as userid
from json_tb limit 10;
Transform(調用自定義腳本)
Hive的 TRANSFORM 關鍵字提供了在SQL中調用自寫腳本的功能,適合實現Hive中沒有的功能又不想寫UDF的情況。
自定義python腳本(vim time_parse.py)
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rate, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rate, str(weekday),userid])
將py文件導入到hive的工作目錄下
add file time_parse.py
使用transform調用自定義的py代碼
create TABLE json_parse_time_tb as
SELECT
#根據transform括號中的參數,將json_parse_tb表的對應數據取出
TRANSFORM (movieid, rate, time, userid)
USING 'python time_parse.py'
AS (movieid, rate, weekday,userid)
FROM json_parse_tb;
查看新表數據
select * from json_parse_time_tb;
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| json_parse_time_tb.movieid | json_parse_time_tb.rate | json_parse_time_tb.weekday | json_parse_time_tb.userid |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| 1690 | 3 | 1 | 2 |
| 589 | 4 | 1 | 2 |
| 3471 | 5 | 1 | 2 |
| 1834 | 4 | 1 | 2 |
| 2490 | 3 | 1 | 2 |
| 2278 | 3 | 1 | 2 |
| 110 | 5 | 1 | 2 |
| 3257 | 3 | 1 | 2 |
| 3256 | 2 | 1 | 2 |
| 3255 | 4 | 1 | 2 |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
案例
原數據(用戶名,月份,點擊量)
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5
求每個人每個月的點擊量,以及點擊量累計
第一步:創建表,導入數據
#建表
create table click_tb(username string,month string,click int)
row format delimited fields terminated by ',';
#導入數據
load data local inpath ‘/root/test_data/click.txt’ into click_tb;
第二步:求每個用戶每個月的點擊量
select username,month,sum(click) as click_count from click_tb group by username,month;
+-----------+----------+--------------+--+
| username | month | click_count |
+-----------+----------+--------------+--+
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| B | 2015-01 | 30 |
| B | 2015-02 | 15 |
+-----------+----------+--------------+--+
第三步:自己和自己內連接(求交集)
select * from
(select username,month,sum(click) as click_count from click_tb group by username,month) A
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) B
on
A.username=B.username;
+-------------+----------+----------------+-------------+----------+----------------+--+
| a.username | a.month | a.click_count | b.username | b.month | b.click_count |
+-------------+----------+----------------+-------------+----------+----------------+--+
| A | 2015-01 | 33 | A | 2015-01 | 33 |
| A | 2015-01 | 33 | A | 2015-02 | 10 |
| A | 2015-02 | 10 | A | 2015-01 | 33 |
| A | 2015-02 | 10 | A | 2015-02 | 10 |
| B | 2015-01 | 30 | B | 2015-01 | 30 |
| B | 2015-01 | 30 | B | 2015-02 | 15 |
| B | 2015-02 | 15 | B | 2015-01 | 30 |
| B | 2015-02 | 15 | B | 2015-02 | 15 |
+-------------+----------+----------------+-------------+----------+----------------+--+
第四步:求出最終所需結果
select a.username,a.month,min(a.click_count) as click_count,sum(b.click_count) as sum_count from
(select username,month,sum(click) as click_count from click_tb group by username,month) a
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) b
on
A.username=B.username
where b.month<=a.month
group by a.username,a.month
order by a.username,a.month;
+-------------+----------+--------------+------------+--+
| a.username | a.month | click_count | sum_count |
+-------------+----------+--------------+------------+--+
| A | 2015-01 | 33 | 33 |
| A | 2015-02 | 10 | 43 |
| B | 2015-01 | 30 | 30 |
| B | 2015-02 | 15 | 45 |
+-------------+----------+--------------+------------+--+