Hive入門操作


Hive是基於Hadoop的一個數據倉庫工具,可以將結構化的數據文件映射為一張數據庫表,並提供類SQL查詢功能。本文描述了HIve的一些基本操作,如有錯誤之處還請指出。

常用語法

#顯示相關信息
show tables;
show databases;
show partitions;
show functions;
desc extended table_name;
desc formatted table_name;
#創建庫
create database test_db;
#刪除庫
drop database 庫名;
#刪除表
drop table 表名;
#重命名表名
ALTER TABLE table_name RENAME TO new_table_name;
#清空表數據
truncate table 表名;

建表語句

CREATE [EXTERNAL] TABLE [IF NOT EXISTS] table_name 
   [(col_name data_type [COMMENT col_comment], ...)] 
   [COMMENT table_comment] 
   [PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] 
   [CLUSTERED BY (col_name, col_name, ...) 
   [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS] 
   [ROW FORMAT row_format] 
   [STORED AS file_format] 
   [LOCATION hdfs_path]

創建內部表

create table if not exists my_tb(id int,name string)
row format delimited fields terminated by ',';

創建外部表

#創建外部表要指明表的數據所在路徑
create table if not exists my_ex_tb(id int,name string)
row format delimited fields terminated by ','
location 'hdfs://192.168.38.3:9000/externdb/my_ex_tb/';

在刪除表的時候,內部表的元數據和數據會被一起刪除,而外部表只刪除元數據,不刪除數據。

加載數據到表目錄下

#在表的最后插入或覆蓋整個表的數據(into/overwrite)
load data local inpath '/root/1.txt' into(overwrite) table my_ex_tb;

創建分區表

create table if not exists my_par_tb(id int,name string)
partitioned by(country string)
row format delimited fields terminated by ',';

load data local inpath '/root/1.txt' into table my_par_tb partition(country='China');
load data local inpath '/root/1.txt.us' into table my_par_tb partition(country='US');

#1.txt中的數據

    1,張三
    2,李四
    3,王五
    
#1.txt.us中的數據

    1,張三
    2,李四
    3,王五

#select * from my_par_tb顯示的數據

    1	張三	China
    2	李四	China
    3	王五	China
    1	張三	US
    2	李四	US
    3	王五	US
    
#查某個分區里的數據

select * from my_par_tb where country='China'

    1	張三	China
    2	李四	China
    3	王五	China

添加刪除分區

#添加分區
alter table my_par_tb add partition(country='Eng') partition(country='Ame');
#刪除分區
alter table my_par_tb drop partition(country='Eng') partition(country='Ame');

#顯示表中的分區
show partitions my_par_tb;

country=China
country=US

創建分桶表

create table if not exists my_buck_tb(id int,name string)
clustered by(id) sorted by(id)
into 4 buckets
row format delimited fields terminated by ',';

#指定開啟分桶
set hive.enforce.bucketing=true;
#分了幾個桶就設置幾個reduce,將從其他表中查出來多個文件,分表放入到多個桶里。
set mapreduce.job.reduces=4;

#從my_tb表中查出數據插入到分桶表里
insert into table my_buck_tb
#指定map輸出的數據根據id去分區,排序(cluster by等價於distribute by+sort by的效果)
select id,name from my_tb cluster by(id);

保存查詢結果

默認情況下查詢結果顯示在屏幕上,可以將查詢結果保存到表里。

#將查詢結果保存到一張新創建的表中
create table tmp_tb as select * from my_tb;

#將查詢結果保存到一張已經存在的表中
insert into table tmp_tb select * from my_tb;

#將查詢結果保存到指定目錄下(本地或hdfs上)
#本地
insert overwrite local directory '/root/out_tb/'
select * from my_tb;
#hdfs
insert overwrite  directory '/out_tb/'
select * from my_tb;

join操作

a表數據:

1,張三
2,李四
3,c
4,a
5,e
6,r

b表數據:

1,綠間
3,青峰
4,黑子
9,紅發

建表:
create table a(id int,name string)
row format delimited fields terminated by ',';

create table b(id int,name string)
row format delimited fields terminated by ',';

導入數據:
load data local inpath '/root/a.txt' into table a;
load data local inpath '/root/b.txt' into table b;

#內連接(交集)
select * from a inner join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 張三      | 1     | 綠間      |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
+-------+---------+-------+---------+--+

#左連接
select * from a left join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 張三      | 1     | 綠間      |
| 2     | 李四      | NULL  | NULL    |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| 5     | e       | NULL  | NULL    |
| 6     | r       | NULL  | NULL    |
+-------+---------+-------+---------+--+

#右連接
select * from a right join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 張三      | 1     | 綠間      |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| NULL  | NULL    | 9     | 紅發      |
+-------+---------+-------+---------+--+

#全連接
select * from a full outer join b on a.id=b.id;
+-------+---------+-------+---------+--+
| a.id  | a.name  | b.id  | b.name  |
+-------+---------+-------+---------+--+
| 1     | 張三      | 1     | 綠間      |
| 2     | 李四      | NULL  | NULL    |
| 3     | c       | 3     | 青峰      |
| 4     | a       | 4     | 黑子      |
| 5     | e       | NULL  | NULL    |
| 6     | r       | NULL  | NULL    |
| NULL  | NULL    | 9     | 紅發      |
+-------+---------+-------+---------+--+


#左半連接(內連接的結果中只取左邊的表的數據)
select * from a left semi join b on a.id = b.id;
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 1     | 張三      |
| 3     | c       |
| 4     | a       |
+-------+---------+--+

select查詢語句

SELECT [ALL | DISTINCT] select_expr, select_expr, ... 
FROM table_name[..join..on(a.id=b.id)]
[WHERE where_condition] 
[GROUP BY col_list [HAVING condition]] 
[CLUSTER BY col_list 
  | [DISTRIBUTE BY col_list] [SORT BY| ORDER BY col_list] 
] 
[LIMIT number]

#指定map輸出的數據根據id去分區,排序(cluster by等價於distribute by+sort by的效果)
select id,name from my_tb cluster by(id);

自定義函數

hive內置函數

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF

pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.xiaojie.mm</groupId>
    <artifactId>my_hive</artifactId>
    <version>0.0.1-SNAPSHOT</version>

    <properties>
        <hadoop.version>2.6.5</hadoop.version>
        <hive.version>1.2.1</hive.version>
    </properties>


    <dependencies>
        <!-- Hadoop -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.5</version>
        </dependency>
        <!-- Hive -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-metastore</artifactId>
            <version>1.2.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-pdk</artifactId>
            <version>0.10.0</version>
        </dependency>
        <dependency>
            <groupId>javax.jdo</groupId>
            <artifactId>jdo2-api</artifactId>
            <version>2.3-eb</version>
        </dependency>
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.1.1</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.7</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.7</version>
            <scope>system</scope>
            <systemPath>/home/miao/apps/install/jdk1.7.0_45/lib/tools.jar</systemPath>
        </dependency>
    </dependencies>
</project>

自定義將大寫轉為小寫的方法

package com.xiaojie.mm;

import org.apache.hadoop.hive.ql.exec.UDF;

public class ToLower extends UDF{
//	重載該方法  
    public String evaluate(String field) {
        return field.toLowerCase();
    }
}

導出jar包,並放到hive所在的機器上

scp tolower.jar mini1:/root/apps/

hive客戶端添加自定義函數

#第一步
add JAR /root/apps/tolower.jar;
#第二步 引號里是自定義方法的全名(臨時方法,只在該回話窗口有效)
create temporary function tolower as 'com.xiaojie.mm.ToLower';
#第三步使用
select * from a;                        
+-------+---------+--+
| a.id  | a.name  |
+-------+---------+--+
| 7     | AAAAA   |
| 1     | 張三      |
| 2     | 李四      |
| 3     | c       |
| 4     | a       |
| 5     | e       |
| 6     | r       |
+-------+---------+--+

select id,tolower(name) from a;
+-----+--------+--+
| id  |  _c1   |
+-----+--------+--+
| 7   | aaaaa  |
| 1   | 張三     |
| 2   | 李四     |
| 3   | c      |
| 4   | a      |
| 5   | e      |
| 6   | r      |
+-----+--------+--+

自定義獲取手機歸屬地

package com.xiaojie.mm;
import java.util.HashMap;
import org.apache.hadoop.hive.ql.exec.UDF;
public class GetProvince extends UDF{
    public static HashMap<String,String> provinceMap = new HashMap<String,String>();
    static {
        provinceMap.put("183", "hangzhou");
        provinceMap.put("186", "nanjing");
        provinceMap.put("187", "suzhou");
        provinceMap.put("188", "ningbo");
    }
    public String evaluate(int phonenumber) {
        String phone_num = String.valueOf(phonenumber);
        #取手機號碼前三位
        String phone = phone_num.substring(0, 3);
        return provinceMap.get(phone)==null?"未知":provinceMap.get(phone);
    }
}

原數據:
+----------------------+---------------------+--+
| flow_province.phone  | flow_province.flow  |
+----------------------+---------------------+--+
| 1837878              | 12m                 |
| 1868989              | 13m                 |
| 1878989              | 14m                 |
| 1889898              | 15m                 |
| 1897867              | 16m                 |
| 1832323              | 78m                 |
| 1858767              | 88m                 |
| 1862343              | 99m                 |
| 1893454              | 77m                 |
+----------------------+---------------------+--+

調用自定義方法后:

select phone,getpro(phone),flow from flow_province;
+----------+-----------+-------+--+
|  phone   |    _c1    | flow  |
+----------+-----------+-------+--+
| 1837878  | hangzhou  | 12m   |
| 1868989  | nanjing   | 13m   |
| 1878989  | suzhou    | 14m   |
| 1889898  | ningbo    | 15m   |
| 1897867  | 未知        | 16m   |
| 1832323  | hangzhou  | 78m   |
| 1858767  | 未知        | 88m   |
| 1862343  | nanjing   | 99m   |
| 1893454  | 未知        | 77m   |
+----------+-----------+-------+--+

自定義解析json格式的數據

#創建表
create table json_tb(line string);
#導入數據
load data local inpath '/root/test_data/a.json' into table json_tb;
#顯示原數據
select line from my_tb limit 10;
+----------------------------------------------------------------+--+
|                          json_tb.line                          |
+----------------------------------------------------------------+--+
| {"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}  |
| {"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}   |
| {"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}   |
| {"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}  |
| {"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}  |
| {"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}  |
| {"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}  |
| {"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}  |
| {"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}   |
| {"movie":"919","rate":"4","timeStamp":"978301368","uid":"1"}   |
+----------------------------------------------------------------+--+

#自定義函數
package com.xiaojie.mm;
import org.apache.hadoop.hive.ql.exec.UDF;
import parquet.org.codehaus.jackson.map.ObjectMapper;

public class JsonParse extends UDF{
    public String evaluate(String jsonLine) {
        ObjectMapper objectMapper = new ObjectMapper();
        try {
            MovieBean bean = objectMapper.readValue(jsonLine, MovieBean.class);
            return bean.toString();
        }catch(Exception e){

        }
        return "";
    }
}

package com.xiaojie.mm;
public class MovieBean {
//	電影id
    private String movie;
//	電影評分
    private String rate;
//	評分時間
    private String timeStamp;
//	用戶id
    private String uid;

    public String getMovie() {
        return movie;
    }

    public void setMovie(String movie) {
        this.movie = movie;
    }

    public String getRate() {
        return rate;
    }

    public void setRate(String rate) {
        this.rate = rate;
    }

    public String getTimeStamp() {
        return timeStamp;
    }

    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }

    public String getUid() {
        return uid;
    }

    public void setUid(String uid) {
        this.uid = uid;
    }

    @Override
    public String toString() {
        return this.movie + "\t" +this.rate + "\t" + this.timeStamp + "\t" + this.uid;
    }

}

#打jar包上傳到hive所在機器,創建函數
add JAR /root/test_data/json_parse.jar;
create temporary function json_parse as 'com.xiaojie.mm.JsonParse';

#使用自定義的json解析函數
select json_parse(line) from json_tb limit 10;
+---------------------+--+
|         _c0         |
+---------------------+--+
| 1193	5	978300760	1  |
| 661	3	978302109	1   |
| 914	3	978301968	1   |
| 3408	4	978300275	1  |
| 2355	5	978824291	1  |
| 1197	3	978302268	1  |
| 1287	5	978302039	1  |
| 2804	5	978300719	1  |
| 594	4	978302268	1   |
| 919	4	978301368	1   |
+---------------------+--+

#將json解析的數據保存到一張新創建的表里
create table json_parse_tb as
select split(json_parse(line),'\t')[0] as movieid,
split(json_parse(line),'\t')[1] as rate,
split(json_parse(line),'\t')[2] as time,
split(json_parse(line),'\t')[3] as userid
from json_tb limit 100;

#內置json函數
select get_json_object(line,'$.movie') as moiveid,
get_json_object(line,'$.rate') as rate,
get_json_object(line,'$.timeStamp') as time,
get_json_object(line,'$.uid') as userid
from json_tb limit 10;

Transform(調用自定義腳本)

Hive的 TRANSFORM 關鍵字提供了在SQL中調用自寫腳本的功能,適合實現Hive中沒有的功能又不想寫UDF的情況。

自定義python腳本(vim time_parse.py)

#!/bin/python
import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  movieid, rate, unixtime,userid = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([movieid, rate, str(weekday),userid])

將py文件導入到hive的工作目錄下

add file time_parse.py

使用transform調用自定義的py代碼

create TABLE json_parse_time_tb as
SELECT
#根據transform括號中的參數,將json_parse_tb表的對應數據取出
  TRANSFORM (movieid, rate, time, userid)
  USING 'python time_parse.py'
  AS (movieid, rate, weekday,userid)
FROM json_parse_tb;

查看新表數據

select * from json_parse_time_tb;
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| json_parse_time_tb.movieid  | json_parse_time_tb.rate  | json_parse_time_tb.weekday  | json_parse_time_tb.userid  |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+
| 1690                        | 3                        | 1                           | 2                          |
| 589                         | 4                        | 1                           | 2                          |
| 3471                        | 5                        | 1                           | 2                          |
| 1834                        | 4                        | 1                           | 2                          |
| 2490                        | 3                        | 1                           | 2                          |
| 2278                        | 3                        | 1                           | 2                          |
| 110                         | 5                        | 1                           | 2                          |
| 3257                        | 3                        | 1                           | 2                          |
| 3256                        | 2                        | 1                           | 2                          |
| 3255                        | 4                        | 1                           | 2                          |
+-----------------------------+--------------------------+-----------------------------+----------------------------+--+

案例

原數據(用戶名,月份,點擊量)

A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
A,2015-02,4
A,2015-02,6
B,2015-02,10
B,2015-02,5

求每個人每個月的點擊量,以及點擊量累計

第一步:創建表,導入數據

#建表
create table click_tb(username string,month string,click int)
row format delimited fields terminated by ',';
#導入數據
load data local inpath ‘/root/test_data/click.txt’ into click_tb;

第二步:求每個用戶每個月的點擊量

select username,month,sum(click) as click_count from click_tb group by username,month;

+-----------+----------+--------------+--+
| username  |  month   | click_count  |
+-----------+----------+--------------+--+
| A         | 2015-01  | 33           |
| A         | 2015-02  | 10           |
| B         | 2015-01  | 30           |
| B         | 2015-02  | 15           |
+-----------+----------+--------------+--+

第三步:自己和自己內連接(求交集)

select * from
(select username,month,sum(click) as click_count from click_tb group by username,month) A
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) B
on
A.username=B.username;

+-------------+----------+----------------+-------------+----------+----------------+--+
| a.username  | a.month  | a.click_count  | b.username  | b.month  | b.click_count  |
+-------------+----------+----------------+-------------+----------+----------------+--+
| A           | 2015-01  | 33             | A           | 2015-01  | 33             |
| A           | 2015-01  | 33             | A           | 2015-02  | 10             |
| A           | 2015-02  | 10             | A           | 2015-01  | 33             |
| A           | 2015-02  | 10             | A           | 2015-02  | 10             |
| B           | 2015-01  | 30             | B           | 2015-01  | 30             |
| B           | 2015-01  | 30             | B           | 2015-02  | 15             |
| B           | 2015-02  | 15             | B           | 2015-01  | 30             |
| B           | 2015-02  | 15             | B           | 2015-02  | 15             |
+-------------+----------+----------------+-------------+----------+----------------+--+

第四步:求出最終所需結果

select a.username,a.month,min(a.click_count) as click_count,sum(b.click_count) as sum_count from
(select username,month,sum(click) as click_count from click_tb group by username,month) a
inner join
(select username,month,sum(click) as click_count from click_tb group by username,month) b
on
A.username=B.username
where b.month<=a.month
group by a.username,a.month
order by a.username,a.month;

+-------------+----------+--------------+------------+--+
| a.username  | a.month  | click_count  | sum_count  |
+-------------+----------+--------------+------------+--+
| A           | 2015-01  | 33           | 33         |
| A           | 2015-02  | 10           | 43         |
| B           | 2015-01  | 30           | 30         |
| B           | 2015-02  | 15           | 45         |
+-------------+----------+--------------+------------+--+


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM