一、創建表
hive建表的時候默認的分割符是'\001',若在建表的時候沒有指明分隔符,load文件的時候文件的分隔符需要是'\001';
若文件分隔符不是'001',程序不會報錯,但表查詢的結果會全部為'null';
1、建表的時候指定分隔符:
create table pokes(foo int,bar string) row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile;
load data local inpath '/root/pokes.txt' into table pokes;
2、替換分隔符
待導入的文件的分隔符與表的分隔符不一致,或者hive導出文件的分隔符需要替換:
hive建表的時候雖然可以指定分隔符,不過用insert overwrite local directory這種方式導出文件時,字段的分隔符會被默認
設置為\001,一般都需要將字段分隔符轉換為其它字符,可以使用如下命令
sed -e 's/\x01/\t/g' file

二、DDL操作
創建表
hive> CREATE TABLE pokes (foo INT, bar STRING);
創建表並創建索引字段ds
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
顯示所有表
hive> SHOW TABLES;
按正條件(正則表達式)顯示表,
hive> SHOW TABLES '.*s';
表添加一列
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
添加一列並增加列字段注釋
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
更改表名
hive> ALTER TABLE events RENAME TO 3koobecaf;
刪除列
hive> DROP TABLE pokes;
元數據存儲
將文件中的數據加載到表中
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
加載本地數據,同時給定分區信息
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
加載DFS數據 ,同時給定分區信息
hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
The above command will load data from an HDFS file/directory to the table. Note that loading data from HDFS will result in moving the file/directory. As a result, the operation is almost instantaneous.
SQL 操作
按先件查詢
hive> SELECT a.foo FROM invites a WHERE a.ds='<DATE>';
將查詢數據輸出至目錄
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<DATE>';
將查詢結果輸出至本地目錄
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
選擇所有列到本地目錄
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='<DATE>';
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;
將一個表的統計結果插入另一個表中
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
將多表數據插入到同一表中
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
將文件流直接插入文件
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
This streams the data in the map phase through the script /bin/cat (like hadoop streaming). Similarly - streaming can be used on the reduce side (please see the Hive Tutorial or examples)
hive> CREATE TABLE pokes (foo INT, bar STRING);
創建表並創建索引字段ds
hive> CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);
顯示所有表
hive> SHOW TABLES;
按正條件(正則表達式)顯示表,
hive> SHOW TABLES '.*s';
表添加一列
hive> ALTER TABLE pokes ADD COLUMNS (new_col INT);
添加一列並增加列字段注釋
hive> ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
更改表名
hive> ALTER TABLE events RENAME TO 3koobecaf;
刪除列
hive> DROP TABLE pokes;
元數據存儲
將文件中的數據加載到表中
hive> LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;
加載本地數據,同時給定分區信息
hive> LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
加載DFS數據 ,同時給定分區信息
hive> LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
The above command will load data from an HDFS file/directory to the table. Note that loading data from HDFS will result in moving the file/directory. As a result, the operation is almost instantaneous.
SQL 操作
按先件查詢
hive> SELECT a.foo FROM invites a WHERE a.ds='<DATE>';
將查詢數據輸出至目錄
hive> INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<DATE>';
將查詢結果輸出至本地目錄
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;
選擇所有列到本地目錄
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
hive> INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='<DATE>';
hive> INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
hive> INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;
將一個表的統計結果插入另一個表中
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT a.bar, count(1) WHERE a.foo > 0 GROUP BY a.bar;
hive> INSERT OVERWRITE TABLE events SELECT a.bar, count(1) FROM invites a WHERE a.foo > 0 GROUP BY a.bar;
JOIN
hive> FROM pokes t1 JOIN invites t2 ON (t1.bar = t2.bar) INSERT OVERWRITE TABLE events SELECT t1.bar, t1.foo, t2.foo;
將多表數據插入到同一表中
FROM src
INSERT OVERWRITE TABLE dest1 SELECT src.* WHERE src.key < 100
INSERT OVERWRITE TABLE dest2 SELECT src.key, src.value WHERE src.key >= 100 and src.key < 200
INSERT OVERWRITE TABLE dest3 PARTITION(ds='2008-04-08', hr='12') SELECT src.key WHERE src.key >= 200 and src.key < 300
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/dest4.out' SELECT src.value WHERE src.key >= 300;
將文件流直接插入文件
hive> FROM invites a INSERT OVERWRITE TABLE events SELECT TRANSFORM(a.foo, a.bar) AS (oof, rab) USING '/bin/cat' WHERE a.ds > '2008-08-09';
This streams the data in the map phase through the script /bin/cat (like hadoop streaming). Similarly - streaming can be used on the reduce side (please see the Hive Tutorial or examples)
三、復雜類型的數據表,這里列之間以'\t'分割,數組元素之間以','分割
#數據文件內容如下
1 huangfengxiao beijing,shanghai,tianjin,hangzhou
2 linan changchu,chengdu,wuhan
hive> create table complex(name string,work_locations array)
> ROW FORMAT DELIMITED
> FIELDS TERMINATED BY '\t'
> COLLECTION ITEMS TERMINATED BY ',';
hive> describe complex;
OK
name string
work_locations array
hive> LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/complex.txt' OVERWRITE INTO TABLE complex
hive> select * from complex;
OK
huangfengxiao ["beijing","shanghai","tianjin","hangzhou"]
linan ["changchu","chengdu","wuhan"]
Time taken: 0.125 seconds
hive> select name, work_locations[0] from complex;
MapReduce Total cumulative CPU time: 790 msec
Ended Job = job_201301211420_0012
MapReduce Jobs Launched:
Job 0: Map: 1 Cumulative CPU: 0.79 sec HDFS Read: 296 HDFS Write: 37 SUCCESS
Total MapReduce CPU Time Spent: 790 msec
OK
huangfengxiao beijing
linan changchu
Time taken: 20.703 seconds
四、如何分區
表class(teacher sting,student string,age int)
Mis li huangfengxiao 20
Mis li lijie 21
Mis li dongdong 21
Mis li liqiang 21
Mis li hemeng 21
Mr xu dingding 19
Mr xu wangqiang 19
Mr xu lidong 19
Mr xu hexing 19
如果我們將這個班級成員的數據按teacher來分區
create table classmem(student string,age int) partitioned by(teacher string)
分區文件
classmem_Misli.txt
huangfengxiao 20
lijie 21
dongdong 21
liqiang 21
hemeng 21
classmem_MrXu.txt
dingding 19
wangqiang 19
lidong 19
hexing 19
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/classmem_Misli.txt' INTO TABLE classmem partition (teacher = 'Mis.li')
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/classmem_MrXu.txt' INTO TABLE classmem partition (teacher = 'Mis.Xu')
#分區列被默認到最后一列
hive> select * from classmem where teacher = 'Mr.Xu';
OK
dingding 19 NULL Mr.Xu
wangqiang 19 NULL Mr.Xu
lidong 19 NULL Mr.Xu
hexing 19 NULL Mr.Xu
Time taken: 0.196 seconds
#直接從分區檢索,加速;如果where子句的條件不是分區列,那么,這個sql將被編譯成mapreduce程序,延時很大。
#所以,我們建立分區,是為了一些常用的篩選查詢字段而用的。
五、桶的使用?更高效!可取樣!主要用於大數據集的取樣
桶的原理是對一個表(或者分區)進行切片,選擇被切片的字段,設定桶的個數,用字段與個數的hash值進行入桶。
比如bucket.txt數據文件內容如下:
id name age
1 huang 11
2 li 11
3 xu 12
4 zhong 14
5 hu 15
6 liqiang 17
7 zhonghua 19
如果我們想將這個數據表切成3個桶,切片字段為id
那么用id字段hash后,3個桶的內容如下:
桶id hash 3 =0
3 xu 12
6 liqiang 17
桶id hash 3 =1
1 huang 11
4 zhong 14
7 zhonghua 19
桶id hash 3 =2
2 li 11
5 hu 15
這個過程的創建表語句如下:
create table bucketmem (id int,name string,age int) CLUSTERED BY (id) sorted by (id asc) into 3 buckets
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/home/hadoop/hfxdoc/bucketmem.txt' INTO TABLE bucketmem;
select * from bucketmem tablesample(bucket 1 out of 4)
六、實際示例
創建一個表
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
下載示例數據文件,並解壓縮
wget http://www.grouplens.org/system/files/ml-data.tar__0.gz
tar xvzf ml-data.tar__0.gz
加載數據到表中
LOAD DATA LOCAL INPATH 'ml-data/u.data'
OVERWRITE INTO TABLE u_data;
統計數據總量
SELECT COUNT(1) FROM u_data;
現在做一些復雜的數據分析
創建一個 weekday_mapper.py: 文件,作為數據按周進行分割
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
生成數據的周信息
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
使用映射腳本
//創建表,按分割符分割行中的字段值
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
//將python文件加載到系統
add FILE weekday_mapper.py;
將數據按周進行分割
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
SELECT weekday, COUNT(1)
FROM u_data_new
GROUP BY weekday;
處理Apache Weblog 數據
將WEB日志先用正則表達式進行組合,再按需要的條件進行組合輸入到表中
add jar ../build/contrib/hive_contrib.jar;
CREATE TABLE apachelog (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
"output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;