創建分桶的表
create table t_buck(id int ,name string) clustered by (id ) sorted by (id) into 4 buckets ;
分桶表的數據不是直接導入(load)的,是從其他表里面查出來插入的
,插入時會在語句中加入約束的語句。
hive的存儲格式介紹
hive 的DML操作
https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML
插入
Standard syntax: INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 FROM from_statement; INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 FROM from_statement; Hive extension (multiple inserts): FROM from_statement INSERT OVERWRITE TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...) [IF NOT EXISTS]] select_statement1 [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] ...; FROM from_statement INSERT INTO TABLE tablename1 [PARTITION (partcol1=val1, partcol2=val2 ...)] select_statement1 [INSERT INTO TABLE tablename2 [PARTITION ...] select_statement2] [INSERT OVERWRITE TABLE tablename2 [PARTITION ... [IF NOT EXISTS]] select_statement2] ...; Hive extension (dynamic partition inserts): INSERT OVERWRITE TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement; INSERT INTO TABLE tablename PARTITION (partcol1[=val1], partcol2[=val2] ...) select_statement FROM from_statement;
設置
set hive.enforce.bucketing=true;
set mapreduce.job.reduces = 4;//和分桶的數量相同(設置reduce.tasks的個數)
set mapred.resuce.tasks=2; 設置reduce.tasks的個數
insert into table t_buck select id,name from t_sz02 cluster by (id) ; --觀察此時的reducetask的數量
insert into table t_buck select id,name from t_sz02 distribute by (id) sort by (id desc);
使用sort by 可以指定排序,使用cluster by 不能,默認是升序。
最后就能得到分桶數據
分桶相當於mapreduce中的分區,hive中的分區表是將數據放在分好的文件夾里面
保存數據的方式
將結果保存到一張已經存在的hive表中
create table t_tmp as select id,name from t_sz02; -- as 是必須的
insert into table t_buck select id,name from t_sz02 distribute by (id) sort by (id desc); 保存到已經存在數據的表中,as不是必須的 ,也可以加overwrite
insert overwrite local directory '/home/hadoop/study' select * from t_tmp; 保持到本地文件系統中(也可以存在hdfs上去) overwrite是必須的,因為textoutputformat 不支持追加,只能覆蓋
不是分區的表也可以使用sort by 來查詢數據,或者導出數據。同理,分區表的數據也是沒有分區的表經過 sort by 查詢之后查出來的。