1. 創建數據庫,切換數據庫
create database testdb; use testdb;
2. 創建管理表
create table emp( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) row format delimited fields terminated by '\t'; 加載數據 load data local inpath '/opt/test/emp.txt' overwrite into table emp;
emp.txt文件內容如下:
101 'duan' 'it' 1, 'hiredate' 100.0 10.0 1
102 'duan2' 'product' 1, '2018' 200.0 20.0 1
在hadoop中查看數據,如下:
3. 創建外部表
創建外部表時直接指定表位置
上傳數據文件到指定路徑
duanxz@three:~/hive/hivelocal$ hdfs dfs -mkdir /hive/warehouse/testdb.db/emp_ext duanxz@three:~/hive/hivelocal$ hdfs dfs -put emp.txt /hive/warehouse/testdb.db/emp_ext/ duanxz@three:~/hive/hivelocal$
在hive中創建數據表指定location
create external table emp_ext( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) row format delimited fields terminated by '\t' location '/hive/warehouse/testdb.db/emp_ext/';
4. 創建分區表
create table emp_part( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) partitioned by (year string, month string) row format delimited fields terminated by '\t';
注:分區字段不能與表中其他字段重復,否則報錯
FAILED: SemanticException [Error 10035]: Column repeated in partitioning columns
加載數據
1、將txt的文本文件導入hive
從本地拷貝emp.txt到分區表目錄中
load data local inpath '/home/duanxz/hive/hivelocal/emp.txt' into table emp_part partition (year='2018', month='5'); load data local inpath '/home/duanxz/hive/hivelocal/emp2.txt' into table emp_part partition (year='2018', month='6');
用hdfs中指定位置的數據,增加分區表中數據,此操作不會移動數據文件到分區表目錄中
alter table emp_part add partition (year='2016', month='5') location '/data';
把hdfs中指定位置的數據移動到分區表目錄中,增加數據
load data inpath '/emp.txt' into table emp_part partition (year='2016', month='6');
2、將csv導入hive
create table feizhou_china_part2( merchant string, pay_time string, currency string, amount double, fee double, transaction_reference string, feizhou_reference string, link_reference string, narration string, account_number string, account_name string, bank string, bank_code string, status string, source string) partitioned by (year string, month string, day string) row format delimited fields terminated by '?';
導入:
load data local inpath '/home/duanxz/hive/hivelocal/china-pay-disburse-transactions.csv' into table feizhou_china_part2 partition (year='2018',month='06',day='19');
說明:上面的為什么將分隔符調整為"?"呢,是因為csv中默認的分隔符是',',內容中如果有',',這樣導入后,內容就亂了。
如何修改CSV文件的分隔符
5.其他創建表的方式
(1) create-as
create table emp3 as select * from emp;
(2) create-like
create table emp4 like emp; load data local inpath '/opt/test/emp.txt' overwrite into table emp4;
(3)插入數據
insert overwrite table emp4 select * from emp;
6.指定表存儲格式與壓縮格式
(1) 指定orc格式
create table emp_orc( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) stored as orc;
指定為非文本格式時無需再指定row format delimited fields terminated by '\t'
插入數據 insert into table emp_orc select * from emp;
可以利用已有的ORC存儲格式的表創建新的ORC表
create table emp_orc2 like emp_orc; 插入數據 insert overwrite table emp_orc2 select * from emp;
(2) 指定orc+snappy格式
a)先創建表,再插入數據
create table emp_orc_snappy( empno int, empname string, job string, mgr int, hiredate string, salary double, comm double, deptno int) stored as orc tblproperties("orc.compression"="snappy"); 插入數據 insert overwrite table emp_orc_snappy select * from emp;
b)利用已有的orc表格式創建orc+snappy格式表
create table emp_orc_snappy2 like emp_orc tblproperties ("orc.compression"="snappy"); insert overwrite table emp_orc_snappy2 select * from emp;
c)利用非壓縮表直接創建orc+snappy表並導入數據
create table emp_orc_snappy3 stored as orc tblproperties("orc.compression"="snappy") as select * from emp;
7.hive執行參數-e,-f,--hiveconf
(1)命令行直接執行hql語句
hive -e "select * from db_hive01.emp"
(2)執行hql文件中的語句
hive -f emp.hql
(3)打開調試模式
hive --hiveconf hive.root.logger=DEBUG,console
8.數據導出
(1)導出數據到本地
a)insert
insert overwrite local directory '/opt/test/local' row format delimited fields terminated by '\t' select * from emp;
如果不指定row format delimited fields terminated by '\t',字段間默認沒有分割符
b)
hive -e 'select * from testdb2.emp' >> ./emp_export.txt
(2)導出到hdfs
a)
insert overwrite directory '/export_data' select * from emp;
hive 0.13.1版本還不支持導出數據到hdfs時指定分隔符row format delimited fields terminated by '\t'
b)
export table emp to '/export_data';
導出后會在會生成/export_data/data目錄, emp.txt存放在此目錄中,即/export_data/data/emp.txt
9. 排序
(1)order by 全局排序
insert overwrite local directory '/opt/test/local' row format delimited fields terminated by '\t' select * from emp order by empno;
(2)sort by 與 distributed by
類似MR中partition,進行分區,結合sort by使用
每個reduce內部進行排序,全局不是排序, distribute by 一定是放在sort by 前面,
且必須要指定mapreduce.job.reduces數量,否則導出結果還是在一個文件中
set mapreduce.job.reduces=3; insert overwrite local directory '/opt/test/local' row format delimited fields terminated by '\t' select * from emp distribute by deptno sort by empno;
(3)cluster by
當distributed by和sort by 字段一樣的時候,直接使用cluster by
10.常用函數
select upper(empname) from emp; select unix_timestamp(trackTime) from bflog limit 3 ; select year(hiredate) from emp ; select month(hiredate) from emp ; select hour(hiredate) from emp ; select substr(hiredate,1,4) from .emp ; select split(hiredate,'-')[1] from emp ; select reverse(hiredate) from emp ; select concat(empno,'-',empname) from emp ; case when 條件1 then ... when 條件2 then ... else end
可以使用desc function substr 查看函數說明, substr第二個參數為index 從1技術,第三個參數為length
11. 自定義UDF
add jar /opt/test/mylower.jar ; CREATE TEMPORARY FUNCTION mylower AS 'org.gh.hadoop.hive.MyLower';
12. 使用正則表達式加載數據字段
create table beifenglog( remote_addr string, remote_user string, time_local string, request string, status string, body_bytes_sent string, request_body string, http_referer string, http_user_agent string, http_x_forwarded_for string, host string) row format serde 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' with serdeproperties( "input.regex" = "(\\\"[\\d\\.]+\\\") (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\".*?\\\") (\\\"\\d+\\\") (\\\"\\d+\\\") ([^ ]+) (\\\"[^ ]+\\\") (\\\".*?\\\") (\\\"[^ ]+\\\") (\\\"[^ ]+\\\")" ) stored as textfile; 加載原表數據 load data local inpath '/opt/test/beifenglog.data' overwrite into table beifenglog;
可以使用工具調試正則:http://tool.chinaz.com/regex
13.注意點
(1)在創建表(無論管理表還是外部表)時,如果沒有指定location,可以使用load data加載數據
a) 指定本地目錄中的數據,會上傳數據文件到hdfs中
b) 指定hdfs中數據文件,如果指定的路徑與表所在的目錄不一致,則移動數據文件到表目錄中
create external table emp_ext2 like emp; load data inpath '/emp.txt' into table emp_ext2; 會把/emp.txt移動到/user/hive/warehouse/testdb2.db/emp_ext2/目錄中
create table emp2 like emp; load data inpath '/emp.txt' into table emp2; 會把/emp.txt移動到/user/hive/warehouse/testdb2.db/emp2/目錄中
(2)create-like時不能指定stored as為其他格式,否則報錯
以下操作會報錯 FAILED: ParseException line 1:31 missing EOF at 'stored' near 'emp'
create table emp_orc2 like emp stored as orc;