DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc( user_id string COMMENT '用戶id' ,all_addr string COMMENT '常用地址' ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS orc TBLPROPERTIES('orc.compress'='SNAPPY'); set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101' union all SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180101' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102' union all SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180102' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180102' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day) SELECT * from tmp; SELECT * from tmp_dm_test_a.t_aa_orc; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180101'; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180103'; ---- SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc; --DDL語句最后添加CASCADE,否則新增的列在舊分區中不可見 alter table tmp_dm_test_a.t_aa_orc add columns(original_union_id string) cascade; --新增多個字段
alter table `ods_wst`.`awd_pckt_in_sm`
add columns(
`rule_code` string COMMENT '規則編碼'
, `bus_type` string COMMENT '擴展字段業務類型,用於關聯擴展字段業務值1-4'
, `bus_attr1` string COMMENT '擴展字段業務值1'
, `bus_attr2` string COMMENT '擴展字段業務值2'
, `bus_attr3` string COMMENT '擴展字段業務值3'
, `bus_attr4` string COMMENT '擴展字段業務值4'
) cascade;
alter table tmp_dm_test_a.t_aa_orc partition(inc_day='20180101') add columns(original_union_id string); DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc( user_id string COMMENT '用戶id' ,all_addr string COMMENT '常用地址' ,original_union_id string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS orc TBLPROPERTIES('orc.compress'='SNAPPY'); MSCK REPAIR TABLE tmp_dm_test_a.t_aa_orc; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x3311' as original_union_id, '20180103' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x6611' as original_union_id, '20180104' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day) SELECT * from tmp; ******************** 之前分區的數據找不到phone_number,需要重跑歷史數據
alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string;
alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string cascade;
------------- show create table tmp_dm_test_a.t_aa_orc;
ALTER TABLE table_name [PARTITION partition_spec] -- (Note: Hive 0.14.0 and later) ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...) [CASCADE|RESTRICT] -- (Note: Hive 1.1.0 and later)
REPLACE列刪除所有現有列並添加新的列集。這只能用於具有本機SerDe的表(DynamicSerDe、元數據類型pedcolumnsetserde、LazySimpleSerDe和ColumnarSerDe)。REPLACE列還可以用於刪除列。
刪除列示例:
原有Hive表test_change中有a,b,c,d,e這幾個字段 將從test_change中刪除“d”列: ALTER TABLE test_change REPLACE COLUMNS (a int, b int,c string,e string) cascade; 將d和e兩列一起刪除: ALTER TABLE test_change REPLACE COLUMNS (a int, b int,c string) cascade;
-- parque格式的數據,保留d,e字段的數據,但是replace刪除后無法查詢d,e
-- REPLACE也可以調整字段的順序,原始數據不用變也可正常查詢
parquet存儲格式
DROP TABLE IF EXISTS tmp_dm_test_a.t_aa; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa( user_id string COMMENT '用戶id' ,all_addr string COMMENT '常用地址' ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.fetch.task.conversion=more; set hive.exec.parallel=true; set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec; set mapreduce.output.fileoutputformat.compress.type=BLOCK; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101' union all SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180101' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102' union all SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180102' union all SELECT 'sf3333' as user_id, '上東省' as all_addr, '20180102' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day) SELECT * from tmp; SELECT * from tmp_dm_test_a.t_aa; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa; SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa where inc_day='20180101'; ---- SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa; alter table tmp_dm_test_a.t_aa add columns(original_union_id string); alter table tmp_dm_test_a.t_aa partition(inc_day='20180101') add columns(original_union_id string); DROP TABLE IF EXISTS tmp_dm_test_a.t_aa; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa( user_id string COMMENT '用戶id' ,all_addr string COMMENT '常用地址' ,original_union_id string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS parquet TBLPROPERTIES('parquet.compression'='SNAPPY'); MSCK REPAIR TABLE tmp_dm_test_a.t_aa; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x3311' as original_union_id, '20180103' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104' union all SELECT 'sf3333' as user_id, '上東省' as all_addr,'sf0x6611' as original_union_id, '20180104' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day) SELECT * from tmp; ******************** alter table tmp_dm_test_a.t_aa change column user_id phone_number string; alter table tmp_dm_test_a.t_aa change column user_id phone_number string cascade;
*********************************************
結論:
1、parquet和orc格式,舊分區中數據文件內容不可變。
2、parquet和orc格式:字段增加后,舊數據文件中無新字段內容;新產生的分區中數據文件才會有新字段內容。
3、parquet和orc格式:通過add語句末尾追加新增字段后,舊分區和新分區都可以查,舊數據為null而已。
4、parquet格式:修改字段名后,無法從舊數據解析原字段內容,相當於新舊字段名沒有印射關系。因為舊數據中只有舊字段名沒有新字段名,而且新字段名無法印射舊字段名,所以select不能解析新字段名。
5、orc格式:修改字段名后,可以從舊數據解析原字段內容,相當於新舊字段名有印射關系。雖然舊數據中只有舊字段名沒有新字段名,但是新字段名印射了舊字段名,相當於一個別名,所以select可以解析新字段名。