數據量不小,不能全量存儲, 數據緩慢變化的維度數據
拉鏈表的建表語句--全量表 通過有效起始時間<=時間<=有效結束時間來獲取維度的全量切片數據
建表語句
drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT '用戶 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性別',
`email` string COMMENT '郵箱',
`user_level` string COMMENT '用戶等級',
`create_time` string COMMENT '創建時間',
`operate_time` string COMMENT '操作時間',
`start_date` string COMMENT '有效開始日期',
`end_date` string COMMENT '有效結束日期'
) COMMENT '用戶拉鏈表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
tblproperties ("parquet.compression"="lzo");
已有的一些數據表
drop table if exists ods_user_info;
create external table ods_user_info(
`id` string COMMENT '用戶 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性別',
`email` string COMMENT '郵箱',
`user_level` string COMMENT '用戶等級',
`create_time` string COMMENT '創建時間',
`operate_time` string COMMENT '操作時間'
) COMMENT '用戶表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED AS
INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_user_info/';
數據示例:
20200101ods數據
id create_time operate_time
1 20200101 20200101
20200102 ods數據,產生修改的數據
1 20200101 20200102
20200103 ods數據,產生修改的數據
1 20200101 20200103
2 20200103 20200103
20200101拉鏈表
1 20200101 9999-99-99
20200102拉鏈表數據
1 20200101 20200101
1 20200102 9999-99-99
20200103拉鏈表數據
1 20200101 20200101
1 20200102 20200102
1 20200103 9999-99-99
2 20200103 9999-99-99
步驟一:初始化拉鏈表(以某一天為基准日期,這一天的數據都當成是新增數據)
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'20200101' as start_date,
'9999-99-99' as end_date
from
ods_user_info
where
dt = '20200101'
步驟二:制作當日變動數據(新增、修改)
insert overwrite table dwd_dim_user_info_his_tmp --先導入臨時表。再通過臨時表導入正式表
select
user_his.id,
user_his.name,
user_his.birthday,
user_his.gender,
user_his.email,
user_his.user_level,
user_his.create_time,
user_his.operate_time,
user_his.start_date,
if(
update_user.end_date is not null
and user_his.end_date = '9999-99-99',
'20200101',
user_his.end_date
) as end_date
from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
start_date,
end_date
from
dwd_dim_user_info_his
where
dt = '20200101'
) user_his
left join (
select
id,
name,
birthday,
gender,
email,
user_level,
from
ods_user_info
where
dt = '20200102'
) update_user on user_his.id = update_user.id
union all
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'20200102' as start_date,
'9999-99-99' as end_date
from
ods_user_info
where
dt = '20200102'