項目實戰從0到1之hive(26)企業級數據倉庫構建(八):搭建DWD 層-業務數據


1.1 DWD 層(業務數據)



1.1.1 商品維度表(全量表)



1)建表語句

DROP TABLE IF EXISTS `dwd_dim_sku_info`;
CREATE EXTERNAL TABLE `dwd_dim_sku_info` (
`id` string COMMENT '商品 id',
`spu_id` string COMMENT 'spuid',
`price` double COMMENT '商品價格',
`sku_name` string COMMENT '商品名稱',
`sku_desc` string COMMENT '商品描述',
`weight` double COMMENT '重量',
`tm_id` string COMMENT '品牌 id',
`tm_name` string COMMENT '品牌名稱',
`category3_id` string COMMENT '三級分類 id',
`category2_id` string COMMENT '二級分類 id',
`category1_id` string COMMENT '一級分類 id',
`category3_name` string COMMENT '三級分類名稱',
`category2_name` string COMMENT '二級分類名稱',
`category1_name` string COMMENT '一級分類名稱',
`spu_name` string COMMENT 'spu 名稱',
`create_time` string COMMENT '創建時間'
)
COMMENT '商品維度表'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_sku_info/'
tblproperties ("parquet.compression"="lzo");

2)數據裝載

insert overwrite table dwd_dim_sku_info partition(dt='2020-03-10')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ods_sku_info where dt='2020-03-10'
)sku
join
(
select * from ods_base_trademark where dt='2020-03-10'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ods_spu_info where dt='2020-03-10'
)spu on spu.id = sku.spu_id
join
(
select * from ods_base_category3 where dt='2020-03-10'
)c3 on sku.category3_id=c3.id
join
(
select * from ods_base_category2 where dt='2020-03-10'
)c2 on c3.category2_id=c2.id
join
(
select * from ods_base_category1 where dt='2020-03-10'
)c1 on c2.category1_id=c1.id;

1.1.2 優惠券信息表(全量)

把 ODS 層 ods_coupon_info 表數據導入到 DWD 層優惠卷信息表,在導入過程中可以做適當的清洗

1)建表語句

drop table if exists dwd_dim_coupon_info;
create external table dwd_dim_coupon_info(
`id` string COMMENT '購物券編號',
`coupon_name` string COMMENT '購物券名稱',
`coupon_type` string COMMENT '購物券類型 1 現金券 2 折扣券 3 滿減券 4 滿件打折券',
`condition_amount` string COMMENT '滿額數',
`condition_num` string COMMENT '滿件數',
`activity_id` string COMMENT '活動編號',
`benefit_amount` string COMMENT '減金額',
`benefit_discount` string COMMENT '折扣',
`create_time` string COMMENT '創建時間',
`range_type` string COMMENT '范圍類型 1、商品 2、品類 3、品牌',
`spu_id` string COMMENT '商品 id',
`tm_id` string COMMENT '品牌 id',
`category3_id` string COMMENT '品類 id',
`limit_num` string COMMENT '最多領用次數',
`operate_time` string COMMENT '修改時間',
`expire_time` string COMMENT '過期時間'
) COMMENT '優惠券信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_coupon_info/'
tblproperties ("parquet.compression"="lzo"); 

2)數據裝載

insert overwrite table dwd_dim_coupon_info partition(dt='2020-03-10')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ods_coupon_info
where dt='2020-03-10';

3)查詢加載結果

select * from dwd_dim_coupon_info where dt='2020-03-10';

1.1.3 活動維度表(全量)



1)建表語句

drop table if exists dwd_dim_activity_info;
create external table dwd_dim_activity_info(
`id` string COMMENT '編號',
`activity_name` string COMMENT '活動名稱',
`activity_type` string COMMENT '活動類型',
`condition_amount` string COMMENT '滿減金額',
`condition_num` string COMMENT '滿減件數',
`benefit_amount` string COMMENT '優惠金額',
`benefit_discount` string COMMENT '優惠折扣',
`benefit_level` string COMMENT '優惠級別',
`start_time` string COMMENT '開始時間',
`end_time` string COMMENT '結束時間',
`create_time` string COMMENT '創建時間'
) COMMENT '活動信息表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_activity_info/'
tblproperties ("parquet.compression"="lzo"); 

2)數據裝載

insert overwrite table dwd_dim_activity_info partition(dt='2020-03-10')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time
from
(
select * from ods_activity_info where dt='2020-03-10'
)info
left join
(
select * from ods_activity_rule where dt='2020-03-10'
)rule on info.id = rule.activity_id;

3)查詢加載結果

select * from dwd_dim_activity_info where dt='2020-03-10';

 1.1.4 地區維度表(特殊)



1)建表語句

DROP TABLE IF EXISTS `dwd_dim_base_province`;
CREATE EXTERNAL TABLE `dwd_dim_base_province` (
`id` string COMMENT 'id',
`province_name` string COMMENT '省市名稱',
`area_code` string COMMENT '地區編碼',
`iso_code` string COMMENT 'ISO 編碼',
`region_id` string COMMENT '地區 id',
`region_name` string COMMENT '地區名稱'
)
COMMENT '地區省市表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_base_province/'
tblproperties ("parquet.compression"="lzo"); 

2)數據裝載

insert overwrite table dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ods_base_province bp
join ods_base_region br
on bp.region_id=br.id;

1.1.5 時間維度表(特殊)(預留)

1)建表語句

DROP TABLE IF EXISTS `dwd_dim_date_info`;
CREATE EXTERNAL TABLE `dwd_dim_date_info`(
`date_id` string COMMENT '',
`week_id` int COMMENT '',
`week_day` int COMMENT '周的第幾天',
`day` int COMMENT '每月的第幾天',
`month` int COMMENT '第幾月',
`quarter` int COMMENT '第幾季度',
`year` int COMMENT '',
`is_workday` int COMMENT '是否是周末',
`holiday_id` int COMMENT '是否是節假日'
)
row format delimited fields terminated by '\t'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_date_info/'
tblproperties ("parquet.compression"="lzo"); 

2)把 date_info.txt 文件上傳到 node01 的 /opt/modules/db_log/路徑

3)數據裝載

load data local inpath '/opt/modules/db_log/date_info.txt' into table dwd_dim_date_info; 

4)查詢加載結果

select * from dwd_dim_date_info;

1.1.6 訂單明細事實表(事務型快照事實表)





1)建表語句

drop table if exists dwd_fact_order_detail;
create external table dwd_fact_order_detail (
`id` string COMMENT '訂單編號',
`order_id` string COMMENT '訂單號',
`user_id` string COMMENT '用戶 id',
`sku_id` string COMMENT 'sku 商品 id',
`sku_name` string COMMENT '商品名稱',
`order_price` decimal(10,2) COMMENT '商品價格',
`sku_num` bigint COMMENT '商品數量',
`create_time` string COMMENT '創建時間',
`province_id` string COMMENT '省份 ID',
`total_amount` decimal(20,2) COMMENT '訂單總金額'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_detail/'
tblproperties ("parquet.compression"="lzo"); 

2)數據裝載

insert overwrite table dwd_fact_order_detail partition(dt='2020-03-10')
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from
(
select * from ods_order_detail where dt='2020-03-10'
) od
join
(
select * from ods_order_info where dt='2020-03-10'
) oi
on od.order_id=oi.id;

3)查詢加載結果

select * from dwd_fact_order_detail where dt='2020-03-10';

1.1.7 支付事實表(事務型快照事實表)




1)建表語句

drop table if exists dwd_fact_payment_info;
create external table dwd_fact_payment_info (
`id` string COMMENT '',
`out_trade_no` string COMMENT '對外業務編號',
`order_id` string COMMENT '訂單編號',
`user_id` string COMMENT '用戶編號',
`alipay_trade_no` string COMMENT '支付寶交易流水編號',
`payment_amount` decimal(16,2) COMMENT '支付金額',
`subject` string COMMENT '交易內容',
`payment_type` string COMMENT '支付類型',
`payment_time` string COMMENT '支付時間',
`province_id` string COMMENT '省份 ID'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_payment_info/'
tblproperties ("parquet.compression"="lzo");

2)數據裝載

insert overwrite table dwd_fact_payment_info partition(dt='2020-03-10')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ods_payment_info where dt='2020-03-10'
)pi
join
(
select id, province_id from ods_order_info where dt='2020-03-10'
)oi
on pi.order_id = oi.id;

3)查詢加載結果

select * from dwd_fact_payment_info where dt='2020-03-10';

1.1.8 退款事實表(事務型快照事實表)

把 ODS 層 ods_order_refund_info 表數據導入到 DWD 層退款事實表,在導入過程中可以做適當的清洗



1)建表語句

drop table if exists dwd_fact_order_refund_info;
create external table dwd_fact_order_refund_info(
`id` string COMMENT '編號',
`user_id` string COMMENT '用戶 ID',
`order_id` string COMMENT '訂單 ID',
`sku_id` string COMMENT '商品 ID',
`refund_type` string COMMENT '退款類型',
`refund_num` bigint COMMENT '退款件數',
`refund_amount` decimal(16,2) COMMENT '退款金額',
`refund_reason_type` string COMMENT '退款原因類型',
`create_time` string COMMENT '退款時間'
) COMMENT '退款事實表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_order_refund_info/'; 

2)數據裝載

insert overwrite table dwd_fact_order_refund_info partition(dt='2020-03-10')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ods_order_refund_info
where dt='2020-03-10';

3)查詢加載結果

select * from dwd_fact_order_refund_info where dt='2020-03-10';

1.1.9 評價事實表(事務型快照事實表)

把 ODS 層 ods_comment_info 表數據導入到 DWD 層評價事實表,在導入過程中可以做適當的清洗



1)建表語句

drop table if exists dwd_fact_comment_info;
create external table dwd_fact_comment_info(
`id` string COMMENT '編號',
`user_id` string COMMENT '用戶 ID',
`sku_id` string COMMENT '商品 sku',
`spu_id` string COMMENT '商品 spu',
`order_id` string COMMENT '訂單 ID',
`appraise` string COMMENT '評價',
`create_time` string COMMENT '評價時間'
) COMMENT '評價事實表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_comment_info/'; 

2)數據裝載

insert overwrite table dwd_fact_comment_info partition(dt='2020-03-10')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ods_comment_info
where dt='2020-03-10';

1.1.10 加購事實表(周期型快照事實表,每日快照)

由於購物車的數量是會發生變化,所以導增量不合適
每天做一次快照,導入的數據是全量,區別於事務型事實表是每天導入新增
周期型快照事實表劣勢:存儲的數據量會比較大
解決方案:周期型快照事實表存儲的數據比較講究時效性,時間太久了的意義不大,可以刪除以前的數據



1)建表語句

drop table if exists dwd_fact_cart_info;
create external table dwd_fact_cart_info(
`id` string COMMENT '編號',
`user_id` string COMMENT '用戶 id',
`sku_id` string COMMENT 'skuid',
`cart_price` string COMMENT '放入購物車時價格',
`sku_num` string COMMENT '數量',
`sku_name` string COMMENT 'sku 名稱 (冗余)',
`create_time` string COMMENT '創建時間',
`operate_time` string COMMENT '修改時間',
`is_ordered` string COMMENT '是否已經下單。1 為已下單;0 為未下單',
`order_time` string COMMENT '下單時間'
) COMMENT '加購事實表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_cart_info/'; 

2)數據裝載

insert overwrite table dwd_fact_cart_info partition(dt='2020-03-10')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from ods_cart_info
where dt='2020-03-10';

3)查詢加載結果

select * from dwd_fact_cart_info where dt='2020-03-10';

1.1.11 收藏事實表(周期型快照事實表,每日快照)

收藏的標記,是否取消,會發生變化,做增量不合適
每天做一次快照,導入的數據是全量,區別於事務型事實表是每天導入新增


1)建表語句

drop table if exists dwd_fact_favor_info;
create external table dwd_fact_favor_info(
`id` string COMMENT '編號',
`user_id` string COMMENT '用戶 id',
`sku_id` string COMMENT 'skuid',
`spu_id` string COMMENT 'spuid',
`is_cancel` string COMMENT '是否取消',
`create_time` string COMMENT '收藏時間',
`cancel_time` string COMMENT '取消時間'
) COMMENT '收藏事實表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_favor_info/';

2)數據裝載

insert overwrite table dwd_fact_favor_info partition(dt='2020-03-10')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ods_favor_info
where dt='2020-03-10';

3)查詢加載結果

select * from dwd_fact_favor_info where dt='2020-03-10';

1.1.12 優惠券領用事實表(累積型快照事實表)



優惠卷的生命周期:領取優惠卷-》用優惠卷下單-》優惠卷參與支付

累積型快照事實表使用:統計優惠卷領取次數、優惠卷下單次數、優惠卷參與支付次數
1)建表語句

drop table if exists dwd_fact_coupon_use;
create external table dwd_fact_coupon_use(
`id` string COMMENT '編號',
`coupon_id` string COMMENT '優惠券 ID',
`user_id` string COMMENT 'userid',
`order_id` string COMMENT '訂單 id',
`coupon_status` string COMMENT '優惠券狀態',
`get_time` string COMMENT '領取時間',
`using_time` string COMMENT '使用時間(下單)',
`used_time` string COMMENT '使用時間(支付)'
) COMMENT '優惠券領用事實表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/dwd/dwd_fact_coupon_use/'; 

注意:dt 是按照優惠卷領用時間 get_time 做為分區

2)數據裝載

set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,'yyyy-MM-dd')
from ods_coupon_use
where dt='2020-03-10'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ods_coupon_use
where dt='2020-03-10'
)new
on old.id=new.id;

1.1.13 訂單事實表(累積型快照事實表)

1)concat 函數

concat 函數在連接字符串的時候,只要其中一個是 NULL,那么將返回 NULL

hive> select concat('a','b');
ab
hive> select concat('a','b',null);
NULL

2)concat_ws 函數

concat_ws 函數在連接字符串的時候,只要有一個字符串不是 NULL,就不會返回 NULL。concat_ws 函數需要指定分隔符

hive> select concat_ws('-','a','b');
a-b
hive> select concat_ws('-','a','b',null);
a-b
hive> select concat_ws('','a','b',null);
ab

3)STR_TO_MAP 函數

  • (1)語法描述


STR_TO_MAP(VARCHAR text, VARCHAR listDelimiter, VARCHAR keyValueDelimiter)

  • (2)功能描述


使用 listDelimiter 將 text 分隔成 K-V 對,然后使用 keyValueDelimiter 分隔每個 K-V 對,
組裝成 MAP 返回。默認 listDelimiter 為( ,),keyValueDelimiter 為(=)。

  • (3)案例


str_to_map(‘1001=2020-03-10,1002=2020-03-10’, ‘,’ , ‘=’)
輸出{“1001”:“2020-03-10”,“1002”:“2020-03-10”}

4)建表語句



訂單生命周期:創建時間=》支付時間=》取消時間=》完成時間=》退款時間=》退款完成時間

由於 ODS 層訂單表只有創建時間和操作時間兩個狀態,不能表達所有時間含義,所以需要關聯訂單狀態表。訂單事實表里面增加了活動 id,所以需要關聯活動訂單表

drop table if exists dwd_fact_order_info;
create external table dwd_fact_order_info (
`id` string COMMENT '訂單編號',
`order_status` string COMMENT '訂單狀態',
`user_id` string COMMENT '用戶 id',
`out_trade_no` string COMMENT '支付流水號',
`create_time` string COMMENT '創建時間(未支付狀態)',
`payment_time` string COMMENT '支付時間(已支付狀態)',
`cancel_time` string COMMENT '取消時間(已取消狀態)',
`finish_time` string COMMENT '完成時間(已完成狀態)',
`refund_time` string COMMENT '退款時間(退款中狀態)',
`refund_finish_time` string COMMENT '退款完成時間(退款完成狀態)',
`province_id` string COMMENT '省份 ID',
`activity_id` string COMMENT '活動 ID',
`original_total_amount` string COMMENT '原價金額',
`benefit_reduce_amount` string COMMENT '優惠金額',
`feight_fee` string COMMENT '運費',
`final_total_amount` decimal(10,2) COMMENT '訂單金額'
)
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dwd/dwd_fact_order_info/'
tblproperties ("parquet.compression"="lzo");

5)數據裝載



5)常用函數

更多函數請點擊博客【HIve】Hive入門解析(五)

6)數據裝載

set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 對應未支付狀態
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is
null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is
null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is
null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from dwd_fact_order_info
where dt
in
(
select
date_format(create_time,'yyyy-MM-dd')
from ods_order_info
where dt='2020-03-10'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','=')
tms
from ods_order_status_log
where dt='2020-03-10'
group by order_id
)log
join
(
select * from ods_order_info where dt='2020-03-10'
)info
on log.order_id=info.id
left join
(
select * from ods_activity_order where dt='2020-03-10'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;

1.1.14 用戶維度表(拉鏈表)

用戶表中的數據每日既有可能新增,也有可能修改,但修改頻率並不高,屬於緩慢變化
維度,此處采用拉鏈表存儲用戶維度數據

1)什么是拉鏈表



2)為什么要做拉鏈表

<ignore_js_op>


3)拉鏈表形成過程



4)拉鏈表制作過程圖



5)拉鏈表制作過程

步驟 0:初始化拉鏈表(首次獨立執行)

(1)建立拉鏈表

drop table if exists dwd_dim_user_info_his;
create external table dwd_dim_user_info_his(
`id` string COMMENT '用戶 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性別',
`email` string COMMENT '郵箱',
`user_level` string COMMENT '用戶等級',
`create_time` string COMMENT '創建時間',
`operate_time` string COMMENT '操作時間',
`start_date` string COMMENT '有效開始日期',
`end_date` string COMMENT '有效結束日期'
) COMMENT '訂單拉鏈表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his/'
tblproperties ("parquet.compression"="lzo");

(2)初始化拉鏈表

insert overwrite table dwd_dim_user_info_his
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-03-10',
'9999-99-99'
from ods_user_info oi
where oi.dt='2020-03-10';

步驟 1:制作當日變動數據(包括新增,修改)每日執行

(1)如何獲得每日變動表

  • a.最好表內有創建時間和變動時間(Lucky!)
  • b.如果沒有,可以利用第三方工具監控比如 canal,監控 MySQL 的實時變化進行記錄(麻煩)
  • c.逐行對比前后兩天的數據,檢查 md5(concat(全部有可能變化的字段))是否相同(low)
  • d.要求業務數據庫提供變動流水(人品,顏值)


(2)因為 ods_order_info 本身導入過來就是新增變動明細的表,所以不用處理

  • a)數據庫中新增 2020-03-11 一天的數據
  • b)通過 Sqoop 把 2020-03-11 日所有數據導入mysqlTohdfs.sh all 2020-03-11
  • c)ods 層數據導入hdfs_to_ods_db.sh all 2020-03-11


步驟 2:先合並變動信息,再追加新增信息,插入到臨時表中

1)建立臨時表

drop table if exists dwd_dim_user_info_his_tmp;
create external table dwd_dim_user_info_his_tmp(
`id` string COMMENT '用戶 id',
`name` string COMMENT '姓名',
`birthday` string COMMENT '生日',
`gender` string COMMENT '性別',
`email` string COMMENT '郵箱',
`user_level` string COMMENT '用戶等級',
`create_time` string COMMENT '創建時間',
`operate_time` string COMMENT '操作時間',
`start_date` string COMMENT '有效開始日期',
`end_date` string COMMENT '有效結束日期'
) COMMENT '訂單拉鏈臨時表'
stored as parquet
location '/warehouse/gmall/dwd/dwd_dim_user_info_his_tmp/'
tblproperties ("parquet.compression"="lzo");

2)導入腳本

insert overwrite table dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'2020-03-11' start_date,
'9999-99-99' end_date
from ods_user_info where dt='2020-03-11'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
uh.end_date) end_date
from dwd_dim_user_info_his uh left join
(
select
*
from ods_user_info
where dt='2020-03-11'
) ui on uh.id=ui.id
)his
order by his.id, start_date;

1.1.15 DWD 層數據導入腳本

1)vim ods_to_dwd_db.sh

#!/bin/bash
APP=gmall
hive=/opt/modules/hive/bin/hive
# 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天
if [ -n "$2" ] ;then
do_date=$2
else
do_date=`date -d "-1 day" +%F`
fi
sql1="
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table ${APP}.dwd_dim_sku_info partition(dt='$do_date')
select
sku.id,
sku.spu_id,
sku.price,
sku.sku_name,
sku.sku_desc,
sku.weight,
sku.tm_id,
ob.tm_name,
sku.category3_id,
c2.id category2_id,
c1.id category1_id,
c3.name category3_name,
c2.name category2_name,
c1.name category1_name,
spu.spu_name,
sku.create_time
from
(
select * from ${APP}.ods_sku_info where dt='$do_date'
)sku
join
(
select * from ${APP}.ods_base_trademark where dt='$do_date'
)ob on sku.tm_id=ob.tm_id
join
(
select * from ${APP}.ods_spu_info where dt='$do_date'
)spu on spu.id = sku.spu_id
join
(
select * from ${APP}.ods_base_category3 where dt='$do_date'
)c3 on sku.category3_id=c3.id
join
(
select * from ${APP}.ods_base_category2 where dt='$do_date'
)c2 on c3.category2_id=c2.id
join
(
select * from ${APP}.ods_base_category1 where dt='$do_date'
)c1 on c2.category1_id=c1.id;


insert overwrite table ${APP}.dwd_dim_coupon_info partition(dt='$do_date')
select
id,
coupon_name,
coupon_type,
condition_amount,
condition_num,
activity_id,
benefit_amount,
benefit_discount,
create_time,
range_type,
spu_id,
tm_id,
category3_id,
limit_num,
operate_time,
expire_time
from ${APP}.ods_coupon_info
where dt='$do_date';


insert overwrite table ${APP}.dwd_dim_activity_info partition(dt='$do_date')
select
info.id,
info.activity_name,
info.activity_type,
rule.condition_amount,
rule.condition_num,
rule.benefit_amount,
rule.benefit_discount,
rule.benefit_level,
info.start_time,
info.end_time,
info.create_time
from
(
select * from ${APP}.ods_activity_info where dt='$do_date'
)info
left join
(
select * from ${APP}.ods_activity_rule where dt='$do_date'
)rule on info.id = rule.activity_id;


insert overwrite table ${APP}.dwd_fact_order_detail partition(dt='$do_date')
select
od.id,
od.order_id,
od.user_id,
od.sku_id,
od.sku_name,
od.order_price,
od.sku_num,
od.create_time,
oi.province_id,
od.order_price*od.sku_num
from
(
select * from ${APP}.ods_order_detail where dt='$do_date'
) od
join
(
select * from ${APP}.ods_order_info where dt='$do_date'
) oi
on od.order_id=oi.id;


insert overwrite table ${APP}.dwd_fact_payment_info partition(dt='$do_date')
select
pi.id,
pi.out_trade_no,
pi.order_id,
pi.user_id,
pi.alipay_trade_no,
pi.total_amount,
pi.subject,
pi.payment_type,
pi.payment_time,
oi.province_id
from
(
select * from ${APP}.ods_payment_info where dt='$do_date'
)pi
join
(
select id, province_id from ${APP}.ods_order_info where dt='$do_date'
)oi
on pi.order_id = oi.id;


insert overwrite table ${APP}.dwd_fact_order_refund_info partition(dt='$do_date')
select
id,
user_id,
order_id,
sku_id,
refund_type,
refund_num,
refund_amount,
refund_reason_type,
create_time
from ${APP}.ods_order_refund_info
where dt='$do_date';


insert overwrite table ${APP}.dwd_fact_comment_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
spu_id,
order_id,
appraise,
create_time
from ${APP}.ods_comment_info
where dt='$do_date';


insert overwrite table ${APP}.dwd_fact_cart_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
cart_price,
sku_num,
sku_name,
create_time,
operate_time,
is_ordered,
order_time
from ${APP}.ods_cart_info
where dt='$do_date';


insert overwrite table ${APP}.dwd_fact_favor_info partition(dt='$do_date')
select
id,
user_id,
sku_id,
spu_id,
is_cancel,
create_time,
cancel_time
from ${APP}.ods_favor_info
where dt='$do_date';


insert overwrite table ${APP}.dwd_fact_coupon_use partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.coupon_id is null,old.coupon_id,new.coupon_id),
if(new.user_id is null,old.user_id,new.user_id),
if(new.order_id is null,old.order_id,new.order_id),
if(new.coupon_status is null,old.coupon_status,new.coupon_status),
if(new.get_time is null,old.get_time,new.get_time),
if(new.using_time is null,old.using_time,new.using_time),
if(new.used_time is null,old.used_time,new.used_time),
date_format(if(new.get_time is null,old.get_time,new.get_time),'yyyy-MM-dd')
from
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.dwd_fact_coupon_use
where dt in
(
select
date_format(get_time,'yyyy-MM-dd')
from ${APP}.ods_coupon_use
where dt='$do_date'
)
)old
full outer join
(
select
id,
coupon_id,
user_id,
order_id,
coupon_status,
get_time,
using_time,
used_time
from ${APP}.ods_coupon_use
where dt='$do_date'
)new
on old.id=new.id;


insert overwrite table ${APP}.dwd_fact_order_info partition(dt)
select
if(new.id is null,old.id,new.id),
if(new.order_status is null,old.order_status,new.order_status),
if(new.user_id is null,old.user_id,new.user_id),
if(new.out_trade_no is null,old.out_trade_no,new.out_trade_no),
if(new.tms['1001'] is null,old.create_time,new.tms['1001']),--1001 對應未支付狀態
if(new.tms['1002'] is null,old.payment_time,new.tms['1002']),
if(new.tms['1003'] is null,old.cancel_time,new.tms['1003']),
if(new.tms['1004'] is null,old.finish_time,new.tms['1004']),
if(new.tms['1005'] is null,old.refund_time,new.tms['1005']),
if(new.tms['1006'] is null,old.refund_finish_time,new.tms['1006']),
if(new.province_id is null,old.province_id,new.province_id),
if(new.activity_id is null,old.activity_id,new.activity_id),
if(new.original_total_amount is
null,old.original_total_amount,new.original_total_amount),
if(new.benefit_reduce_amount is
null,old.benefit_reduce_amount,new.benefit_reduce_amount),
if(new.feight_fee is null,old.feight_fee,new.feight_fee),
if(new.final_total_amount is
null,old.final_total_amount,new.final_total_amount),
date_format(if(new.tms['1001'] is
null,old.create_time,new.tms['1001']),'yyyy-MM-dd')
from
(
select
id,
order_status,
user_id,
out_trade_no,
create_time,
payment_time,
cancel_time,
finish_time,
refund_time,
refund_finish_time,
province_id,
activity_id,
original_total_amount,
benefit_reduce_amount,
feight_fee,
final_total_amount
from ${APP}.dwd_fact_order_info
where dt
in
(
select
date_format(create_time,'yyyy-MM-dd')
from ${APP}.ods_order_info
where dt='$do_date'
)
)old
full outer join
(
select
info.id,
info.order_status,
info.user_id,
info.out_trade_no,
info.province_id,
act.activity_id,
log.tms,
info.original_total_amount,
info.benefit_reduce_amount,
info.feight_fee,
info.final_total_amount
from
(
select
order_id,
str_to_map(concat_ws(',',collect_set(concat(order_status,'=',operate_time))),',','
=') tms
from ${APP}.ods_order_status_log
where dt='$do_date'
group by order_id
)log
join
(
select * from ${APP}.ods_order_info where dt='$do_date'
)info
on log.order_id=info.id
left join
(
select * from ${APP}.ods_activity_order where dt='$do_date'
)act
on log.order_id=act.order_id
)new
on old.id=new.id;


insert overwrite table ${APP}.dwd_dim_user_info_his_tmp
select * from
(
select
id,
name,
birthday,
gender,
email,
user_level,
create_time,
operate_time,
'$do_date' start_date,
'9999-99-99' end_date
from ${APP}.ods_user_info where dt='$do_date'
union all
select
uh.id,
uh.name,
uh.birthday,
uh.gender,
uh.email,
uh.user_level,
uh.create_time,
uh.operate_time,
uh.start_date,
if(ui.id is not null and uh.end_date='9999-99-99', date_add(ui.dt,-1),
uh.end_date) end_date
from ${APP}.dwd_dim_user_info_his uh left join
(
select
*
from ${APP}.ods_user_info
where dt='$do_date'
) ui on uh.id=ui.id
)his
order by his.id, start_date;


insert overwrite table ${APP}.dwd_dim_user_info_his select * from
${APP}.dwd_dim_user_info_his_tmp;
"

sql2="
insert overwrite table ${APP}.dwd_dim_base_province
select
bp.id,
bp.name,
bp.area_code,
bp.iso_code,
bp.region_id,
br.region_name
from ${APP}.ods_base_province bp
join ${APP}.ods_base_region br
on bp.region_id=br.id;
"

case $1 in
"first"){
$hive -e "$sql1"
$hive -e "$sql2"
};;
"all"){
$hive -e "$sql1"
};;
esac

2)增加腳本執行權限

chmod 770 ods_to_dwd_db.sh

3)執行腳本導入數據

ods_to_dwd_db.sh all 2020-03-11

4)查看導入數據

select * from dwd_fact_order_info where dt='2020-03-11';
select * from dwd_fact_order_detail where dt='2020-03-11';
select * from dwd_fact_comment_info where dt='2020-03-11';
select * from dwd_fact_order_refund_info where dt='2020-03-11';

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM