拉鏈表設計:
在企業中,由於有些流水表每日有幾千萬條記錄,數據倉庫保存5年數據的話很容易不堪重負,因此可以使用拉鏈表的算法來節省存儲空間。
例子:
-- 用戶信息表; 采集當日全量數據存儲到 (當日) 表中 CREATE TABLE dwd.user_info( id string, name string, sex string, biz_date string -- 業務日期 ) -- 用戶信息整合表 CREATE TABLE dws.user_merge_info( id string, name string, sex string, start_date string, end_date string ) -- 測試插入用戶信息 INSERT INTO dwd.user_info SELECT '1','YaoMing','boy','20190701' UNION ALL SELECT '2','YaoLinlin','girl','20190701' UNION ALL SELECT '3','CaiLili','girl','20190701' UNION ALL SELECT '4','ZhangSan','girl','20190702' UNION ALL SELECT '5','LiSi','girl','20190702' -- 查看數據 SELECT * FROM dwd.user_info

-- 初始化用戶信息整合表 INSERT overwrite TABLE dws.user_merge_info SELECT id, name, sex, '20190701' AS start_date, '99991231' AS end_date FROM ( SELECT id, name, sex, row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num -- 初始化時候根據主鍵id分組,取最新修改的數據 FROM dwd.user_info ) t WHERE t.row_num = 1 -- 查看數據 SELECT * FROM dws.user_merge_info

-- 現在biz_date='20190702'這天,新跑了一條全新數據id=6,以及修改了一條id=2的數據 INSERT INTO dwd.user_info SELECT '6','WangWu','boy','20190702' UNION ALL SELECT '2','YaoLinlin','boy','20190702' -- 查看數據 SELECT * FROM dwd.user_info ORDER BY id,biz_date

-- 新增修改以及完全新增 INSERT overwrite TABLE tmp.user_merge_info_new -- 修改的數據 SELECT b.id, b.name, b.sex, '20190702' AS start_date, -- ${bizdate} 業務日期 '99991231' AS end_date -- 99991231代表有效數據 FROM dws.user_merge_info a, dwd.user_info b WHERE a.id = b.id AND a.end_date = '99991231' AND b.biz_date = '20190702' -- ${bizdate}只取當天數據 AND ( a.name!= b.name OR a.sex != b.sex ) UNION ALL -- 全新的數據 SELECT b.id, b.name, b.sex, '20190702'AS start_date, '99991231'AS end_date FROM dws.user_merge_info a RIGHT JOIN dwd.user_info b ON a.id = b.id WHERE b.biz_date='20190702' AND a.id IS NULL;

-- 閉鏈 INSERT overwrite TABLE tmp.user_merge_info_upt SELECT a.id, a.name, a.sex, a.start_date, '20190702' -- 閉鏈,${biz_date}業務時間 FROM dws.user_merge_info a LEFT JOIN dwd.user_info b ON a.id=b.id WHERE a.end_date='99991231' AND b.biz_date='20190702' AND ( a.name != b.name OR a.sex != b.sex )

-- 歷史數據 INSERT overwrite TABLE tmp.user_merge_info_new SELECT a.id, a.name, a.sex, a.start_date, a.end_date FROM dws.user_merge_info a, tmp.user_merge_info_upt b WHERE a.id != b.id;

-- 整合數據 INSERT OVERWRITE TABLE dws.user_merge_info SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_new UNION ALL SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_upt UNION ALL SELECT id, name, sex, start_date, end_date FROM tmp.user_merge_info_his -- 查看下數據 SELECT * FROM dws.user_merge_info ORDER BY id,start_date

以上拉鏈表就實現好了
以下是退鏈操作模板
#!/bin/bash # 使用說明提示 if [ $# -ne 1 ]; then echo "Usage : `basename $0` biz_date" exit 1 fi #業務時間 biz_date=$1 # 判斷是數據整合還是回退拉鏈表 isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'99991231';"` if [ $isGoBack -ne 0 ];then # 回退模式 Log "\n## 【user_merge_info表回退】 執行開始 ##" execHQL " INSERT overwrite TABLE dws.user_merge_info -- 完全不變的數據 SELECT id ,name ,sex ,start_date ,end_date FROM dws.user_merge_info WHERE (start_date<'$biz_date' AND end_date='99991231') OR end_date<'$biz_date' UNION ALL -- 重跑 重新開鏈的數據 SELECT id ,name ,sex ,start_date ,'99991231' AS end_date FROM dws.user_merge_info WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'99991231'; " if [ $? -ne 0 ];then Log "\n## 【user_merge_info表回退】 執行失敗 ##" exit 1 fi Log "\n## 【user_merge_info表回退】 執行成功 ##" fi
