hive拉鏈表以及退鏈例子筆記


 拉鏈表設計:

  在企業中,由於有些流水表每日有幾千萬條記錄,數據倉庫保存5年數據的話很容易不堪重負,因此可以使用拉鏈表的算法來節省存儲空間。

 

 例子:

-- 用戶信息表;  采集當日全量數據存儲到 (當日) 表中
CREATE TABLE dwd.user_info(
    id string,      
    name string,
    sex string,
    biz_date string  -- 業務日期
)


-- 用戶信息整合表
CREATE TABLE dws.user_merge_info(
    id string,
    name string,
    sex string,
    start_date string,
    end_date string
)

-- 測試插入用戶信息
INSERT INTO dwd.user_info 
SELECT 
'1','YaoMing','boy','20190701'
UNION ALL 
SELECT 
'2','YaoLinlin','girl','20190701'
UNION ALL 
SELECT 
'3','CaiLili','girl','20190701'
UNION ALL 
SELECT 
'4','ZhangSan','girl','20190702'
UNION ALL 
SELECT 
'5','LiSi','girl','20190702'


-- 查看數據
SELECT * FROM dwd.user_info 

 

-- 初始化用戶信息整合表
INSERT overwrite TABLE dws.user_merge_info
SELECT
    id,
    name,
    sex,
    '20190701' AS start_date,
    '99991231' AS end_date
FROM (
    SELECT 
        id,
        name,
        sex,
        row_number() over(PARTITION BY id ORDER BY biz_date) AS row_num  -- 初始化時候根據主鍵id分組,取最新修改的數據
    FROM dwd.user_info 
    ) t
WHERE t.row_num = 1 

-- 查看數據
SELECT * FROM dws.user_merge_info

 

-- 現在biz_date='20190702'這天,新跑了一條全新數據id=6,以及修改了一條id=2的數據
INSERT INTO dwd.user_info 
SELECT 
'6','WangWu','boy','20190702'
UNION ALL 
SELECT 
'2','YaoLinlin','boy','20190702'

-- 查看數據
SELECT * FROM dwd.user_info  ORDER BY id,biz_date

 

 

-- 新增修改以及完全新增
INSERT overwrite TABLE tmp.user_merge_info_new
-- 修改的數據
SELECT
b.id,
b.name,
b.sex,
'20190702' AS start_date,        -- ${bizdate} 業務日期
'99991231' AS end_date            -- 99991231代表有效數據
FROM dws.user_merge_info     a,
     dwd.user_info              b
WHERE a.id = b.id 
AND a.end_date = '99991231' 
AND b.biz_date = '20190702'     -- ${bizdate}只取當天數據
AND (
       a.name!= b.name
    OR a.sex != b.sex
)

UNION ALL 
-- 全新的數據
SELECT                                                    
    b.id,
    b.name,
    b.sex,
    '20190702'AS start_date,
    '99991231'AS end_date    
FROM         dws.user_merge_info     a
RIGHT JOIN  dwd.user_info         b
ON a.id = b.id
WHERE b.biz_date='20190702'
    AND a.id IS NULL;

 

-- 閉鏈
INSERT overwrite TABLE tmp.user_merge_info_upt
SELECT
    a.id,
    a.name,
    a.sex,
    a.start_date,
    '20190702'            -- 閉鏈,${biz_date}業務時間
FROM dws.user_merge_info a
LEFT JOIN dwd.user_info b
ON a.id=b.id 
WHERE     a.end_date='99991231' 
    AND b.biz_date='20190702' 
    AND (
        a.name != b.name
        OR a.sex != b.sex
        )

-- 歷史數據
INSERT overwrite TABLE tmp.user_merge_info_new
SELECT
    a.id,
    a.name,
    a.sex,
    a.start_date,
    a.end_date
FROM dws.user_merge_info       a,
     tmp.user_merge_info_upt  b
WHERE a.id != b.id;

-- 整合數據 
INSERT OVERWRITE TABLE dws.user_merge_info
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_new
UNION ALL 
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_upt
UNION ALL 
SELECT
    id,
    name,
    sex,
    start_date,
    end_date
FROM tmp.user_merge_info_his


-- 查看下數據
SELECT * FROM dws.user_merge_info ORDER BY id,start_date

以上拉鏈表就實現好了


 

以下是退鏈操作模板

#!/bin/bash

# 使用說明提示
if [ $# -ne 1 ]; then
    echo "Usage : `basename $0` biz_date"
    exit 1
fi

#業務時間
biz_date=$1

# 判斷是數據整合還是回退拉鏈表
isGoBack=`execHQL "select count(1) from dws.user_merge_info where (end_date>='$biz_date' or start_date>='$biz_date') and biz_date<>'99991231';"`

if [ $isGoBack -ne 0 ];then
    # 回退模式
    Log "\n## 【user_merge_info表回退】 執行開始 ##" 
    execHQL "
        INSERT overwrite TABLE dws.user_merge_info 
        --  完全不變的數據
        SELECT 
             id
            ,name
            ,sex     
            ,start_date
            ,end_date 
        FROM dws.user_merge_info 
        WHERE (start_date<'$biz_date' AND end_date='99991231') OR end_date<'$biz_date'
            
        UNION ALL 
        
        -- 重跑 重新開鏈的數據
        SELECT 
             id
            ,name
            ,sex     
            ,start_date
            ,'99991231' AS end_date
        FROM dws.user_merge_info 
        WHERE start_date<'$biz_date' AND end_date>='$biz_date' AND end_date<>'99991231';
    "
    if [ $? -ne 0 ];then
        Log "\n## 【user_merge_info表回退】 執行失敗 ##"
        exit 1
    fi
    Log "\n## 【user_merge_info表回退】 執行成功 ##"

fi

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM