1. 同列多行數據組合成一個字段cell的方法, top N 問題的hive方案 如下:
hive 列轉行 to json與to array list set等復雜結構,hive topN的提取的窗口統計方法 select ll, collect_list(n) , -- 將topN 轉換成 List or Json with the help of collect_set(xx) collect_list(xx) collect_list(nn), collect_list(ll), collect_list(dd) from ( select concat('\'', n, '\'') as nn, n , ll , concat_ws(":", concat('\\\'', n, '\\\''), ll) as dd , row_number() over (partition by ll order by n desc ) as num1 -- 某用戶的所有文章點擊率排序 from ( select 1 as n, '4' as ll UNION all SELECT 2 as n, '4' as ll UNION all select 3 as n, '5' as ll UNION all SELECT 4 as n, '5' as ll UNION all SELECT 5 as n, '4' as ll UNION all select 6 as n, '5' as ll UNION all SELECT 7 as n, '5' as ll UNION all SELECT 8 as n, '4' as ll UNION all select 9 as n, '5' as ll UNION all SELECT 10 as n, '5' as ll ) a ) c where num1 <= 3 -- 篩選top 3 group by ll
2. 建表存儲list類型數據的方法以及注意點
CREATE TABLE if not exists celebrity_basic_info ( author_id bigint COMMENT 'id', area array<string> COMMENT '復雜類型的數據' ) COMMENT '-----' PARTITIONED BY ( dt string ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' STORED AS textfile ; --這里要注意存儲時的序列化轉換 collection item 的分隔符。
3. 將array<string>類型數據轉成string顯示的方法
select author_id , concat( case when size(area)=-1 then '[' else '["' end,concat_ws('","' , area ) , case when size(area)=-1 then ']' else '"]' end) --組裝拼接成json list from celebrity_basic_info
