**1、組織數據 (需要處理每條數據開頭和結尾的中括號)
(1)創建Hive表weibo_json(json string),表只有一個字段,導入所有數據,並驗證查詢前5條數據 **
create table weibo_json(json string);
load data local inpath '/root/weibo.txt' into table weibo_json;
select * from weibo_json limit 5;
(2)解析完weibo_json當中的json格式數據到擁有19個字段的weibo表中,寫出必要的SQL語句
create table weibo as
select json_tuple(json,
'beCommentWeiboId'
,'beForwardWeiboId'
,'catchTime'
,'commentCount'
,'content'
,'createTime'
,'info1'
,'info2'
,'info3'
,'mlevel'
,'musicurl'
,'pic_list'
,'praiseCount'
,'reportCount'
,'source'
,'userId'
,'videourl'
,'weiboId'
,'weiboUrl'
) as
(beCommentWeiboId
,beForwardWeiboId
,catchTime
,commentCount
,content
,createTime
,info1
,info2
,info3
,mlevel
,musicurl
,pic_list
,praiseCount
,reportCount
,source
,userId
,videourl
,weiboId
,weiboUrl
)
from weibo_json;
2、統計微博總量 和 獨立用戶數
select count(*) from weibo;
+----------+--+
| _c0 |
+----------+--+
| 1451868 |
+----------+--+
select count(*) as Independent from
(select count(*) from weibo group by userId) tmp;
+--------------+--+
| independent |
+--------------+--+
| 78540 |
+--------------+--+
3、統計用戶所有微博被轉發的次數之和,輸出top5用戶,並給出次數
#select source from weibo limit 10;
reportCount
select userId,sum(reportCount) as total from weibo group by userId order by total desc limit 5;
+-------------+--------------+--+
| userid | total |
+-------------+--------------+--+
| 1793285524 | 7.6454805E7 |
| 1629810574 | 7.3656898E7 |
| 2803301701 | 6.8176008E7 |
| 1266286555 | 5.5111054E7 |
| 1191258123 | 5.4808042E7 |
+-------------+--------------+--+
4、統計帶圖片的微博數
select count(*) as total from weibo where pic_list='[]';
+---------+--+
| total |
+---------+--+
| 701356 |
+---------+--+
5、統計使用iphone發微博的獨立用戶數
select count(*) as total from
(select count(*) as total from weibo where source='iPhone客戶端' group by userId) tmp;
+------+--+
| total |
+------+--+
| 921 |
+------+--+
6、將用戶所有微博的點贊人數和轉發人數相加求和,並將相加之和降序排列,取前10條記錄,輸出userid和總次數
select userId,(sum(praiseCount)+sum(reportCount)) as total from weibo group by userId order by total desc limit 10;
+-------------+---------------+--+
| userid | total |
+-------------+---------------+--+
| 1793285524 | 1.14941096E8 |
| 1629810574 | 9.761207E7 |
| 1266286555 | 8.3789422E7 |
| 2803301701 | 7.4208822E7 |
| 1195242865 | 6.9292231E7 |
| 1191258123 | 6.1985742E7 |
| 1197161814 | 5.9093308E7 |
| 2656274875 | 5.2380775E7 |
| 2202387347 | 5.1623117E7 |
| 1195230310 | 4.8321083E7 |
+-------------+---------------+--+
7、統計微博中評論次數小於1000的用戶ID與數據來源信息,將其放入視圖,然后統計視圖中數據來源是”ipad客戶端”的用戶數目
#創建表接收微博中評論次數小於1000的用戶ID與數據來源信息
create table small1000 as
select userId,source from weibo where commentCount<1000 group by userId,source;
#查詢數據來源是”ipad客戶端”的用戶數目
select count(*) as total from small1000 where source='iPad客戶端';
+--------+--+
| total |
+--------+--+
| 537 |
+--------+--+
**8、統計微博內容中出現”iphone”次數最多的用戶,
最終結果輸出用戶id和次數(注意:該次數是”iphone”的出現次數,
不是出現”iphone”的微博數目) **
select userId,count(userid) from weibo where content like '%iphone%' group by userId;
9、求每天發微博次數最多的那個家伙的ID和發微博的條數
#時間函數,原表存儲的數據的時間是字符串的時間戳,所以需要將其轉成長整型bigint
from_unixtime(cast(createTime as bigint)
#首先獲取數據的userId,以及時間並轉換為"yyyy-MM-dd"格式
select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo;
#然后按照時間,用戶進行分組查詢,獲取每日用戶發布微博數量,這里定義該查詢結果為a
select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time;
#從a中按照獲取的時間再進行分組查詢,獲取當日發布微博最大值,這里將查詢結果定義為b
select tmp1.Release_time as Release_time,max(tmp1.total) as big from
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) tmp1
group by tmp1.Release_time;
#最后b與a進行內連接,以獲取導userId
select a.Release_time as Release_time,b.userId as userId,a.big as total from
(select tmp1.Release_time as Release_time,max(tmp1.total) as big from
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) tmp1
group by tmp1.Release_time) a
join
(select tmp.Release_time as Release_time,
tmp.userId as userId,
count(tmp.userId) as total
from
(select userId,from_unixtime(cast(createTime as bigint),"yyyy-MM-dd") as Release_time from weibo) tmp
group by tmp.userId,tmp.Release_time) b
on a.Release_time=b.Release_time and a.big=b.total;

