1.hive開窗函數,分析函數


http://yugouai.iteye.com/blog/1908121

分析函數用於計算基於組的某種聚合值,它和聚合函數的不同之處是:對於每個組返回多行,而
聚合函數對於每個組只返回一行。
開窗函數指定了分析函數工作的數據窗口大小,這個數據窗口大小可能會隨着行的變化而變化


drop table if exists student;

create table student
(
   name           string,
   class          tinyint,
   cooperator_name   string,
   score          tinyint
)
row format delimited fields terminated by '|';


vi /root/test2
adf|3|測試公司1|45
asdf|3|測試公司2|55
cfe|2|測試公司2|74
3dd|3|測試公司5|78
fda|1|測試公司7|80
gds|2|測試公司9|92
ffd|1|測試公司10|95
dss|1|測試公司4|95
ddd|3|測試公司3|99
gf|3|測試公司9|99

load data local inpath '/root/test2' into table student;

hive> select * from student;
OK
adf     3       測試公司1       45
asdf    3       測試公司2       55
cfe     2       測試公司2       74
dd      3       測試公司5       NULL
fda     1       測試公司7       80
gds     2       測試公司9       92
ffd     1       測試公司10      95
dss     1       測試公司4       95
ddd     3       測試公司3       99
gf      3       測試公司9       99
Time taken: 0.642 seconds, Fetched: 10 row(s)

1:over后的寫法:    
   over(order by salary) 按照salary排序進行累計,order by是個默認的開窗函數
   over(partition by deptno)按照部門分區
   over(partition by deptno order by salary)按照部門分區,在一個分區內按照salary排序

2:開窗的窗口范圍:
2.1
over(order by salary range between 2 preceding and 2 following):窗口范圍為當前行數據幅度減2加2后的范圍內的。

--sum(s)over(order by s range between 2 preceding and 2 following) 表示加2或2的范圍內的求和

select name,class,score,sum(score)over(order by score range between 2 preceding and 2 following) mm from student;
adf        3        45        45  --45加2減2即43到47,但是s在這個范圍內只有45
asdf       3        55        55
cfe        2        74        74
3dd        3        78        158 --78在76到80范圍內有78,80,求和得158
fda        1        80        158
gds        2        92        92
ffd        1        95        190
dss        1        95        190
ddd        3        99        198
gf         3        99        198

2.2
over(order by salary rows between 5 preceding and 5 following):窗口范圍為當前行前后各移動5行。
舉例:

--sum(s)over(order by s rows between 2 preceding and 2 following)表示在上下兩行之間的范圍內
select name,class,s, sum(s)over(order by s rows between 2 preceding and 2 following) mm from t2
adf        3        45        174  (45+55+74=174)
asdf       3        55        252   (45+55+74+78=252)
cfe        2        74        332    (74+55+45+78+80=332)
3dd        3        78        379    (78+74+55+80+92=379)
fda        1        80        419
gds        2        92        440
ffd        1        95        461
dss        1        95        480
ddd        3        99        388
gf         3        99        293
 
2.3
over(order by salary range between unbounded preceding and unbounded following)或者
over(order by salary rows between unbounded preceding and unbounded following):窗口不做限制

3.與over函數結合的幾個函數介紹
row_number()over()、rank()over()和dense_rank()over()函數的使用

--create table
drop table if exists student;
create table student
(
   name           string,
   class          tinyint,
   score          tinyint
)
row format delimited fields terminated by '|';

vi /root/test3
adf|3|45
asdf|3|55
cfe|2|74
3dd|3|78
fda|1|80
gds|2|92
ffd|1|95
dss|1|95
ddd|3|99
gf|3|99

3.1rank()over()類型的
select * from                                                                      
    (                                                                           
    select name,class,score,rank()over(partition by class order by score desc) mm from t2
    )                                                                           
    where mm=1;
得到的結果是:
dss        1        95        1
ffd        1        95        1
gds        2        92        1
gf         3        99        1
ddd        3        99        1

注意:
1.在求第一名成績的時候,不能用row_number(),因為如果同班有兩個並列第一,row_number()只返回一個結果;

3.2 row_number()over 類型
select * from                                                                      
    (                                                                           
    select name,class,score,row_number()over(partition by class order by score desc) mm from t2
    )                                                                           
    where mm=1;
1        95        1  --95有兩名但是只顯示一個
2        92        1
3        99        1 --99有兩名但也只顯示一個

3.3
rank()和dense_rank()可以將所有的都查找出來:
如上可以看到采用rank可以將並列第一名的都查找出來;
rank()和dense_rank()區別:
--rank()是跳躍排序,有兩個第二名時接下來就是第四名;
select name,class,score,rank()over(partition by class order by score desc) mm from t2
dss        1        95        1
ffd        1        95        1
fda        1        80        3 --直接就跳到了第三
gds        2        92        1
cfe        2        74        2
gf         3        99        1
ddd        3        99        1
3dd        3        78        3
asdf       3        55        4
adf        3        45        5
--dense_rank()l是連續排序,有兩個第二名時仍然跟着第三名
select name,class,score,dense_rank()over(partition by class order by score desc) mm from t2
dss        1        95        1
ffd        1        95        1
fda        1        80        2 --連續排序(仍為2)
gds        2        92        1
cfe        2        74        2
gf         3        99        1
ddd        3        99        1
3dd        3        78        2
asdf       3        55        3
adf        3        45        4

3.4
--sum()over()的使用
select name,class,score, sum(s)over(partition by class order by score desc) mm from t2 --根據班級進行分數求和
dss        1        95        190 --由於兩個95都是第一名,所以累加時是兩個第一名的相加
ffd        1        95        190
fda        1        80        270 --第一名加上第二名的
gds        2        92        92  --92
cfe        2        74        166 --92+74=166
gf         3        99        198 --99+99
ddd        3        99        198 --99+99
3dd        3        78        276 --198+78
asdf       3        55        331 --276+55
adf        3        45        376 --331+45


4.first_value()over()和last_value()over()的使用
--create table
drop table if exists rm_circuit_route;
create table rm_circuit_route
(
   opr_id           string,
   serial_no         tinyint,
   res_type         string,
   res_id          tinyint,
   route_name          string
)
row format delimited fields terminated by '|';


vi /root/test3
000100190000000000021311|1|2|000100190000000001289311|光大會展1
000100190000000000021311|1|6|000100190000000001289311|光大會展2
000100190000000000021311|7|2|000100190000000001289318|光大會展3
000100190000000000021339|1|4|000100190000000001289311|光大會展4
000100190000000000021311|3|7|000100190000000001289313|光大會展5
000100190000000000021355|1|2|000100190000000001289314|光大會展6
000100190000000000021355|2|2|000100190000000001289314|光大會展7
000100190000000000021311|1|9|000100190000000001289315|光大會展8
000100190000000000021339|8|2|000100190000000001289316|光大會展9
000100190000000000021311|1|2|000100190000000001289311|光大會展10

--找出這三條電路每條電路的第一條記錄類型和最后一條記錄類型
---使用rows BETWEEN unbounded preceding AND unbounded following
SELECT opr_id,res_type,
       first_value(res_type) over(PARTITION BY opr_id ORDER BY res_type) low,
       last_value(res_type) over(PARTITION BY opr_id ORDER BY res_type rows BETWEEN unbounded preceding AND unbounded following) high
  FROM rm_circuit_route
WHERE opr_id IN ('000100190000000000021311','000100190000000000021355','000100190000000000021339')
 ORDER BY opr_id;


---取last_value時不使用rows BETWEEN unbounded preceding AND unbounded following的結果
SELECT opr_id,res_type,
       first_value(res_type) over(PARTITION BY opr_id ORDER BY res_type) low,
       last_value(res_type) over(PARTITION BY opr_id ORDER BY res_type) high
  FROM rm_circuit_route
 WHERE opr_id IN ('000100190000000000021311','000100190000000000021355','000100190000000000021339')
 ORDER BY opr_id;
如下圖可以看到,如果不使用
rows BETWEEN unbounded preceding AND unbounded following,取出的last_value由於與res_type進行進行排列,因此取出的電路的最后一行記錄的類型就不是按照電路的范圍提取了,而是以res_type為范圍進行提取了


4.1
在first_value和last_value中ignore nulls的使用
取出該電路的第一條記錄,加上ignore nulls后,如果第一條是判斷的那個字段是空的,則默認取下一條,結果如下所示:
SELECT opr_id,res_type,
       first_value(route_name ignore nulls) over(ORDER BY opr_id)
  FROM rm_circuit_route
 WHERE opr_id=('000100190000000000021311');

 或者
 SELECT opr_id,res_type,
       first_value(route_name ignore nulls) over(ORDER BY opr_id rows BETWEEN unbounded preceding AND unbounded following)
  FROM rm_circuit_route
 WHERE opr_id=('000100190000000000021311');


 5.
--lag() over()函數用法(取出前n行數據)
lag(expresstion,<offset>,<default>)
with a as
(select 1 id,'a' name from dual
 union
 select 2 id,'b' name from dual
 union
 select 3 id,'c' name from dual
 union
 select 4 id,'d' name from dual
 union
 select 5 id,'e' name from dual
)
select id,name,lag(id,1,'')over(order by name) from a;

--lead() over()函數用法(取出后N行數據)
lead(expresstion,<offset>,<default>)
with a as
(select 1 id,'a' name from dual
 union
 select 2 id,'b' name from dual
 union
 select 3 id,'c' name from dual
 union
 select 4 id,'d' name from dual
 union
 select 5 id,'e' name from dual
)
select id,name,lead(id,1,'')over(order by name) from a;

--ratio_to_report(a)函數用法 Ratio_to_report() 括號中就是分子,over() 括號中就是分母
with a as (select 1 a from dual
           union all
select 1 a from dual
           union  all
select 1 a from dual
           union all
select 2 a from dual
           union all
select 3 a from dual
           union all
select 4 a from dual
           union all
select 4 a from dual
           union all
select 5 a from dual
           )
select a, ratio_to_report(a)over(partition by a) b from a
order by a;

with a as (select 1 a from dual
           union all
select 1 a from dual
           union  all
select 1 a from dual
           union all
select 2 a from dual
           union all
select 3 a from dual
           union all
select 4 a from dual
           union all
select 4 a from dual
           union all
select 5 a from dual
           )
select a, ratio_to_report(a)over() b from a --分母缺省就是整個占比
order by a;

with a as (select 1 a from dual
           union all
select 1 a from dual
           union  all
select 1 a from dual
           union all
select 2 a from dual
           union all
select 3 a from dual
           union all
select 4 a from dual
           union all
select 4 a from dual
           union all
select 5 a from dual
           )
select a, ratio_to_report(a)over() b from a
group by a order by a;--分組后的占比

---percent_rank用法
計算方法:所在組排名序號-1除以該組所有的行數-1,如下所示自己計算的pr1與通過percent_rank函數得到的值是一樣的:
SELECT a.deptno,
       a.ename,
       a.sal,
       a.r,
       b.n,
       (a.r-1)/(n-1) pr1,
       percent_rank() over(PARTITION BY a.deptno ORDER BY a.sal) pr2
  FROM (SELECT deptno,
               ename,
               sal,
               rank() over(PARTITION BY deptno ORDER BY sal) r --計算出在組中的排名序號
          FROM emp
         ORDER BY deptno, sal) a,
       (SELECT deptno, COUNT(1) n FROM emp GROUP BY deptno) b --按部門計算每個部門的所有成員數
 WHERE a.deptno = b.deptno;

--cume_dist函數
計算方法:所在組排名序號除以該組所有的行數,但是如果存在並列情況,則需加上並列的個數-1,
          如下所示自己計算的pr1與通過percent_rank函數得到的值是一樣的:
SELECT a.deptno,
       a.ename,
       a.sal,
       a.r,
       b.n,
       c.rn,
       (a.r + c.rn - 1) / n pr1,
       cume_dist() over(PARTITION BY a.deptno ORDER BY a.sal) pr2
  FROM (SELECT deptno,
               ename,
               sal,
               rank() over(PARTITION BY deptno ORDER BY sal) r
          FROM emp
         ORDER BY deptno, sal) a,
       (SELECT deptno, COUNT(1) n FROM emp GROUP BY deptno) b,
       (SELECT deptno, r, COUNT(1) rn,sal
          FROM (SELECT deptno,sal,
                       rank() over(PARTITION BY deptno ORDER BY sal) r
                  FROM emp)
         GROUP BY deptno, r,sal
         ORDER BY deptno) c --c表就是為了得到每個部門員工工資的一樣的個數
 WHERE a.deptno = b.deptno
   AND a.deptno = c.deptno(+)
   AND a.sal = c.sal;


--percentile_cont函數

含義:輸入一個百分比(該百分比就是按照percent_rank函數計算的值),返回該百分比位置的平均值
如下,輸入百分比為0.7,因為0.7介於0.6和0.8之間,因此返回的結果就是0.6對應的sal的1500加上0.8對應的sal的1600平均
SELECT ename,
       sal,
       deptno,
       percentile_cont(0.7) within GROUP(ORDER BY sal) over(PARTITION BY deptno) "Percentile_Cont",
       percent_rank() over(PARTITION BY deptno ORDER BY sal) "Percent_Rank"
  FROM emp
 WHERE deptno IN (30, 60);


PERCENTILE_DISC函數

功能描述:返回一個與輸入的分布百分比值相對應的數據值,分布百分比的計算方法見函數CUME_DIST,如果沒有正好對應的數據值,就取大於該分布值的下一個值。
注意:本函數與PERCENTILE_CONT的區別在找不到對應的分布值時返回的替代值的計算方法不同

SAMPLE:下例中0.7的分布值在部門30中沒有對應的Cume_Dist值,所以就取下一個分布值0.83333333所對應的SALARY來替代

SELECT ename,
       sal,
       deptno,
       percentile_disc(0.7) within GROUP(ORDER BY sal) over(PARTITION BY deptno) "Percentile_Disc",
       cume_dist() over(PARTITION BY deptno ORDER BY sal) "Cume_Dist"
  FROM emp
 WHERE deptno IN (30, 60);

 
 
 
 開窗函數二:實驗部分
 
Hive分析窗口函數(1) GROUPING SETS,GROUPING__ID,CUBE,ROLLUP
 數據准備:
vi test
2015-03,2015-03-10,cookie1
2015-03,2015-03-10,cookie5
2015-03,2015-03-12,cookie7
2015-04,2015-04-12,cookie3
2015-04,2015-04-13,cookie2
2015-04,2015-04-13,cookie4
2015-04,2015-04-16,cookie4
2015-03,2015-03-10,cookie2
2015-03,2015-03-10,cookie3
2015-04,2015-04-12,cookie5
2015-04,2015-04-13,cookie6
2015-04,2015-04-15,cookie3
2015-04,2015-04-15,cookie2
2015-04,2015-04-16,cookie1

--create table
CREATE EXTERNAL TABLE austin (
    month         STRING,
    day STRING,
    cookieid     STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
--導入本地數據
load data local inpath '/hadoop/home/austin' into table austin;
 
hive> select * from austin;
OK
2015-03 2015-03-10      cookie1
2015-03 2015-03-10      cookie5
2015-03 2015-03-12      cookie7
2015-04 2015-04-12      cookie3
2015-04 2015-04-13      cookie2
2015-04 2015-04-13      cookie4
2015-04 2015-04-16      cookie4
2015-03 2015-03-10      cookie2
2015-03 2015-03-10      cookie3
2015-04 2015-04-12      cookie5
2015-04 2015-04-13      cookie6
2015-04 2015-04-15      cookie3
2015-04 2015-04-15      cookie2
2015-04 2015-04-16      cookie1
 
--GROUPING SETS
在一個GROUP BY查詢中,根據不同的維度組合進行聚合,等價於將不同維度的GROUP BY結果集進行UNION ALL
SELECT month,
    day,
    COUNT(DISTINCT cookieid) AS uv,
    GROUPING__ID
FROM austin
GROUP BY month,day
GROUPING SETS (month,day)
ORDER BY GROUPING__ID;
 
Total MapReduce CPU Time Spent: 5 seconds 740 msec
OK
2015-04 NULL            6       1
2015-03 NULL            5       1
NULL    2015-04-16      2       2
NULL    2015-04-15      2       2
NULL    2015-04-13      3       2
NULL    2015-04-12      2       2
NULL    2015-03-12      1       2
NULL    2015-03-10      4       2
Time taken: 67.361 seconds, Fetched: 8 row(s)
 
等價於
SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM austin GROUP BY month
UNION ALL
SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM austin GROUP BY day

SELECT month,NULL,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM austin GROUP BY month;
Total MapReduce CPU Time Spent: 4 seconds 800 msec
OK
2015-03 NULL    5       1
2015-04 NULL    6       1
Time taken: 33.225 seconds, Fetched: 2 row(s)

SELECT NULL,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM austin GROUP BY day;
Total MapReduce CPU Time Spent: 3 seconds 140 msec
OK
NULL    2015-03-10      4       2
NULL    2015-03-12      1       2
NULL    2015-04-12      2       2
NULL    2015-04-13      3       2
NULL    2015-04-15      2       2
NULL    2015-04-16      2       2
Time taken: 30.019 seconds, Fetched: 6 row(s)

最終:
SELECT t.month,
    t.day,
    t.uv,
    t.GROUPING__ID
FROM(
SELECT month,NULL as day,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM austin GROUP BY month
    UNION ALL
    SELECT NULL as month,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM austin GROUP BY day
) t
ORDER BY GROUPING__ID;
Total MapReduce CPU Time Spent: 8 seconds 330 msec
OK
2015-04 NULL            6       1
2015-03 NULL            5       1
NULL    2015-04-16      2       2
NULL    2015-04-15      2       2
NULL    2015-04-13      3       2
NULL    2015-04-12      2       2
NULL    2015-03-12      1       2
NULL    2015-03-10      4       2
Time taken: 65.355 seconds, Fetched: 8 row(s)

舉一反三:加入分組(month,day)--每一個月的每一天統計uv
SELECT
    month,
    day,
    COUNT(DISTINCT cookieid) AS uv,
    GROUPING__ID
FROM austin
GROUP BY month,day
GROUPING SETS (month,day,(month,day))
ORDER BY GROUPING__ID;


等價於
SELECT month,NULL as day,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM austin GROUP BY month
UNION ALL
SELECT NULL as month,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM austin GROUP BY day
UNION ALL
SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM austin GROUP BY month,day

其中的 GROUPING__ID,表示結果屬於哪一個分組集合。

SELECT t.month,
    t.day,
    t.uv,
    t.GROUPING__ID
FROM(
SELECT month,NULL as day,COUNT(DISTINCT cookieid) AS uv,1 AS GROUPING__ID FROM austin GROUP BY month
    UNION ALL
    SELECT NULL as month,day,COUNT(DISTINCT cookieid) AS uv,2 AS GROUPING__ID FROM austin GROUP BY day
    UNION ALL
    SELECT month,day,COUNT(DISTINCT cookieid) AS uv,3 AS GROUPING__ID FROM austin GROUP BY month,day)
ORDER BY GROUPING__ID;

Stage-Stage-2: Map: 1  Reduce: 1   Cumulative CPU: 3.06 sec   HDFS Read: 5427 HDFS Write: 276 SUCCESS
Total MapReduce CPU Time Spent: 7 seconds 170 msec
OK
2015-04 NULL    6       1
2015-03 NULL    5       1
NULL    2015-03-10      4       2
NULL    2015-04-16      2       2
NULL    2015-04-15      2       2
NULL    2015-04-13      3       2
NULL    2015-04-12      2       2
NULL    2015-03-12      1       2
2015-04 2015-04-16      2       3
2015-04 2015-04-12      2       3
2015-04 2015-04-13      3       3
2015-03 2015-03-12      1       3
2015-03 2015-03-10      4       3
2015-04 2015-04-15      2       3
Time taken: 91.51 seconds, Fetched: 14 row(s)

--CUBE
根據GROUP BY的維度的所有組合進行聚合。
month day 分組的有4中組合
SELECT month,
    day,
    COUNT(DISTINCT cookieid) AS uv,
    GROUPING__ID
FROM austin
GROUP BY month,day
WITH CUBE
ORDER BY GROUPING__ID;

Total MapReduce CPU Time Spent: 4 seconds 480 msec
OK
NULL    NULL            7       0
2015-03 NULL            5       1
2015-04 NULL            6       1
NULL    2015-04-16      2       2
NULL    2015-04-15      2       2
NULL    2015-04-13      3       2
NULL    2015-04-12      2       2
NULL    2015-03-12      1       2
NULL    2015-03-10      4       2
2015-04 2015-04-12      2       3
2015-04 2015-04-16      2       3
2015-03 2015-03-12      1       3
2015-03 2015-03-10      4       3
2015-04 2015-04-15      2       3
2015-04 2015-04-13      3       3
Time taken: 47.217 seconds, Fetched: 15 row(s)


---ROLLUP
是CUBE的子集,以最左側的維度為主,從該維度進行層級聚合。
比如,以month維度進行層級聚合:
SELECT month,
    day,
    COUNT(DISTINCT cookieid) AS uv,
    GROUPING__ID  
FROM austin
GROUP BY month,day
WITH ROLLUP
ORDER BY GROUPING__ID;

Total MapReduce CPU Time Spent: 6 seconds 520 msec
OK
NULL    NULL    7       0
2015-04 NULL    6       1
2015-03 NULL    5       1
2015-04 2015-04-16      2       3
2015-04 2015-04-15      2       3
2015-04 2015-04-13      3       3
2015-04 2015-04-12      2       3
2015-03 2015-03-12      1       3
2015-03 2015-03-10      4       3
Time taken: 239.641 seconds, Fetched: 9 row(s)
可以實現這樣的上鑽過程:
月天的UV->月的UV->總UV

--把month和day調換順序,則以day維度進行層級聚合:
 
SELECT
day,
month,
COUNT(DISTINCT cookieid) AS uv,
GROUPING__ID  
FROM lxw1234
GROUP BY day,month
WITH ROLLUP
ORDER BY GROUPING__ID;

day                    month              uv     GROUPING__ID
-------------------------------------------------------
NULL            NULL               7       0
2015-04-13      NULL               3       1
2015-03-12      NULL               1       1
2015-04-15      NULL               2       1
2015-03-10      NULL               4       1
2015-04-16      NULL               2       1
2015-04-12      NULL               2       1
2015-04-12      2015-04            2       3
2015-03-10      2015-03            4       3
2015-03-12      2015-03            1       3
2015-04-13      2015-04            3       3
2015-04-15      2015-04            2       3
2015-04-16      2015-04            2       3
 
可以實現這樣的上鑽過程:
天月的UV->天的UV->總UV
(這里,根據天和月進行聚合,和根據天聚合結果一樣,因為有父子關系,如果是其他維度組合的話,就會不一樣)

Hive分析窗口函數(2) NTILE,ROW_NUMBER,RANK,DENSE_RANK
數據准備:表austin 公司的164 qa_test庫中
vim austin2
cookie1,2015-04-10,1
cookie1,2015-04-11,5
cookie1,2015-04-12,7
cookie1,2015-04-13,3
cookie1,2015-04-14,2
cookie1,2015-04-15,4
cookie1,2015-04-16,4
cookie2,2015-04-10,2
cookie2,2015-04-11,3
cookie2,2015-04-12,5
cookie2,2015-04-13,6
cookie2,2015-04-14,3
cookie2,2015-04-15,9
cookie2,2015-04-16,7

--create table
CREATE TABLE austin2 (
cookieid string,
createtime string,   --day
pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
--導入本地數據
load data local inpath '/home/hadoop/austin2' into table austin2;

hive> select * from austin2;
OK
cookie1 2015-04-10      1
cookie1 2015-04-11      5
cookie1 2015-04-12      7
cookie1 2015-04-13      3
cookie1 2015-04-14      2
cookie1 2015-04-15      4
cookie1 2015-04-16      4
cookie2 2015-04-10      2
cookie2 2015-04-11      3
cookie2 2015-04-12      5
cookie2 2015-04-13      6
cookie2 2015-04-14      3
cookie2 2015-04-15      9
cookie2 2015-04-16      7
Time taken: 0.055 seconds, Fetched: 14 row(s)


--NTILE
NTILE(n),用於將分組數據按照順序切分成n片,返回當前切片值
NTILE不支持ROWS BETWEEN,比如 NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW)
如果切片不均勻,默認增加第一個切片的分布

SELECT
cookieid,
createtime,
pv,
NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn1,    --分組內將數據分成2片
NTILE(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2,  --分組內將數據分成3片
NTILE(4) OVER(ORDER BY createtime) AS rn3        --將所有數據分成4片
FROM austin2
ORDER BY cookieid,createtime;

SELECT cookieid,
    createtime,
    pv,
    NTILE(2) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn1,
    NTILE(3) OVER(PARTITION BY cookieid ORDER BY createtime) AS rn2,
    NTILE(4) OVER(ORDER BY createtime) AS rn3
FROM austin2
ORDER BY cookieid,createtime;

Total MapReduce CPU Time Spent: 8 seconds 100 msec
OK
cookie1 2015-04-10      1       1       1       1
cookie1 2015-04-11      5       1       1       1
cookie1 2015-04-12      7       1       1       2
cookie1 2015-04-13      3       1       2       2
cookie1 2015-04-14      2       2       2       3
cookie1 2015-04-15      4       2       3       4
cookie1 2015-04-16      4       2       3       4
cookie2 2015-04-10      2       1       1       1
cookie2 2015-04-11      3       1       1       1
cookie2 2015-04-12      5       1       1       2
cookie2 2015-04-13      6       1       2       2
cookie2 2015-04-14      3       2       2       3
cookie2 2015-04-15      9       2       3       3
cookie2 2015-04-16      7       2       3       4
Time taken: 66.363 seconds, Fetched: 14 row(s)

–比如,統計一個cookie,pv數最多的前1/3的天
SELECT
cookieid,
createtime,
pv,
NTILE(3) OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn
FROM austin2;

--rn = 1 的記錄,就是我們想要的結果
Total MapReduce CPU Time Spent: 2 seconds 620 msec
OK
cookie1 2015-04-12      7       1
cookie1 2015-04-11      5       1
cookie1 2015-04-16      4       1
cookie1 2015-04-15      4       2
cookie1 2015-04-13      3       2
cookie1 2015-04-14      2       3
cookie1 2015-04-10      1       3
cookie2 2015-04-15      9       1
cookie2 2015-04-16      7       1
cookie2 2015-04-13      6       1
cookie2 2015-04-12      5       2
cookie2 2015-04-11      3       2
cookie2 2015-04-14      3       3
cookie2 2015-04-10      2       3
Time taken: 21.98 seconds, Fetched: 14 row(s)

---ROW_NUMBER

ROW_NUMBER() 從1開始,按照順序,生成分組內記錄的序列
–比如,按照pv降序排列,生成分組內每天的pv名次
ROW_NUMBER() 的應用場景非常多,再比如,獲取分組內排序第一的記錄;獲取一個session中的第一條refer等。
SELECT
cookieid,
createtime,
pv,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn
FROM austin2;
Total MapReduce CPU Time Spent: 2 seconds 830 msec
OK
cookie1 2015-04-12      7       1
cookie1 2015-04-11      5       2
cookie1 2015-04-16      4       3
cookie1 2015-04-15      4       4
cookie1 2015-04-13      3       5
cookie1 2015-04-14      2       6
cookie1 2015-04-10      1       7
cookie2 2015-04-15      9       1
cookie2 2015-04-16      7       2
cookie2 2015-04-13      6       3
cookie2 2015-04-12      5       4
cookie2 2015-04-11      3       5
cookie2 2015-04-14      3       6
cookie2 2015-04-10      2       7
Time taken: 21.9 seconds, Fetched: 14 row(s)


---RANK 和 DENSE_RANK

—RANK() 生成數據項在分組中的排名,排名相等會在名次中留下空位
—DENSE_RANK() 生成數據項在分組中的排名,排名相等會在名次中不會留下空位

SELECT
cookieid,
createtime,
pv,
RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn1,
DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rn2,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn3
FROM austin2
WHERE cookieid = 'cookie1';

Total MapReduce CPU Time Spent: 3 seconds 360 msec
OK
cookie1 2015-04-12      7       1       1       1
cookie1 2015-04-11      5       2       2       2
cookie1 2015-04-16      4       3       3       3
cookie1 2015-04-15      4       3       3       4
cookie1 2015-04-13      3       5       4       5
cookie1 2015-04-14      2       6       5       6
cookie1 2015-04-10      1       7       6       7
Time taken: 23.128 seconds, Fetched: 7 row(s)


rn1: 15號和16號並列第3, 13號排第5
rn2: 15號和16號並列第3, 13號排第4
rn3: 如果相等,則按記錄值排序,生成唯一的次序,如果所有記錄值都相等,或許會隨機排吧。


Hive分析窗口函數(3) LAG,LEAD,FIRST_VALUE,LAST_VALUE
數據准備:
vi austin3
cookie1,2015-04-10 10:00:02,url2
cookie1,2015-04-10 10:00:00,url1
cookie1,2015-04-10 10:03:04,1url3
cookie1,2015-04-10 10:50:05,url6
cookie1,2015-04-10 11:00:00,url7
cookie1,2015-04-10 10:10:00,url4
cookie1,2015-04-10 10:50:01,url5
cookie2,2015-04-10 10:00:02,url22
cookie2,2015-04-10 10:00:00,url11
cookie2,2015-04-10 10:03:04,1url33
cookie2,2015-04-10 10:50:05,url66
cookie2,2015-04-10 11:00:00,url77
cookie2,2015-04-10 10:10:00,url44
cookie2,2015-04-10 10:50:01,url55
 
 --create table
CREATE TABLE austin3 (
cookieid string,
createtime string,  --頁面訪問時間
url STRING       --被訪問頁面
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;

--導入本地數據
load data local inpath '/home/hadoop/austin3' into table austin3;

hive> select * from austin3;
OK
cookie1 2015-04-10 10:00:02     url2
cookie1 2015-04-10 10:00:00     url1
cookie1 2015-04-10 10:03:04     1url3
cookie1 2015-04-10 10:50:05     url6
cookie1 2015-04-10 11:00:00     url7
cookie1 2015-04-10 10:10:00     url4
cookie1 2015-04-10 10:50:01     url5
cookie2 2015-04-10 10:00:02     url22
cookie2 2015-04-10 10:00:00     url11
cookie2 2015-04-10 10:03:04     1url33
cookie2 2015-04-10 10:50:05     url66
cookie2 2015-04-10 11:00:00     url77
cookie2 2015-04-10 10:10:00     url44
cookie2 2015-04-10 10:50:01     url55
Time taken: 0.05 seconds, Fetched: 14 row(s)

LAG

LAG(col,n,DEFAULT) 用於統計窗口內往上第n行值
第一個參數為列名,第二個參數為往上第n行(可選,默認為1),第三個參數為默認值(當往上第n行為NULL時候,取默認值,如不指定,則為NULL)

SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAG(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS last_1_time,
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS last_2_time
FROM austin3;

Total MapReduce CPU Time Spent: 3 seconds 220 msec
OK
cookie1 2015-04-10 10:00:00     url1    1       1970-01-01 00:00:00     NULL
cookie1 2015-04-10 10:00:02     url2    2       2015-04-10 10:00:00     NULL
cookie1 2015-04-10 10:03:04     1url3   3       2015-04-10 10:00:02     2015-04-10 10:00:00
cookie1 2015-04-10 10:10:00     url4    4       2015-04-10 10:03:04     2015-04-10 10:00:02
cookie1 2015-04-10 10:50:01     url5    5       2015-04-10 10:10:00     2015-04-10 10:03:04
cookie1 2015-04-10 10:50:05     url6    6       2015-04-10 10:50:01     2015-04-10 10:10:00
cookie1 2015-04-10 11:00:00     url7    7       2015-04-10 10:50:05     2015-04-10 10:50:01
cookie2 2015-04-10 10:00:00     url11   1       1970-01-01 00:00:00     NULL
cookie2 2015-04-10 10:00:02     url22   2       2015-04-10 10:00:00     NULL
cookie2 2015-04-10 10:03:04     1url33  3       2015-04-10 10:00:02     2015-04-10 10:00:00
cookie2 2015-04-10 10:10:00     url44   4       2015-04-10 10:03:04     2015-04-10 10:00:02
cookie2 2015-04-10 10:50:01     url55   5       2015-04-10 10:10:00     2015-04-10 10:03:04
cookie2 2015-04-10 10:50:05     url66   6       2015-04-10 10:50:01     2015-04-10 10:10:00
cookie2 2015-04-10 11:00:00     url77   7       2015-04-10 10:50:05     2015-04-10 10:50:01
Time taken: 22.855 seconds, Fetched: 14 row(s)

last_1_time: 指定了往上第1行的值,default為'1970-01-01 00:00:00'  
             cookie1第一行,往上1行為NULL,因此取默認值 1970-01-01 00:00:00
             cookie1第三行,往上1行值為第二行值,2015-04-10 10:00:02
             cookie1第六行,往上1行值為第五行值,2015-04-10 10:50:01
last_2_time: 指定了往上第2行的值,為指定默認值
                         cookie1第一行,往上2行為NULL
                         cookie1第二行,往上2行為NULL
                         cookie1第四行,往上2行為第二行值,2015-04-10 10:00:02
                         cookie1第七行,往上2行為第五行值,2015-04-10 10:50:01


---LEAD

與LAG相反
LEAD(col,n,DEFAULT) 用於統計窗口內往下第n行值
第一個參數為列名,第二個參數為往下第n行(可選,默認為1),第三個參數為默認值(當往下第n行為NULL時候,取默認值,如不指定,則為NULL)

SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LEAD(createtime,1,'1970-01-01 00:00:00') OVER(PARTITION BY cookieid ORDER BY createtime) AS next_1_time,
LEAD(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS next_2_time
FROM austin3;

Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 2.86 sec   HDFS Read: 9978 HDFS Write: 991 SUCCESS
Total MapReduce CPU Time Spent: 2 seconds 860 msec
OK
cookie1 2015-04-10 10:00:00     url1    1       2015-04-10 10:00:02     2015-04-10 10:03:04
cookie1 2015-04-10 10:00:02     url2    2       2015-04-10 10:03:04     2015-04-10 10:10:00
cookie1 2015-04-10 10:03:04     1url3   3       2015-04-10 10:10:00     2015-04-10 10:50:01
cookie1 2015-04-10 10:10:00     url4    4       2015-04-10 10:50:01     2015-04-10 10:50:05
cookie1 2015-04-10 10:50:01     url5    5       2015-04-10 10:50:05     2015-04-10 11:00:00
cookie1 2015-04-10 10:50:05     url6    6       2015-04-10 11:00:00     NULL
cookie1 2015-04-10 11:00:00     url7    7       1970-01-01 00:00:00     NULL
cookie2 2015-04-10 10:00:00     url11   1       2015-04-10 10:00:02     2015-04-10 10:03:04
cookie2 2015-04-10 10:00:02     url22   2       2015-04-10 10:03:04     2015-04-10 10:10:00
cookie2 2015-04-10 10:03:04     1url33  3       2015-04-10 10:10:00     2015-04-10 10:50:01
cookie2 2015-04-10 10:10:00     url44   4       2015-04-10 10:50:01     2015-04-10 10:50:05
cookie2 2015-04-10 10:50:01     url55   5       2015-04-10 10:50:05     2015-04-10 11:00:00
cookie2 2015-04-10 10:50:05     url66   6       2015-04-10 11:00:00     NULL
cookie2 2015-04-10 11:00:00     url77   7       1970-01-01 00:00:00     NULL
Time taken: 22.859 seconds, Fetched: 14 row(s)

--邏輯與LAG一樣,只不過LAG是往上,LEAD是往下。

---FIRST_VALUE

取分組內排序后,截止到當前行,第一個值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS first1
FROM austin3;
Total MapReduce CPU Time Spent: 2 seconds 920 msec
OK
cookie1 2015-04-10 10:00:00     url1    1       url1
cookie1 2015-04-10 10:00:02     url2    2       url1
cookie1 2015-04-10 10:03:04     1url3   3       url1
cookie1 2015-04-10 10:10:00     url4    4       url1
cookie1 2015-04-10 10:50:01     url5    5       url1
cookie1 2015-04-10 10:50:05     url6    6       url1
cookie1 2015-04-10 11:00:00     url7    7       url1
cookie2 2015-04-10 10:00:00     url11   1       url11
cookie2 2015-04-10 10:00:02     url22   2       url11
cookie2 2015-04-10 10:03:04     1url33  3       url11
cookie2 2015-04-10 10:10:00     url44   4       url11
cookie2 2015-04-10 10:50:01     url55   5       url11
cookie2 2015-04-10 10:50:05     url66   6       url11
cookie2 2015-04-10 11:00:00     url77   7       url11
Time taken: 21.91 seconds, Fetched: 14 row(s)


--LAST_VALUE

取分組內排序后,截止到當前行,最后一個值
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1
FROM austin3;

Total MapReduce CPU Time Spent: 2 seconds 850 msec
OK
cookie1 2015-04-10 10:00:00     url1    1       url1
cookie1 2015-04-10 10:00:02     url2    2       url2
cookie1 2015-04-10 10:03:04     1url3   3       1url3
cookie1 2015-04-10 10:10:00     url4    4       url4
cookie1 2015-04-10 10:50:01     url5    5       url5
cookie1 2015-04-10 10:50:05     url6    6       url6
cookie1 2015-04-10 11:00:00     url7    7       url7
cookie2 2015-04-10 10:00:00     url11   1       url11
cookie2 2015-04-10 10:00:02     url22   2       url22
cookie2 2015-04-10 10:03:04     1url33  3       1url33
cookie2 2015-04-10 10:10:00     url44   4       url44
cookie2 2015-04-10 10:50:01     url55   5       url55
cookie2 2015-04-10 10:50:05     url66   6       url66
cookie2 2015-04-10 11:00:00     url77   7       url77
Time taken: 22.864 seconds, Fetched: 14 row(s)
如果不指定ORDER BY,則默認按照記錄在文件中的偏移量進行排序,會出現錯誤的結果

SELECT cookieid,
createtime,
url,
FIRST_VALUE(url) OVER(PARTITION BY cookieid) AS first2  
FROM austin3;

Total MapReduce CPU Time Spent: 2 seconds 750 msec
OK
cookie1 2015-04-10 10:00:02     url2    url2
cookie1 2015-04-10 10:50:01     url5    url2
cookie1 2015-04-10 10:10:00     url4    url2
cookie1 2015-04-10 11:00:00     url7    url2
cookie1 2015-04-10 10:50:05     url6    url2
cookie1 2015-04-10 10:03:04     1url3   url2
cookie1 2015-04-10 10:00:00     url1    url2
cookie2 2015-04-10 10:50:01     url55   url55
cookie2 2015-04-10 10:10:00     url44   url55
cookie2 2015-04-10 11:00:00     url77   url55
cookie2 2015-04-10 10:50:05     url66   url55
cookie2 2015-04-10 10:03:04     1url33  url55
cookie2 2015-04-10 10:00:00     url11   url55
cookie2 2015-04-10 10:00:02     url22   url55
Time taken: 21.846 seconds, Fetched: 14 row(s)

SELECT cookieid,
createtime,
url,
LAST_VALUE(url) OVER(PARTITION BY cookieid) AS last2  
FROM austin3;
 Total MapReduce CPU Time Spent: 2 seconds 950 msec
OK
cookie1 2015-04-10 10:00:02     url2    url1
cookie1 2015-04-10 10:50:01     url5    url1
cookie1 2015-04-10 10:10:00     url4    url1
cookie1 2015-04-10 11:00:00     url7    url1
cookie1 2015-04-10 10:50:05     url6    url1
cookie1 2015-04-10 10:03:04     1url3   url1
cookie1 2015-04-10 10:00:00     url1    url1
cookie2 2015-04-10 10:50:01     url55   url22
cookie2 2015-04-10 10:10:00     url44   url22
cookie2 2015-04-10 11:00:00     url77   url22
cookie2 2015-04-10 10:50:05     url66   url22
cookie2 2015-04-10 10:03:04     1url33  url22
cookie2 2015-04-10 10:00:00     url11   url22
cookie2 2015-04-10 10:00:02     url22   url22
Time taken: 24.82 seconds, Fetched: 14 row(s)

如果想要取分組內排序后最后一個值,則需要變通一下:
SELECT cookieid,
createtime,
url,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY createtime) AS rn,
LAST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime) AS last1,
FIRST_VALUE(url) OVER(PARTITION BY cookieid ORDER BY createtime DESC) AS last2
FROM austin3
ORDER BY cookieid,createtime;

Total MapReduce CPU Time Spent: 8 seconds 0 msec
OK
cookie1 2015-04-10 10:00:00     url1    1       url1    url7
cookie1 2015-04-10 10:00:02     url2    2       url2    url7
cookie1 2015-04-10 10:03:04     1url3   3       1url3   url7
cookie1 2015-04-10 10:10:00     url4    4       url4    url7
cookie1 2015-04-10 10:50:01     url5    5       url5    url7
cookie1 2015-04-10 10:50:05     url6    6       url6    url7
cookie1 2015-04-10 11:00:00     url7    7       url7    url7
cookie2 2015-04-10 10:00:00     url11   1       url11   url77
cookie2 2015-04-10 10:00:02     url22   2       url22   url77
cookie2 2015-04-10 10:03:04     1url33  3       1url33  url77
cookie2 2015-04-10 10:10:00     url44   4       url44   url77
cookie2 2015-04-10 10:50:01     url55   5       url55   url77
cookie2 2015-04-10 10:50:05     url66   6       url66   url77
cookie2 2015-04-10 11:00:00     url77   7       url77   url77
Time taken: 66.336 seconds, Fetched: 14 row(s)
提示:在使用分析函數的過程中,要特別注意ORDER BY子句,用的不恰當,統計出的結果就不是你所期望的

Hive分析窗口函數(4) CUME_DIST,PERCENT_RANK
數據准備;vi austin4
d1,user1,1000
d1,user2,2000
d1,user3,3000
d2,user4,4000
d2,user5,5000
 
CREATE TABLE austin4 (
dept STRING,
userid string,
sal INT
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
stored as textfile;

load data local inpath '/home/hadoop/austin4' into table austin4;

hive> select * from austin4;
OK
d1      user1   1000
d1      user2   2000
d1      user3   3000
d2      user4   4000
d2      user5   5000
Time taken: 0.055 seconds, Fetched: 5 row(s)

–-CUME_DIST 小於等於當前值的行數/分組內總行數
–比如,統計小於等於當前薪水的人數,所占總人數的比例
SELECT
dept,
userid,
sal,
CUME_DIST() OVER(ORDER BY sal) AS rn1,
CUME_DIST() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM austin4;

Total MapReduce CPU Time Spent: 7 seconds 30 msec
OK
d1      user1   1000    0.2     0.3333333333333333
d1      user2   2000    0.4     0.6666666666666666
d1      user3   3000    0.6     1.0
d2      user4   4000    0.8     0.5
d2      user5   5000    1.0     1.0
Time taken: 65.446 seconds, Fetched: 5 row(s)

rn1: 沒有partition,所有數據均為1組,總行數為5,
     第一行:小於等於1000的行數為1,因此,1/5=0.2
     第二行:小於等於2000的行數為2,因此,2/5=0.4
     第三行:小於等於3000的行數為3,因此,3/5=0.6
rn2: 按照部門分組,dpet=d1的行數為3,
     第二行:小於等於2000的行數為2,因此,2/3=0.6666666666666666

--PERCENT_RANK
–PERCENT_RANK 分組內當前行的RANK值-1/分組內總行數-1
應用場景不了解,可能在一些特殊算法的實現中可以用到吧。

SELECT
dept,
userid,
sal,
PERCENT_RANK() OVER(ORDER BY sal) AS rn1,   --分組內
RANK() OVER(ORDER BY sal) AS rn11,          --分組內RANK值
SUM(1) OVER(PARTITION BY NULL) AS rn12,     --分組內總行數
PERCENT_RANK() OVER(PARTITION BY dept ORDER BY sal) AS rn2
FROM austin4;

Total MapReduce CPU Time Spent: 8 seconds 370 msec
OK
d1      user1   1000    0.0     1       5       0.0
d1      user2   2000    0.25    2       5       0.5
d1      user3   3000    0.5     3       5       1.0
d2      user4   4000    0.75    4       5       0.0
d2      user5   5000    1.0     5       5       1.0
Time taken: 68.557 seconds, Fetched: 5 row(s)

rn1: rn1 = (rn11-1) / (rn12-1)
       第一行,(1-1)/(5-1)=0/4=0
       第二行,(2-1)/(5-1)=1/4=0.25
       第四行,(4-1)/(5-1)=3/4=0.75
rn2: 按照dept分組,
     dept=d1的總行數為3
     第一行,(1-1)/(3-1)=0
     第三行,(3-1)/(3-1)=1

    
Hive分析窗口函數(5) SUM,AVG,MIN,MAX

准備數據:vi austin5
cookie1,2015-04-10,1
cookie1,2015-04-11,5
cookie1,2015-04-12,7
cookie1,2015-04-13,3
cookie1,2015-04-14,2
cookie1,2015-04-15,4
cookie1,2015-04-16,4

 --create table   
CREATE  TABLE austin5 (
    cookieid string,
    createtime string,   --day
    pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
stored as textfile;
    
--導入本地數據
load data local inpath '/home/hadoop/austin5' into table austin5;
    
hive> select * from austin5;
    OK
cookie1 2015-04-10      1
cookie1 2015-04-11      5
cookie1 2015-04-12      7
cookie1 2015-04-13      3
cookie1 2015-04-14      2
cookie1 2015-04-15      4
cookie1 2015-04-16      4

SUM — 注意,結果和ORDER BY相關,默認為升序

SELECT cookieid,
    createtime,
    pv,
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默認為從起點到當前行
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --從起點到當前行,結果同pv1
    SUM(pv) OVER(PARTITION BY cookieid) AS pv3,--分組內所有行
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --當前行+往前3行
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --當前行+往前3行+往后1行
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---當前行+往后所有行  
FROM austin5;

SELECT cookieid,
    createtime,
    pv,
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1,
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2,
    SUM(pv) OVER(PARTITION BY cookieid) AS pv3,
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,  
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    
    SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6  
FROM austin5;
Total MapReduce CPU Time Spent: 6 seconds 10 msec
OK
cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6
-----------------------------------------------------------------------------
cookie1  2015-04-10      1       1       1       26      1       6       26
cookie1  2015-04-11      5       6       6       26      6       13      25
cookie1  2015-04-12      7       13      13      26      13      16      20
cookie1  2015-04-13      3       16      16      26      16      18      13
cookie1  2015-04-14      2       18      18      26      17      21      10
cookie1  2015-04-15      4       22      22      26      16      20      8
cookie1  2015-04-16      4       26      26      26      13      13      4
Time taken: 51.209 seconds, Fetched: 7 row(s)

pv1: 分組內從起點到當前行的pv累積,如,11號的pv1=10號的pv+11號的pv, 12號=10號+11號+12號
pv2: 同pv1
pv3: 分組內(cookie1)所有的pv累加
pv4: 分組內當前行+往前3行,如,11號=10號+11號, 12號=10號+11號+12號, 13號=10號+11號+12號+13號, 14號=11號+12號+13號+14號
pv5: 分組內當前行+往前3行+往后1行,如,14號=11號+12號+13號+14號+15號=5+7+3+2+4=21
pv6: 分組內當前行+往后所有行,如,13號=13號+14號+15號+16號=3+2+4+4=13,14號=14號+15號+16號=2+4+4=10

如果不指定ROWS BETWEEN,默認為從起點到當前行;
如果不指定ORDER BY,則將分組內所有值累加;
關鍵是理解ROWS BETWEEN含義,也叫做WINDOW子句:
PRECEDING:往前
FOLLOWING:往后
CURRENT ROW:當前行
UNBOUNDED:起點,UNBOUNDED PRECEDING 表示從前面的起點, UNBOUNDED FOLLOWING:表示到后面的終點

    --AVG
    SELECT cookieid,
    createtime,
    pv,
    AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默認為從起點到當前行
    AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --從起點到當前行,結果同pv1
    AVG(pv) OVER(PARTITION BY cookieid) AS pv3,    --分組內所有行
    AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --當前行+往前3行
    AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --當前行+往前3行+往后1行
    AVG(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---當前行+往后所有行  
    FROM lxw1234;
    cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6
    -----------------------------------------------------------------------------
    cookie1 2015-04-10      1       1.0     1.0     3.7142857142857144      1.0     3.0     3.7142857142857144
    cookie1 2015-04-11      5       3.0     3.0     3.7142857142857144      3.0     4.333333333333333       4.166666666666667
    cookie1 2015-04-12      7       4.333333333333333       4.333333333333333       3.7142857142857144      4.333333333333333       4.0     4.0
    cookie1 2015-04-13      3       4.0     4.0     3.7142857142857144      4.0     3.6     3.25
    cookie1 2015-04-14      2       3.6     3.6     3.7142857142857144      4.25    4.2     3.3333333333333335
    cookie1 2015-04-15      4       3.6666666666666665      3.6666666666666665      3.7142857142857144      4.0     4.0     4.0
    cookie1 2015-04-16      4       3.7142857142857144      3.7142857142857144      3.7142857142857144      3.25    3.25    4.0

    --MIN
    SELECT cookieid,
    createtime,
    pv,
    MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默認為從起點到當前行
    MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --從起點到當前行,結果同pv1
    MIN(pv) OVER(PARTITION BY cookieid) AS pv3,--分組內所有行
    MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --當前行+往前3行
    MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --當前行+往前3行+往后1行
    MIN(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---當前行+往后所有行  
    FROM lxw1234;
     
    cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6
    -----------------------------------------------------------------------------
    cookie1 2015-04-10      1       1       1       1       1       1       1
    cookie1 2015-04-11      5       1       1       1       1       1       2
    cookie1 2015-04-12      7       1       1       1       1       1       2
    cookie1 2015-04-13      3       1       1       1       1       1       2
    cookie1 2015-04-14      2       1       1       1       2       2       2
    cookie1 2015-04-15      4       1       1       1       2       2       4
    cookie1 2015-04-16      4       1       1       1       2       2       4

    ----MAX
    SELECT cookieid,
    createtime,
    pv,
    MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默認為從起點到當前行
    MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --從起點到當前行,結果同pv1
    MAX(pv) OVER(PARTITION BY cookieid) AS pv3,                                --分組內所有行
    MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4,   --當前行+往前3行
    MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5,    --當前行+往前3行+往后1行
    MAX(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6   ---當前行+往后所有行  
    FROM lxw1234;
     
    cookieid createtime     pv      pv1     pv2     pv3     pv4     pv5      pv6
    -----------------------------------------------------------------------------
    cookie1 2015-04-10      1       1       1       7       1       5       7
    cookie1 2015-04-11      5       5       5       7       5       7       7
    cookie1 2015-04-12      7       7       7       7       7       7       7
    cookie1 2015-04-13      3       7       7       7       7       7       4
    cookie1 2015-04-14      2       7       7       7       7       7       4
    cookie1 2015-04-15      4       7       7       7       7       7       4
    cookie1 2015-04-16      4       7       7       7       4       4       4





免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM