天池新人實戰賽《阿里移動推薦算法》(三)—— 平台賽模型規則


   注:作為新手,基本思路來源於天池官網的視頻教程,本文僅用於在天池的數加平台上學習后所作的個人總結,特此聲明。

   視頻網址:https://tianchi.shuju.aliyun.com/video.htm?spm=5176.100068.1234.7.7Kftz1 《新手入門賽-第四課時》

       

一、SQL預處理:

  step1 —— 0_init

  預處理數據:根據項目空間給出的原始數據表構造符合自己需求的新表。

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17
 5 --****************************************************************************************
 6 
 7 -- copy user data and convert
 8 -- 整合賽題中tianchi_fresh_comp_train_user_online的數據
 9 DROP TABLE IF EXISTS recommend_train_user;
10 -- 創建依賴於behavior_type和days的hive分區表
11 CREATE TABLE IF NOT EXISTS recommend_train_user (
12     user_id BIGINT,
13     item_id BIGINT,
14     item_category BIGINT,
15     user_geohash STRING,
16     time DATETIME,
17     hours BIGINT
18 )
19 PARTITIONED BY (
20     behavior_type STRING,
21     days STRING
22 );
23 
24 -- INSERT OVER WRITE 表示往HIVE表中插入記錄並覆蓋
25 INSERT OVERWRITE TABLE recommend_train_user PARTITION (behavior_type, days)
26 SELECT t1.user_id,t1.item_id,t1.item_category,t1.user_geohash,to_date(t1.time,'yyyy-mm-dd hh'),substr(t1.time,12,13) as hours,
27 t1.behavior_type,datediff(to_date(t1.time,'yyyy-mm-dd hh'),'2014-11-18 00:00:00','dd') as days
28 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_user_online t1;
29 
30 -- copy item data and convert
31 drop table if exists recommend_train_item;
32 create table if not exists recommend_train_item(
33     item_id bigint,
34     item_geohash string,
35     item_category bigint
36 );
37 
38 insert overwrite table recommend_train_item
39 select t1.item_id,t1.item_geohash,t1.item_category
40 from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online t1; 
41 
42 
43 -- create o2o data
44 drop table if exists recommend_train_user_o2o;
45 -- like表示復制表結構
46 create table  recommend_train_user_o2o like recommend_train_user;
47 
48 --取所有recommend_train_user中item_id存在於商品子集P中的行為記錄
49 insert overwrite table recommend_train_user_o2o partition (behavior_type,days)
50 select t2.* from
51 (
52     --查詢所有不重復的item_id
53     --group by的性能可能比distinct要高
54     select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id
55 )t1
56 join recommend_train_user t2
57 on t1.item_id = t2.item_id;

 

  step2.0 —— 1_train_set

     根據原始表構造訓練用特征表,用於后續的各種分析。

  1 --odps sql
  2 --****************************************************************************************
  3 --author:smilemoon
  4 --create time;2017-01-14
  5 --****************************************************************************************
  6 -- labelday: 29(2014-12-17),window : 19~28
  7 
  8 
  9 -- training set
 10 -- user_item
 11 
 12 --step1
 13 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集
 14 --創建按照(user_id,item_id)對進行統計的特征列
 15 DROP TABLE IF EXISTS ui_feats_29;
 16 CREATE TABLE ui_feats_29
 17 AS
 18 SELECT user_id, item_id, item_category,
 19      SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--瀏覽的次數
 20      SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次數
 21      SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入購物車的次數
 22      SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--購買的次數
 23      datediff('2014-12-17 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行為距離的小時數
 24      datediff('2014-12-17 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行為距離的小時數
 25      COUNT(DISTINCT days) AS ui_act_days, -- 存在行為的活動天數
 26      -- ***人工規則特征統計***
 27      decode(sum(decode(behavior_type='3' and days=28 and hours>=18,true,1,0))>0,true,1,0)--end sum 第28天18點之后加入購物車的行為的次數是否大於0)
 28      * decode(sum(decode(behavior_type='4' and days=28 and hours>=18,true,1,0))=0,true,1,0)-- 第28天18點之后沒有購買行為的
 29      as iscartnotbuy6h,--6小時內放到購物車是否購買 1|沒有購買 0|購買
 30      decode(sum(decode(behavior_type='3' and days=28 and hours>=12,true,1,0))>0,true,1,0)--end sum 第28天12點之后加入購物車的行為的次數是否大於0)
 31      * decode(sum(decode(behavior_type='4' and days=28 and hours>=12,true,1,0))=0,true,1,0)-- 第28天12點之后沒有購買行為的
 32      as iscartnotbuy12h,--6小時內放到購物車是否購買 1|沒有購買 0|購買
 33      decode(sum(decode(behavior_type='3' and days=28,true,1,0))>0,true,1,0)--end sum 第28天0點之后加入購物車的行為的次數是否大於0)
 34      * decode(sum(decode(behavior_type='4' and days=28,true,1,0))=0,true,1,0)-- 第28天0點之后沒有購買行為的
 35      as iscartnotbuy24h --6小時內放到購物車是否購買 1|沒有購買 0|購買
 36 FROM recommend_train_user
 37 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為
 38 WHERE days >= 19
 39     AND days <= 28
 40 --根據user_id,item_id,item_category分組
 41 GROUP BY user_id, 
 42     item_id, 
 43     item_category;
 44     
 45     
 46     
 47 --step2    
 48 --user_category
 49 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集
 50 --創建按照(user_id,item_category)對進行統計的特征列
 51 DROP TABLE IF EXISTS uc_feats_29;
 52 CREATE TABLE uc_feats_29
 53 AS
 54 SELECT user_id,item_category,
 55      SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--瀏覽次數
 56      SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次數
 57      SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入購物車次數
 58      SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--購買次數
 59      decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有過購買記錄
 60      decode(SUM(decode(behavior_type = '4' AND days > 28 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小時內是否有過購買記錄
 61      decode(SUM(decode(behavior_type = '4' AND days > 28 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天內是否有過購買記錄
 62 FROM recommend_train_user
 63 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為
 64 WHERE days >= 19
 65     AND days <= 28
 66 --根據user_id,item_category分組
 67 GROUP BY user_id, 
 68     item_category;
 69 
 70 
 71 
 72 --step3
 73 --user
 74 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集
 75 --創建按照user_id進行統計的特征列
 76 DROP TABLE IF EXISTS u_feats_29;
 77 CREATE TABLE u_feats_29
 78 AS
 79 SELECT user_id,
 80      SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--瀏覽次數
 81      SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次數
 82      SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入購物車次數
 83      SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--購買次數
 84      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 購買次數/購物車次數
 85      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 購買次數/瀏覽次數
 86 FROM recommend_train_user
 87 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為
 88 WHERE days >= 19
 89     AND days <= 28
 90 GROUP BY user_id;
 91 
 92 
 93 
 94 --step4
 95 --item_id
 96 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集
 97 --創建按照user_id進行統計的特征列
 98 DROP TABLE IF EXISTS i_feats_29;
 99 CREATE TABLE i_feats_29
100 AS
101 SELECT item_id,
102     SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--瀏覽次數
103     SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次數
104     SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入購物車次數
105     SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--購買次數
106     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 購買次數/購物車次數
107     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 購買次數/瀏覽次數
108 FROM recommend_train_user 
109 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為
110 WHERE days >= 19
111     AND days <= 28
112 GROUP BY item_id;
113 
114 
115 
116 -- step5
117 -- join feats of (ui,uc,u,i) and add label
118 DROP TABLE IF EXISTS feats_29;
119 CREATE TABLE IF NOT EXISTS feats_29
120 AS
121 SELECT 
122     t1.user_id,
123     t1.item_id,
124     t1.item_category,
125     --利用左外連接右邊的表是否為null表示label(是否購買 1|購買 0|未購買)
126     decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label,
127     -- ui
128     ui_bro_cnt, 
129     ui_fav_cnt,
130     ui_cart_cnt,
131     ui_buy_cnt,
132     ui_first_hour_gap,
133     ui_last_hour_gap,
134     ui_act_days,
135     iscartnotbuy6h,
136     iscartnotbuy12h,
137     iscartnotbuy24h,
138     
139     --uc
140     uc_bro_cnt, 
141     uc_fav_cnt,
142     uc_cart_cnt,
143     uc_buy_cnt,
144     isbuycatebefore,
145     isnotbuycate24h,
146     isnotbuycate3d,
147     
148     --u
149     u_bro_cnt, 
150     u_fav_cnt,
151     u_cart_cnt,
152     u_buy_cnt,
153     u_ratio_buycart,
154     u_ratio_buybro,
155     
156     --i
157     i_bro_cnt, 
158     i_fav_cnt,
159     i_cart_cnt,
160     i_buy_cnt,
161     i_ratio_buycart,
162     i_ratio_buybro,
163     iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h,
164     iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h,
165     iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h,
166     iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d,
167     u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 
168 FROM 
169     ui_feats_29 t1
170 JOIN uc_feats_29 t2
171     ON t1.user_id = t2.user_id
172     AND t1.item_category = t2.item_category
173 JOIN u_feats_29 t3
174     ON t1.user_id = t3.user_id
175 JOIN i_feats_29 t4
176     ON t1.item_id = t4.item_id
177 --左外連接
178 LEFT OUTER JOIN (
179     --獲取所有第29天的購買記錄
180     SELECT 
181         user_id, item_id
182     FROM 
183         recommend_train_user
184     WHERE days = 29
185         AND behavior_type = '4'
186     GROUP BY user_id, item_id
187 ) t_l
188 ON t1.user_id = t_l.user_id
189 AND t1.item_id = t_l.item_id;
190 
191 
192 -- step6
193 -- fetch o2o set
194 DROP TABLE IF EXISTS feats_29_o2o;
195 CREATE TABLE IF NOT EXISTS feats_29_o2o
196 AS
197 SELECT t1.*
198 FROM feats_29 t1
199 JOIN (
200     SELECT item_id
201     FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online
202     GROUP BY item_id
203 ) t2
204 --使用內連接將feats_29不在商品子集P中的數據篩選掉
205 ON t1.item_id = t2.item_id;

 

  step3.0 —— 2_test_set

     根據原始表構造測試用特征表,用於后續的各種分析。

 

  1 --odps sql
  2 --****************************************************************************************
  3 --author:smilemoon
  4 --create time;2017-01-14
  5 --****************************************************************************************
  6 -- labelday: 30(2014-12-18),window : 20~29
  7 
  8 
  9 -- training set
 10 -- user_item
 11 
 12 --step1
 13 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集
 14 --創建按照(user_id,item_id)對進行統計的特征列
 15 DROP TABLE IF EXISTS ui_feats_30;
 16 CREATE TABLE ui_feats_30
 17 AS
 18 SELECT user_id, item_id, item_category,
 19      SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--瀏覽的次數
 20      SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次數
 21      SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入購物車的次數
 22      SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--購買的次數
 23      datediff('2014-12-18 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行為距離的小時數
 24      datediff('2014-12-18 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行為距離的小時數
 25      COUNT(DISTINCT days) AS ui_act_days, -- 存在行為的活動天數
 26      -- ***人工規則特征統計***
 27      decode(sum(decode(behavior_type='3' and days=29 and hours>=18,true,1,0))>0,true,1,0)--end sum 第29天18點之后加入購物車的行為的次數是否大於0)
 28      * decode(sum(decode(behavior_type='4' and days=29 and hours>=18,true,1,0))=0,true,1,0)-- 第29天18點之后沒有購買行為的
 29      as iscartnotbuy6h,--6小時內放到購物車是否購買 1|沒有購買 0|購買
 30      decode(sum(decode(behavior_type='3' and days=29 and hours>=12,true,1,0))>0,true,1,0)--end sum 第29天12點之后加入購物車的行為的次數是否大於0)
 31      * decode(sum(decode(behavior_type='4' and days=29 and hours>=12,true,1,0))=0,true,1,0)-- 第29天12點之后沒有購買行為的
 32      as iscartnotbuy12h,--6小時內放到購物車是否購買 1|沒有購買 0|購買
 33      decode(sum(decode(behavior_type='3' and days=29,true,1,0))>0,true,1,0)--end sum 第29天0點之后加入購物車的行為的次數是否大於0)
 34      * decode(sum(decode(behavior_type='4' and days=29,true,1,0))=0,true,1,0)-- 第29天0點之后沒有購買行為的
 35      as iscartnotbuy24h --6小時內放到購物車是否購買 1|沒有購買 0|購買
 36 FROM recommend_train_user
 37 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為
 38 WHERE days >= 20
 39     AND days <= 29
 40 --根據user_id,item_id,item_category分組
 41 GROUP BY user_id, 
 42     item_id, 
 43     item_category;
 44     
 45     
 46     
 47 --step2    
 48 --user_category
 49 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集
 50 --創建按照(user_id,item_category)對進行統計的特征列
 51 DROP TABLE IF EXISTS uc_feats_30;
 52 CREATE TABLE uc_feats_30
 53 AS
 54 SELECT user_id,item_category,
 55      SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--瀏覽次數
 56      SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次數
 57      SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入購物車次數
 58      SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--購買次數
 59      decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有過購買記錄
 60      decode(SUM(decode(behavior_type = '4' AND days > 29 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小時內是否有過購買記錄
 61      decode(SUM(decode(behavior_type = '4' AND days > 29 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天內是否有過購買記錄
 62 FROM recommend_train_user
 63 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為
 64 WHERE days >= 20
 65     AND days <= 29
 66 --根據user_id,item_category分組
 67 GROUP BY user_id, 
 68     item_category;
 69 
 70 
 71 
 72 --step3
 73 --user
 74 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集
 75 --創建按照user_id進行統計的特征列
 76 DROP TABLE IF EXISTS u_feats_30;
 77 CREATE TABLE u_feats_30
 78 AS
 79 SELECT user_id,
 80      SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--瀏覽次數
 81      SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次數
 82      SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入購物車次數
 83      SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--購買次數
 84      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 購買次數/購物車次數
 85      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 購買次數/瀏覽次數
 86 FROM recommend_train_user
 87 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為
 88 WHERE days >= 20
 89     AND days <= 29
 90 GROUP BY user_id;
 91 
 92 
 93 
 94 --step4
 95 --item_id
 96 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集
 97 --創建按照user_id進行統計的特征列
 98 DROP TABLE IF EXISTS i_feats_30;
 99 CREATE TABLE i_feats_30
100 AS
101 SELECT item_id,
102     SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--瀏覽次數
103     SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次數
104     SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入購物車次數
105     SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--購買次數
106     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 購買次數/購物車次數
107     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 購買次數/瀏覽次數
108 FROM recommend_train_user 
109 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為
110 WHERE days >= 20
111     AND days <= 29
112 GROUP BY item_id;
113 
114 
115 
116 -- step5
117 -- join feats of (ui,uc,u,i) and add label
118 DROP TABLE IF EXISTS feats_30;
119 CREATE TABLE IF NOT EXISTS feats_30
120 AS
121 SELECT 
122     t1.user_id,
123     t1.item_id,
124     t1.item_category,
125     --利用左外連接右邊的表是否為null表示label(是否購買 1|購買 0|未購買)
126     decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label,
127     -- ui
128     ui_bro_cnt, 
129     ui_fav_cnt,
130     ui_cart_cnt,
131     ui_buy_cnt,
132     ui_first_hour_gap,
133     ui_last_hour_gap,
134     ui_act_days,
135     iscartnotbuy6h,
136     iscartnotbuy12h,
137     iscartnotbuy24h,
138     
139     --uc
140     uc_bro_cnt, 
141     uc_fav_cnt,
142     uc_cart_cnt,
143     uc_buy_cnt,
144     isbuycatebefore,
145     isnotbuycate24h,
146     isnotbuycate3d,
147     
148     --u
149     u_bro_cnt, 
150     u_fav_cnt,
151     u_cart_cnt,
152     u_buy_cnt,
153     u_ratio_buycart,
154     u_ratio_buybro,
155     
156     --i
157     i_bro_cnt, 
158     i_fav_cnt,
159     i_cart_cnt,
160     i_buy_cnt,
161     i_ratio_buycart,
162     i_ratio_buybro,
163     iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h,
164     iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h,
165     iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h,
166     iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d,
167     u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 
168 FROM 
169     ui_feats_30 t1
170 JOIN uc_feats_30 t2
171     ON t1.user_id = t2.user_id
172     AND t1.item_category = t2.item_category
173 JOIN u_feats_30 t3
174     ON t1.user_id = t3.user_id
175 JOIN i_feats_30 t4
176     ON t1.item_id = t4.item_id
177 --左外連接
178 LEFT OUTER JOIN (
179     --獲取所有第30天的購買記錄
180     SELECT 
181         user_id, item_id
182     FROM 
183         recommend_train_user
184     WHERE days = 30
185         AND behavior_type = '4'
186     GROUP BY user_id, item_id
187 ) t_l
188 ON t1.user_id = t_l.user_id
189 AND t1.item_id = t_l.item_id;
190 
191 
192 -- step6
193 -- fetch o2o set
194 DROP TABLE IF EXISTS feats_30_o2o;
195 CREATE TABLE IF NOT EXISTS feats_30_o2o
196 AS
197 SELECT t1.*
198 FROM feats_30 t1
199 JOIN (
200     SELECT item_id
201     FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online
202     GROUP BY item_id
203 ) t2
204 --使用內連接將feats_30不在商品子集P中的數據篩選掉
205 ON t1.item_id = t2.item_id;

 

二、算法實驗:

  (1)LR邏輯回歸:

    step1.歸一化數據

    feats_29 ——> feats_29_norm

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17 15;28:25
 5 --****************************************************************************************
 6 
 7 -- union two tables and then normalize
 8 drop table if exists feats_union;
 9 create table feats_union as
10 select * from
11 (
12     select "train" as sign,* from feats_29 
13     union all select "test" as sign, * from feats_30 
14 )t_u;

 

  實驗:norm_split

    歸一化選擇字段:除了user_id,item_id,item_category,label 4個字段的其余34個字段。

            歸一化選擇的是max-min歸一化方法。

    feats_union ——> feats_union_norm;

 

 

     創建歸一化腳本:

 1 --split the samples and fetch 020 samples
 2 drop table if exists feats_29_norm;
 3 create table feats_29_norm as select * from feats_union_norm where sign = "train";
 4 
 5 drop table if exists feats_30_norm;
 6 create table feats_30_norm as select * from feats_union_norm where sign = "test";
 7 
 8 --fetch o2o set
 9 drop table if exists feats_29_o2o_norm;
10 create table if not exists feats_29_o2o_norm as
11 select t1.* from feats_29_norm t1 join
12 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2
13 on t1.item_id = t2.item_id;
14 
15 --fetch o2o set
16 drop table if exists feats_30_o2o_norm;
17 create table if not exists feats_30_o2o_norm as
18 select t1.* from feats_30_norm t1 join
19 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2
20 on t1.item_id = t2.item_id;

     

    

        step2.對數據量過大的負樣本(標簽為不購買的樣本)進行隨機采樣

      feats_29_norm ——> feats_29_norm_sample

      實驗:lr_sample   

 

        實驗參數設置:

        a)過濾與映射-1

      

      b)過濾與映射-2

      

        c)隨機采樣

      

      d)合並行

      

     

      step3.lr模型訓練實驗:

        feats_29_norm_sample訓練:

       

        實驗參數設置:

        (a)邏輯回歸二分類:

          

 

        (b) 預測:

        

 

  (2)GBDT:

     實驗:gbdt_demo

      

      實驗參數設置:

        過濾與映射,隨機采樣與合並行與LR模型一樣。

      (a)GBDT二分類:

        字段設置:

          特征列:排除user_id,item_id,item_category,label四列的所有列

          標簽列:label

          

        

 

      (b)預測:

        

  (3)聯合GBDT:

 三、F1評估腳本:

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17 15;20:25
 5 --****************************************************************************************
 6 
 7 --LR result of day29
 8 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1
 9 from 
10 (
11     select count(1) as predict,sum(label) as hits
12     from (select * from lr_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre
13 ) t1;
14 
15 --LR result of day30
16 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1
17 from 
18 (
19     select count(1) as predict,sum(label) as hits
20     from (select * from lr_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre
21 ) t1;
22 
23 
24 --GDBT result of day29
25 --4895
26 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1
27 from 
28 (
29     select count(1) as predict,sum(label) as hits
30     from (select * from gdbt_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre
31 ) t1;
32 
33 --GDBT result of day30
34 --4642
35 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1
36 from 
37 (
38     select count(1) as predict,sum(label) as hits
39     from (select * from gdbt_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre
40 ) t1;

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM