注:作為新手,基本思路來源於天池官網的視頻教程,本文僅用於在天池的數加平台上學習后所作的個人總結,特此聲明。
視頻網址:https://tianchi.shuju.aliyun.com/video.htm?spm=5176.100068.1234.7.7Kftz1 《新手入門賽-第四課時》
一、SQL預處理:
step1 —— 0_init
預處理數據:根據項目空間給出的原始數據表構造符合自己需求的新表。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 5 --**************************************************************************************** 6 7 -- copy user data and convert 8 -- 整合賽題中tianchi_fresh_comp_train_user_online的數據 9 DROP TABLE IF EXISTS recommend_train_user; 10 -- 創建依賴於behavior_type和days的hive分區表 11 CREATE TABLE IF NOT EXISTS recommend_train_user ( 12 user_id BIGINT, 13 item_id BIGINT, 14 item_category BIGINT, 15 user_geohash STRING, 16 time DATETIME, 17 hours BIGINT 18 ) 19 PARTITIONED BY ( 20 behavior_type STRING, 21 days STRING 22 ); 23 24 -- INSERT OVER WRITE 表示往HIVE表中插入記錄並覆蓋 25 INSERT OVERWRITE TABLE recommend_train_user PARTITION (behavior_type, days) 26 SELECT t1.user_id,t1.item_id,t1.item_category,t1.user_geohash,to_date(t1.time,'yyyy-mm-dd hh'),substr(t1.time,12,13) as hours, 27 t1.behavior_type,datediff(to_date(t1.time,'yyyy-mm-dd hh'),'2014-11-18 00:00:00','dd') as days 28 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_user_online t1; 29 30 -- copy item data and convert 31 drop table if exists recommend_train_item; 32 create table if not exists recommend_train_item( 33 item_id bigint, 34 item_geohash string, 35 item_category bigint 36 ); 37 38 insert overwrite table recommend_train_item 39 select t1.item_id,t1.item_geohash,t1.item_category 40 from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online t1; 41 42 43 -- create o2o data 44 drop table if exists recommend_train_user_o2o; 45 -- like表示復制表結構 46 create table recommend_train_user_o2o like recommend_train_user; 47 48 --取所有recommend_train_user中item_id存在於商品子集P中的行為記錄 49 insert overwrite table recommend_train_user_o2o partition (behavior_type,days) 50 select t2.* from 51 ( 52 --查詢所有不重復的item_id 53 --group by的性能可能比distinct要高 54 select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id 55 )t1 56 join recommend_train_user t2 57 on t1.item_id = t2.item_id;
step2.0 —— 1_train_set
根據原始表構造訓練用特征表,用於后續的各種分析。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-14 5 --**************************************************************************************** 6 -- labelday: 29(2014-12-17),window : 19~28 7 8 9 -- training set 10 -- user_item 11 12 --step1 13 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集 14 --創建按照(user_id,item_id)對進行統計的特征列 15 DROP TABLE IF EXISTS ui_feats_29; 16 CREATE TABLE ui_feats_29 17 AS 18 SELECT user_id, item_id, item_category, 19 SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--瀏覽的次數 20 SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次數 21 SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入購物車的次數 22 SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--購買的次數 23 datediff('2014-12-17 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行為距離的小時數 24 datediff('2014-12-17 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行為距離的小時數 25 COUNT(DISTINCT days) AS ui_act_days, -- 存在行為的活動天數 26 -- ***人工規則特征統計*** 27 decode(sum(decode(behavior_type='3' and days=28 and hours>=18,true,1,0))>0,true,1,0)--end sum 第28天18點之后加入購物車的行為的次數是否大於0) 28 * decode(sum(decode(behavior_type='4' and days=28 and hours>=18,true,1,0))=0,true,1,0)-- 第28天18點之后沒有購買行為的 29 as iscartnotbuy6h,--6小時內放到購物車是否購買 1|沒有購買 0|購買 30 decode(sum(decode(behavior_type='3' and days=28 and hours>=12,true,1,0))>0,true,1,0)--end sum 第28天12點之后加入購物車的行為的次數是否大於0) 31 * decode(sum(decode(behavior_type='4' and days=28 and hours>=12,true,1,0))=0,true,1,0)-- 第28天12點之后沒有購買行為的 32 as iscartnotbuy12h,--6小時內放到購物車是否購買 1|沒有購買 0|購買 33 decode(sum(decode(behavior_type='3' and days=28,true,1,0))>0,true,1,0)--end sum 第28天0點之后加入購物車的行為的次數是否大於0) 34 * decode(sum(decode(behavior_type='4' and days=28,true,1,0))=0,true,1,0)-- 第28天0點之后沒有購買行為的 35 as iscartnotbuy24h --6小時內放到購物車是否購買 1|沒有購買 0|購買 36 FROM recommend_train_user 37 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為 38 WHERE days >= 19 39 AND days <= 28 40 --根據user_id,item_id,item_category分組 41 GROUP BY user_id, 42 item_id, 43 item_category; 44 45 46 47 --step2 48 --user_category 49 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集 50 --創建按照(user_id,item_category)對進行統計的特征列 51 DROP TABLE IF EXISTS uc_feats_29; 52 CREATE TABLE uc_feats_29 53 AS 54 SELECT user_id,item_category, 55 SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--瀏覽次數 56 SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次數 57 SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入購物車次數 58 SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--購買次數 59 decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有過購買記錄 60 decode(SUM(decode(behavior_type = '4' AND days > 28 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小時內是否有過購買記錄 61 decode(SUM(decode(behavior_type = '4' AND days > 28 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天內是否有過購買記錄 62 FROM recommend_train_user 63 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為 64 WHERE days >= 19 65 AND days <= 28 66 --根據user_id,item_category分組 67 GROUP BY user_id, 68 item_category; 69 70 71 72 --step3 73 --user 74 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集 75 --創建按照user_id進行統計的特征列 76 DROP TABLE IF EXISTS u_feats_29; 77 CREATE TABLE u_feats_29 78 AS 79 SELECT user_id, 80 SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--瀏覽次數 81 SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次數 82 SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入購物車次數 83 SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--購買次數 84 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 購買次數/購物車次數 85 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 購買次數/瀏覽次數 86 FROM recommend_train_user 87 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為 88 WHERE days >= 19 89 AND days <= 28 90 GROUP BY user_id; 91 92 93 94 --step4 95 --item_id 96 --從recommend_train_user用戶行為記錄表創建第1到28天的特征集 97 --創建按照user_id進行統計的特征列 98 DROP TABLE IF EXISTS i_feats_29; 99 CREATE TABLE i_feats_29 100 AS 101 SELECT item_id, 102 SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--瀏覽次數 103 SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次數 104 SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入購物車次數 105 SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--購買次數 106 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 購買次數/購物車次數 107 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 購買次數/瀏覽次數 108 FROM recommend_train_user 109 -- 在第19天到第28天 10天內用戶的行為數據,用於預測第29天的行為 110 WHERE days >= 19 111 AND days <= 28 112 GROUP BY item_id; 113 114 115 116 -- step5 117 -- join feats of (ui,uc,u,i) and add label 118 DROP TABLE IF EXISTS feats_29; 119 CREATE TABLE IF NOT EXISTS feats_29 120 AS 121 SELECT 122 t1.user_id, 123 t1.item_id, 124 t1.item_category, 125 --利用左外連接右邊的表是否為null表示label(是否購買 1|購買 0|未購買) 126 decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label, 127 -- ui 128 ui_bro_cnt, 129 ui_fav_cnt, 130 ui_cart_cnt, 131 ui_buy_cnt, 132 ui_first_hour_gap, 133 ui_last_hour_gap, 134 ui_act_days, 135 iscartnotbuy6h, 136 iscartnotbuy12h, 137 iscartnotbuy24h, 138 139 --uc 140 uc_bro_cnt, 141 uc_fav_cnt, 142 uc_cart_cnt, 143 uc_buy_cnt, 144 isbuycatebefore, 145 isnotbuycate24h, 146 isnotbuycate3d, 147 148 --u 149 u_bro_cnt, 150 u_fav_cnt, 151 u_cart_cnt, 152 u_buy_cnt, 153 u_ratio_buycart, 154 u_ratio_buybro, 155 156 --i 157 i_bro_cnt, 158 i_fav_cnt, 159 i_cart_cnt, 160 i_buy_cnt, 161 i_ratio_buycart, 162 i_ratio_buybro, 163 iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h, 164 iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h, 165 iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h, 166 iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d, 167 u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 168 FROM 169 ui_feats_29 t1 170 JOIN uc_feats_29 t2 171 ON t1.user_id = t2.user_id 172 AND t1.item_category = t2.item_category 173 JOIN u_feats_29 t3 174 ON t1.user_id = t3.user_id 175 JOIN i_feats_29 t4 176 ON t1.item_id = t4.item_id 177 --左外連接 178 LEFT OUTER JOIN ( 179 --獲取所有第29天的購買記錄 180 SELECT 181 user_id, item_id 182 FROM 183 recommend_train_user 184 WHERE days = 29 185 AND behavior_type = '4' 186 GROUP BY user_id, item_id 187 ) t_l 188 ON t1.user_id = t_l.user_id 189 AND t1.item_id = t_l.item_id; 190 191 192 -- step6 193 -- fetch o2o set 194 DROP TABLE IF EXISTS feats_29_o2o; 195 CREATE TABLE IF NOT EXISTS feats_29_o2o 196 AS 197 SELECT t1.* 198 FROM feats_29 t1 199 JOIN ( 200 SELECT item_id 201 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online 202 GROUP BY item_id 203 ) t2 204 --使用內連接將feats_29不在商品子集P中的數據篩選掉 205 ON t1.item_id = t2.item_id;
step3.0 —— 2_test_set
根據原始表構造測試用特征表,用於后續的各種分析。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-14 5 --**************************************************************************************** 6 -- labelday: 30(2014-12-18),window : 20~29 7 8 9 -- training set 10 -- user_item 11 12 --step1 13 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集 14 --創建按照(user_id,item_id)對進行統計的特征列 15 DROP TABLE IF EXISTS ui_feats_30; 16 CREATE TABLE ui_feats_30 17 AS 18 SELECT user_id, item_id, item_category, 19 SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--瀏覽的次數 20 SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次數 21 SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入購物車的次數 22 SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--購買的次數 23 datediff('2014-12-18 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行為距離的小時數 24 datediff('2014-12-18 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行為距離的小時數 25 COUNT(DISTINCT days) AS ui_act_days, -- 存在行為的活動天數 26 -- ***人工規則特征統計*** 27 decode(sum(decode(behavior_type='3' and days=29 and hours>=18,true,1,0))>0,true,1,0)--end sum 第29天18點之后加入購物車的行為的次數是否大於0) 28 * decode(sum(decode(behavior_type='4' and days=29 and hours>=18,true,1,0))=0,true,1,0)-- 第29天18點之后沒有購買行為的 29 as iscartnotbuy6h,--6小時內放到購物車是否購買 1|沒有購買 0|購買 30 decode(sum(decode(behavior_type='3' and days=29 and hours>=12,true,1,0))>0,true,1,0)--end sum 第29天12點之后加入購物車的行為的次數是否大於0) 31 * decode(sum(decode(behavior_type='4' and days=29 and hours>=12,true,1,0))=0,true,1,0)-- 第29天12點之后沒有購買行為的 32 as iscartnotbuy12h,--6小時內放到購物車是否購買 1|沒有購買 0|購買 33 decode(sum(decode(behavior_type='3' and days=29,true,1,0))>0,true,1,0)--end sum 第29天0點之后加入購物車的行為的次數是否大於0) 34 * decode(sum(decode(behavior_type='4' and days=29,true,1,0))=0,true,1,0)-- 第29天0點之后沒有購買行為的 35 as iscartnotbuy24h --6小時內放到購物車是否購買 1|沒有購買 0|購買 36 FROM recommend_train_user 37 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為 38 WHERE days >= 20 39 AND days <= 29 40 --根據user_id,item_id,item_category分組 41 GROUP BY user_id, 42 item_id, 43 item_category; 44 45 46 47 --step2 48 --user_category 49 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集 50 --創建按照(user_id,item_category)對進行統計的特征列 51 DROP TABLE IF EXISTS uc_feats_30; 52 CREATE TABLE uc_feats_30 53 AS 54 SELECT user_id,item_category, 55 SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--瀏覽次數 56 SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次數 57 SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入購物車次數 58 SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--購買次數 59 decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有過購買記錄 60 decode(SUM(decode(behavior_type = '4' AND days > 29 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小時內是否有過購買記錄 61 decode(SUM(decode(behavior_type = '4' AND days > 29 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天內是否有過購買記錄 62 FROM recommend_train_user 63 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為 64 WHERE days >= 20 65 AND days <= 29 66 --根據user_id,item_category分組 67 GROUP BY user_id, 68 item_category; 69 70 71 72 --step3 73 --user 74 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集 75 --創建按照user_id進行統計的特征列 76 DROP TABLE IF EXISTS u_feats_30; 77 CREATE TABLE u_feats_30 78 AS 79 SELECT user_id, 80 SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--瀏覽次數 81 SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次數 82 SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入購物車次數 83 SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--購買次數 84 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 購買次數/購物車次數 85 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 購買次數/瀏覽次數 86 FROM recommend_train_user 87 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為 88 WHERE days >= 20 89 AND days <= 29 90 GROUP BY user_id; 91 92 93 94 --step4 95 --item_id 96 --從recommend_train_user用戶行為記錄表創建第1到29天的特征集 97 --創建按照user_id進行統計的特征列 98 DROP TABLE IF EXISTS i_feats_30; 99 CREATE TABLE i_feats_30 100 AS 101 SELECT item_id, 102 SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--瀏覽次數 103 SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次數 104 SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入購物車次數 105 SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--購買次數 106 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 購買次數/購物車次數 107 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 購買次數/瀏覽次數 108 FROM recommend_train_user 109 -- 在第20天到第29天 10天內用戶的行為數據,用於預測第30天的行為 110 WHERE days >= 20 111 AND days <= 29 112 GROUP BY item_id; 113 114 115 116 -- step5 117 -- join feats of (ui,uc,u,i) and add label 118 DROP TABLE IF EXISTS feats_30; 119 CREATE TABLE IF NOT EXISTS feats_30 120 AS 121 SELECT 122 t1.user_id, 123 t1.item_id, 124 t1.item_category, 125 --利用左外連接右邊的表是否為null表示label(是否購買 1|購買 0|未購買) 126 decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label, 127 -- ui 128 ui_bro_cnt, 129 ui_fav_cnt, 130 ui_cart_cnt, 131 ui_buy_cnt, 132 ui_first_hour_gap, 133 ui_last_hour_gap, 134 ui_act_days, 135 iscartnotbuy6h, 136 iscartnotbuy12h, 137 iscartnotbuy24h, 138 139 --uc 140 uc_bro_cnt, 141 uc_fav_cnt, 142 uc_cart_cnt, 143 uc_buy_cnt, 144 isbuycatebefore, 145 isnotbuycate24h, 146 isnotbuycate3d, 147 148 --u 149 u_bro_cnt, 150 u_fav_cnt, 151 u_cart_cnt, 152 u_buy_cnt, 153 u_ratio_buycart, 154 u_ratio_buybro, 155 156 --i 157 i_bro_cnt, 158 i_fav_cnt, 159 i_cart_cnt, 160 i_buy_cnt, 161 i_ratio_buycart, 162 i_ratio_buybro, 163 iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h, 164 iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h, 165 iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h, 166 iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d, 167 u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 168 FROM 169 ui_feats_30 t1 170 JOIN uc_feats_30 t2 171 ON t1.user_id = t2.user_id 172 AND t1.item_category = t2.item_category 173 JOIN u_feats_30 t3 174 ON t1.user_id = t3.user_id 175 JOIN i_feats_30 t4 176 ON t1.item_id = t4.item_id 177 --左外連接 178 LEFT OUTER JOIN ( 179 --獲取所有第30天的購買記錄 180 SELECT 181 user_id, item_id 182 FROM 183 recommend_train_user 184 WHERE days = 30 185 AND behavior_type = '4' 186 GROUP BY user_id, item_id 187 ) t_l 188 ON t1.user_id = t_l.user_id 189 AND t1.item_id = t_l.item_id; 190 191 192 -- step6 193 -- fetch o2o set 194 DROP TABLE IF EXISTS feats_30_o2o; 195 CREATE TABLE IF NOT EXISTS feats_30_o2o 196 AS 197 SELECT t1.* 198 FROM feats_30 t1 199 JOIN ( 200 SELECT item_id 201 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online 202 GROUP BY item_id 203 ) t2 204 --使用內連接將feats_30不在商品子集P中的數據篩選掉 205 ON t1.item_id = t2.item_id;
二、算法實驗:
(1)LR邏輯回歸:
step1.歸一化數據
feats_29 ——> feats_29_norm
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 15;28:25 5 --**************************************************************************************** 6 7 -- union two tables and then normalize 8 drop table if exists feats_union; 9 create table feats_union as 10 select * from 11 ( 12 select "train" as sign,* from feats_29 13 union all select "test" as sign, * from feats_30 14 )t_u;
實驗:norm_split
歸一化選擇字段:除了user_id,item_id,item_category,label 4個字段的其余34個字段。
歸一化選擇的是max-min歸一化方法。
feats_union ——> feats_union_norm;
創建歸一化腳本:
1 --split the samples and fetch 020 samples 2 drop table if exists feats_29_norm; 3 create table feats_29_norm as select * from feats_union_norm where sign = "train"; 4 5 drop table if exists feats_30_norm; 6 create table feats_30_norm as select * from feats_union_norm where sign = "test"; 7 8 --fetch o2o set 9 drop table if exists feats_29_o2o_norm; 10 create table if not exists feats_29_o2o_norm as 11 select t1.* from feats_29_norm t1 join 12 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2 13 on t1.item_id = t2.item_id; 14 15 --fetch o2o set 16 drop table if exists feats_30_o2o_norm; 17 create table if not exists feats_30_o2o_norm as 18 select t1.* from feats_30_norm t1 join 19 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2 20 on t1.item_id = t2.item_id;
step2.對數據量過大的負樣本(標簽為不購買的樣本)進行隨機采樣
feats_29_norm ——> feats_29_norm_sample
實驗:lr_sample
實驗參數設置:
a)過濾與映射-1
b)過濾與映射-2
c)隨機采樣
d)合並行
step3.lr模型訓練實驗:
feats_29_norm_sample訓練:
實驗參數設置:
(a)邏輯回歸二分類:
(b) 預測:
(2)GBDT:
實驗:gbdt_demo
實驗參數設置:
過濾與映射,隨機采樣與合並行與LR模型一樣。
(a)GBDT二分類:
字段設置:
特征列:排除user_id,item_id,item_category,label四列的所有列
標簽列:label
(b)預測:
(3)聯合GBDT:
三、F1評估腳本:
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 15;20:25 5 --**************************************************************************************** 6 7 --LR result of day29 8 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1 9 from 10 ( 11 select count(1) as predict,sum(label) as hits 12 from (select * from lr_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre 13 ) t1; 14 15 --LR result of day30 16 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1 17 from 18 ( 19 select count(1) as predict,sum(label) as hits 20 from (select * from lr_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre 21 ) t1; 22 23 24 --GDBT result of day29 25 --4895 26 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1 27 from 28 ( 29 select count(1) as predict,sum(label) as hits 30 from (select * from gdbt_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre 31 ) t1; 32 33 --GDBT result of day30 34 --4642 35 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1 36 from 37 ( 38 select count(1) as predict,sum(label) as hits 39 from (select * from gdbt_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre 40 ) t1;