注:作为新手,基本思路来源于天池官网的视频教程,本文仅用于在天池的数加平台上学习后所作的个人总结,特此声明。
视频网址:https://tianchi.shuju.aliyun.com/video.htm?spm=5176.100068.1234.7.7Kftz1 《新手入门赛-第四课时》
一、SQL预处理:
step1 —— 0_init
预处理数据:根据项目空间给出的原始数据表构造符合自己需求的新表。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 5 --**************************************************************************************** 6 7 -- copy user data and convert 8 -- 整合赛题中tianchi_fresh_comp_train_user_online的数据 9 DROP TABLE IF EXISTS recommend_train_user; 10 -- 创建依赖于behavior_type和days的hive分区表 11 CREATE TABLE IF NOT EXISTS recommend_train_user ( 12 user_id BIGINT, 13 item_id BIGINT, 14 item_category BIGINT, 15 user_geohash STRING, 16 time DATETIME, 17 hours BIGINT 18 ) 19 PARTITIONED BY ( 20 behavior_type STRING, 21 days STRING 22 ); 23 24 -- INSERT OVER WRITE 表示往HIVE表中插入记录并覆盖 25 INSERT OVERWRITE TABLE recommend_train_user PARTITION (behavior_type, days) 26 SELECT t1.user_id,t1.item_id,t1.item_category,t1.user_geohash,to_date(t1.time,'yyyy-mm-dd hh'),substr(t1.time,12,13) as hours, 27 t1.behavior_type,datediff(to_date(t1.time,'yyyy-mm-dd hh'),'2014-11-18 00:00:00','dd') as days 28 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_user_online t1; 29 30 -- copy item data and convert 31 drop table if exists recommend_train_item; 32 create table if not exists recommend_train_item( 33 item_id bigint, 34 item_geohash string, 35 item_category bigint 36 ); 37 38 insert overwrite table recommend_train_item 39 select t1.item_id,t1.item_geohash,t1.item_category 40 from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online t1; 41 42 43 -- create o2o data 44 drop table if exists recommend_train_user_o2o; 45 -- like表示复制表结构 46 create table recommend_train_user_o2o like recommend_train_user; 47 48 --取所有recommend_train_user中item_id存在于商品子集P中的行为记录 49 insert overwrite table recommend_train_user_o2o partition (behavior_type,days) 50 select t2.* from 51 ( 52 --查询所有不重复的item_id 53 --group by的性能可能比distinct要高 54 select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id 55 )t1 56 join recommend_train_user t2 57 on t1.item_id = t2.item_id;
step2.0 —— 1_train_set
根据原始表构造训练用特征表,用于后续的各种分析。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-14 5 --**************************************************************************************** 6 -- labelday: 29(2014-12-17),window : 19~28 7 8 9 -- training set 10 -- user_item 11 12 --step1 13 --从recommend_train_user用户行为记录表创建第1到28天的特征集 14 --创建按照(user_id,item_id)对进行统计的特征列 15 DROP TABLE IF EXISTS ui_feats_29; 16 CREATE TABLE ui_feats_29 17 AS 18 SELECT user_id, item_id, item_category, 19 SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--浏览的次数 20 SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次数 21 SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入购物车的次数 22 SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--购买的次数 23 datediff('2014-12-17 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行为距离的小时数 24 datediff('2014-12-17 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行为距离的小时数 25 COUNT(DISTINCT days) AS ui_act_days, -- 存在行为的活动天数 26 -- ***人工规则特征统计*** 27 decode(sum(decode(behavior_type='3' and days=28 and hours>=18,true,1,0))>0,true,1,0)--end sum 第28天18点之后加入购物车的行为的次数是否大于0) 28 * decode(sum(decode(behavior_type='4' and days=28 and hours>=18,true,1,0))=0,true,1,0)-- 第28天18点之后没有购买行为的 29 as iscartnotbuy6h,--6小时内放到购物车是否购买 1|没有购买 0|购买 30 decode(sum(decode(behavior_type='3' and days=28 and hours>=12,true,1,0))>0,true,1,0)--end sum 第28天12点之后加入购物车的行为的次数是否大于0) 31 * decode(sum(decode(behavior_type='4' and days=28 and hours>=12,true,1,0))=0,true,1,0)-- 第28天12点之后没有购买行为的 32 as iscartnotbuy12h,--6小时内放到购物车是否购买 1|没有购买 0|购买 33 decode(sum(decode(behavior_type='3' and days=28,true,1,0))>0,true,1,0)--end sum 第28天0点之后加入购物车的行为的次数是否大于0) 34 * decode(sum(decode(behavior_type='4' and days=28,true,1,0))=0,true,1,0)-- 第28天0点之后没有购买行为的 35 as iscartnotbuy24h --6小时内放到购物车是否购买 1|没有购买 0|购买 36 FROM recommend_train_user 37 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为 38 WHERE days >= 19 39 AND days <= 28 40 --根据user_id,item_id,item_category分组 41 GROUP BY user_id, 42 item_id, 43 item_category; 44 45 46 47 --step2 48 --user_category 49 --从recommend_train_user用户行为记录表创建第1到28天的特征集 50 --创建按照(user_id,item_category)对进行统计的特征列 51 DROP TABLE IF EXISTS uc_feats_29; 52 CREATE TABLE uc_feats_29 53 AS 54 SELECT user_id,item_category, 55 SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--浏览次数 56 SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次数 57 SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入购物车次数 58 SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--购买次数 59 decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有过购买记录 60 decode(SUM(decode(behavior_type = '4' AND days > 28 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小时内是否有过购买记录 61 decode(SUM(decode(behavior_type = '4' AND days > 28 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天内是否有过购买记录 62 FROM recommend_train_user 63 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为 64 WHERE days >= 19 65 AND days <= 28 66 --根据user_id,item_category分组 67 GROUP BY user_id, 68 item_category; 69 70 71 72 --step3 73 --user 74 --从recommend_train_user用户行为记录表创建第1到28天的特征集 75 --创建按照user_id进行统计的特征列 76 DROP TABLE IF EXISTS u_feats_29; 77 CREATE TABLE u_feats_29 78 AS 79 SELECT user_id, 80 SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--浏览次数 81 SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次数 82 SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入购物车次数 83 SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--购买次数 84 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 购买次数/购物车次数 85 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 购买次数/浏览次数 86 FROM recommend_train_user 87 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为 88 WHERE days >= 19 89 AND days <= 28 90 GROUP BY user_id; 91 92 93 94 --step4 95 --item_id 96 --从recommend_train_user用户行为记录表创建第1到28天的特征集 97 --创建按照user_id进行统计的特征列 98 DROP TABLE IF EXISTS i_feats_29; 99 CREATE TABLE i_feats_29 100 AS 101 SELECT item_id, 102 SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--浏览次数 103 SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次数 104 SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入购物车次数 105 SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--购买次数 106 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 购买次数/购物车次数 107 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 购买次数/浏览次数 108 FROM recommend_train_user 109 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为 110 WHERE days >= 19 111 AND days <= 28 112 GROUP BY item_id; 113 114 115 116 -- step5 117 -- join feats of (ui,uc,u,i) and add label 118 DROP TABLE IF EXISTS feats_29; 119 CREATE TABLE IF NOT EXISTS feats_29 120 AS 121 SELECT 122 t1.user_id, 123 t1.item_id, 124 t1.item_category, 125 --利用左外连接右边的表是否为null表示label(是否购买 1|购买 0|未购买) 126 decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label, 127 -- ui 128 ui_bro_cnt, 129 ui_fav_cnt, 130 ui_cart_cnt, 131 ui_buy_cnt, 132 ui_first_hour_gap, 133 ui_last_hour_gap, 134 ui_act_days, 135 iscartnotbuy6h, 136 iscartnotbuy12h, 137 iscartnotbuy24h, 138 139 --uc 140 uc_bro_cnt, 141 uc_fav_cnt, 142 uc_cart_cnt, 143 uc_buy_cnt, 144 isbuycatebefore, 145 isnotbuycate24h, 146 isnotbuycate3d, 147 148 --u 149 u_bro_cnt, 150 u_fav_cnt, 151 u_cart_cnt, 152 u_buy_cnt, 153 u_ratio_buycart, 154 u_ratio_buybro, 155 156 --i 157 i_bro_cnt, 158 i_fav_cnt, 159 i_cart_cnt, 160 i_buy_cnt, 161 i_ratio_buycart, 162 i_ratio_buybro, 163 iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h, 164 iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h, 165 iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h, 166 iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d, 167 u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 168 FROM 169 ui_feats_29 t1 170 JOIN uc_feats_29 t2 171 ON t1.user_id = t2.user_id 172 AND t1.item_category = t2.item_category 173 JOIN u_feats_29 t3 174 ON t1.user_id = t3.user_id 175 JOIN i_feats_29 t4 176 ON t1.item_id = t4.item_id 177 --左外连接 178 LEFT OUTER JOIN ( 179 --获取所有第29天的购买记录 180 SELECT 181 user_id, item_id 182 FROM 183 recommend_train_user 184 WHERE days = 29 185 AND behavior_type = '4' 186 GROUP BY user_id, item_id 187 ) t_l 188 ON t1.user_id = t_l.user_id 189 AND t1.item_id = t_l.item_id; 190 191 192 -- step6 193 -- fetch o2o set 194 DROP TABLE IF EXISTS feats_29_o2o; 195 CREATE TABLE IF NOT EXISTS feats_29_o2o 196 AS 197 SELECT t1.* 198 FROM feats_29 t1 199 JOIN ( 200 SELECT item_id 201 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online 202 GROUP BY item_id 203 ) t2 204 --使用内连接将feats_29不在商品子集P中的数据筛选掉 205 ON t1.item_id = t2.item_id;
step3.0 —— 2_test_set
根据原始表构造测试用特征表,用于后续的各种分析。
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-14 5 --**************************************************************************************** 6 -- labelday: 30(2014-12-18),window : 20~29 7 8 9 -- training set 10 -- user_item 11 12 --step1 13 --从recommend_train_user用户行为记录表创建第1到29天的特征集 14 --创建按照(user_id,item_id)对进行统计的特征列 15 DROP TABLE IF EXISTS ui_feats_30; 16 CREATE TABLE ui_feats_30 17 AS 18 SELECT user_id, item_id, item_category, 19 SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--浏览的次数 20 SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次数 21 SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入购物车的次数 22 SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--购买的次数 23 datediff('2014-12-18 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行为距离的小时数 24 datediff('2014-12-18 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行为距离的小时数 25 COUNT(DISTINCT days) AS ui_act_days, -- 存在行为的活动天数 26 -- ***人工规则特征统计*** 27 decode(sum(decode(behavior_type='3' and days=29 and hours>=18,true,1,0))>0,true,1,0)--end sum 第29天18点之后加入购物车的行为的次数是否大于0) 28 * decode(sum(decode(behavior_type='4' and days=29 and hours>=18,true,1,0))=0,true,1,0)-- 第29天18点之后没有购买行为的 29 as iscartnotbuy6h,--6小时内放到购物车是否购买 1|没有购买 0|购买 30 decode(sum(decode(behavior_type='3' and days=29 and hours>=12,true,1,0))>0,true,1,0)--end sum 第29天12点之后加入购物车的行为的次数是否大于0) 31 * decode(sum(decode(behavior_type='4' and days=29 and hours>=12,true,1,0))=0,true,1,0)-- 第29天12点之后没有购买行为的 32 as iscartnotbuy12h,--6小时内放到购物车是否购买 1|没有购买 0|购买 33 decode(sum(decode(behavior_type='3' and days=29,true,1,0))>0,true,1,0)--end sum 第29天0点之后加入购物车的行为的次数是否大于0) 34 * decode(sum(decode(behavior_type='4' and days=29,true,1,0))=0,true,1,0)-- 第29天0点之后没有购买行为的 35 as iscartnotbuy24h --6小时内放到购物车是否购买 1|没有购买 0|购买 36 FROM recommend_train_user 37 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为 38 WHERE days >= 20 39 AND days <= 29 40 --根据user_id,item_id,item_category分组 41 GROUP BY user_id, 42 item_id, 43 item_category; 44 45 46 47 --step2 48 --user_category 49 --从recommend_train_user用户行为记录表创建第1到29天的特征集 50 --创建按照(user_id,item_category)对进行统计的特征列 51 DROP TABLE IF EXISTS uc_feats_30; 52 CREATE TABLE uc_feats_30 53 AS 54 SELECT user_id,item_category, 55 SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--浏览次数 56 SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次数 57 SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入购物车次数 58 SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--购买次数 59 decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有过购买记录 60 decode(SUM(decode(behavior_type = '4' AND days > 29 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小时内是否有过购买记录 61 decode(SUM(decode(behavior_type = '4' AND days > 29 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天内是否有过购买记录 62 FROM recommend_train_user 63 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为 64 WHERE days >= 20 65 AND days <= 29 66 --根据user_id,item_category分组 67 GROUP BY user_id, 68 item_category; 69 70 71 72 --step3 73 --user 74 --从recommend_train_user用户行为记录表创建第1到29天的特征集 75 --创建按照user_id进行统计的特征列 76 DROP TABLE IF EXISTS u_feats_30; 77 CREATE TABLE u_feats_30 78 AS 79 SELECT user_id, 80 SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--浏览次数 81 SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次数 82 SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入购物车次数 83 SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--购买次数 84 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 购买次数/购物车次数 85 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 购买次数/浏览次数 86 FROM recommend_train_user 87 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为 88 WHERE days >= 20 89 AND days <= 29 90 GROUP BY user_id; 91 92 93 94 --step4 95 --item_id 96 --从recommend_train_user用户行为记录表创建第1到29天的特征集 97 --创建按照user_id进行统计的特征列 98 DROP TABLE IF EXISTS i_feats_30; 99 CREATE TABLE i_feats_30 100 AS 101 SELECT item_id, 102 SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--浏览次数 103 SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次数 104 SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入购物车次数 105 SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--购买次数 106 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 购买次数/购物车次数 107 SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 购买次数/浏览次数 108 FROM recommend_train_user 109 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为 110 WHERE days >= 20 111 AND days <= 29 112 GROUP BY item_id; 113 114 115 116 -- step5 117 -- join feats of (ui,uc,u,i) and add label 118 DROP TABLE IF EXISTS feats_30; 119 CREATE TABLE IF NOT EXISTS feats_30 120 AS 121 SELECT 122 t1.user_id, 123 t1.item_id, 124 t1.item_category, 125 --利用左外连接右边的表是否为null表示label(是否购买 1|购买 0|未购买) 126 decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label, 127 -- ui 128 ui_bro_cnt, 129 ui_fav_cnt, 130 ui_cart_cnt, 131 ui_buy_cnt, 132 ui_first_hour_gap, 133 ui_last_hour_gap, 134 ui_act_days, 135 iscartnotbuy6h, 136 iscartnotbuy12h, 137 iscartnotbuy24h, 138 139 --uc 140 uc_bro_cnt, 141 uc_fav_cnt, 142 uc_cart_cnt, 143 uc_buy_cnt, 144 isbuycatebefore, 145 isnotbuycate24h, 146 isnotbuycate3d, 147 148 --u 149 u_bro_cnt, 150 u_fav_cnt, 151 u_cart_cnt, 152 u_buy_cnt, 153 u_ratio_buycart, 154 u_ratio_buybro, 155 156 --i 157 i_bro_cnt, 158 i_fav_cnt, 159 i_cart_cnt, 160 i_buy_cnt, 161 i_ratio_buycart, 162 i_ratio_buybro, 163 iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h, 164 iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h, 165 iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h, 166 iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d, 167 u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 168 FROM 169 ui_feats_30 t1 170 JOIN uc_feats_30 t2 171 ON t1.user_id = t2.user_id 172 AND t1.item_category = t2.item_category 173 JOIN u_feats_30 t3 174 ON t1.user_id = t3.user_id 175 JOIN i_feats_30 t4 176 ON t1.item_id = t4.item_id 177 --左外连接 178 LEFT OUTER JOIN ( 179 --获取所有第30天的购买记录 180 SELECT 181 user_id, item_id 182 FROM 183 recommend_train_user 184 WHERE days = 30 185 AND behavior_type = '4' 186 GROUP BY user_id, item_id 187 ) t_l 188 ON t1.user_id = t_l.user_id 189 AND t1.item_id = t_l.item_id; 190 191 192 -- step6 193 -- fetch o2o set 194 DROP TABLE IF EXISTS feats_30_o2o; 195 CREATE TABLE IF NOT EXISTS feats_30_o2o 196 AS 197 SELECT t1.* 198 FROM feats_30 t1 199 JOIN ( 200 SELECT item_id 201 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online 202 GROUP BY item_id 203 ) t2 204 --使用内连接将feats_30不在商品子集P中的数据筛选掉 205 ON t1.item_id = t2.item_id;
二、算法实验:
(1)LR逻辑回归:
step1.归一化数据
feats_29 ——> feats_29_norm
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 15;28:25 5 --**************************************************************************************** 6 7 -- union two tables and then normalize 8 drop table if exists feats_union; 9 create table feats_union as 10 select * from 11 ( 12 select "train" as sign,* from feats_29 13 union all select "test" as sign, * from feats_30 14 )t_u;
实验:norm_split
归一化选择字段:除了user_id,item_id,item_category,label 4个字段的其余34个字段。
归一化选择的是max-min归一化方法。
feats_union ——> feats_union_norm;
创建归一化脚本:
1 --split the samples and fetch 020 samples 2 drop table if exists feats_29_norm; 3 create table feats_29_norm as select * from feats_union_norm where sign = "train"; 4 5 drop table if exists feats_30_norm; 6 create table feats_30_norm as select * from feats_union_norm where sign = "test"; 7 8 --fetch o2o set 9 drop table if exists feats_29_o2o_norm; 10 create table if not exists feats_29_o2o_norm as 11 select t1.* from feats_29_norm t1 join 12 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2 13 on t1.item_id = t2.item_id; 14 15 --fetch o2o set 16 drop table if exists feats_30_o2o_norm; 17 create table if not exists feats_30_o2o_norm as 18 select t1.* from feats_30_norm t1 join 19 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2 20 on t1.item_id = t2.item_id;
step2.对数据量过大的负样本(标签为不购买的样本)进行随机采样
feats_29_norm ——> feats_29_norm_sample
实验:lr_sample
实验参数设置:
a)过滤与映射-1
b)过滤与映射-2
c)随机采样
d)合并行
step3.lr模型训练实验:
feats_29_norm_sample训练:
实验参数设置:
(a)逻辑回归二分类:
(b) 预测:
(2)GBDT:
实验:gbdt_demo
实验参数设置:
过滤与映射,随机采样与合并行与LR模型一样。
(a)GBDT二分类:
字段设置:
特征列:排除user_id,item_id,item_category,label四列的所有列
标签列:label
(b)预测:
(3)联合GBDT:
三、F1评估脚本:
1 --odps sql 2 --**************************************************************************************** 3 --author:smilemoon 4 --create time;2017-01-17 15;20:25 5 --**************************************************************************************** 6 7 --LR result of day29 8 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1 9 from 10 ( 11 select count(1) as predict,sum(label) as hits 12 from (select * from lr_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre 13 ) t1; 14 15 --LR result of day30 16 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1 17 from 18 ( 19 select count(1) as predict,sum(label) as hits 20 from (select * from lr_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre 21 ) t1; 22 23 24 --GDBT result of day29 25 --4895 26 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1 27 from 28 ( 29 select count(1) as predict,sum(label) as hits 30 from (select * from gdbt_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre 31 ) t1; 32 33 --GDBT result of day30 34 --4642 35 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1 36 from 37 ( 38 select count(1) as predict,sum(label) as hits 39 from (select * from gdbt_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre 40 ) t1;