天池新人实战赛《阿里移动推荐算法》(三)—— 平台赛模型规则


   注:作为新手,基本思路来源于天池官网的视频教程,本文仅用于在天池的数加平台上学习后所作的个人总结,特此声明。

   视频网址:https://tianchi.shuju.aliyun.com/video.htm?spm=5176.100068.1234.7.7Kftz1 《新手入门赛-第四课时》

       

一、SQL预处理:

  step1 —— 0_init

  预处理数据:根据项目空间给出的原始数据表构造符合自己需求的新表。

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17
 5 --****************************************************************************************
 6 
 7 -- copy user data and convert
 8 -- 整合赛题中tianchi_fresh_comp_train_user_online的数据
 9 DROP TABLE IF EXISTS recommend_train_user;
10 -- 创建依赖于behavior_type和days的hive分区表
11 CREATE TABLE IF NOT EXISTS recommend_train_user (
12     user_id BIGINT,
13     item_id BIGINT,
14     item_category BIGINT,
15     user_geohash STRING,
16     time DATETIME,
17     hours BIGINT
18 )
19 PARTITIONED BY (
20     behavior_type STRING,
21     days STRING
22 );
23 
24 -- INSERT OVER WRITE 表示往HIVE表中插入记录并覆盖
25 INSERT OVERWRITE TABLE recommend_train_user PARTITION (behavior_type, days)
26 SELECT t1.user_id,t1.item_id,t1.item_category,t1.user_geohash,to_date(t1.time,'yyyy-mm-dd hh'),substr(t1.time,12,13) as hours,
27 t1.behavior_type,datediff(to_date(t1.time,'yyyy-mm-dd hh'),'2014-11-18 00:00:00','dd') as days
28 FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_user_online t1;
29 
30 -- copy item data and convert
31 drop table if exists recommend_train_item;
32 create table if not exists recommend_train_item(
33     item_id bigint,
34     item_geohash string,
35     item_category bigint
36 );
37 
38 insert overwrite table recommend_train_item
39 select t1.item_id,t1.item_geohash,t1.item_category
40 from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online t1; 
41 
42 
43 -- create o2o data
44 drop table if exists recommend_train_user_o2o;
45 -- like表示复制表结构
46 create table  recommend_train_user_o2o like recommend_train_user;
47 
48 --取所有recommend_train_user中item_id存在于商品子集P中的行为记录
49 insert overwrite table recommend_train_user_o2o partition (behavior_type,days)
50 select t2.* from
51 (
52     --查询所有不重复的item_id
53     --group by的性能可能比distinct要高
54     select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id
55 )t1
56 join recommend_train_user t2
57 on t1.item_id = t2.item_id;

 

  step2.0 —— 1_train_set

     根据原始表构造训练用特征表,用于后续的各种分析。

  1 --odps sql
  2 --****************************************************************************************
  3 --author:smilemoon
  4 --create time;2017-01-14
  5 --****************************************************************************************
  6 -- labelday: 29(2014-12-17),window : 19~28
  7 
  8 
  9 -- training set
 10 -- user_item
 11 
 12 --step1
 13 --从recommend_train_user用户行为记录表创建第1到28天的特征集
 14 --创建按照(user_id,item_id)对进行统计的特征列
 15 DROP TABLE IF EXISTS ui_feats_29;
 16 CREATE TABLE ui_feats_29
 17 AS
 18 SELECT user_id, item_id, item_category,
 19      SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--浏览的次数
 20      SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次数
 21      SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入购物车的次数
 22      SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--购买的次数
 23      datediff('2014-12-17 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行为距离的小时数
 24      datediff('2014-12-17 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行为距离的小时数
 25      COUNT(DISTINCT days) AS ui_act_days, -- 存在行为的活动天数
 26      -- ***人工规则特征统计***
 27      decode(sum(decode(behavior_type='3' and days=28 and hours>=18,true,1,0))>0,true,1,0)--end sum 第28天18点之后加入购物车的行为的次数是否大于0)
 28      * decode(sum(decode(behavior_type='4' and days=28 and hours>=18,true,1,0))=0,true,1,0)-- 第28天18点之后没有购买行为的
 29      as iscartnotbuy6h,--6小时内放到购物车是否购买 1|没有购买 0|购买
 30      decode(sum(decode(behavior_type='3' and days=28 and hours>=12,true,1,0))>0,true,1,0)--end sum 第28天12点之后加入购物车的行为的次数是否大于0)
 31      * decode(sum(decode(behavior_type='4' and days=28 and hours>=12,true,1,0))=0,true,1,0)-- 第28天12点之后没有购买行为的
 32      as iscartnotbuy12h,--6小时内放到购物车是否购买 1|没有购买 0|购买
 33      decode(sum(decode(behavior_type='3' and days=28,true,1,0))>0,true,1,0)--end sum 第28天0点之后加入购物车的行为的次数是否大于0)
 34      * decode(sum(decode(behavior_type='4' and days=28,true,1,0))=0,true,1,0)-- 第28天0点之后没有购买行为的
 35      as iscartnotbuy24h --6小时内放到购物车是否购买 1|没有购买 0|购买
 36 FROM recommend_train_user
 37 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为
 38 WHERE days >= 19
 39     AND days <= 28
 40 --根据user_id,item_id,item_category分组
 41 GROUP BY user_id, 
 42     item_id, 
 43     item_category;
 44     
 45     
 46     
 47 --step2    
 48 --user_category
 49 --从recommend_train_user用户行为记录表创建第1到28天的特征集
 50 --创建按照(user_id,item_category)对进行统计的特征列
 51 DROP TABLE IF EXISTS uc_feats_29;
 52 CREATE TABLE uc_feats_29
 53 AS
 54 SELECT user_id,item_category,
 55      SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--浏览次数
 56      SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次数
 57      SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入购物车次数
 58      SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--购买次数
 59      decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有过购买记录
 60      decode(SUM(decode(behavior_type = '4' AND days > 28 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小时内是否有过购买记录
 61      decode(SUM(decode(behavior_type = '4' AND days > 28 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天内是否有过购买记录
 62 FROM recommend_train_user
 63 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为
 64 WHERE days >= 19
 65     AND days <= 28
 66 --根据user_id,item_category分组
 67 GROUP BY user_id, 
 68     item_category;
 69 
 70 
 71 
 72 --step3
 73 --user
 74 --从recommend_train_user用户行为记录表创建第1到28天的特征集
 75 --创建按照user_id进行统计的特征列
 76 DROP TABLE IF EXISTS u_feats_29;
 77 CREATE TABLE u_feats_29
 78 AS
 79 SELECT user_id,
 80      SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--浏览次数
 81      SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次数
 82      SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入购物车次数
 83      SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--购买次数
 84      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 购买次数/购物车次数
 85      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 购买次数/浏览次数
 86 FROM recommend_train_user
 87 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为
 88 WHERE days >= 19
 89     AND days <= 28
 90 GROUP BY user_id;
 91 
 92 
 93 
 94 --step4
 95 --item_id
 96 --从recommend_train_user用户行为记录表创建第1到28天的特征集
 97 --创建按照user_id进行统计的特征列
 98 DROP TABLE IF EXISTS i_feats_29;
 99 CREATE TABLE i_feats_29
100 AS
101 SELECT item_id,
102     SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--浏览次数
103     SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次数
104     SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入购物车次数
105     SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--购买次数
106     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 购买次数/购物车次数
107     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 购买次数/浏览次数
108 FROM recommend_train_user 
109 -- 在第19天到第28天 10天内用户的行为数据,用于预测第29天的行为
110 WHERE days >= 19
111     AND days <= 28
112 GROUP BY item_id;
113 
114 
115 
116 -- step5
117 -- join feats of (ui,uc,u,i) and add label
118 DROP TABLE IF EXISTS feats_29;
119 CREATE TABLE IF NOT EXISTS feats_29
120 AS
121 SELECT 
122     t1.user_id,
123     t1.item_id,
124     t1.item_category,
125     --利用左外连接右边的表是否为null表示label(是否购买 1|购买 0|未购买)
126     decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label,
127     -- ui
128     ui_bro_cnt, 
129     ui_fav_cnt,
130     ui_cart_cnt,
131     ui_buy_cnt,
132     ui_first_hour_gap,
133     ui_last_hour_gap,
134     ui_act_days,
135     iscartnotbuy6h,
136     iscartnotbuy12h,
137     iscartnotbuy24h,
138     
139     --uc
140     uc_bro_cnt, 
141     uc_fav_cnt,
142     uc_cart_cnt,
143     uc_buy_cnt,
144     isbuycatebefore,
145     isnotbuycate24h,
146     isnotbuycate3d,
147     
148     --u
149     u_bro_cnt, 
150     u_fav_cnt,
151     u_cart_cnt,
152     u_buy_cnt,
153     u_ratio_buycart,
154     u_ratio_buybro,
155     
156     --i
157     i_bro_cnt, 
158     i_fav_cnt,
159     i_cart_cnt,
160     i_buy_cnt,
161     i_ratio_buycart,
162     i_ratio_buybro,
163     iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h,
164     iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h,
165     iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h,
166     iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d,
167     u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 
168 FROM 
169     ui_feats_29 t1
170 JOIN uc_feats_29 t2
171     ON t1.user_id = t2.user_id
172     AND t1.item_category = t2.item_category
173 JOIN u_feats_29 t3
174     ON t1.user_id = t3.user_id
175 JOIN i_feats_29 t4
176     ON t1.item_id = t4.item_id
177 --左外连接
178 LEFT OUTER JOIN (
179     --获取所有第29天的购买记录
180     SELECT 
181         user_id, item_id
182     FROM 
183         recommend_train_user
184     WHERE days = 29
185         AND behavior_type = '4'
186     GROUP BY user_id, item_id
187 ) t_l
188 ON t1.user_id = t_l.user_id
189 AND t1.item_id = t_l.item_id;
190 
191 
192 -- step6
193 -- fetch o2o set
194 DROP TABLE IF EXISTS feats_29_o2o;
195 CREATE TABLE IF NOT EXISTS feats_29_o2o
196 AS
197 SELECT t1.*
198 FROM feats_29 t1
199 JOIN (
200     SELECT item_id
201     FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online
202     GROUP BY item_id
203 ) t2
204 --使用内连接将feats_29不在商品子集P中的数据筛选掉
205 ON t1.item_id = t2.item_id;

 

  step3.0 —— 2_test_set

     根据原始表构造测试用特征表,用于后续的各种分析。

 

  1 --odps sql
  2 --****************************************************************************************
  3 --author:smilemoon
  4 --create time;2017-01-14
  5 --****************************************************************************************
  6 -- labelday: 30(2014-12-18),window : 20~29
  7 
  8 
  9 -- training set
 10 -- user_item
 11 
 12 --step1
 13 --从recommend_train_user用户行为记录表创建第1到29天的特征集
 14 --创建按照(user_id,item_id)对进行统计的特征列
 15 DROP TABLE IF EXISTS ui_feats_30;
 16 CREATE TABLE ui_feats_30
 17 AS
 18 SELECT user_id, item_id, item_category,
 19      SUM(decode(behavior_type = '1', true, 1, 0)) AS ui_bro_cnt,--浏览的次数
 20      SUM(decode(behavior_type = '2', true, 1, 0)) AS ui_fav_cnt,--收藏的次数
 21      SUM(decode(behavior_type = '3', true, 1, 0)) AS ui_cart_cnt,--加入购物车的次数
 22      SUM(decode(behavior_type = '4', true, 1, 0)) AS ui_buy_cnt,--购买的次数
 23      datediff('2014-12-18 00:00:00', MIN(time), 'hh') AS ui_first_hour_gap,--第一次行为距离的小时数
 24      datediff('2014-12-18 00:00:00', MAX(time), 'hh') AS ui_last_hour_gap,--最后一次行为距离的小时数
 25      COUNT(DISTINCT days) AS ui_act_days, -- 存在行为的活动天数
 26      -- ***人工规则特征统计***
 27      decode(sum(decode(behavior_type='3' and days=29 and hours>=18,true,1,0))>0,true,1,0)--end sum 第29天18点之后加入购物车的行为的次数是否大于0)
 28      * decode(sum(decode(behavior_type='4' and days=29 and hours>=18,true,1,0))=0,true,1,0)-- 第29天18点之后没有购买行为的
 29      as iscartnotbuy6h,--6小时内放到购物车是否购买 1|没有购买 0|购买
 30      decode(sum(decode(behavior_type='3' and days=29 and hours>=12,true,1,0))>0,true,1,0)--end sum 第29天12点之后加入购物车的行为的次数是否大于0)
 31      * decode(sum(decode(behavior_type='4' and days=29 and hours>=12,true,1,0))=0,true,1,0)-- 第29天12点之后没有购买行为的
 32      as iscartnotbuy12h,--6小时内放到购物车是否购买 1|没有购买 0|购买
 33      decode(sum(decode(behavior_type='3' and days=29,true,1,0))>0,true,1,0)--end sum 第29天0点之后加入购物车的行为的次数是否大于0)
 34      * decode(sum(decode(behavior_type='4' and days=29,true,1,0))=0,true,1,0)-- 第29天0点之后没有购买行为的
 35      as iscartnotbuy24h --6小时内放到购物车是否购买 1|没有购买 0|购买
 36 FROM recommend_train_user
 37 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为
 38 WHERE days >= 20
 39     AND days <= 29
 40 --根据user_id,item_id,item_category分组
 41 GROUP BY user_id, 
 42     item_id, 
 43     item_category;
 44     
 45     
 46     
 47 --step2    
 48 --user_category
 49 --从recommend_train_user用户行为记录表创建第1到29天的特征集
 50 --创建按照(user_id,item_category)对进行统计的特征列
 51 DROP TABLE IF EXISTS uc_feats_30;
 52 CREATE TABLE uc_feats_30
 53 AS
 54 SELECT user_id,item_category,
 55      SUM(decode(behavior_type = '1', true, 1, 0)) AS uc_bro_cnt,--浏览次数
 56      SUM(decode(behavior_type = '2', true, 1, 0)) AS uc_fav_cnt,--收藏次数
 57      SUM(decode(behavior_type = '3', true, 1, 0)) AS uc_cart_cnt,--加入购物车次数
 58      SUM(decode(behavior_type = '4', true, 1, 0)) AS uc_buy_cnt,--购买次数
 59      decode(SUM(decode(behavior_type = '4', true, 1, 0)) > 0, true, 1, 0) AS isbuycatebefore,--之前是否有过购买记录
 60      decode(SUM(decode(behavior_type = '4' AND days > 29 - 1, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate24h,--24小时内是否有过购买记录
 61      decode(SUM(decode(behavior_type = '4' AND days > 29 - 3, true, 1, 0)) = 0, true, 1, 0) AS isnotbuycate3d --3天内是否有过购买记录
 62 FROM recommend_train_user
 63 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为
 64 WHERE days >= 20
 65     AND days <= 29
 66 --根据user_id,item_category分组
 67 GROUP BY user_id, 
 68     item_category;
 69 
 70 
 71 
 72 --step3
 73 --user
 74 --从recommend_train_user用户行为记录表创建第1到29天的特征集
 75 --创建按照user_id进行统计的特征列
 76 DROP TABLE IF EXISTS u_feats_30;
 77 CREATE TABLE u_feats_30
 78 AS
 79 SELECT user_id,
 80      SUM(decode(behavior_type = '1', true, 1, 0)) AS u_bro_cnt,--浏览次数
 81      SUM(decode(behavior_type = '2', true, 1, 0)) AS u_fav_cnt,--收藏次数
 82      SUM(decode(behavior_type = '3', true, 1, 0)) AS u_cart_cnt,--加入购物车次数
 83      SUM(decode(behavior_type = '4', true, 1, 0)) AS u_buy_cnt,--购买次数
 84      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS u_ratio_buycart, -- 购买次数/购物车次数
 85      SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '1', true, 1, 0)) + 1) AS u_ratio_buybro -- 购买次数/浏览次数
 86 FROM recommend_train_user
 87 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为
 88 WHERE days >= 20
 89     AND days <= 29
 90 GROUP BY user_id;
 91 
 92 
 93 
 94 --step4
 95 --item_id
 96 --从recommend_train_user用户行为记录表创建第1到29天的特征集
 97 --创建按照user_id进行统计的特征列
 98 DROP TABLE IF EXISTS i_feats_30;
 99 CREATE TABLE i_feats_30
100 AS
101 SELECT item_id,
102     SUM(decode(behavior_type = '1', true, 1, 0)) AS i_bro_cnt,--浏览次数
103     SUM(decode(behavior_type = '2', true, 1, 0)) AS i_fav_cnt,--收藏次数
104     SUM(decode(behavior_type = '3', true, 1, 0)) AS i_cart_cnt,--加入购物车次数
105     SUM(decode(behavior_type = '4', true, 1, 0)) AS i_buy_cnt,--购买次数
106     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buycart,-- 购买次数/购物车次数
107     SUM(decode(behavior_type = '4', true, 1, 0)) / (SUM(decode(behavior_type = '3', true, 1, 0)) + 1) AS i_ratio_buybro -- 购买次数/浏览次数
108 FROM recommend_train_user 
109 -- 在第20天到第29天 10天内用户的行为数据,用于预测第30天的行为
110 WHERE days >= 20
111     AND days <= 29
112 GROUP BY item_id;
113 
114 
115 
116 -- step5
117 -- join feats of (ui,uc,u,i) and add label
118 DROP TABLE IF EXISTS feats_30;
119 CREATE TABLE IF NOT EXISTS feats_30
120 AS
121 SELECT 
122     t1.user_id,
123     t1.item_id,
124     t1.item_category,
125     --利用左外连接右边的表是否为null表示label(是否购买 1|购买 0|未购买)
126     decode(t_l.user_id IS NOT NULL, true, 1, 0) AS label,
127     -- ui
128     ui_bro_cnt, 
129     ui_fav_cnt,
130     ui_cart_cnt,
131     ui_buy_cnt,
132     ui_first_hour_gap,
133     ui_last_hour_gap,
134     ui_act_days,
135     iscartnotbuy6h,
136     iscartnotbuy12h,
137     iscartnotbuy24h,
138     
139     --uc
140     uc_bro_cnt, 
141     uc_fav_cnt,
142     uc_cart_cnt,
143     uc_buy_cnt,
144     isbuycatebefore,
145     isnotbuycate24h,
146     isnotbuycate3d,
147     
148     --u
149     u_bro_cnt, 
150     u_fav_cnt,
151     u_cart_cnt,
152     u_buy_cnt,
153     u_ratio_buycart,
154     u_ratio_buybro,
155     
156     --i
157     i_bro_cnt, 
158     i_fav_cnt,
159     i_cart_cnt,
160     i_buy_cnt,
161     i_ratio_buycart,
162     i_ratio_buybro,
163     iscartnotbuy6h * isnotbuycate24h AS iscartnobuy6h_notbuycate24h,
164     iscartnotbuy12h * isnotbuycate24h AS iscartnobuy12h_notbuycate24h,
165     iscartnotbuy24h * isnotbuycate24h AS iscartnobuy24h_notbuycate24h,
166     iscartnotbuy24h * isnotbuycate3d AS iscartnobuy24h_notbuycate3d,
167     u_ratio_buycart * ui_cart_cnt AS u_ui_buycart 
168 FROM 
169     ui_feats_30 t1
170 JOIN uc_feats_30 t2
171     ON t1.user_id = t2.user_id
172     AND t1.item_category = t2.item_category
173 JOIN u_feats_30 t3
174     ON t1.user_id = t3.user_id
175 JOIN i_feats_30 t4
176     ON t1.item_id = t4.item_id
177 --左外连接
178 LEFT OUTER JOIN (
179     --获取所有第30天的购买记录
180     SELECT 
181         user_id, item_id
182     FROM 
183         recommend_train_user
184     WHERE days = 30
185         AND behavior_type = '4'
186     GROUP BY user_id, item_id
187 ) t_l
188 ON t1.user_id = t_l.user_id
189 AND t1.item_id = t_l.item_id;
190 
191 
192 -- step6
193 -- fetch o2o set
194 DROP TABLE IF EXISTS feats_30_o2o;
195 CREATE TABLE IF NOT EXISTS feats_30_o2o
196 AS
197 SELECT t1.*
198 FROM feats_30 t1
199 JOIN (
200     SELECT item_id
201     FROM odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online
202     GROUP BY item_id
203 ) t2
204 --使用内连接将feats_30不在商品子集P中的数据筛选掉
205 ON t1.item_id = t2.item_id;

 

二、算法实验:

  (1)LR逻辑回归:

    step1.归一化数据

    feats_29 ——> feats_29_norm

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17 15;28:25
 5 --****************************************************************************************
 6 
 7 -- union two tables and then normalize
 8 drop table if exists feats_union;
 9 create table feats_union as
10 select * from
11 (
12     select "train" as sign,* from feats_29 
13     union all select "test" as sign, * from feats_30 
14 )t_u;

 

  实验:norm_split

    归一化选择字段:除了user_id,item_id,item_category,label 4个字段的其余34个字段。

            归一化选择的是max-min归一化方法。

    feats_union ——> feats_union_norm;

 

 

     创建归一化脚本:

 1 --split the samples and fetch 020 samples
 2 drop table if exists feats_29_norm;
 3 create table feats_29_norm as select * from feats_union_norm where sign = "train";
 4 
 5 drop table if exists feats_30_norm;
 6 create table feats_30_norm as select * from feats_union_norm where sign = "test";
 7 
 8 --fetch o2o set
 9 drop table if exists feats_29_o2o_norm;
10 create table if not exists feats_29_o2o_norm as
11 select t1.* from feats_29_norm t1 join
12 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2
13 on t1.item_id = t2.item_id;
14 
15 --fetch o2o set
16 drop table if exists feats_30_o2o_norm;
17 create table if not exists feats_30_o2o_norm as
18 select t1.* from feats_30_norm t1 join
19 (select item_id from odps_tc_257100_f673506e024.tianchi_fresh_comp_train_item_online group by item_id)t2
20 on t1.item_id = t2.item_id;

     

    

        step2.对数据量过大的负样本(标签为不购买的样本)进行随机采样

      feats_29_norm ——> feats_29_norm_sample

      实验:lr_sample   

 

        实验参数设置:

        a)过滤与映射-1

      

      b)过滤与映射-2

      

        c)随机采样

      

      d)合并行

      

     

      step3.lr模型训练实验:

        feats_29_norm_sample训练:

       

        实验参数设置:

        (a)逻辑回归二分类:

          

 

        (b) 预测:

        

 

  (2)GBDT:

     实验:gbdt_demo

      

      实验参数设置:

        过滤与映射,随机采样与合并行与LR模型一样。

      (a)GBDT二分类:

        字段设置:

          特征列:排除user_id,item_id,item_category,label四列的所有列

          标签列:label

          

        

 

      (b)预测:

        

  (3)联合GBDT:

 三、F1评估脚本:

 1 --odps sql
 2 --****************************************************************************************
 3 --author:smilemoon
 4 --create time;2017-01-17 15;20:25
 5 --****************************************************************************************
 6 
 7 --LR result of day29
 8 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1
 9 from 
10 (
11     select count(1) as predict,sum(label) as hits
12     from (select * from lr_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre
13 ) t1;
14 
15 --LR result of day30
16 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1
17 from 
18 (
19     select count(1) as predict,sum(label) as hits
20     from (select * from lr_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre
21 ) t1;
22 
23 
24 --GDBT result of day29
25 --4895
26 select 175052 as real,predict,hits,(hits/predict) as precision,(hits/175052) as recall, (2*hits/(175052+predict)) as F1
27 from 
28 (
29     select count(1) as predict,sum(label) as hits
30     from (select * from gdbt_29_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 175052) pre
31 ) t1;
32 
33 --GDBT result of day30
34 --4642
35 select 168476 as real,predict,hits,(hits/predict) as precision,(hits/168476) as recall, (2*hits/(168476+predict)) as F1
36 from 
37 (
38     select count(1) as predict,sum(label) as hits
39     from (select * from gdbt_30_result order by (prediction_result*2-1)*prediction_score+(1-prediction_result) desc limit 168476) pre
40 ) t1;

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM