一、前述
根據前文中架構,本文我們討論線下部分構建訓練集部分。因為我們離線部分模型的選擇是邏輯回歸,所以我們數據必須有x和y.
二、具體流程
1.從數據庫中分離出我們需要的數據。
用戶行為表(日志)
用戶歷史下載表
商品詞表(商品的基本特征)
2.構建訓練集中的關聯特征
流程:
2.構建訓練集中的基本特征
總結:注意特征名離散化因為如果特征不離散化會造成數據之間有關系。
三、具體構建過程
1、hive建表
真實的生產場景涉及到大概五十張表的字段,這里全部簡化流程,直接給出最終的三張表:
應用詞表:
CREATE EXTERNAL TABLE IF NOT EXISTS dim_rcm_hitop_id_list_ds ( hitop_id STRING, name STRING, author STRING, sversion STRING, ischarge SMALLINT, designer STRING, font STRING, icon_count INT, stars DOUBLE, price INT, file_size INT, comment_num INT, screen STRING, dlnum INT )row format delimited fields terminated by '\t';
/**
*
*模擬app的商品詞表
hitop_id STRING, 應用軟件ID
name STRING, 名稱
author STRING, 作者
sversion STRING, 版本號
ischarge SMALLINT, 收費軟件
designer STRING, 設計者
font STRING, 字體
icon_count INT, 有幾張配圖
stars DOUBLE, 評價星級
price INT, 價格
file_size INT, 大小
comment_num INT, 評論數據
screen STRING, 分辨率
dlnum INT 下載數量
*/
用戶歷史下載表:
CREATE EXTERNAL TABLE IF NOT EXISTS dw_rcm_hitop_userapps_dm ( device_id STRING, devid_applist STRING, device_name STRING, pay_ability STRING )row format delimited fields terminated by '\t';
/**
*用戶下載歷史表 這里沒有用戶這個概念 手機設備ID就是userId
* device_id STRING, 手機設備ID
devid_applist STRING, 下載過軟件列表
device_name STRING, 設備名稱
pay_ability STRING 支付能力
*/
正負例樣本(用戶當前行為即日志)表:
CREATE EXTERNAL TABLE IF NOT EXISTS dw_rcm_hitop_sample2learn_dm ( label STRING, device_id STRING, hitop_id STRING, screen STRING, en_name STRING, ch_name STRING, author STRING, sversion STRING, mnc STRING, event_local_time STRING, interface STRING, designer STRING, is_safe INT, icon_count INT, update_time STRING, stars DOUBLE, comment_num INT, font STRING, price INT, file_size INT, ischarge SMALLINT, dlnum INT )row format delimited fields terminated by '\t';
/**
* 正負例樣本表 = 瀏覽記錄+標簽
label STRING, Y列,-1或1代表正負例 label值實際上是批處理得出來的,用戶瀏覽了並在一段時間內下載為正例
device_id STRING, 設備ID
hitop_id STRING, 應用ID
screen STRING, 手機軟件需要的分辨率
en_name STRING, 英文名
ch_name STRING, 中文名
author STRING, 作者
sversion STRING, 版本
mnc STRING, Mobile Network Code,移動網絡號碼
event_local_time STRING, 瀏覽的時間
interface STRING,
designer STRING,
is_safe INT,
icon_count INT,
update_time STRING,
stars DOUBLE,
comment_num INT,
font STRING,
price INT,
file_size INT,
ischarge SMALLINT,
dlnum INT
*/
2、load數據
分別往三張表load數據:
商品詞表:
load data local inpath '/opt/sxt/recommender/script/applist.txt' into table dim_rcm_hitop_id_list_ds;
用戶歷史下載表:
load data local inpath '/opt/sxt/recommender/script/userdownload.txt' into table dw_rcm_hitop_userapps_dm;
正負例樣本表:
load data local inpath '/opt/sxt/recommender/script/sample.txt' into table dw_rcm_hitop_sample2learn_dm;
3、構建訓練數據
3.1創建臨時表
創建處理數據時所需要的臨時表
CREATE TABLE IF NOT EXISTS tmp_dw_rcm_hitop_prepare2train_dm
(
device_id STRING,
label STRING,
hitop_id STRING,
screen STRING,
ch_name STRING,
author STRING,
sversion STRING,
mnc STRING,
interface STRING,
designer STRING,
is_safe INT,
icon_count INT,
update_date STRING,
stars DOUBLE,
comment_num INT,
font STRING,
price INT,
file_size INT,
ischarge SMALLINT,
dlnum INT,
idlist STRING,
device_name STRING,
pay_ability STRING
)row format delimited fields terminated by '\t';
最終保存訓練集的表
CREATE TABLE IF NOT EXISTS dw_rcm_hitop_prepare2train_dm
(
label STRING,
features STRING
)row format delimited fields terminated by '\t';
3.2 訓練數據預處理過程
首先將數據從正負例樣本和用戶歷史下載表數據加載到臨時表中
INSERT OVERWRITE TABLE tmp_dw_rcm_hitop_prepare2train_dm
SELECT
t2.device_id,
t2.label,
t2.hitop_id,
t2.screen,
t2.ch_name,
t2.author,
t2.sversion,
t2.mnc,
t2.interface,
t2.designer,
t2.is_safe,
t2.icon_count,
to_date(t2.update_time),
t2.stars,
t2.comment_num,
t2.font,
t2.price,
t2.file_size,
t2.ischarge,
t2.dlnum,
t1.devid_applist,
t1.device_name,
t1.pay_ability
FROM
(
SELECT
device_id,
devid_applist,
device_name,
pay_ability
FROM
dw_rcm_hitop_userapps_dm
) t1
RIGHT OUTER JOIN
(
SELECT
device_id,
label,
hitop_id,
screen,
ch_name,
author,
sversion,
IF (mnc IN ('00','01','02','03','04','05','06','07'), mnc,'x') AS mnc,
interface,
designer,
is_safe,
IF (icon_count <= 5,icon_count,6) AS icon_count,
update_time,
stars,
IF ( comment_num IS NULL,0,
IF ( comment_num <= 10,comment_num,11)) AS comment_num,
font,
price,
IF (file_size <= 2*1024*1024,2,
IF (file_size <= 4*1024*1024,4,
IF (file_size <= 6*1024*1024,6,
IF (file_size <= 8*1024*1024,8,
IF (file_size <= 10*1024*1024,10,
IF (file_size <= 12*1024*1024,12,
IF (file_size <= 14*1024*1024,14,
IF (file_size <= 16*1024*1024,16,
IF (file_size <= 18*1024*1024,18,
IF (file_size <= 20*1024*1024,20,21)))))))))) AS file_size,
ischarge,
IF (dlnum IS NULL,0,
IF (dlnum <= 50,50,
IF (dlnum <= 100,100,
IF (dlnum <= 500,500,
IF (dlnum <= 1000,1000,
IF (dlnum <= 5000,5000,
IF (dlnum <= 10000,10000,
IF (dlnum <= 20000,20000,20001)))))))) AS dlnum
FROM
dw_rcm_hitop_sample2learn_dm
) t2
ON (t1.device_id = t2.device_id);
選擇右外關聯的原因是因為以用戶行為為基准。
這張表得到的數據就是關聯特征中的數據,截圖如下:
然后再利用python腳本處理格式
這里要先講python腳本加載到hive中
ADD FILE /opt/sxt/recommender/script/dw_rcm_hitop_prepare2train_dm.py;
可以通過list files;查看是不是python文件加載到了hive
在hive中使用python腳本處理數據的原理:
Hive會以輸出流的形式將數據交給python腳本,python腳本以輸入流的形式來接受數據,接受來數據以后,在python中就行一系列的數據處理,處理完畢后,又以輸出流的形式交給Hive,交給了hive就說明了就處理后的數據成功保存到hive表中了。
INSERT OVERWRITE TABLE dw_rcm_hitop_prepare2train_dm SELECT TRANSFORM (t.*) USING 'python dw_rcm_hitop_prepare2train_dm.py' AS (label,features) FROM ( SELECT label, hitop_id, screen, ch_name, author, sversion, mnc, interface, designer, icon_count, update_date, stars, comment_num, font, price, file_size, ischarge, dlnum, idlist, device_name, pay_ability FROM tmp_dw_rcm_hitop_prepare2train_dm ) t;
python處理流程:
#! /usr/bin/env python # -*- coding: utf-8 -*- # ---------------------------------------------------------------------------- # File Name: dw_rcm_hitop_prepare2train_dm.py # Copyright(C)Huawei Technologies Co.,Ltd.1998-2014.All rights reserved. # Describe: # Input: tmp_dw_rcm_hitop_prepare2train_dm # Output: dw_rcm_hitop_prepare2train_dm import sys import codecs import random import math import time import datetime if __name__ == "__main__": random.seed(time.time()) for l in sys.stdin: d = l.strip().split('\t') if len(d) != 21: continue # Extract data from the line label = d.pop(0) hitop_id = d.pop(0) screen = d.pop(0) ch_name = d.pop(0) author = d.pop(0) sversion = d.pop(0) mnc = d.pop(0) interface = d.pop(0) designer = d.pop(0) icon_count = d.pop(0) update_date = d.pop(0) stars = d.pop(0) comment_num = d.pop(0) font = d.pop(0) price = d.pop(0) file_size = d.pop(0) ischarge = d.pop(0) dlnum = d.pop(0) hitopids = d.pop(0) device_name = d.pop(0) pay_ability = d.pop(0) # Construct feature vector features = [] features.append(("Item.id,%s" % hitop_id, 1)) features.append(("Item.screen,%s" % screen, 1)) features.append(("Item.name,%s" % ch_name, 1)) features.append(("All,0",1)) features.append(("Item.author,%s" % author, 1)) features.append(("Item.sversion,%s" % sversion, 1)) features.append(("Item.network,%s" % mnc, 1)) features.append(("Item.dgner,%s" % designer, 1)) features.append(("Item.icount,%s" % icon_count, 1)) features.append(("Item.stars,%s" % stars, 1)) features.append(("Item.comNum,%s" % comment_num,1)) features.append(("Item.font,%s" % font,1)) features.append(("Item.price,%s" % price,1)) features.append(("Item.fsize,%s" % file_size,1)) features.append(("Item.ischarge,%s" % ischarge,1)) features.append(("Item.downNum,%s" % dlnum,1)) ####User.Item and User.Item*Item idlist = hitopids[:-2].split(',') idCT = 0; for id in idlist: features.append(("User.Item*Item,%s" % id +'*'+hitop_id, 1)) idCT += 1 if idCT >= 3: #取每一個用戶的前3個下載歷史進行關聯,因為用戶量比較多,所以這里最后結果覆蓋還是比較全的。 break; features.append(("User.phone*Item,%s" % device_name + '*' + hitop_id,1))#升維 features.append(("User.pay*Item.price,%s" % pay_ability + '*' + price,1)) # Output output = "%s\t%s" % (label, ",".join([ "%s:%d" % (f, v) for f, v in features ]))#這里join相當於是把list中的數據進行拆分,然后添加上分號。 print output
經過上述處理之后的數據如圖所示:
特征工程部分前期准別結束。