工具導入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import gc
from collections import Counter
import copy
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
數據讀取
讀取訓練集和測試集以及用戶信息,用戶日志的數據
#讀取數據集
#test_data = pd.read_csv('./data_format1/test_format1.csv')
#train_data = pd.read_csv('./data_format1/train_format1.csv')
#user_info = pd.read_csv('./data_format1/user_info_format1.csv')
#user_log = pd.read_csv('./data_format1/user_log_format1.csv')
數據讀取函數
def read_csv(file_name, num_rows):
return pd.read_csv(file_name, nrows=num_rows)
內存壓縮方法
# reduce memory
def reduce_mem_usage(df, verbose=True):
start_mem = df.memory_usage().sum() / 1024**2
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
數據進行內存壓縮
num_rows = None
num_rows = 200 * 10000 # 1000條測試代碼使用
# num_rows = 1000
train_file = './data_format1/train_format1.csv'
test_file = './data_format1/test_format1.csv'
user_info_file = './data_format1/user_info_format1.csv'
user_log_file = './data_format1/user_log_format1.csv'
train_data = reduce_mem_usage(read_csv(train_file, num_rows))
test_data = reduce_mem_usage(read_csv(test_file, num_rows))
user_info = reduce_mem_usage(read_csv(user_info_file, num_rows))
user_log = reduce_mem_usage(read_csv(user_log_file, num_rows))
壓縮效果
Memory usage after optimization is: 1.74 MB
Decreased by 70.8%
Memory usage after optimization is: 3.49 MB
Decreased by 41.7%
Memory usage after optimization is: 3.24 MB
Decreased by 66.7%
Memory usage after optimization is: 32.43 MB
Decreased by 69.6%
查看壓縮后的數據信息
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 260864 non-null int32
1 merchant_id 260864 non-null int16
2 label 260864 non-null int8
dtypes: int16(1), int32(1), int8(1)
memory usage: 1.7 MB
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261477 entries, 0 to 261476
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 261477 non-null int32
1 merchant_id 261477 non-null int16
2 prob 0 non-null float64
dtypes: float64(1), int16(1), int32(1)
memory usage: 3.5 MB
user_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 424170 non-null int32
1 age_range 421953 non-null float16
2 gender 417734 non-null float16
dtypes: float16(2), int32(1)
memory usage: 3.2 MB
user_log.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
# Column Dtype
--- ------ -----
0 user_id int32
1 item_id int32
2 cat_id int16
3 seller_id int16
4 brand_id float16
5 time_stamp int16
6 action_type int8
dtypes: float16(1), int16(3), int32(2), int8(1)
memory usage: 32.4 MB
數據處理
合並用戶信息
del test_data['prob']
all_data = train_data.append(test_data)
all_data = all_data.merge(user_info,on=['user_id'],how='left')
del train_data, test_data, user_info
gc.collect()
user_id | merchant_id | label | age_range | gender |
---|---|---|---|---|
0 | 34176 | 3906 | 0.0 | 6.0 |
1 | 34176 | 121 | 0.0 | 6.0 |
2 | 34176 | 4356 | 1.0 | 6.0 |
3 | 34176 | 2217 | 0.0 | 6.0 |
4 | 230784 | 4818 | 0.0 | 0.0 |
用戶行為日志信息按時間進行排序
user_log = user_log.sort_values(['user_id','time_stamp'])
user_id | item_id | cat_id | seller_id | brand_id | time_stamp | action_type |
---|---|---|---|---|---|---|
61975 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61976 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61977 | 16 | 980982 | 437 | 650 | 4276.0 | 914 |
61978 | 16 | 962763 | 19 | 650 | 4276.0 | 914 |
61979 | 16 | 391126 | 437 | 650 | 4276.0 | 914 |
對每個用戶的逐個合並所有的item_id, cat_id,seller_id,brand_id,time_stamp, action_type字段
list_join_func = lambda x: " ".join([str(i) for i in x])
agg_dict = {
'item_id' : list_join_func,
'cat_id' : list_join_func,
'seller_id' : list_join_func,
'brand_id' : list_join_func,
'time_stamp' : list_join_func,
'action_type' : list_join_func
}
rename_dict = {
'item_id' : 'item_path',
'cat_id' : 'cat_path',
'seller_id' : 'seller_path',
'brand_id' : 'brand_path',
'time_stamp' : 'time_stamp_path',
'action_type' : 'action_type_path'
}
刪除數據並回收內存
del user_log
gc.collect()
定義數據統計函數
統計數據的總數
def cnt_(x):
try:
return len(x.split(' ')) //split()通過指定分隔符對字符串進行切片,如果參數有指定值,則分隔參數+1個子字符串
except:
return -1
統計唯一數據總數
def nunique_(x):
try:
return len(set(x.split(' ')))
except:
return -1
統計數據最大值
def max_(x):
try:
return np.max([int(i) for i in x.split(' ')])
except:
return -1
統計數據最小值
統計數據最小值
def min_(x):
try:
return np.min([int(i) for i in x.split(' ')])
except:
return -1
統計數據的標准差
def std_(x):
try:
return np.std([float(i) for i in x.split(' ')])
except:
return -1
統計數據中top N的數據
def most_n(x, n):
try:
return Counter(x.split(' ')).most_common(n)[n-1][0]
except:
return -1
統計數據中top N數據的總數
def most_n_cnt(x, n):
try:
return Counter(x.split(' ')).most_common(n)[n-1][1]
except:
return -1
提取商鋪的基本統計特征
all_data_test = all_data_path.head(2000)
#all_data_test = all_data_path
# 統計用戶 點擊、瀏覽、加購、購買行為
# 總次數
all_data_test = user_cnt(all_data_test, 'seller_path', 'user_cnt')
# 不同店鋪個數
all_data_test = user_nunique(all_data_test, 'seller_path', 'seller_nunique')
# 不同品類個數
all_data_test = user_nunique(all_data_test, 'cat_path', 'cat_nunique')
# 不同品牌個數
all_data_test = user_nunique(all_data_test, 'brand_path', 'brand_nunique')
# 不同商品個數
all_data_test = user_nunique(all_data_test, 'item_path', 'item_nunique')
# 活躍天數
all_data_test = user_nunique(all_data_test, 'time_stamp_path', 'time_stamp_nunique')
# 不用行為種數
all_data_test = user_nunique(all_data_test, 'action_type_path', 'action_type_nunique')
# ....
分開統計用戶的點擊,加購,購買,收藏特征
不同行為的業務函數定義
def col_cnt_(df_data, columns_list, action_type):
try:
data_dict = {}
col_list = copy.deepcopy(columns_list)
if action_type != None:
col_list += ['action_type_path']
for col in col_list:
data_dict[col] = df_data[col].split(' ')
path_len = len(data_dict[col])
data_out = []
for i_ in range(path_len):
data_txt = ''
for col_ in columns_list:
if data_dict['action_type_path'][i_] == action_type:
data_txt += '_' + data_dict[col_][i_]
data_out.append(data_txt)
return len(data_out)
except:
return -1
def col_nuique_(df_data, columns_list, action_type):
try:
data_dict = {}
col_list = copy.deepcopy(columns_list)
if action_type != None:
col_list += ['action_type_path']
for col in col_list:
data_dict[col] = df_data[col].split(' ')
path_len = len(data_dict[col])
data_out = []
for i_ in range(path_len):
data_txt = ''
for col_ in columns_list:
if data_dict['action_type_path'][i_] == action_type:
data_txt += '_' + data_dict[col_][i_]
data_out.append(data_txt)
return len(set(data_out))
except:
return -1
def user_col_cnt(df_data, columns_list, action_type, name):
df_data[name] = df_data.apply(lambda x: col_cnt_(x, columns_list, action_type), axis=1)
return df_data
def user_col_nunique(df_data, columns_list, action_type, name):
df_data[name] = df_data.apply(lambda x: col_nuique_(x, columns_list, action_type), axis=1)
return df_data
統計店鋪被用戶點擊次數,加購次數,購買次數,收藏次數
# 點擊次數
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '0', 'user_cnt_0')
# 加購次數
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '1', 'user_cnt_1')
# 購買次數
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '2', 'user_cnt_2')
# 收藏次數
all_data_test = user_col_cnt(all_data_test, ['seller_path'], '3', 'user_cnt_3')
# 不同店鋪個數
all_data_test = user_col_nunique(all_data_test, ['seller_path'], '0', 'seller_nunique_0')
# ....
組合特征
特征組合進行業務特征提取
# 點擊次數
all_data_test = user_col_cnt(all_data_test, ['seller_path', 'item_path'], '0', 'user_cnt_0')
# 不同店鋪個數
all_data_test = user_col_nunique(all_data_test, ['seller_path', 'item_path'], '0', 'seller_nunique_0')
查看提取的特征
list(all_data_test.columns)
['user_id',
, 'merchant_id',
, 'label',
, 'age_range',
, 'gender',
, 'item_path',
, 'cat_path',
, 'seller_path',
, 'brand_path',
, 'time_stamp_path',
, 'action_type_path',
, 'user_cnt',
, 'seller_nunique',
, 'cat_nunique',
, 'brand_nunique',
, 'item_nunique',
, 'time_stamp_nunique',
, 'action_type_nunique',
, 'time_stamp_max',
, 'time_stamp_min',
, 'time_stamp_std',
, 'time_stamp_range',
, 'seller_most_1',
, 'cat_most_1',
, 'brand_most_1',
, 'action_type_1',
, 'seller_most_1_cnt',
, 'cat_most_1_cnt',
, 'brand_most_1_cnt',
, 'action_type_1_cnt',
, 'user_cnt_0',
, 'user_cnt_1',
, 'user_cnt_2',
, 'user_cnt_3',
, 'seller_nunique_0']
利用countvector,tfidf提取特征
這里的countvector,tfidf分別有不同的作用
CountVectorizer把一個文檔轉成一個包含詞頻的矩陣,而TfidfVectorizer跟CountVectorizer的區別在於:CountVectorizer返回的是詞頻,TfidfVectorizer返回的是tfidf值
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse
# cntVec = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)
tfidfVec = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, ngram_range=(1, 1), max_features=100)
# columns_list = ['seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']
columns_list = ['seller_path']
for i, col in enumerate(columns_list):
all_data_test[col] = all_data_test[col].astype(str)
tfidfVec.fit(all_data_test[col])
data_ = tfidfVec.transform(all_data_test[col])
if i == 0:
data_cat = data_
else:
data_cat = sparse.hstack((data_cat, data_))
特征重命名 特征合並
df_tfidf = pd.DataFrame(data_cat.toarray())
df_tfidf.columns = ['tfidf_' + str(i) for i in df_tfidf.columns]
all_data_test = pd.concat([all_data_test, df_tfidf],axis=1)
嵌入特征
嵌入法是一種讓算法自己決定使用哪些特征的方法,即特征選擇和算法訓練同時進行。在使用嵌入法時,我們先使用某些機器學習的算法和模型進行訓練,得到各個特征的權重系數,根據權值系數從大到小選擇特征,這些權值系數往往代表了特征對於模型的某種貢獻或某種重要性,比如決策樹和樹的集成模型中的feature_importances_屬性,可以列出各個特征對樹建立的貢獻,我們可以基於這種貢獻的評估,找出對模型建立最有用的特征,因此相對於過濾法,嵌入法的結果會更加精確到模型的效用本身,對提高模型效力有更好的效果,並且,由於考慮特征對模型的貢獻,因此無關的特征(需要相關性過濾的特征)和無區分度的特征(需要方差過濾的特征)都會因為缺乏對模型的貢獻而被刪除掉。
import gensim
# Train Word2Vec model
model = gensim.models.Word2Vec(all_data_test['seller_path'].apply(lambda x: x.split(' ')), size=100, window=5, min_count=5, workers=4)
# model.save("product2vec.model")
# model = gensim.models.Word2Vec.load("product2vec.model")
def mean_w2v_(x, model, size=100):
try:
i = 0
for word in x.split(' '):
if word in model.wv.vocab:
i += 1
if i == 1:
vec = np.zeros(size)
vec += model.wv[word]
return vec / i
except:
return np.zeros(size)
def get_mean_w2v(df_data, columns, model, size):
data_array = []
for index, row in df_data.iterrows():
w2v = mean_w2v_(row[columns], model, size)
data_array.append(w2v)
return pd.DataFrame(data_array)
df_embeeding = get_mean_w2v(all_data_test, 'seller_path', model, 100)
df_embeeding.columns = ['embeeding_' + str(i) for i in df_embeeding.columns]
embeeding特征和原始特征合並
all_data_test = pd.concat([all_data_test, df_embeeding],axis=1)
stacking特征
stacking 回歸特征
"""
-- 回歸
-- stacking 回歸特征
"""
def stacking_reg(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
train=np.zeros((train_x.shape[0],1))
test=np.zeros((test_x.shape[0],1))
test_pre=np.empty((folds,test_x.shape[0],1))
cv_scores=[]
for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)): //把train_x,label_split變成一個索引列表
tr_x=train_x[train_index]
tr_y=train_y[train_index]
te_x=train_x[test_index]
te_y = train_y[test_index]
if clf_name in ["rf","ada","gb","et","lr"]: //如果是隨機森林,adaboost,梯度增強回歸器 ,極端隨機森林,線性回歸等算法
clf.fit(tr_x,tr_y)
pre=clf.predict(te_x).reshape(-1,1)
train[test_index]=pre
test_pre[i,:]=clf.predict(test_x).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["xgb"]: //如果是XGBOOST算法
train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
z = clf.DMatrix(test_x, label=te_y, missing=-1)
params = {'booster': 'gbtree',
'eval_metric': 'rmse',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.03,
'tree_method': 'exact',
'seed': 2017,
'nthread': 12
}
num_round = 10000
early_stopping_rounds = 100
watchlist = [(train_matrix, 'train'),
(test_matrix, 'eval')
]
if test_matrix:
model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit).reshape(-1,1)
train[test_index]=pre
test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
elif clf_name in ["lgb"]: //如果是LGBOOST算法
train_matrix = clf.Dataset(tr_x, label=tr_y)
test_matrix = clf.Dataset(te_x, label=te_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression_l2',
'metric': 'mse',
'min_child_weight': 1.5,
'num_leaves': 2**5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.03,
'tree_method': 'exact',
'seed': 2017,
'nthread': 12,
'silent': True,
}
num_round = 10000
early_stopping_rounds = 100
if test_matrix:
model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(te_x,num_iteration=model.best_iteration).reshape(-1,1)
train[test_index]=pre
test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration).reshape(-1,1)
cv_scores.append(mean_squared_error(te_y, pre))
else:
raise IOError("Please add new clf.")
print("%s now score is:"%clf_name,cv_scores)
test[:]=test_pre.mean(axis=0)
print("%s_score_list:"%clf_name,cv_scores)
print("%s_score_mean:"%clf_name,np.mean(cv_scores))
return train.reshape(-1,1),test.reshape(-1,1)
def rf_reg(x_train, y_train, x_valid, kf, label_split=None):
randomforest = RandomForestRegressor(n_estimators=600, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
rf_train, rf_test = stacking_reg(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
return rf_train, rf_test,"rf_reg"
def ada_reg(x_train, y_train, x_valid, kf, label_split=None):
adaboost = AdaBoostRegressor(n_estimators=30, random_state=2017, learning_rate=0.01)
ada_train, ada_test = stacking_reg(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
return ada_train, ada_test,"ada_reg"
def gb_reg(x_train, y_train, x_valid, kf, label_split=None):
gbdt = GradientBoostingRegressor(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
gbdt_train, gbdt_test = stacking_reg(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
return gbdt_train, gbdt_test,"gb_reg"
def et_reg(x_train, y_train, x_valid, kf, label_split=None):
extratree = ExtraTreesRegressor(n_estimators=600, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
et_train, et_test = stacking_reg(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
return et_train, et_test,"et_reg"
def lr_reg(x_train, y_train, x_valid, kf, label_split=None):
lr_reg=LinearRegression(n_jobs=-1)
lr_train, lr_test = stacking_reg(lr_reg, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return lr_train, lr_test, "lr_reg"
def xgb_reg(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_reg(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
return xgb_train, xgb_test,"xgb_reg"
def lgb_reg(x_train, y_train, x_valid, kf, label_split=None):
lgb_train, lgb_test = stacking_reg(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
return lgb_train, lgb_test,"lgb_reg"
stacking 分類特征
"""
-- 分類
-- stacking 分類特征
"""
def stacking_clf(clf,train_x,train_y,test_x,clf_name,kf,label_split=None):
train=np.zeros((train_x.shape[0],1))
test=np.zeros((test_x.shape[0],1))
test_pre=np.empty((folds,test_x.shape[0],1))
cv_scores=[]
for i,(train_index,test_index) in enumerate(kf.split(train_x,label_split)):
tr_x=train_x[train_index]
tr_y=train_y[train_index]
te_x=train_x[test_index]
te_y = train_y[test_index]
if clf_name in ["rf","ada","gb","et","lr","knn","gnb"]:
clf.fit(tr_x,tr_y)
pre=clf.predict_proba(te_x)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i,:]=clf.predict_proba(test_x)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
elif clf_name in ["xgb"]:
train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
z = clf.DMatrix(test_x)
params = {'booster': 'gbtree',
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'gamma': 1,
'min_child_weight': 1.5,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.03,
'tree_method': 'exact',
'seed': 2017,
"num_class": 2
}
num_round = 10000
early_stopping_rounds = 100
watchlist = [(train_matrix, 'train'),
(test_matrix, 'eval')
]
if test_matrix:
model = clf.train(params, train_matrix, num_boost_round=num_round,evals=watchlist,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(test_matrix,ntree_limit=model.best_ntree_limit)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i, :]= model.predict(z, ntree_limit=model.best_ntree_limit)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
elif clf_name in ["lgb"]:
train_matrix = clf.Dataset(tr_x, label=tr_y)
test_matrix = clf.Dataset(te_x, label=te_y)
params = {
'boosting_type': 'gbdt',
#'boosting_type': 'dart',
'objective': 'multiclass',
'metric': 'multi_logloss',
'min_child_weight': 1.5,
'num_leaves': 2**5,
'lambda_l2': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'learning_rate': 0.03,
'tree_method': 'exact',
'seed': 2017,
"num_class": 2,
'silent': True,
}
num_round = 10000
early_stopping_rounds = 100
if test_matrix:
model = clf.train(params, train_matrix,num_round,valid_sets=test_matrix,
early_stopping_rounds=early_stopping_rounds
)
pre= model.predict(te_x,num_iteration=model.best_iteration)
train[test_index]=pre[:,0].reshape(-1,1)
test_pre[i, :]= model.predict(test_x, num_iteration=model.best_iteration)[:,0].reshape(-1,1)
cv_scores.append(log_loss(te_y, pre[:,0].reshape(-1,1)))
else:
raise IOError("Please add new clf.")
print("%s now score is:"%clf_name,cv_scores)
test[:]=test_pre.mean(axis=0)
print("%s_score_list:"%clf_name,cv_scores)
print("%s_score_mean:"%clf_name,np.mean(cv_scores))
return train.reshape(-1,1),test.reshape(-1,1)
def rf_clf(x_train, y_train, x_valid, kf, label_split=None):
randomforest = RandomForestClassifier(n_estimators=1200, max_depth=20, n_jobs=-1, random_state=2017, max_features="auto",verbose=1)
rf_train, rf_test = stacking_clf(randomforest, x_train, y_train, x_valid, "rf", kf, label_split=label_split)
return rf_train, rf_test,"rf"
def ada_clf(x_train, y_train, x_valid, kf, label_split=None):
adaboost = AdaBoostClassifier(n_estimators=50, random_state=2017, learning_rate=0.01)
ada_train, ada_test = stacking_clf(adaboost, x_train, y_train, x_valid, "ada", kf, label_split=label_split)
return ada_train, ada_test,"ada"
def gb_clf(x_train, y_train, x_valid, kf, label_split=None):
gbdt = GradientBoostingClassifier(learning_rate=0.04, n_estimators=100, subsample=0.8, random_state=2017,max_depth=5,verbose=1)
gbdt_train, gbdt_test = stacking_clf(gbdt, x_train, y_train, x_valid, "gb", kf, label_split=label_split)
return gbdt_train, gbdt_test,"gb"
def et_clf(x_train, y_train, x_valid, kf, label_split=None):
extratree = ExtraTreesClassifier(n_estimators=1200, max_depth=35, max_features="auto", n_jobs=-1, random_state=2017,verbose=1)
et_train, et_test = stacking_clf(extratree, x_train, y_train, x_valid, "et", kf, label_split=label_split)
return et_train, et_test,"et"
def xgb_clf(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_clf(xgboost, x_train, y_train, x_valid, "xgb", kf, label_split=label_split)
return xgb_train, xgb_test,"xgb"
def lgb_clf(x_train, y_train, x_valid, kf, label_split=None):
xgb_train, xgb_test = stacking_clf(lightgbm, x_train, y_train, x_valid, "lgb", kf, label_split=label_split)
return xgb_train, xgb_test,"lgb"
def gnb_clf(x_train, y_train, x_valid, kf, label_split=None):
gnb=GaussianNB()
gnb_train, gnb_test = stacking_clf(gnb, x_train, y_train, x_valid, "gnb", kf, label_split=label_split)
return gnb_train, gnb_test,"gnb"
def lr_clf(x_train, y_train, x_valid, kf, label_split=None):
logisticregression=LogisticRegression(n_jobs=-1,random_state=2017,C=0.1,max_iter=200)
lr_train, lr_test = stacking_clf(logisticregression, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return lr_train, lr_test, "lr"
def knn_clf(x_train, y_train, x_valid, kf, label_split=None):
kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
knn_train, knn_test = stacking_clf(kneighbors, x_train, y_train, x_valid, "lr", kf, label_split=label_split)
return knn_train, knn_test, "knn"
獲取訓練和驗證數據(為stacking特征做准備)
features_columns = [c for c in all_data_test.columns if c not in ['label', 'prob', 'seller_path', 'cat_path', 'brand_path', 'action_type_path', 'item_path', 'time_stamp_path']]
x_train = all_data_test[~all_data_test['label'].isna()][features_columns].values
y_train = all_data_test[~all_data_test['label'].isna()]['label'].values
x_valid = all_data_test[all_data_test['label'].isna()][features_columns].values
處理函數值inf以及nan情況
def get_matrix(data):
where_are_nan = np.isnan(data)
where_are_inf = np.isinf(data)
data[where_are_nan] = 0
data[where_are_inf] = 0
return data
x_train = np.float_(get_matrix(np.float_(x_train)))
y_train = np.int_(y_train)
x_valid = x_train
導入划分數據函數 設stacking特征為5折
注意:這里五折一開始我沒理解,查詢資料后得到解釋如下
k 折交叉驗證通過對 k 個不同分組訓練的結果進行平均來減少方差,因此模型的性能對數據的划分就不那么敏感。
- 第一步,不重復抽樣將原始數據隨機分為 k 份。
- 第二步,每一次挑選其中 1 份作為測試集,剩余 k-1 份作為訓練集用於模型訓練。
- 第三步,重復第二步 k 次,這樣每個子集都有一次機會作為測試集,其余機會作為訓練集。在每個訓練集上訓練后得到一個模型,用這個模型在相應的測試集上測試,計算並保存模型的評估指標,
- 第四步,計算 k 組測試結果的平均值作為模型精度的估計,並作為當前 k 折交叉驗證下模型的性能指標。
from sklearn.model_selection import StratifiedKFold, KFold
folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)
使用lgb和xgb分類模型構造stacking特征
clf_list = [lgb_clf, xgb_clf]
clf_list_col = ['lgb_clf', 'xgb_clf']
訓練模型,獲取stacking特征
clf_list = clf_list
column_list = []
train_data_list=[]
test_data_list=[]
for clf in clf_list:
train_data,test_data,clf_name=clf(x_train, y_train, x_valid, kf, label_split=None)
train_data_list.append(train_data)
test_data_list.append(test_data)
train_stacking = np.concatenate(train_data_list, axis=1)
test_stacking = np.concatenate(test_data_list, axis=1)
原始特征和stacking特征合並
train = pd.DataFrame(np.concatenate([x_train, train_stacking], axis=1))
test = np.concatenate([x_valid, test_stacking], axis=1)
特征重命名
df_train_all = pd.DataFrame(train)
df_train_all.columns = features_columns + clf_list_col
df_test_all = pd.DataFrame(test)
df_test_all.columns = features_columns + clf_list_col
獲取數據ID以及特征標簽LABEL
df_train_all['label'] = all_data_test['label']
訓練數據和測試數據保存
df_train_all.to_csv('train_all.csv',header=True,index=False)
df_test_all.to_csv('test_all.csv',header=True,index=False)