1 #!/usr/bin/env python2 2 # -*- coding: utf-8 -*- 3 """ 4 Created on Sat Mar 31 21:19:09 2018 5 6 @author: hello4720 7 """ 8 import numpy as np 9 import pandas as pd 10 import lightgbm as lgb 11 from sklearn import metrics 12 from sklearn.model_selection import train_test_split 13 14 ### 讀取數據 15 print("載入數據") 16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv') 17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv') 18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv') 19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv') 20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv') 21 22 dataset1.drop_duplicates(inplace=True) 23 dataset2.drop_duplicates(inplace=True) 24 dataset3.drop_duplicates(inplace=True) 25 dataset4.drop_duplicates(inplace=True) 26 dataset5.drop_duplicates(inplace=True) 27 28 ### 數據合並 29 print("數據合並") 30 trains = pd.concat([dataset1,dataset2],axis=0) 31 trains = pd.concat([trains,dataset3],axis=0) 32 trains = pd.concat([trains,dataset4],axis=0) 33 34 online_test = dataset5 35 36 ### 數據拆分 37 print("數據拆分") 38 train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21) 39 train,val = train_test_split(train_xy, test_size = 0.2,random_state=21) 40 41 print("訓練集") 42 y = train.is_trade # 訓練集標簽 43 X = train.drop(['instance_id','is_trade'],axis=1) # 訓練集特征矩陣 44 45 print("驗證集") 46 val_y = val.is_trade # 驗證集標簽 47 val_X = val.drop(['instance_id','is_trade'],axis=1) # 驗證集特征矩陣 48 49 print("測試集") 50 offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 線下測試特征矩陣 51 online_test_X=online_test.drop(['instance_id'],axis=1) # 線上測試特征矩陣 52 53 ### 數據轉換 54 lgb_train = lgb.Dataset(X, y, free_raw_data=False) 55 lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False) 56 57 ### 開始訓練 58 print('設置參數') 59 params = { 60 'boosting_type': 'gbdt', 61 'boosting': 'dart', 62 'objective': 'binary', 63 'metric': 'binary_logloss', 64 65 'learning_rate': 0.01, 66 'num_leaves':25, 67 'max_depth':3, 68 69 'max_bin':10, 70 'min_data_in_leaf':8, 71 72 'feature_fraction': 0.6, 73 'bagging_fraction': 1, 74 'bagging_freq':0, 75 76 'lambda_l1': 0, 77 'lambda_l2': 0, 78 'min_split_gain': 0 79 } 80 81 print("開始訓練") 82 gbm = lgb.train(params, # 參數字典 83 lgb_train, # 訓練集 84 num_boost_round=2000, # 迭代次數 85 valid_sets=lgb_eval, # 驗證集 86 early_stopping_rounds=30) # 早停系數 87 ### 線下預測 88 print ("線下預測") 89 preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 輸出概率 90 offline=offline_test[['instance_id','is_trade']] 91 offline['preds']=preds_offline 92 offline.is_trade = offline['is_trade'].astype(np.float64) 93 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds)) 94 95 ### 線上預測 96 print("線上預測") 97 preds_online = gbm.predict(online_test_X, num_iteration=gbm.best_iteration) # 輸出概率 98 online=online_test[['instance_id']] 99 online['preds']=preds_online 100 online.rename(columns={'preds':'predicted_score'},inplace=True) 101 online.to_csv("./data/20180405.txt",index=None,sep=' ') 102 103 ### 保存模型 104 from sklearn.externals import joblib 105 joblib.dump(gbm,'gbm.pkl') 106 107 ### 特征選擇 108 df = pd.DataFrame(X.columns.tolist(), columns=['feature']) 109 df['importance']=list(gbm.feature_importance()) 110 df = df.sort_values(by='importance',ascending=False) 111 df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')
