【集成學習】lightgbm使用案例


github地址

  1 #!/usr/bin/env python2
  2 # -*- coding: utf-8 -*-
  3 """
  4 Created on Sat Mar 31 21:19:09 2018
  5 
  6 @author: hello4720
  7 """
  8 import numpy as np
  9 import pandas as pd
 10 import lightgbm as lgb
 11 from sklearn import metrics
 12 from sklearn.model_selection import train_test_split
 13 
 14 ### 讀取數據
 15 print("載入數據")
 16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv')
 17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv')
 18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv')
 19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv')
 20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv')
 21 
 22 dataset1.drop_duplicates(inplace=True)
 23 dataset2.drop_duplicates(inplace=True)
 24 dataset3.drop_duplicates(inplace=True)
 25 dataset4.drop_duplicates(inplace=True)
 26 dataset5.drop_duplicates(inplace=True)
 27 
 28 ### 數據合並
 29 print("數據合並")
 30 trains = pd.concat([dataset1,dataset2],axis=0)
 31 trains = pd.concat([trains,dataset3],axis=0)
 32 trains = pd.concat([trains,dataset4],axis=0)
 33 
 34 online_test = dataset5
 35 
 36 ### 數據拆分
 37 print("數據拆分")
 38 train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21)
 39 train,val = train_test_split(train_xy, test_size = 0.2,random_state=21)
 40 
 41 print("訓練集")
 42 y = train.is_trade                                                  # 訓練集標簽
 43 X = train.drop(['instance_id','is_trade'],axis=1)                   # 訓練集特征矩陣
 44 
 45 print("驗證集")
 46 val_y = val.is_trade                                                # 驗證集標簽
 47 val_X = val.drop(['instance_id','is_trade'],axis=1)                 # 驗證集特征矩陣
 48 
 49 print("測試集")
 50 offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 線下測試特征矩陣
 51 online_test_X=online_test.drop(['instance_id'],axis=1)              # 線上測試特征矩陣
 52 
 53 ### 數據轉換
 54 lgb_train = lgb.Dataset(X, y, free_raw_data=False)
 55 lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False)
 56 
 57 ### 開始訓練
 58 print('設置參數')
 59 params = {
 60             'boosting_type': 'gbdt',
 61             'boosting': 'dart',
 62             'objective': 'binary',
 63             'metric': 'binary_logloss',
 64 
 65             'learning_rate': 0.01,
 66             'num_leaves':25,
 67             'max_depth':3,
 68 
 69             'max_bin':10,
 70             'min_data_in_leaf':8,
 71 
 72             'feature_fraction': 0.6,
 73             'bagging_fraction': 1,
 74             'bagging_freq':0,
 75 
 76             'lambda_l1': 0,
 77             'lambda_l2': 0,
 78             'min_split_gain': 0
 79 }
 80 
 81 print("開始訓練")
 82 gbm = lgb.train(params,                     # 參數字典
 83                 lgb_train,                  # 訓練集
 84                 num_boost_round=2000,       # 迭代次數
 85                 valid_sets=lgb_eval,        # 驗證集
 86                 early_stopping_rounds=30)   # 早停系數
 87 ### 線下預測
 88 print ("線下預測")
 89 preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 輸出概率
 90 offline=offline_test[['instance_id','is_trade']]
 91 offline['preds']=preds_offline
 92 offline.is_trade = offline['is_trade'].astype(np.float64)
 93 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds))
 94 
 95 ### 線上預測
 96 print("線上預測")
 97 preds_online =  gbm.predict(online_test_X, num_iteration=gbm.best_iteration)  # 輸出概率
 98 online=online_test[['instance_id']]
 99 online['preds']=preds_online
100 online.rename(columns={'preds':'predicted_score'},inplace=True)
101 online.to_csv("./data/20180405.txt",index=None,sep=' ')
102 
103 ### 保存模型
104 from sklearn.externals import joblib
105 joblib.dump(gbm,'gbm.pkl')
106 
107 ### 特征選擇
108 df = pd.DataFrame(X.columns.tolist(), columns=['feature'])
109 df['importance']=list(gbm.feature_importance())
110 df = df.sort_values(by='importance',ascending=False)
111 df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM