lightgbm 的簡單實踐案例


import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
import pandas as pd
import numpy as np

data_df = pd.read_csv('train.csv')
label = data_df['TARGET']
feature = data_df.drop(['TARGET','ID'],axis=1)
data_test = pd.read_csv('test.csv') data_test_ID = data_test['ID'] data_test_feature = data_test.drop(['ID'],axis=1) feature_all = pd.concat([feature,data_test_feature]) feature_all = pd.get_dummies(feature_all, dummy_na=True, columns=None) feature_train = feature_all.iloc[:len(feature),:] feature_test = feature_all.iloc[len(feature):] # 訓練模型 def train_model(data_X,data_y): from sklearn.model_selection import train_test_split X_train,x_test,Y_train,y_test = train_test_split(data_X,data_y,test_size=0.2,random_state=3) # 創建成lgb特征的數據集格式,將使加載更快 lgb_train = lgb.Dataset(X_train, label=Y_train) lgb_eval = lgb.Dataset(x_test, label=y_test, reference=lgb_train) parameters = { 'task': 'train', 'max_depth': 15, 'boosting_type': 'gbdt', 'num_leaves': 20, # 葉子節點數 'n_estimators': 50, 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.2, 'feature_fraction': 0.7, #小於 1.0, LightGBM 將會在每次迭代中隨機選擇部分特征. 'bagging_fraction': 1, #類似於 feature_fraction, 但是它將在不進行重采樣的情況下隨機選擇部分數據 'bagging_freq': 3, #bagging 的頻率, 0 意味着禁用 bagging. k 意味着每 k 次迭代執行bagging 'lambda_l1': 0.5, 'lambda_l2': 0, 'cat_smooth': 10, #用於分類特征,這可以降低噪聲在分類特征中的影響, 尤其是對數據很少的類別 'is_unbalance': False, #適合二分類。這里如果設置為True,評估結果降低3個點 'verbose': 0 } evals_result = {} #記錄訓練結果所用 gbm_model = lgb.train(parameters, lgb_train, valid_sets=[lgb_train,lgb_eval], num_boost_round=50, #提升迭代的次數 early_stopping_rounds=5, evals_result=evals_result, verbose_eval=10 ) prediction = gbm_model.predict(x_test,num_iteration=gbm_model.best_iteration) from sklearn.metrics import roc_auc_score roc_auc_score = roc_auc_score(y_test, prediction) print(roc_auc_score) return gbm_model,evals_result model,evals_result = train_model(feature_train,label)
#運行結果
[5] training's auc: 0.946343 valid_1's auc: 0.94609 [10] training's auc: 0.950425 valid_1's auc: 0.948894 [15] training's auc: 0.954869 valid_1's auc: 0.950978 [20] training's auc: 0.957274 valid_1's auc: 0.951505 [25] training's auc: 0.958921 valid_1's auc: 0.95193 [30] training's auc: 0.960303 valid_1's auc: 0.951958 Early stopping, best iteration is: [24] training's auc: 0.958674 valid_1's auc: 0.952064

 

# 可視化訓練結果以及模型下的特征重要性
def lgb_importance():
    
    model,evals_result = train_model(feature_train,label)
    
    ax = lgb.plot_metric(evals_result, metric='auc') #metric的值與之前的params里面的值對應
    plt.title('metric')
    plt.show()
    
    feature_names_pd = pd.DataFrame({'column': feature_train.columns,
                                     'importance': model.feature_importance(),
                                     })
    plt.figure(figsize=(10, 15))
    sns.barplot(x="importance", y="column", data=feature_names_pd.sort_values(by="importance", ascending=False))  #按照importance的進行降排序
    plt.title('LightGBM Features')
    plt.tight_layout()


lgb_importance()

訓練集和驗證集的auc分數對比

 

可視化出的所有特征的重要性,可以給前面數據預處理做一定參考

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM