lightgbm 進行二分類


 
         

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
train_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\trainfinal.csv',dtype={'city_name': 'category', 'county_name': 'category'})
test_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\testfinal.csv',encoding='GBK',dtype={'city_name': 'category', 'county_name': 'category'})

 

 

 

 

 

 

test_data 多了label 列 :

 

train_label=train_data['label']  #取出Y
train_data.drop(['phone_no_m','label'],axis=1,inplace=True) #去除Y和無關的名字變量

# 交叉驗證評價指標 :額外的
from sklearn.metrics import f1_score
def culatescore(predict,real):
    f1=f1_score(real, predict, average='macro')
    scores.append(f1)
    return scores

params = {
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 31,
        'max_bin': 50,
        'max_depth': 6,
        "learning_rate": 0.02,
        "colsample_bytree": 0.8,  # 每次迭代中隨機選擇特征的比例
        "bagging_fraction": 0.8,  # 每次迭代時用的數據比例
        'min_child_samples': 25,
        'n_jobs': -1,
        'silent': True,  # 信息輸出設置成1則沒有信息輸出
        'seed': 1000,
    }  #設置出參數

results=[]         #這個是f1的值,每一次交叉驗證的f1 
bigtestresults=[]   #這個是測試集各個交叉驗證匯總后的結果 
smalltestresults=[] #每一次運行這一大段代碼,初始化各個list,這份是測試集預測的交叉驗證的每一次存放結果的list
scores=[]          #這是匯總后的交叉驗證
cat = ["city_name", "county_name"]  #這個是類別特征,catgorical_feature=cat
kf=StratifiedKFold(n_splits=3,shuffle=True,random_state=123)  #類別交叉嚴重
x,y=pd.DataFrame(train_data),pd.DataFrame(train_label)  #將數據dataframe化,后面進行iloc 等選項

for i,(train_index,valid_index) in enumerate(kf.split(x,y)): #這里需要輸入y
    print("",i+1,"")
    x_train,y_train=x.iloc[train_index],y.iloc[train_index]
    x_valid,y_valid=x.iloc[valid_index],y.iloc[valid_index]  #取出數據
    lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat,silent=True)
    lgb_eval  = lgb.Dataset(x_valid, y_valid, reference=lgb_train, categorical_feature=cat,silent=True)
    gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], categorical_feature=cat,verbose_eval=100,
                        early_stopping_rounds=200)
    #varbose_eval 迭代多少次打印  early_stopping_rounds:有多少次分數沒有提高就停止
    #categorical_feature:lightgbm可以處理標稱型(類別)數據。通過指定'categorical_feature' 這一參數告訴它哪些feature是標稱型的。
    #它不需要將數據展開成獨熱碼(one-hot),其原理是對特征的所有取值,做一個one-vs-others,從而找出最佳分割的那一個特征取值
    #bagging_fraction:和bagging_freq同時使用可以更快的出結果
    vaild_preds = gbm.predict(x_valid, num_iteration=gbm.best_iteration)
    #對測試集進行操作
    test_pre = gbm.predict(test_data.iloc[:,1:], num_iteration=gbm.best_iteration)
    
    threshold = 0.45      #設置閾值
    smalltestresults=[]  #這個是測試集預測的交叉驗證的每一次存放結果的list
                        # 對每個交叉驗證的測試集進行0 , 1 化,然后將每次結果放入bigtestresults中匯總
    for w in test_pre:    
        temp = 1 if w > threshold else 0
        smalltestresults.append(temp)
    bigtestresults.append(smalltestresults)
    
                        # 對每次交叉驗證的驗證集進行 0 ,1 化,然后評估f1值
    results=[]
    for pred in vaild_preds:  
        result = 1 if pred > threshold else 0
        results.append(result)
    c=culatescore(results,y_valid)
    print(c) 
print('---N折交叉驗證分數---')
print(np.average(c)) 
#將匯總的交叉驗證的測試集的數據轉變為dataframe,取出出現次數最多的那類,用作預測結果。
finalpres=pd.DataFrame(bigtestresults)
finaltask=[]
lss=[]  #這個是最終結果
for i in finalpres.columns:
    temp1=finalpres.iloc[:,i].value_counts().index[0]
    lss.append(temp1)
第 1 次
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.950557	valid_1's auc: 0.916566
[200]	training's auc: 0.97265	valid_1's auc: 0.937039
[300]	training's auc: 0.983987	valid_1's auc: 0.945772
[400]	training's auc: 0.990576	valid_1's auc: 0.951353
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.990576	valid_1's auc: 0.951353
[0.849546119365964]
第 2 次
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.947291	valid_1's auc: 0.923428
[200]	training's auc: 0.969263	valid_1's auc: 0.942003
[300]	training's auc: 0.980737	valid_1's auc: 0.951233
[400]	training's auc: 0.98822	valid_1's auc: 0.956855
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.98822	valid_1's auc: 0.956855
[0.849546119365964, 0.8638736081868513]
第 3 次
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.948401	valid_1's auc: 0.919948
[200]	training's auc: 0.970649	valid_1's auc: 0.939757
[300]	training's auc: 0.98241	valid_1's auc: 0.948706
[400]	training's auc: 0.9892	valid_1's auc: 0.954053
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.9892	valid_1's auc: 0.954053
[0.849546119365964, 0.8638736081868513, 0.8498867103198264]
---N折交叉驗證分數---
0.8544354792908807

 ----------------------------------------------------------------------------以上是2分類任務的簡單應用,還沒有調參----------------------------------------------------------------------

 調參代碼:

--------------------------普通手動網格搜索調參:只適用於全是數字的,沒有種類的 dataset----------------------------

#設定初始參數
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1, #先設置大一點 ,盡量小一些。
          'num_leaves':250, 
          'max_depth': 8,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8
          }

#先調整 n_estimators  迭代的次數/殘差樹的數目   盡量將nfold 設置大一點
lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True)
cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=4, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

 

best n_estimators: 325
best cv score: 0.9776274857159668
然后網格搜索調參

#求解 max_depth / num_leaves  
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

kf=KFold(n_splits=2,shuffle=True,random_state=123)  #后面KFold網格搜索和交叉驗證用
params_test1={'max_depth': range(5,8,1),
              'num_leaves':range(10, 60, 10)}

gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=325, max_depth=8, bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test1, scoring='roc_auc',cv=kf,n_jobs=-1)

gsearch1.fit(x,y)
gsearch1.best_params_, gsearch1.best_score_
({'max_depth': 7, 'num_leaves': 50}, 0.9643225388549975)

這樣一直進行下去。
max_depth  1
num_leaves 1 

min_data_in_leaf #2 
max_bin  #2

feature_fraction 3 構建弱學習器時,對特征隨機采樣的比例,默認值為1。推薦的候選值為:[0.6, 0.7, 0.8, 0.9, 1]
bagging_fraction 3
bagging_freq    3 

min_child_samples  4  葉節點樣本的最少數量,默認值20,用於防止過擬合。
min_child_weight 4   指定孩子節點中最小的樣本權重和,如果一個葉子節點的樣本權重和小於min_child_weight則拆分過程結束,默認值為1。推薦的候選值為:[1, 3, 5, 7]


lambda_l1   5
lambda_l2    5

min_gain_to_split  6  指定葉節點進行分支所需的損失減少的最小值,默認值為0。設置的值越大,模型就越保守。**推薦的候選值為:[0, 0.05 ~ 0.1, 0.3, 0.5, 0.7, 0.9, 1] **

learning_rate
boosting_type
objective
metric
n_jobs

 

----------全自動調參--+categorical_feature=cat ,注意,如果有categorical_feature,則在每次循環的時候,都要加載數據,否則數據會過期。-----------
#這個調參是有充足的時間來的
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split  #這個可以不要
 
canceData=load_breast_cancer()
X=canceData.data
y=canceData.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
 
### 數據轉換
print('數據轉換')
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) #這個可以不要
 
### 設置初始參數--不含交叉驗證參數
print('設置參數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1
          }
 
### 交叉驗證(調參)
print('交叉驗證')
max_auc = float('0')
best_params = {}
 
# 准確率
print("調參1:提高准確率")
for num_leaves in range(5,100,5):  #指定葉子的個數,默認值為31,此參數的數值應該小於 2^{max\_depth}
    for max_depth in range(3,8,1):  #指定樹的最大深度,默認值為-1,表示不做限制,合理的設置可以防止過擬合。
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
 
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=2,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if (mean_auc >= max_auc):
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if ('num_leaves' and 'max_depth' in best_params.keys()):          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
 
# 過擬合
print("調參2:降低過擬合")
for max_bin in range(5,256,10):  #最大的桶的數量,用來裝數值的;
    for min_data_in_leaf in range(1,102,10):  #每個葉子上的最少數據;
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=2,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if (mean_auc >= max_auc):
                max_auc = mean_auc
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf
if ('max_bin' and 'min_data_in_leaf' in best_params.keys()):
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']
 
print("調參3:降低過擬合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:  #默認值為1;指定每次迭代所需要的特征部分;
    for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:  #默認值為1;指定每次迭代所需要的數據部分,並且它通常是被用來提升訓練速度和避免過擬合的。
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=2,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if (mean_auc >= max_auc):
                max_auc=mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq
 
if( 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys()):
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
 
 
print("調參4:降低過擬合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=2,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if( mean_auc >= max_auc):
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if ('lambda_l1' and 'lambda_l2' in best_params.keys()):
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
 
print("調參5:降低過擬合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]: #指定葉節點進行分支所需的損失減少的最小值,默認值為0。設置的值越大,模型就越保守。
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                        params,
                        lgb_train,
                        seed=1,
                        nfold=2,
                        metrics=['auc'],
                        early_stopping_rounds=10,
                        verbose_eval=True
                        )
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
 
print(best_params)
{'num_leaves': 5, 'max_depth': 7, 'max_bin': 255, 'min_data_in_leaf': 41, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 45, 'lambda_l1': 0.5, 'lambda_l2': 0.0, 'min_split_gain': 0.1}


如果還想增加某些參數,還可以單獨拎幾個參數出來仔細挑
### 設置初始參數--不含交叉驗證參數
print('設置參數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1
          }
print('交叉驗證')
max_auc = float('0')
best_params = {}
for num_leaves in range(220,230,5):
    for max_depth in range(6,7,1):
        ### 數據轉換  必須放在這里,否則會出錯,數據過期了。
        x,y=pd.DataFrame(train_data),pd.DataFrame(train_label)  #將數據dataframe化,后面進行iloc 等選項
        lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True)  
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=2, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0) 
        
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if (mean_auc >= max_auc):
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if ('num_leaves' and 'max_depth' in best_params.keys()):          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
print('best cv score:', pd.Series(cv_results['auc-mean']).max())
print(best_params)
設置參數
交叉驗證
best cv score: 0.9625448274375306
{'num_leaves': 225, 'max_depth': 6}

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM