lightgbm 進行二分類

本文轉載自查看原文 2020-07-23 17:21 4715 Algorithm

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
train_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\trainfinal.csv',dtype={'city_name': 'category', 'county_name': 'category'})
test_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\testfinal.csv',encoding='GBK',dtype={'city_name': 'category', 'county_name': 'category'})

test_data 多了label 列：

train_label=train_data['label']  #取出Y
train_data.drop(['phone_no_m','label'],axis=1,inplace=True) #去除Y和無關的名字變量

# 交叉驗證評價指標 ：額外的
from sklearn.metrics import f1_score
def culatescore(predict,real):
    f1=f1_score(real, predict, average='macro')
    scores.append(f1)
    return scores

params = {
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 31,
        'max_bin': 50,
        'max_depth': 6,
        "learning_rate": 0.02,
        "colsample_bytree": 0.8,  # 每次迭代中隨機選擇特征的比例
        "bagging_fraction": 0.8,  # 每次迭代時用的數據比例
        'min_child_samples': 25,
        'n_jobs': -1,
        'silent': True,  # 信息輸出設置成1則沒有信息輸出
        'seed': 1000,
    }  #設置出參數

results=[]         #這個是f1的值，每一次交叉驗證的f1 
bigtestresults=[]   #這個是測試集各個交叉驗證匯總后的結果 
smalltestresults=[] #每一次運行這一大段代碼，初始化各個list，這份是測試集預測的交叉驗證的每一次存放結果的list
scores=[]          #這是匯總后的交叉驗證
cat = ["city_name", "county_name"]  #這個是類別特征，catgorical_feature=cat
kf=StratifiedKFold(n_splits=3,shuffle=True,random_state=123)  #類別交叉嚴重
x,y=pd.DataFrame(train_data),pd.DataFrame(train_label)  #將數據dataframe化，后面進行iloc 等選項

for i,(train_index,valid_index) in enumerate(kf.split(x,y)): #這里需要輸入y
    print("第",i+1,"次")
    x_train,y_train=x.iloc[train_index],y.iloc[train_index]
    x_valid,y_valid=x.iloc[valid_index],y.iloc[valid_index]  #取出數據
    lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat,silent=True)
    lgb_eval  = lgb.Dataset(x_valid, y_valid, reference=lgb_train, categorical_feature=cat,silent=True)
    gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], categorical_feature=cat,verbose_eval=100,
                        early_stopping_rounds=200)
    #varbose_eval 迭代多少次打印  early_stopping_rounds：有多少次分數沒有提高就停止
    #categorical_feature:lightgbm可以處理標稱型（類別）數據。通過指定'categorical_feature' 這一參數告訴它哪些feature是標稱型的。
    #它不需要將數據展開成獨熱碼(one-hot)，其原理是對特征的所有取值，做一個one-vs-others，從而找出最佳分割的那一個特征取值
    #bagging_fraction:和bagging_freq同時使用可以更快的出結果
    vaild_preds = gbm.predict(x_valid, num_iteration=gbm.best_iteration)
    #對測試集進行操作
    test_pre = gbm.predict(test_data.iloc[:,1:], num_iteration=gbm.best_iteration)
    
    threshold = 0.45      #設置閾值
    smalltestresults=[]  #這個是測試集預測的交叉驗證的每一次存放結果的list
                        # 對每個交叉驗證的測試集進行0 ， 1 化，然后將每次結果放入bigtestresults中匯總
    for w in test_pre:    
        temp = 1 if w > threshold else 0
        smalltestresults.append(temp)
    bigtestresults.append(smalltestresults)
    
                        # 對每次交叉驗證的驗證集進行 0 ，1 化，然后評估f1值
    results=[]
    for pred in vaild_preds:  
        result = 1 if pred > threshold else 0
        results.append(result)
    c=culatescore(results,y_valid)
    print(c) 
print('---N折交叉驗證分數---')
print(np.average(c)) 
#將匯總的交叉驗證的測試集的數據轉變為dataframe,取出出現次數最多的那類，用作預測結果。
finalpres=pd.DataFrame(bigtestresults)
finaltask=[]
lss=[]  #這個是最終結果
for i in finalpres.columns:
    temp1=finalpres.iloc[:,i].value_counts().index[0]
    lss.append(temp1)

第 1 次

Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.950557	valid_1's auc: 0.916566
[200]	training's auc: 0.97265	valid_1's auc: 0.937039
[300]	training's auc: 0.983987	valid_1's auc: 0.945772
[400]	training's auc: 0.990576	valid_1's auc: 0.951353
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.990576	valid_1's auc: 0.951353
[0.849546119365964]
第 2 次
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.947291	valid_1's auc: 0.923428
[200]	training's auc: 0.969263	valid_1's auc: 0.942003
[300]	training's auc: 0.980737	valid_1's auc: 0.951233
[400]	training's auc: 0.98822	valid_1's auc: 0.956855
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.98822	valid_1's auc: 0.956855
[0.849546119365964, 0.8638736081868513]
第 3 次
Training until validation scores don't improve for 200 rounds
[100]	training's auc: 0.948401	valid_1's auc: 0.919948
[200]	training's auc: 0.970649	valid_1's auc: 0.939757
[300]	training's auc: 0.98241	valid_1's auc: 0.948706
[400]	training's auc: 0.9892	valid_1's auc: 0.954053
Did not meet early stopping. Best iteration is:
[400]	training's auc: 0.9892	valid_1's auc: 0.954053
[0.849546119365964, 0.8638736081868513, 0.8498867103198264]
---N折交叉驗證分數---
0.8544354792908807

----------------------------------------------------------------------------以上是2分類任務的簡單應用，還沒有調參----------------------------------------------------------------------

調參代碼：

--------------------------普通手動網格搜索調參：只適用於全是數字的，沒有種類的 dataset----------------------------

#設定初始參數
params = {    
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1, #先設置大一點 ，盡量小一些。
          'num_leaves':250, 
          'max_depth': 8,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8
          }

#先調整 n_estimators  迭代的次數/殘差樹的數目   盡量將nfold 設置大一點
lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True)
cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=4, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

best n_estimators: 325
best cv score: 0.9776274857159668
然后網格搜索調參

#求解 max_depth / num_leaves  
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

kf=KFold(n_splits=2,shuffle=True,random_state=123)  #后面KFold網格搜索和交叉驗證用
params_test1={'max_depth': range(5,8,1),
              'num_leaves':range(10, 60, 10)}

gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=325, max_depth=8, bagging_fraction = 0.8,feature_fraction = 0.8), 
                       param_grid = params_test1, scoring='roc_auc',cv=kf,n_jobs=-1)

gsearch1.fit(x,y)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 7, 'num_leaves': 50}, 0.9643225388549975)

這樣一直進行下去。

max_depth  1
num_leaves 1 

min_data_in_leaf #2 
max_bin  #2

feature_fraction 3 構建弱學習器時，對特征隨機采樣的比例，默認值為1。推薦的候選值為：[0.6, 0.7, 0.8, 0.9, 1]
bagging_fraction 3
bagging_freq    3 

min_child_samples  4  葉節點樣本的最少數量，默認值20，用於防止過擬合。
min_child_weight 4   指定孩子節點中最小的樣本權重和，如果一個葉子節點的樣本權重和小於min_child_weight則拆分過程結束，默認值為1。推薦的候選值為：[1, 3, 5, 7]


lambda_l1   5
lambda_l2    5

min_gain_to_split  6  指定葉節點進行分支所需的損失減少的最小值，默認值為0。設置的值越大，模型就越保守。**推薦的候選值為：[0, 0.05 ~ 0.1, 0.3, 0.5, 0.7, 0.9, 1] **

learning_rate
boosting_type
objective
metric
n_jobs

----------全自動調參--+categorical_feature=cat ，注意，如果有categorical_feature，則在每次循環的時候，都要加載數據，否則數據會過期。-----------

#這個調參是有充足的時間來的
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split  #這個可以不要
 
canceData=load_breast_cancer()
X=canceData.data
y=canceData.target
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
 
### 數據轉換
print('數據轉換')
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) #這個可以不要
 
### 設置初始參數--不含交叉驗證參數
print('設置參數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1
          }
 
### 交叉驗證(調參)
print('交叉驗證')
max_auc = float('0')
best_params = {}
 
# 准確率
print("調參1：提高准確率")
for num_leaves in range(5,100,5):  #指定葉子的個數，默認值為31，此參數的數值應該小於 2^{max\_depth}
    for max_depth in range(3,8,1):  #指定樹的最大深度，默認值為-1，表示不做限制，合理的設置可以防止過擬合。
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
 
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=2,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if (mean_auc >= max_auc):
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if ('num_leaves' and 'max_depth' in best_params.keys()):          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
 
# 過擬合
print("調參2：降低過擬合")
for max_bin in range(5,256,10):  #最大的桶的數量，用來裝數值的；
    for min_data_in_leaf in range(1,102,10):  #每個葉子上的最少數據；
            params['max_bin'] = max_bin
            params['min_data_in_leaf'] = min_data_in_leaf
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=2,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if (mean_auc >= max_auc):
                max_auc = mean_auc
                best_params['max_bin']= max_bin
                best_params['min_data_in_leaf'] = min_data_in_leaf
if ('max_bin' and 'min_data_in_leaf' in best_params.keys()):
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']
 
print("調參3：降低過擬合")
for feature_fraction in [0.6,0.7,0.8,0.9,1.0]:  #默認值為1；指定每次迭代所需要的特征部分；
    for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]:  #默認值為1；指定每次迭代所需要的數據部分，並且它通常是被用來提升訓練速度和避免過擬合的。
        for bagging_freq in range(0,50,5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(
                                params,
                                lgb_train,
                                seed=1,
                                nfold=2,
                                metrics=['auc'],
                                early_stopping_rounds=10,
                                verbose_eval=True
                                )
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if (mean_auc >= max_auc):
                max_auc=mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq
 
if( 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys()):
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
 
 
print("調參4：降低過擬合")
for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]:
    for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(
                            params,
                            lgb_train,
                            seed=1,
                            nfold=2,
                            metrics=['auc'],
                            early_stopping_rounds=10,
                            verbose_eval=True
                            )
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if( mean_auc >= max_auc):
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if ('lambda_l1' and 'lambda_l2' in best_params.keys()):
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
 
print("調參5：降低過擬合2")
for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]: #指定葉節點進行分支所需的損失減少的最小值，默認值為0。設置的值越大，模型就越保守。
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(
                        params,
                        lgb_train,
                        seed=1,
                        nfold=2,
                        metrics=['auc'],
                        early_stopping_rounds=10,
                        verbose_eval=True
                        )
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
 
print(best_params)

{'num_leaves': 5, 'max_depth': 7, 'max_bin': 255, 'min_data_in_leaf': 41, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 45, 'lambda_l1': 0.5, 'lambda_l2': 0.0, 'min_split_gain': 0.1}


如果還想增加某些參數，還可以單獨拎幾個參數出來仔細挑

### 設置初始參數--不含交叉驗證參數
print('設置參數')
params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'n_jobs':-1,
          'learning_rate':0.1
          }
print('交叉驗證')
max_auc = float('0')
best_params = {}
for num_leaves in range(220,230,5):
    for max_depth in range(6,7,1):
        ### 數據轉換  必須放在這里，否則會出錯，數據過期了。
        x,y=pd.DataFrame(train_data),pd.DataFrame(train_label)  #將數據dataframe化，后面進行iloc 等選項
        lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True)  
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
        cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=2, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0) 
        
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if (mean_auc >= max_auc):
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if ('num_leaves' and 'max_depth' in best_params.keys()):          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
print('best cv score:', pd.Series(cv_results['auc-mean']).max())
print(best_params)

設置參數
交叉驗證
best cv score: 0.9625448274375306
{'num_leaves': 225, 'max_depth': 6}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 二分類二分類問題二分類實現多分類從二分類到多分類二分類模型之logistic 二分類下的混淆矩陣二分類邏輯回歸及案例 Tensorflow mlp二分類 pyspark二分類-是否結婚二分類Logistic回歸模型