import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
train_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\trainfinal.csv',dtype={'city_name': 'category', 'county_name': 'category'})
test_data=pd.read_csv(r'C:\Users\win10\Desktop\詐騙電話分月數據\testfinal.csv',encoding='GBK',dtype={'city_name': 'category', 'county_name': 'category'})
test_data 多了label 列 :
train_label=train_data['label'] #取出Y train_data.drop(['phone_no_m','label'],axis=1,inplace=True) #去除Y和無關的名字變量 # 交叉驗證評價指標 :額外的 from sklearn.metrics import f1_score def culatescore(predict,real): f1=f1_score(real, predict, average='macro') scores.append(f1) return scores params = { 'objective': 'binary', 'metric': 'auc', 'num_leaves': 31, 'max_bin': 50, 'max_depth': 6, "learning_rate": 0.02, "colsample_bytree": 0.8, # 每次迭代中隨機選擇特征的比例 "bagging_fraction": 0.8, # 每次迭代時用的數據比例 'min_child_samples': 25, 'n_jobs': -1, 'silent': True, # 信息輸出設置成1則沒有信息輸出 'seed': 1000, } #設置出參數 results=[] #這個是f1的值,每一次交叉驗證的f1 bigtestresults=[] #這個是測試集各個交叉驗證匯總后的結果 smalltestresults=[] #每一次運行這一大段代碼,初始化各個list,這份是測試集預測的交叉驗證的每一次存放結果的list scores=[] #這是匯總后的交叉驗證 cat = ["city_name", "county_name"] #這個是類別特征,catgorical_feature=cat kf=StratifiedKFold(n_splits=3,shuffle=True,random_state=123) #類別交叉嚴重 x,y=pd.DataFrame(train_data),pd.DataFrame(train_label) #將數據dataframe化,后面進行iloc 等選項 for i,(train_index,valid_index) in enumerate(kf.split(x,y)): #這里需要輸入y print("第",i+1,"次") x_train,y_train=x.iloc[train_index],y.iloc[train_index] x_valid,y_valid=x.iloc[valid_index],y.iloc[valid_index] #取出數據 lgb_train = lgb.Dataset(x_train, y_train,categorical_feature=cat,silent=True) lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train, categorical_feature=cat,silent=True) gbm = lgb.train(params, lgb_train, num_boost_round=400, valid_sets=[lgb_train, lgb_eval], categorical_feature=cat,verbose_eval=100, early_stopping_rounds=200) #varbose_eval 迭代多少次打印 early_stopping_rounds:有多少次分數沒有提高就停止 #categorical_feature:lightgbm可以處理標稱型(類別)數據。通過指定'categorical_feature' 這一參數告訴它哪些feature是標稱型的。 #它不需要將數據展開成獨熱碼(one-hot),其原理是對特征的所有取值,做一個one-vs-others,從而找出最佳分割的那一個特征取值 #bagging_fraction:和bagging_freq同時使用可以更快的出結果 vaild_preds = gbm.predict(x_valid, num_iteration=gbm.best_iteration) #對測試集進行操作 test_pre = gbm.predict(test_data.iloc[:,1:], num_iteration=gbm.best_iteration) threshold = 0.45 #設置閾值 smalltestresults=[] #這個是測試集預測的交叉驗證的每一次存放結果的list # 對每個交叉驗證的測試集進行0 , 1 化,然后將每次結果放入bigtestresults中匯總 for w in test_pre: temp = 1 if w > threshold else 0 smalltestresults.append(temp) bigtestresults.append(smalltestresults) # 對每次交叉驗證的驗證集進行 0 ,1 化,然后評估f1值 results=[] for pred in vaild_preds: result = 1 if pred > threshold else 0 results.append(result) c=culatescore(results,y_valid) print(c) print('---N折交叉驗證分數---') print(np.average(c)) #將匯總的交叉驗證的測試集的數據轉變為dataframe,取出出現次數最多的那類,用作預測結果。 finalpres=pd.DataFrame(bigtestresults) finaltask=[] lss=[] #這個是最終結果 for i in finalpres.columns: temp1=finalpres.iloc[:,i].value_counts().index[0] lss.append(temp1)
----------------------------------------------------------------------------以上是2分類任務的簡單應用,還沒有調參----------------------------------------------------------------------
調參代碼:
--------------------------普通手動網格搜索調參:只適用於全是數字的,沒有種類的 dataset----------------------------
#設定初始參數 params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'n_jobs':-1, 'learning_rate':0.1, #先設置大一點 ,盡量小一些。 'num_leaves':250, 'max_depth': 8, 'subsample': 0.8, 'colsample_bytree': 0.8 } #先調整 n_estimators 迭代的次數/殘差樹的數目 盡量將nfold 設置大一點 lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True) cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=4, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0) print('best n_estimators:', len(cv_results['auc-mean'])) print('best cv score:', pd.Series(cv_results['auc-mean']).max())
best n_estimators: 325 best cv score: 0.9776274857159668
然后網格搜索調參
#求解 max_depth / num_leaves from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold kf=KFold(n_splits=2,shuffle=True,random_state=123) #后面KFold網格搜索和交叉驗證用 params_test1={'max_depth': range(5,8,1), 'num_leaves':range(10, 60, 10)} gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=325, max_depth=8, bagging_fraction = 0.8,feature_fraction = 0.8), param_grid = params_test1, scoring='roc_auc',cv=kf,n_jobs=-1) gsearch1.fit(x,y) gsearch1.best_params_, gsearch1.best_score_
({'max_depth': 7, 'num_leaves': 50}, 0.9643225388549975)
這樣一直進行下去。
max_depth 1 num_leaves 1 min_data_in_leaf #2 max_bin #2 feature_fraction 3 構建弱學習器時,對特征隨機采樣的比例,默認值為1。推薦的候選值為:[0.6, 0.7, 0.8, 0.9, 1] bagging_fraction 3 bagging_freq 3 min_child_samples 4 葉節點樣本的最少數量,默認值20,用於防止過擬合。 min_child_weight 4 指定孩子節點中最小的樣本權重和,如果一個葉子節點的樣本權重和小於min_child_weight則拆分過程結束,默認值為1。推薦的候選值為:[1, 3, 5, 7] lambda_l1 5 lambda_l2 5 min_gain_to_split 6 指定葉節點進行分支所需的損失減少的最小值,默認值為0。設置的值越大,模型就越保守。**推薦的候選值為:[0, 0.05 ~ 0.1, 0.3, 0.5, 0.7, 0.9, 1] ** learning_rate boosting_type objective metric n_jobs
----------全自動調參--+categorical_feature=cat ,注意,如果有categorical_feature,則在每次循環的時候,都要加載數據,否則數據會過期。-----------
#這個調參是有充足的時間來的 import pandas as pd import lightgbm as lgb from sklearn import metrics from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split #這個可以不要 canceData=load_breast_cancer() X=canceData.data y=canceData.target X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2) ### 數據轉換 print('數據轉換') lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) #這個可以不要 ### 設置初始參數--不含交叉驗證參數 print('設置參數') params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'n_jobs':-1, 'learning_rate':0.1 } ### 交叉驗證(調參) print('交叉驗證') max_auc = float('0') best_params = {} # 准確率 print("調參1:提高准確率") for num_leaves in range(5,100,5): #指定葉子的個數,默認值為31,此參數的數值應該小於 2^{max\_depth} for max_depth in range(3,8,1): #指定樹的最大深度,默認值為-1,表示不做限制,合理的設置可以防止過擬合。 params['num_leaves'] = num_leaves params['max_depth'] = max_depth cv_results = lgb.cv( params, lgb_train, seed=1, nfold=2, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if (mean_auc >= max_auc): max_auc = mean_auc best_params['num_leaves'] = num_leaves best_params['max_depth'] = max_depth if ('num_leaves' and 'max_depth' in best_params.keys()): params['num_leaves'] = best_params['num_leaves'] params['max_depth'] = best_params['max_depth'] # 過擬合 print("調參2:降低過擬合") for max_bin in range(5,256,10): #最大的桶的數量,用來裝數值的; for min_data_in_leaf in range(1,102,10): #每個葉子上的最少數據; params['max_bin'] = max_bin params['min_data_in_leaf'] = min_data_in_leaf cv_results = lgb.cv( params, lgb_train, seed=1, nfold=2, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if (mean_auc >= max_auc): max_auc = mean_auc best_params['max_bin']= max_bin best_params['min_data_in_leaf'] = min_data_in_leaf if ('max_bin' and 'min_data_in_leaf' in best_params.keys()): params['min_data_in_leaf'] = best_params['min_data_in_leaf'] params['max_bin'] = best_params['max_bin'] print("調參3:降低過擬合") for feature_fraction in [0.6,0.7,0.8,0.9,1.0]: #默認值為1;指定每次迭代所需要的特征部分; for bagging_fraction in [0.6,0.7,0.8,0.9,1.0]: #默認值為1;指定每次迭代所需要的數據部分,並且它通常是被用來提升訓練速度和避免過擬合的。 for bagging_freq in range(0,50,5): params['feature_fraction'] = feature_fraction params['bagging_fraction'] = bagging_fraction params['bagging_freq'] = bagging_freq cv_results = lgb.cv( params, lgb_train, seed=1, nfold=2, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if (mean_auc >= max_auc): max_auc=mean_auc best_params['feature_fraction'] = feature_fraction best_params['bagging_fraction'] = bagging_fraction best_params['bagging_freq'] = bagging_freq if( 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys()): params['feature_fraction'] = best_params['feature_fraction'] params['bagging_fraction'] = best_params['bagging_fraction'] params['bagging_freq'] = best_params['bagging_freq'] print("調參4:降低過擬合") for lambda_l1 in [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]: for lambda_l2 in [1e-5,1e-3,1e-1,0.0,0.1,0.4,0.6,0.7,0.9,1.0]: params['lambda_l1'] = lambda_l1 params['lambda_l2'] = lambda_l2 cv_results = lgb.cv( params, lgb_train, seed=1, nfold=2, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if( mean_auc >= max_auc): max_auc=mean_auc best_params['lambda_l1'] = lambda_l1 best_params['lambda_l2'] = lambda_l2 if ('lambda_l1' and 'lambda_l2' in best_params.keys()): params['lambda_l1'] = best_params['lambda_l1'] params['lambda_l2'] = best_params['lambda_l2'] print("調參5:降低過擬合2") for min_split_gain in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]: #指定葉節點進行分支所需的損失減少的最小值,默認值為0。設置的值越大,模型就越保守。 params['min_split_gain'] = min_split_gain cv_results = lgb.cv( params, lgb_train, seed=1, nfold=2, metrics=['auc'], early_stopping_rounds=10, verbose_eval=True ) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if mean_auc >= max_auc: max_auc=mean_auc best_params['min_split_gain'] = min_split_gain if 'min_split_gain' in best_params.keys(): params['min_split_gain'] = best_params['min_split_gain'] print(best_params)
{'num_leaves': 5, 'max_depth': 7, 'max_bin': 255, 'min_data_in_leaf': 41, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 45, 'lambda_l1': 0.5, 'lambda_l2': 0.0, 'min_split_gain': 0.1}
如果還想增加某些參數,還可以單獨拎幾個參數出來仔細挑
### 設置初始參數--不含交叉驗證參數 print('設置參數') params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'n_jobs':-1, 'learning_rate':0.1 } print('交叉驗證') max_auc = float('0') best_params = {} for num_leaves in range(220,230,5): for max_depth in range(6,7,1): ### 數據轉換 必須放在這里,否則會出錯,數據過期了。 x,y=pd.DataFrame(train_data),pd.DataFrame(train_label) #將數據dataframe化,后面進行iloc 等選項 lgb_train=lgb.Dataset(x, y,categorical_feature=cat,silent=True) params['num_leaves'] = num_leaves params['max_depth'] = max_depth cv_results = lgb.cv(params,lgb_train, num_boost_round=1000, nfold=2, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=50,seed=0) mean_auc = pd.Series(cv_results['auc-mean']).max() boost_rounds = pd.Series(cv_results['auc-mean']).idxmax() if (mean_auc >= max_auc): max_auc = mean_auc best_params['num_leaves'] = num_leaves best_params['max_depth'] = max_depth if ('num_leaves' and 'max_depth' in best_params.keys()): params['num_leaves'] = best_params['num_leaves'] params['max_depth'] = best_params['max_depth'] print('best cv score:', pd.Series(cv_results['auc-mean']).max()) print(best_params)
設置參數 交叉驗證 best cv score: 0.9625448274375306 {'num_leaves': 225, 'max_depth': 6}