lightGBM基礎模型步驟


 

###基礎工具

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder,Imputer
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
warnings.filterwarnings("ignore")

os.chdir("C:/Users/my/Desktop/記錄/網約車/流失模型")
driver = pd.read_excel('樣本.xlsx')

y_test = driver["target"]
x_test = driver.drop(['a.driver_id','target'],axis = 1)

#特征工程
#第一步:填補空值,類別型轉化
str_encoder = LabelEncoder()#類別型變量編碼
str_encoder.fit(driver["a.contract_company"])
driver["a.contract_company"] = str_encoder.transform(driver["a.contract_company"])

#第二步:特征初步選擇,變量值少於0.01,需要刪除
# ValueLess = []
# for i in x_train.columns:
#     ValuePct = driver[driver[i]>0][i].count()/driver[i].count()
#     if ValuePct < 0.05:
#         ValueLess.append(i)
#         print(ValueLess,ValuePct)
#
# SameValue= []
# for i in x_train.columns:
#     SameValuePct = driver[i].value_counts().max()/driver[i].count()
#     if SameValuePct < 0.05:
#         SameValue.append(i)
#         print(SameValue,SameValuePct)

#driver = driver.drop(ValueLess,axis = 1)
#driver = driver.drop(SameValue,axis = 1)

select_col = ['vehicle_level','max_days','min_days','min_score','tendcy']

os.chdir("C:/Users/my/Desktop/模型/第四版/")
driver = pd.read_excel('8.8訓練樣本.xlsx')


y = driver["target"]
x = driver.drop(['a.driver_id','target'],axis = 1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分數據集

#類別型轉化
from sklearn.preprocessing import LabelEncoder,Imputer
imp = Imputer(missing_values = 'NaN',strategy = 'mean',axis = 0)
imp.fit(x_train)
x_train = imp.transform(x_train)
x_test = imp.transform(x_test)

#第二步:變量分析
fig = plt.figure()
fig.set(alpha=0.2)
# 解決中文的顯示問題
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

#看變量分布
plt.subplot2grid((2, 2), (0, 0))
x_train['max_days'].plot(kind="kde", grid=True)
plt.title('max_days')
plt.show()
plt.subplot2grid((2, 2), (0, 1))
x_train['rest_rate'].plot(kind="kde", grid=True)
plt.title('rest_rate')
plt.show()

變量分布
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))
sns.distplot(df_train.Age, kde=True, bins=20, rug=True)

#皮爾森相關性
pd.DataFrame(data = list(driver.corr()['target']),index = driver.columns,columns = ['value']).sort_values("value",ascending = True)
#多重共線性 vif_x = np.matrix(x[select_col]) vif_list = [variance_inflation_factor(vif_x,i) for i in range(x.shape[1])] print(max(vif_list)) #便捷調用評價函數 def model_evaluate(model,x,y): y_pred = model.predict(x) fpr,tpr,_ = roc_curve(y,y_pred) ks = abs(fpr - tpr).max() auc = roc_auc_score(y,y_pred) print('ks:',ks) print('auc:',auc) #第三步:建模 #GBDT建模比較 from sklearn.ensemble import GradientBoostingClassifier gb = GradientBoostingClassifier(learning_rate=0.05,subsample=0.6,min_samples_split= 90,n_estimators = 50,min_samples_leaf = 10 ,max_depth=15,max_features=15,random_state=10) gb_model =gb.fit(x_train,y_train) model_evaluate(gb_model,x_train,y_train) model_evaluate(gb_model,x_test,y_test) import xgboost as xgb # 初始化模型 xgb_classifier = xgb.XGBClassifier(n_estimators=20,max_depth=4,learning_rate=0.1,subsample=0.7,colsample_bytree=0.7) # 擬合模型 xgb_classifier.fit(x_train, y_train) model_evaluate(xgb_classifier,x_train,y_train) model_evaluate(xgb_classifier,x_test,y_test) xgb_y_train_prob = xgb_classifier.predict(x_train) fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob) xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max() xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob) print("train_ks:",xgb_train_ks) print("train_auc:",xgb_train_auc) import lightgbm as lgb lgb = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=2, n_estimators=800, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=-1, num_iterations = 800) lgb.fit(x_train, y_train) model_evaluate(lgb,x_train,y_train) model_evaluate(lgb,x_test,y_test) #隨機調參 from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint gb = GradientBoostingClassifier(learning_rate=0.02,subsample=0.6,min_samples_split= 70,n_estimators = 200,min_samples_leaf = 40 ,max_depth=4,max_features='sqrt',random_state=10) gbParams = {'loss' : ['deviance', 'exponential'], 'n_estimators': randint(10,500), 'max_depth': randint(1,20), 'subsample':[0.5,0.6,0.7,0.8], 'min_samples_split':range(10,101,10), 'min_samples_leaf':range(5,51,5), 'learning_rate':[0.2,0.1, 0.05,0.02,0.01], 'max_features':randint(1,20)} randomizedSearchGB = RandomizedSearchCV(estimator=gb, param_distributions=gbParams, n_iter=10, scoring='roc_auc', fit_params=None, cv=None, verbose=2).fit(x_train, y_train) print(randomizedSearchGB.best_params_, randomizedSearchGB.best_score_) bestGb = randomizedSearchGB.best_estimator_.fit(x_train, y_train) model_evaluate(bestGb,x_train,y_train) #快速查看模型評估值 from sklearn.metrics import precision_score, recall_score, f1_score print('Precision: %.3f' % precision_score(y_true=y_train, y_pred=xgb_y_train_prob)) print('Recall: %.3f' % recall_score(y_true=y_train, y_pred=xgb_y_train_prob)) print('F1: %.3f' % f1_score(y_true=y_train, y_pred=xgb_y_train_prob)) #變量重要性輸出,樹模型可以輸出變量重要性 gb_importance = pd.DataFrame({'cols':x_train.columns,'gb':gb_model.feature_importances_}).sort_values('gb',ascending=False) gb_importance import pickle #將模型保存 folderOfData = "C:/Users/my/Desktop/模型/" saveModel =open(folderOfData+'bestGb.pkl','wb') pickle.dump(bestGb,saveModel) saveModel.close() #調用模型 import pickle folderOfData = "C:/Users/my/Desktop/模型/" modelFile =open(folderOfData+'bestGb.pkl','rb') gb = pickle.load(modelFile) modelFile.close() #測試數據 #概率轉分數 def Prob2Score(prob, basePoint, PDO): #將概率轉化成分數且為正整數 y = np.log(prob/(1-prob)) return (basePoint+PDO/np.log(2)*(-y)) basePoint = 300 PDO = 100 prob = pd.DataFrame({'prob':xgb_y_pred,'y_test':y_test}) prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO)) plt.hist(prob['score'], 100) plt.style.use('seaborn') plt.xlabel('score') plt.ylabel('freq') plt.title('distribution') plt.show() #決策樹規則提取 from sklearn import tree dtree = tree.DecisionTreeClassifier(max_depth = 4,min_samples_leaf = 7,min_samples_split = 18) dtree = dtree.fit(x,y) import pydotplus from sklearn.externals.six import StringIO os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/' dot_data = StringIO() tree.export_graphviz(dtree,out_file = dot_data,feature_names=x.columns, class_names=['0','1'],filled=True,rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("loss.pdf") print('Visible tree plot saved as pdf.')

 

###貝葉斯優化模型

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split,cross_val_score
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score,roc_curve

import os
os.chdir('C:/Users/my/Desktop/')
data = pd.read_excel('訓練數據.xlsx',sheet_name = 'Sheet1')
print(data.columns)


y = data["label"]
x = data.drop(['passenger_id','label'],axis = 1)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)#划分數據集
print(x_train.shape)
print(y_train.shape)


import xgboost as xgb

# 擬合模型
xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', learning_rate=0.1,
        max_depth=3, min_child_weight=1,n_estimators=100, n_jobs=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1,subsample=0.7)
xgb.fit(x_train, y_train)

xgb_y_train_prob = xgb.predict(x_train)
fpr_xgb_train,tpr_xgb_train,_ = roc_curve(y_train,xgb_y_train_prob)
xgb_train_ks = abs(fpr_xgb_train - tpr_xgb_train).max()
xgb_train_auc = roc_auc_score(y_train,xgb_y_train_prob)
print("train_ks:",xgb_train_ks)
print("train_auc:",xgb_train_auc)


from bayes_opt import BayesianOptimization
import lightgbm as lgb
def GBM_evaluate(min_child_samples,learning_rate, n_estimators,min_child_weight,num_leaves,colsample_bytree, max_depth, subsample, reg_alpha, reg_lambda):
    """自定義的模型評估函數"""

    # 5-fold 交叉檢驗,注意BayesianOptimization會向最大評估值的方向優化。
    val = cross_val_score(
        lgb.LGBMClassifier(objective= 'binary',metric='auc',random_state= 2018,
                learning_rate = float(learning_rate),
                n_estimators=int(n_estimators),
                max_depth = int(max_depth),
                num_leaves = int(num_leaves),
                min_child_samples = int(min_child_samples),
                subsample = float(subsample),
                colsample_bytree = float(colsample_bytree),
                reg_alpha = reg_alpha,
                reg_lambda = reg_lambda,
                min_child_weight = min_child_weight,
                class_weight = 'balanced' ),
        x_train, y_train, scoring='roc_auc', cv=5).mean()
    return val

# 調參范圍
adj_params = {'min_child_weight': (3, 20),
              'colsample_bytree': (0.4, 1),
              'n_estimators':(100,300),
              'learning_rate':(0.05,0.2),
              'max_depth': (5, 15),
              'num_leaves':(10, 50),
              'subsample': (0.5, 1),
              'reg_lambda': (0.1, 1),
              'reg_alpha': (0.1, 1),
              'min_child_samples': (10, 30)}
# 調用貝葉斯優化
num_iter = 25
init_points = 5

bayes = BayesianOptimization(GBM_evaluate,adj_params)
bayes.maximize(init_points=init_points, n_iter=num_iter)
params = bayes.max
print(params)
# {'target': 0.7452465518984774, 'params': {'colsample_bytree': 0.863774165376339,
#                                           'learning_rate': 0.05000062849693596,
#                                           'max_depth': 6.20154732653672,
#                                           'min_child_samples': 29.985852121149026,
#                                           'min_child_weight': 6.810125687159286,
#                                           'n_estimators': 170.32415049570488,
#                                           'num_leaves': 10.403716972233827,
#                                           'reg_alpha': 0.999999999999874,
#                                           'reg_lambda': 0.10000005514579893,
#                                           'subsample': 0.7261106692459622}}

#{'target': 0.752230340011879, 'params': {'colsample_bytree': 0.6766116352832452,
# 'learning_rate': 0.08410079723412914, 'max_depth': 6.009908969461344, 'min_child_samples': 10.45373385991692,
# 'min_child_weight': 5.299569525386938, 'n_estimators': 100.33382248028828, 'num_leaves': 10.861841362739199,
# 'reg_alpha': 0.7515529745843912, 'reg_lambda': 0.9773103767283371, 'subsample': 0.6742906352043163}}


lgbm = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',
                   colsample_bytree= 0.67,  learning_rate=0.08,
                  max_depth= 6,  min_child_samples=10,  min_child_weight= 5.3,
                  n_estimators= 100,  num_leaves= 10,  reg_alpha= 0.75,subsample_freq=1,
                  reg_lambda= 0.9,  subsample= 0.67,random_state=None ,n_jobs=-1,
                  num_iterations = 800,class_weight='balanced')
lgbm.fit(x_train, y_train, eval_set=[(x_test, y_test)])
y_pred=lgbm.predict(x_train)
from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve
fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y_train,y_pred)
lgb_train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()
lgb_train_auc = roc_auc_score(y_train,y_pred)
print("train_ks:",lgb_train_ks)
print("train_auc:",lgb_train_auc)
print('precision:', precision_score(y_train, y_pred))
print('recall:', recall_score(y_train, y_pred))


y_pred_test=lgbm.predict(x_test)
from sklearn.metrics import precision_score, recall_score, f1_score,roc_auc_score,roc_curve
fpr_lgb_test,tpr_lgb_test,_ = roc_curve(y_test,y_pred_test)
lgb_test_ks = abs(fpr_lgb_test - tpr_lgb_test).max()
lgb_test_auc = roc_auc_score(y_test,y_pred_test)
print("train_ks:",lgb_test_ks)
print("train_auc:",lgb_test_auc)
print('precision:', precision_score(y_test, y_pred_test))
print('recall:', recall_score(y_test, y_pred_test))



#概率轉分數
def Prob2Score(prob, basePoint, PDO):
    #將概率轉化成分數且為正整數
    y = np.log(prob/(1-prob))
    return (basePoint+PDO/np.log(2)*(-y))
y_pred_test=lgbm.predict_proba(x_test)[:,1]
basePoint = 300
PDO = 100
prob = pd.DataFrame({'prob':y_pred_test,'y_test':y_test})
prob['score'] = prob['prob'].map(lambda x:Prob2Score(x,basePoint,PDO))
plt.hist(prob['score'], 100)
plt.style.use('seaborn')
plt.xlabel('score')
plt.ylabel('freq')
plt.title('distribution')
plt.show()

 

 

#調整參數value,設置調參區間
min_value = 40
max_value = 60
for value in  range(min_value,max_value+1):
    best_omd = -1
    best_value = -1
    best_ks=[]
    def  lgb_test(train_x,train_y,test_x,test_y):
        clf =lgb.LGBMClassifier(boosting_type = 'gbdt',
                               objective = 'binary',
                               metric = 'auc',
                               learning_rate = 0.1,
                               n_estimators = value,
                               max_depth = 5,
                               num_leaves = 20,
                               max_bin = 45,
                               min_data_in_leaf = 6,
                               bagging_fraction = 0.6,
                               bagging_freq = 0,
                               feature_fraction = 0.8,
                               silent=True
                               )
        clf.fit(train_x,train_y,eval_set = [(train_x,train_y),(test_x,test_y)],eval_metric = 'auc')
        return clf,clf.best_score_['valid_1']['auc'],
    lgb_model , lgb_auc  = lgb_test(train_x,train_y,test_x,test_y)

    y_pred = lgb_model.predict_proba(x)[:,1]
    fpr_lgb_train,tpr_lgb_train,_ = roc_curve(y,y_pred)
    train_ks = abs(fpr_lgb_train - tpr_lgb_train).max()

    y_pred = lgb_model.predict_proba(val_x)[:,1]
    fpr_lgb,tpr_lgb,_ = roc_curve(val_y,y_pred)
    val_ks = abs(fpr_lgb - tpr_lgb).max()
    
    Omd= val_ks + 0.8*(val_ks - train_ks)
    if Omd>best_omd:
        best_omd = Omd
        best_value = value
        best_ks = [train_ks,val_ks]
print('best_value:',best_value)
print('best_ks:',best_ks)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM