數據來源:https://www.kaggle.com/usdot/flight-delays
該數據集完整數據量有500多萬條航班記錄數據,特征有31個
感覺這個數據不是很好,如果我們將ARRIVAL_DELAY作為y值,但是后面的空氣系統延誤,安全延誤感覺又像是延誤的原因,我們首先看一下數據怎么樣
1.由於數據量實在是太多,我們抽取其中一些數據
#%%導入模塊 import pandas as pd import numpy as np from scipy import stats import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline plt.rc("font",family="SimHei",size="12") #解決中文無法顯示的問題 import pycard as pc #%%導入數據 #%% 讀取flights數據集 flights = pd.read_csv('D:/迅雷下載/flights.csv') # 數據集抽樣1% flights = flights.sample(frac=0.01, random_state=10) flights.shape #(58191, 31)
2.將ARRIVAL_DELAY超過10min作為延誤
flights["y"] = (flights["ARRIVAL_DELAY"]>10)*1 flights.pop("ARRIVAL_DELAY")
3.區分類別型變量和數值型變量
cate_col = flights.select_dtypes(include='object').columns.to_list() num_col = [i for i in flights.columns if i not in cate_col]
4.計算數值型變量的iv值
#%%數值變量的iv值計算 num_iv_woedf = pc.WoeDf() clf = pc.NumBin() #min_bin_samples=20, min_impurity_decrease=4e-5 for i in num_col: clf.fit(flights[i] ,flights.y) #clf.generate_transform_fun() num_iv_woedf.append(clf.woe_df_) num_iv_woedf.to_excel('tmp1')
剩下的幾個值太大了
再仔細看看后面這5個
這幾個變量其實也是延誤的原因,我們不可以那它作為特征,(如果不了解字段原因,遇到這個等於無窮的情況,我們可以將這些變量作為特征,但是這次的明顯不是特征,我們把這些變量刪除掉)
5.類別型變量
#類別型變量 cate_iv_woedf = pc.WoeDf() for i in cate_col: cate_iv_woedf.append(pc.cross_woe(flights[i] ,flights.y)) cate_iv_woedf.to_excel('tmp2')
類別型變量的類別太多了,暫時不使用這些變量
6.中間的一些處理
這步是去掉iv值比較低的變量
drop_num = ["MONTH", "FLIGHT_NUMBER", "DIVERTED", "DAY", "DAY_OF_WEEK",l "DISTANCE", "SCHEDULED_TIME", "YEAR"] # 去掉這些 num_col = [i for i in num_col if i not in drop_num] num_iv_woedf = pc.WoeDf() clf = pc.NumBin() #min_bin_samples=20, min_impurity_decrease=4e-5 for i in num_col: clf.fit(flights[i] ,flights.y) #flights[i+'_bin'] = clf.transform(flights[i]) #這樣可以省略掉后面轉換成_bin的一步驟 num_iv_woedf.append(clf.woe_df_)
相關性處理
flights[num_col].corr().to_excel('tmp2.xlsx')
再刪除上面說的延誤原因的那五個字段
7.最后入模字段
num_col = ["DEPARTURE_DELAY", "TAXI_OUT", "WHEELS_OFF", "ELAPSED_TIME", "TAXI_IN", "SCHEDULED_ARRIVAL", "ARRIVAL_TIME", "CANCELLED"] num_iv_woedf = pc.WoeDf() clf = pc.NumBin() #min_bin_samples=20, min_impurity_decrease=4e-5 for i in num_col: clf.fit(flights[i] ,flights.y) flights[i+'_bin'] = clf.transform(flights[i]) #這樣可以省略掉后面轉換成_bin的一步驟 num_iv_woedf.append(clf.woe_df_)
8.woe轉換
#%%woe轉換 bin_col = [i for i in list(flights.columns) if i[-4:]=='_bin'] cate_iv_woedf = pc.WoeDf() for i in bin_col: cate_iv_woedf.append(pc.cross_woe(flights[i] ,flights.y)) #cate_iv_woedf.to_excel('tmp1') cate_iv_woedf.bin2woe(flights,bin_col)
9.建模
model_col = [i for i in list(flights.columns) if i[-4:]=='_woe'] import pandas as pd import matplotlib.pyplot as plt #導入圖像庫 import matplotlib import seaborn as sns import statsmodels.api as sm from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split X = flights[model_col] Y = flights['y'] x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=100) X1=sm.add_constant(x_train) #在X前加上一列常數1,方便做帶截距項的回歸 logit=sm.Logit(y_train.astype(float),X1.astype(float)) result=logit.fit() result.summary() result.params
10.訓練集效果
resu_1 = result.predict(X1.astype(float)) fpr, tpr, threshold = roc_curve(y_train, resu_1) rocauc = auc(fpr, tpr) #0.9599064590567914 plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() # 此處我們看一下混淆矩陣 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix #lr = LogisticRegression(C=best_c, penalty='l1') #lr.fit(X_train_undersample, y_train_undersample) #y_pred_undersample = lr.predict(X_train_undersample) resu_1 = resu_1.apply(lambda x :1 if x>=0.5 else 0) matrix = confusion_matrix(y_train, resu_1) print("混淆矩陣:\n", matrix) print("精度:", precision_score(y_train, resu_1)) print("召回率:", recall_score(y_train, resu_1)) print("f1分數:", f1_score(y_train, resu_1)) ''' 混淆矩陣: [[33506 866] [ 2305 6966]] 精度: 0.8894279877425945 召回率: 0.7513752561751699 f1分數: 0.8145939308893178 '''
11.測試集效果
#%%驗證集 X3 = sm.add_constant(x_test) resu = result.predict(X3.astype(float)) fpr, tpr, threshold = roc_curve(y_test, resu) rocauc = auc(fpr, tpr) #0.9618011576768271 plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % rocauc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('真正率') plt.xlabel('假正率') plt.show() # 此處我們看一下混淆矩陣 from sklearn.metrics import precision_score, recall_score, f1_score,confusion_matrix #lr = LogisticRegression(C=best_c, penalty='l1') #lr.fit(X_train_undersample, y_train_undersample) #y_pred_undersample = lr.predict(X_train_undersample) resu = resu.apply(lambda x :1 if x>=0.5 else 0) matrix = confusion_matrix(y_test, resu) print("混淆矩陣:\n", matrix) print("精度:", precision_score(y_test, resu)) print("召回率:", recall_score(y_test, resu)) print("f1分數:", f1_score(y_test, resu)) ''' 混淆矩陣: [[11168 276] [ 740 2364]] 精度: 0.8954545454545455 召回率: 0.7615979381443299 f1分數: 0.8231197771587743 '''
總結:
由於數據的原因,該數據不是很典型,只適合練練手,所以后面的我就不深究