import mglearn from numpy import int64 from sklearn import metrics from sklearn.calibration import CalibratedClassifierCV from sklearn.model_selection import GridSearchCV, KFold from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression, LogisticRegressionCV from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVR, SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import seaborn as sns from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA, TruncatedSVD from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV from sklearn.metrics import roc_auc_score import warnings warnings.filterwarnings('always') warnings.filterwarnings('ignore') sns.set(style="darkgrid") plt.rcParams['font.sans-serif'] = ['SimHei'] # 用來正常顯示中文標簽 plt.rcParams['axes.unicode_minus'] = False # 用來正常顯示 # 字段說明 # # NO 字段名稱 數據類型 字段描述 # 1 ID Int 客戶唯一標識 # 2 age Int 客戶年齡 # 3 job String 客戶的職業 # 4 marital String 婚姻狀況 # 5 education String 受教育水平 # 6 default String 是否有違約記錄 # 7 balance Int 每年賬戶的平均余額 # 8 housing String 是否有住房貸款 # 9 loan String 是否有個人貸款 # 10 contact String 與客戶聯系的溝通方式 # 11 day Int 最后一次聯系的時間(幾號) # 12 month String 最后一次聯系的時間(月份) # 13 duration Int 最后一次聯系的交流時長 # 14 campaign Int 在本次活動中,與該客戶交流過的次數 # 15 pdays Int 距離上次活動最后一次聯系該客戶,過去了多久(999表示沒有聯系過) # 16 previous Int 在本次活動之前,與該客戶交流過的次數 # 17 poutcome String 上一次活動的結果 # 18 y Int 預測客戶是否會訂購定期存款業務 from sklearn.tree import DecisionTreeClassifier data_train = pd.read_csv('train_set.csv') data_test = pd.read_csv('test_set.csv') ids_test = data_test['ID'] print(data_train.shape[0]) # data_train['cppv']=data_train['campaign']+data_train['previous'] # data_test['cppv']=data_test['campaign']+data_test['previous'] # data_train.drop(['campaign','previous'], axis=1, inplace=True) # data_test.drop(['campaign','previous'], axis=1, inplace=True) # Rela_grouped=data_train.groupby(['cp']) # Rela_Survival_Rate=(Rela_grouped.sum()/Rela_grouped.count())['y'] # Rela_count=Rela_grouped.count()['y'] # # ax1=Rela_count.plot(kind='bar',color='g') # ax2=ax1.twinx() # ax2.plot(Rela_Survival_Rate.values,color='r') # ax1.set_xlabel('Relatives') # ax1.set_ylabel('Number') # ax2.set_ylabel('Survival Rate') # plt.title('Survival Rate by Relatives') # plt.grid(True,linestyle='-',color='0.7') # plt.show() # g = sns.FacetGrid(data_train, col='y') # g.map(plt.hist, 'day', bins=30) # plt.show() print("數值處理1:標簽指標one-hot編碼處理") data_train.drop(['ID'], axis=1, inplace=True) data_test.drop(['ID'], axis=1, inplace=True) dummy = pd.get_dummies(data_train[['month','job','marital','education','default','housing','loan','contact','poutcome']]) dummyTest = pd.get_dummies(data_test[['month','job','marital','education','default','housing','loan','contact','poutcome']]) data_train = pd.concat([dummy, data_train], axis=1) data_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_test = pd.concat([dummyTest, data_test], axis=1) data_test.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_train['day'].replace([30,13,15,4,14,12,18],4,inplace=True) data_train['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True) data_train['day'].replace([17,9,6,27,7,22,28],2,inplace=True) data_train['day'].replace([23,25,26,10,29,19],1,inplace=True) data_train['day'].replace([1,24,31],0,inplace=True) data_test['day'].replace([30,13,15,4,14,12,18],4,inplace=True) data_test['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True) data_test['day'].replace([17,9,6,27,7,22,28],2,inplace=True) data_test['day'].replace([23,25,26,10,29,19],1,inplace=True) data_test['day'].replace([1,24,31],0,inplace=True) # data_train['month1'] = data_train.month.apply(lambda x: 4 if x in ['may'] else 0) # data_train['month1'] = data_train.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0) # data_train['month1'] = data_train.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0) # data_train['month1'] = data_train.month.apply(lambda x: 1 if x in ['sep','mar'] else 0) # # data_test['month1'] = data_test.month.apply(lambda x: 4 if x in ['may'] else 0) # data_test['month1'] = data_test.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0) # data_test['month1'] = data_test.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0) # data_test['month1'] = data_test.month.apply(lambda x: 1 if x in ['sep','mar'] else 0) # # data_train.drop(['month'], inplace=True, axis=1) data_test.drop(['month'], inplace=True, axis=1) # data_train.drop(['day','job_management','marital_single'], axis=1, inplace=True) # data_test.drop(['day','job_management','marital_single'], axis=1, inplace=True) # data_train['month'].replace(['may'],4,inplace=True) # data_train['month'].replace(['aug','jul','apr'],3,inplace=True) # data_train['month'].replace(['jun','feb','nov','oct'],2,inplace=True) # data_train['month'].replace(['sep','mar'],1,inplace=True) # data_train['month'].replace(['jan','dec'],0,inplace=True) # 多刪特征 # data_train.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) # data_test.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) #default、housing、loan都是2分類的指標,刪除其中一個即可 # data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) # data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) ################################ ########### 數據整理 ########### ################################ data_train['pdays'].replace(-1,9999,inplace=True) data_test['pdays'].replace(-1,9999,inplace=True) print("數值處理2:pdays將-1替換為999") # data_train.drop(['pdays'], inplace=True, axis=1) # data_test.drop(['pdays'], inplace=True, axis=1) # g = sns.FacetGrid(data_train, col='y') # g.map(plt.hist, 'pdays', bins=20) # plt.show() # data_train.drop(['pdays'], inplace=True, axis=1) # data_test.drop(['pdays'], inplace=True, axis=1) y = data_train['y'] X = data_train[data_train.columns[: -1]] # # X.info() # pdays的平均值先前看到是45,而-1距離45很近,距離max值854很遠,故還是需要將所有的-1替換為999 #數據預處理: #數據中pdays=-1表示從未聯絡過,替換為999 #對方差較大的數據指標進行變換,MinMaxScaler或者StandardScaler print("數值處理3:數值指標Scaler變換") scaler = MinMaxScaler() # numerical = ['age','balance', 'duration', 'pdays', 'previous'] # X[numerical] = scaler.fit_transform(X[numerical]) # data_test[numerical] = scaler.fit_transform(data_test[numerical]) print(data_test.shape) X = scaler.fit_transform(X) data_test = scaler.fit_transform(data_test) # tsvd = TruncatedSVD(n_components=46) # data_test = tsvd.fit_transform(data_test) #數據分割,用於測試 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1) # X_train = tsvd.fit_transform(X_train) # X_test = tsvd.fit_transform(X_test) # print(X_train.shape) #增加二項式特征 # polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False) # #增加二項式特征,僅僅是交叉特征 # polynomial_interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False) # X_train = polynomial_interaction.fit_transform(X_train) # X_test = polynomial_interaction.fit_transform(X_test) # data_test = polynomial_interaction.fit_transform(data_test) # print('after Polynomial:',X_train.shape) # # # #保留99%的信息,進行朱成本分析 # pca = PCA(n_components=100,whiten=True) # X_train = pca.fit_transform(X_train) # X_test = pca.fit_transform(X_test) # data_test = pca.fit_transform(data_test) # print('after PCA:',X_train.shape) # #卡方分類篩選 # selector = SelectKBest(f_classif,k=300) # X_train = selector.fit_transform(X_train,y_train) # X_test = selector.fit_transform(X_test,y_test) # print('after SelectKBest:',X_train.shape) # print(X_train['pdays']) ################################ ########### 性能計算 ########### ################################ # print('決策樹,分數不理想') # clf = DecisionTreeClassifier(random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('隨機森林,0.919203') clf = RandomForestClassifier(n_estimators=90, random_state=0,oob_score=True,n_jobs=-1) clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) print(clf.score(X_test, y_test)) y_predprob = clf.predict_proba(X_test) y_predprob = y_predprob[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) #窮舉隨機森林的最佳參數,答案:90 # param_test1 ={'n_estimators':range(10,100,5)} # gsearch1= GridSearchCV(estimator =RandomForestClassifier(min_samples_split=100, # min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10), # param_grid =param_test1,scoring='roc_auc',cv=5) # gsearch1.fit(X_train, y_train) # print(gsearch1.best_params_) # y_predprob = gsearch1.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # predictions = gsearch1.predict(X_test) # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # print('邏輯回歸,0.904655,0.915316') # # print(X_train) # #clf = Lasso(alpha=0.5) # clf = LogisticRegression(random_state=0,solver='newton-cg',class_weight='balanced',penalty='l2',n_jobs=-1) # # solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’). # clf.fit(X_train, y_train) # # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # # print(classification_report(y_test, predictions)) # # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # raletion = pd.DataFrame({"columns":list(data_train.columns)[0:-1], "coef":list(clf.coef_.T)}) # print('相關性:',raletion) # #窮舉邏輯回歸的最佳參數,答案: # # best C : LogisticRegression(C=7.742636826811269, class_weight=None, dual=False, # # fit_intercept=True, intercept_scaling=1, l1_ratio=None, # # max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', # # random_state=None, solver='warn', tol=0.0001, verbose=0, # # warm_start=False) # penalty = ['l1','l2'] # C=np.logspace(0,4,10) # hyperparameters = dict(C=C,penalty=penalty) # gridsearch = GridSearchCV(clf,hyperparameters,cv=5,verbose=0) # best_clf= gridsearch.fit(X_train, y_train) # print('best C :',best_clf.best_estimator_) # print(gridsearch.best_params_) # y_predprob = gridsearch.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # predictions = gridsearch.predict(X_test) # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('AdaBoost') # clf = AdaBoostClassifier(n_estimators=60, random_state=90) # # clf.fit(X_train, y_train) # predictionsByadaBoost = clf.predict(X_test) # print(classification_report(y_test, predictionsByadaBoost)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # pred = clf.predict_proba(X_test) # dataPred = pd.DataFrame(pred, columns=['pred0', 'pred']) # dataPred.drop('pred0', axis=1, inplace=True) # print(dataPred) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # predictions_train = clf.predict(X_train) # y_predprob_train = clf.predict_proba(X_train) # y_predprob_train = y_predprob_train[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train)) # print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train)) # # # # # # # # print('神經網絡') # # ‘lbfgs’ is an optimizer in the family of quasi-Newton methods. # # ‘sgd’ refers to stochastic gradient descent. # # ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba # clf = MLPClassifier(solver='adam', hidden_layer_sizes=(80,80), # random_state=1) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('神經網絡 end') # # #導出結果 ID = list(range(25318,36170)) submission = pd.DataFrame(ID) submission.rename(columns = {0: 'ID'}, inplace = True) # 將pred_y從array轉化成DataFrame y_predprob_test = clf.predict_proba(data_test) y_predprob_test = y_predprob_test[:, 1] y_predprob_DataFrame = pd.DataFrame(y_predprob_test) submission['pred'] =y_predprob_DataFrame submission.to_csv('Result.csv', index = False) #為防止過擬合而減半步長,最大迭代次數加倍 # gbm1 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=10000, max_depth=7, min_samples_leaf=70, # min_samples_split=1300, subsample=0.8, random_state=10) # gbm1.fit(X_train, y_train) # # y_pred = gbm1.predict(X_test) # y_predprob = gbm1.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred)) # print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('KNN近鄰,分數不理想') # clf = KNeighborsClassifier(n_neighbors=5) # clf.fit(X_train,y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('SVM支持向量機') # clf = SVC(kernel='rbf',C=1,gamma='auto',probability=True).fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) #朴素貝葉斯 # print('朴素貝葉斯') # clf = GaussianNB() # # clf_sigmoid = CalibratedClassifierCV(clf,cv=5) # clf_sigmoid.fit(X_train,y_train) # predictions = clf_sigmoid.predict(X_test) # y_predprob = clf_sigmoid.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) ################################ # AdaBoost選為第一次使用的算法,提交數據 ################################ # print('AdaBoost') # adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11) # adaBoost.fit(X_train, y_train) # # age_null = pd.isnull(data_test['age']) # data_null = data_test[age_null == True] # # print(data_null) # # id = data_test["ID"] # print(id) # X_test.drop(['ID'], axis=1, inplace=True) # # submission = pd.DataFrame({ # "ID": id # }) # # submission[['ID']].astype(int) # # submission[['ID']] = submission[['ID']].astype(int) # submission.to_csv('submission.csv', index=False) # data_test.dropna(inplace=True) # print(np.isnan(data_test).any()) # submission.replace(np.nan, 0, inplace=True) # predictionsByadaBoost = adaBoost.predict_proba(X_test) # # submission = pd.DataFrame({ # "ID": id, # "pred": predictionsByadaBoost # }) # submission.to_csv('submission.csv', index=False)
第一次提交,沒做什么特征工程,分數還不太理想
0.9157894736842105
Accuracy : 0.9158
AUC Score (Test): 0.932477
過程分析
from numpy import int64 from sklearn import metrics from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import seaborn as sns from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest,chi2,f_classif from sklearn.metrics import roc_auc_score data_train = pd.read_csv('/home/kesci/input/firstdata1587/train_set.csv') data_test = pd.read_csv('/home/kesci/input/firstdata1587/test_set.csv') data_train.describe()
Out[4]:
總計記錄25317人。 年齡分布:18-95; balance(存款)分布:-8019 - 102127,balance的標准差2999.822811,比較大,看到平均存款1357,上四分位1435,下四分位才只有73元,存款的差距還是蠻大的,萬惡的資本主義; day(最后一次聯系是幾號):1-31,很明顯一個月從1號開始,從31號結束,這個特征很可能和預測無關聯; duration(交流時長):0-3881,這個猜測是持續的天數; campaign(交流次數):1-55 pdays(上次聯系后過了多久):-1 - 854,這里沒有999,應該是-1為沒有聯系,>-1就是期間幾天前曾聯系過; previous(活動前交流次數):0-275,平均0.591737,不到1次;
In [5]:
#工作和購買理財的關系
y_0 = data_train.job[data_train.y == 0].value_counts() y_1 = data_train.job[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"job to buy") plt.ylabel(u"counts") plt.show()
In [14]:
#婚姻和購買理財的關系
#看不出啥結果 y_0 = data_train.marital[data_train.y == 0].value_counts() y_1 = data_train.marital[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"marital to buy") plt.ylabel(u"counts") plt.show()
In [15]:
#教育和購買理財的關系
y_0 = data_train.education[data_train.y == 0].value_counts() y_1 = data_train.education[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"education to buy") plt.ylabel(u"counts") plt.show()
In [24]:
#上次活動結果和購買理財的關系
#發現poutcome指標相當重要,上次活動成功的客戶這次也購買理財的比例非常高 y_0 = data_train.poutcome[data_train.y == 0].value_counts() y_1 = data_train.poutcome[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show()
判斷day、month是和客戶交流的月份和日份,很容易被當成噪音特征。用統計來說話。
In [3]:
#月份對結果的影響
y_0 = data_train.month[data_train.y == 0].value_counts() y_1 = data_train.month[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #我們發現may(0.019789)和dec (0.001896)相差10倍,所以這個特征還是蠻重要的
In [4]:
#日對結果的影響
y_0 = data_train.day[data_train.y == 0].value_counts() y_1 = data_train.day[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #發現30號最容易出單,31號極不容易出單
In [7]:
#'job','marital','education','default','housing','loan','contact','poutcome'這8個字段都
#要做one-hot編碼預處理,暫時先將unknown作為一個特征項。 dummy = pd.get_dummies(data_train[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) dummyTest = pd.get_dummies(data_test[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) data_train = pd.concat([dummy, data_train], axis=1) data_train.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_test = pd.concat([dummyTest, data_test], axis=1) data_test.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) print("數值處理1:標簽指標one-hot編碼處理") #default、housing、loan都是2分類的指標,刪除其中一個即可 #data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) #data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) data_train['pdays'].replace(-1,999,inplace=True) data_test['pdays'].replace(-1,999,inplace=True) print("數值處理2:pdays將-1替換為999")
In [20]:
data_train.head()
Out[20]:
In [6]:
#測試單一特征和目標的關系
#print('無違約:',data_train[data_train['default_yes']==0].count()) #print('有違約:',data_train[data_train['default_yes']==1].count()) print(data_train['default_yes'].value_counts()) print(data_test['default_yes'].value_counts()) #data_train.groupby(["default_yes"], as_index=False)['y'].count()
Out[6]:
In [8]:
#違約記錄&訂購理財的關系
fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts() y_1 = data_train.default_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"default") plt.ylabel(u"counts") plt.show()
In [9]:
#住房貸款&訂購理財的關系
#可以看出沒有房貸購買理財的比例稍微高一些,但不明顯,可能是還房貸的人資金壓力稍大 fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_train.housing_yes[data_train.y == 0].value_counts() y_1 = data_train.housing_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"housing") plt.ylabel(u"counts") plt.show() #發現沒有違約的人買理財比例略高
In [19]:
#個人貸款&訂購理財的關系
#可以看出兩種情況差別不大 fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_train.loan_yes[data_train.y == 0].value_counts() y_1 = data_train.loan_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"loan") plt.ylabel(u"counts") plt.show() data_train[["loan_yes", "y"]].groupby(['loan_yes'], as_index=False).mean().sort_values(by='y', ascending=False) #可以看出12.6%的無個人貸的人買了理財,有貸款的只有6.89%買了理財 #說明無個貸買理財的機會比較大
Out[19]:
In [7]:
#使用直方圖來看看那個區段年齡的人最多購買或不購買
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'age', bins=20) plt.show() #貌似看不出什么問題,只能說明買理財的年齡不大集中,不買的集中在30-40歲之間
In [8]:
#使用直方圖來看看“距離上次活動最后一次聯系該客戶,過去了多久”的人最多購買或不購買
#看來是時間越短,購買率越高,說明pdays是相當重要的指標 g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'pdays', bins=20) plt.show() #pdays指標讓人讀不懂,以后重點解決
In [9]:
y = data_train['y'] X = data_train[data_train.columns[: -1]] X.info()
In [ ]:
#查看相關矩陣,連帶y也作為指標
#data_train.corr() #查看相關矩陣熱圖 #colormap = plt.cm.RdBu #plt.figure(figsize=(39,37)) #plt.title('Correlation of Features', y=1.05, size=37) #sns.heatmap(data_train.astype(float).corr(),linewidths=0.1,vmax=1.0, # square=True, cmap=colormap, linecolor='white', annot=True) #plt.show()
In [11]:
print("數值處理3:數值指標Scaler變換") scaler = StandardScaler() X = scaler.fit_transform(X) data_test = scaler.fit_transform(data_test) #數據分割,用於測試 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=90)
In [12]:
# print('決策樹')
# clf = DecisionTreeClassifier(random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # print('隨機森林') # clf = RandomForestClassifier(n_estimators=10, random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('邏輯回歸') # clf = LogisticRegression() # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print('AdaBoost') adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11) adaBoost.fit(X_train, y_train) predictionsByadaBoost = adaBoost.predict(X_test) print(classification_report(y_test, predictionsByadaBoost)) print(cross_val_score(adaBoost,X_train, y_train,scoring='f1')) print(cross_val_score(adaBoost,X_test, y_test,scoring='f1')) print(adaBoost.score(X_test, y_test)) pred = adaBoost.predict_proba(X_test) dataPred = pd.DataFrame(pred, columns=['pred0', 'pred']) dataPred.drop('pred0', axis=1, inplace=True) print(dataPred) y_predprob = adaBoost.predict_proba(X_test) y_predprob = y_predprob[:, 1] predictions_train = adaBoost.predict(X_train) y_predprob_train = adaBoost.predict_proba(X_train) y_predprob_train = y_predprob_train[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost)) print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train)) print("AUC Score (Train): %f