「二分類算法」提供銀行精准營銷解決方案 代碼存檔


import mglearn
from numpy import int64
from sklearn import metrics
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Lasso, LinearRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFECV
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用來正常顯示中文標簽
plt.rcParams['axes.unicode_minus'] = False  # 用來正常顯示

# 字段說明
#
# NO    字段名稱    數據類型    字段描述
# 1    ID    Int    客戶唯一標識
# 2    age    Int    客戶年齡
# 3    job    String    客戶的職業
# 4    marital    String    婚姻狀況
# 5    education    String    受教育水平
# 6    default    String    是否有違約記錄
# 7    balance    Int    每年賬戶的平均余額
# 8    housing    String    是否有住房貸款
# 9    loan    String    是否有個人貸款
# 10    contact    String    與客戶聯系的溝通方式
# 11    day    Int    最后一次聯系的時間(幾號)
# 12    month    String    最后一次聯系的時間(月份)
# 13    duration    Int    最后一次聯系的交流時長
# 14    campaign    Int    在本次活動中,與該客戶交流過的次數
# 15    pdays    Int    距離上次活動最后一次聯系該客戶,過去了多久(999表示沒有聯系過)
# 16    previous    Int    在本次活動之前,與該客戶交流過的次數
# 17    poutcome    String    上一次活動的結果
# 18    y    Int    預測客戶是否會訂購定期存款業務
from sklearn.tree import DecisionTreeClassifier

data_train = pd.read_csv('train_set.csv')
data_test = pd.read_csv('test_set.csv')
ids_test = data_test['ID']

print(data_train.shape[0])

# data_train['cppv']=data_train['campaign']+data_train['previous']
# data_test['cppv']=data_test['campaign']+data_test['previous']
# data_train.drop(['campaign','previous'], axis=1, inplace=True)
# data_test.drop(['campaign','previous'], axis=1, inplace=True)

# Rela_grouped=data_train.groupby(['cp'])
# Rela_Survival_Rate=(Rela_grouped.sum()/Rela_grouped.count())['y']
# Rela_count=Rela_grouped.count()['y']
#
# ax1=Rela_count.plot(kind='bar',color='g')
# ax2=ax1.twinx()
# ax2.plot(Rela_Survival_Rate.values,color='r')
# ax1.set_xlabel('Relatives')
# ax1.set_ylabel('Number')
# ax2.set_ylabel('Survival Rate')
# plt.title('Survival Rate by Relatives')
# plt.grid(True,linestyle='-',color='0.7')
# plt.show()

# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'day', bins=30)
# plt.show()


print("數值處理1:標簽指標one-hot編碼處理")


data_train.drop(['ID'], axis=1, inplace=True)
data_test.drop(['ID'], axis=1, inplace=True)

dummy = pd.get_dummies(data_train[['month','job','marital','education','default','housing','loan','contact','poutcome']])
dummyTest = pd.get_dummies(data_test[['month','job','marital','education','default','housing','loan','contact','poutcome']])
data_train = pd.concat([dummy, data_train], axis=1)
data_train.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
data_test = pd.concat([dummyTest, data_test], axis=1)
data_test.drop(['job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

data_train['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_train['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_train['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_train['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_train['day'].replace([1,24,31],0,inplace=True)

data_test['day'].replace([30,13,15,4,14,12,18],4,inplace=True)
data_test['day'].replace([5,20,21,11,8,16,2,3],3,inplace=True)
data_test['day'].replace([17,9,6,27,7,22,28],2,inplace=True)
data_test['day'].replace([23,25,26,10,29,19],1,inplace=True)
data_test['day'].replace([1,24,31],0,inplace=True)


# data_train['month1'] = data_train.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_train['month1'] = data_train.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
#
# data_test['month1'] = data_test.month.apply(lambda x: 4 if x in ['may'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 3 if x in ['aug','jul','apr'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 2 if x in ['jun','feb','nov','oct'] else 0)
# data_test['month1'] = data_test.month.apply(lambda x: 1 if x in ['sep','mar'] else 0)
# #
data_train.drop(['month'], inplace=True, axis=1)
data_test.drop(['month'], inplace=True, axis=1)
# data_train.drop(['day','job_management','marital_single'], axis=1, inplace=True)
# data_test.drop(['day','job_management','marital_single'], axis=1, inplace=True)


# data_train['month'].replace(['may'],4,inplace=True)
# data_train['month'].replace(['aug','jul','apr'],3,inplace=True)
# data_train['month'].replace(['jun','feb','nov','oct'],2,inplace=True)
# data_train['month'].replace(['sep','mar'],1,inplace=True)
# data_train['month'].replace(['jan','dec'],0,inplace=True)

# 多刪特征
# data_train.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)
# data_test.drop(['age','balance','duration','pdays','previous','day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1)

#default、housing、loan都是2分類的指標,刪除其中一個即可
# data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)
# data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1)


################################
########### 數據整理 ###########
################################

data_train['pdays'].replace(-1,9999,inplace=True)
data_test['pdays'].replace(-1,9999,inplace=True)
print("數值處理2:pdays將-1替換為999")
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)


# g = sns.FacetGrid(data_train, col='y')
# g.map(plt.hist, 'pdays', bins=20)
# plt.show()
# data_train.drop(['pdays'], inplace=True, axis=1)
# data_test.drop(['pdays'], inplace=True, axis=1)

y = data_train['y']
X = data_train[data_train.columns[: -1]]
# # X.info()
# pdays的平均值先前看到是45,而-1距離45很近,距離max值854很遠,故還是需要將所有的-1替換為999
#數據預處理:
#數據中pdays=-1表示從未聯絡過,替換為999



#對方差較大的數據指標進行變換,MinMaxScaler或者StandardScaler
print("數值處理3:數值指標Scaler變換")
scaler = MinMaxScaler()
# numerical = ['age','balance', 'duration', 'pdays', 'previous']
# X[numerical] = scaler.fit_transform(X[numerical])
# data_test[numerical] = scaler.fit_transform(data_test[numerical])
print(data_test.shape)
X = scaler.fit_transform(X)
data_test = scaler.fit_transform(data_test)

# tsvd = TruncatedSVD(n_components=46)
# data_test = tsvd.fit_transform(data_test)
#數據分割,用於測試
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.06, random_state=1)
# X_train = tsvd.fit_transform(X_train)
# X_test = tsvd.fit_transform(X_test)
# print(X_train.shape)

#增加二項式特征
# polynomial_interaction = PolynomialFeatures(degree=2,include_bias=False)
# #增加二項式特征,僅僅是交叉特征
# polynomial_interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
# X_train = polynomial_interaction.fit_transform(X_train)
# X_test = polynomial_interaction.fit_transform(X_test)
# data_test = polynomial_interaction.fit_transform(data_test)
# print('after Polynomial:',X_train.shape)
#
# # #保留99%的信息,進行朱成本分析
# pca = PCA(n_components=100,whiten=True)
# X_train = pca.fit_transform(X_train)
# X_test = pca.fit_transform(X_test)
# data_test = pca.fit_transform(data_test)
# print('after PCA:',X_train.shape)

# #卡方分類篩選
# selector = SelectKBest(f_classif,k=300)
# X_train = selector.fit_transform(X_train,y_train)
# X_test = selector.fit_transform(X_test,y_test)
# print('after SelectKBest:',X_train.shape)

# print(X_train['pdays'])

################################
########### 性能計算 ###########
################################


# print('決策樹,分數不理想')
# clf = DecisionTreeClassifier(random_state=11)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
print('隨機森林,0.919203')
clf = RandomForestClassifier(n_estimators=90, random_state=0,oob_score=True,n_jobs=-1)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
print(clf.score(X_test, y_test))
y_predprob = clf.predict_proba(X_test)
y_predprob = y_predprob[:, 1]
print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#窮舉隨機森林的最佳參數,答案:90
# param_test1 ={'n_estimators':range(10,100,5)}
# gsearch1= GridSearchCV(estimator =RandomForestClassifier(min_samples_split=100,
#                                  min_samples_leaf=20,max_depth=8,max_features='sqrt',random_state=10),
#                        param_grid =param_test1,scoring='roc_auc',cv=5)
# gsearch1.fit(X_train, y_train)
# print(gsearch1.best_params_)
# y_predprob = gsearch1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gsearch1.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# print('邏輯回歸,0.904655,0.915316')
# # print(X_train)
# #clf = Lasso(alpha=0.5)
# clf = LogisticRegression(random_state=0,solver='newton-cg',class_weight='balanced',penalty='l2',n_jobs=-1)
# # solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, optional (default=’liblinear’).
# clf.fit(X_train, y_train)
# # clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# # print(classification_report(y_test, predictions))
# # print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# # print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
#
# raletion = pd.DataFrame({"columns":list(data_train.columns)[0:-1], "coef":list(clf.coef_.T)})
# print('相關性:',raletion)

# #窮舉邏輯回歸的最佳參數,答案:
# # best C : LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
# #                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
# #                    max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
# #                    random_state=None, solver='warn', tol=0.0001, verbose=0,
# #                    warm_start=False)
# penalty = ['l1','l2']
# C=np.logspace(0,4,10)
# hyperparameters = dict(C=C,penalty=penalty)
# gridsearch = GridSearchCV(clf,hyperparameters,cv=5,verbose=0)
# best_clf= gridsearch.fit(X_train, y_train)
# print('best C :',best_clf.best_estimator_)
# print(gridsearch.best_params_)
# y_predprob = gridsearch.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
# predictions = gridsearch.predict(X_test)
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('AdaBoost')
# clf = AdaBoostClassifier(n_estimators=60, random_state=90)
#
# clf.fit(X_train, y_train)
# predictionsByadaBoost = clf.predict(X_test)
# print(classification_report(y_test, predictionsByadaBoost))
# print(cross_val_score(clf,X_train, y_train,scoring='f1'))
# print(cross_val_score(clf,X_test, y_test,scoring='f1'))
# print(clf.score(X_test, y_test))
# pred = clf.predict_proba(X_test)
# dataPred = pd.DataFrame(pred, columns=['pred0', 'pred'])
# dataPred.drop('pred0', axis=1, inplace=True)
# print(dataPred)
#
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# predictions_train =  clf.predict(X_train)
# y_predprob_train = clf.predict_proba(X_train)
# y_predprob_train = y_predprob_train[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train))
# #
#
#
# # #
# print('神經網絡')
# # ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
# # ‘sgd’ refers to stochastic gradient descent.
# # ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba
# clf = MLPClassifier(solver='adam', hidden_layer_sizes=(80,80),
#                     random_state=1)
# clf.fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(clf.score(X_test, y_test))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))
# print('神經網絡 end')
# # #導出結果
ID = list(range(25318,36170))
submission = pd.DataFrame(ID)
submission.rename(columns = {0: 'ID'}, inplace = True)
# 將pred_y從array轉化成DataFrame
y_predprob_test = clf.predict_proba(data_test)
y_predprob_test = y_predprob_test[:, 1]
y_predprob_DataFrame = pd.DataFrame(y_predprob_test)
submission['pred'] =y_predprob_DataFrame
submission.to_csv('Result.csv', index = False)

#為防止過擬合而減半步長,最大迭代次數加倍
# gbm1 = GradientBoostingClassifier(learning_rate=0.001, n_estimators=10000, max_depth=7, min_samples_leaf=70,
#                                   min_samples_split=1300, subsample=0.8, random_state=10)
# gbm1.fit(X_train, y_train)
#
# y_pred = gbm1.predict(X_test)
# y_predprob = gbm1.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
# print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('KNN近鄰,分數不理想')
# clf = KNeighborsClassifier(n_neighbors=5)
# clf.fit(X_train,y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]

# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

# print('SVM支持向量機')
# clf = SVC(kernel='rbf',C=1,gamma='auto',probability=True).fit(X_train, y_train)
# predictions = clf.predict(X_test)
# print(classification_report(y_test, predictions))
# y_predprob = clf.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

#朴素貝葉斯
# print('朴素貝葉斯')
# clf = GaussianNB()
#
# clf_sigmoid = CalibratedClassifierCV(clf,cv=5)
# clf_sigmoid.fit(X_train,y_train)
# predictions = clf_sigmoid.predict(X_test)
# y_predprob = clf_sigmoid.predict_proba(X_test)
# y_predprob = y_predprob[:, 1]
#
# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
# print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob))

################################
# AdaBoost選為第一次使用的算法,提交數據
################################
# print('AdaBoost')
# adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11)
# adaBoost.fit(X_train, y_train)
#
# age_null = pd.isnull(data_test['age'])
# data_null = data_test[age_null == True]
# # print(data_null)
#
# id = data_test["ID"]
# print(id)
# X_test.drop(['ID'], axis=1, inplace=True)
#
# submission = pd.DataFrame({
#         "ID": id
#     })
#
# submission[['ID']].astype(int)
# # submission[['ID']] = submission[['ID']].astype(int)
# submission.to_csv('submission.csv', index=False)

# data_test.dropna(inplace=True)
# print(np.isnan(data_test).any())
# submission.replace(np.nan, 0, inplace=True)


# predictionsByadaBoost = adaBoost.predict_proba(X_test)
#
# submission = pd.DataFrame({
#         "ID": id,
#         "pred": predictionsByadaBoost
#     })
# submission.to_csv('submission.csv', index=False)

 

第一次提交,沒做什么特征工程,分數還不太理想

0.9157894736842105
Accuracy : 0.9158
AUC Score (Test): 0.932477

 

過程分析

from numpy import int64 from sklearn import metrics from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import seaborn as sns from sklearn.preprocessing import PolynomialFeatures from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest,chi2,f_classif from sklearn.metrics import roc_auc_score data_train = pd.read_csv('/home/kesci/input/firstdata1587/train_set.csv') data_test = pd.read_csv('/home/kesci/input/firstdata1587/test_set.csv') data_train.describe() 
Out[4]:
  ID age balance day duration campaign pdays previous y
count 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000 25317.000000
mean 12659.000000 40.935379 1357.555082 15.835289 257.732393 2.772050 40.248766 0.591737 0.116957
std 7308.532719 10.634289 2999.822811 8.319480 256.975151 3.136097 100.213541 2.568313 0.321375
min 1.000000 18.000000 -8019.000000 1.000000 0.000000 1.000000 -1.000000 0.000000 0.000000
25% 6330.000000 33.000000 73.000000 8.000000 103.000000 1.000000 -1.000000 0.000000 0.000000
50% 12659.000000 39.000000 448.000000 16.000000 181.000000 2.000000 -1.000000 0.000000 0.000000
75% 18988.000000 48.000000 1435.000000 21.000000 317.000000 3.000000 -1.000000 0.000000 0.000000
max 25317.000000 95.000000 102127.000000 31.000000 3881.000000 55.000000 854.000000 275.000000 1.000000
 

總計記錄25317人。 年齡分布:18-95; balance(存款)分布:-8019 - 102127,balance的標准差2999.822811,比較大,看到平均存款1357,上四分位1435,下四分位才只有73元,存款的差距還是蠻大的,萬惡的資本主義; day(最后一次聯系是幾號):1-31,很明顯一個月從1號開始,從31號結束,這個特征很可能和預測無關聯; duration(交流時長):0-3881,這個猜測是持續的天數; campaign(交流次數):1-55 pdays(上次聯系后過了多久):-1 - 854,這里沒有999,應該是-1為沒有聯系,>-1就是期間幾天前曾聯系過; previous(活動前交流次數):0-275,平均0.591737,不到1次;

In [5]:
#工作和購買理財的關系
y_0 = data_train.job[data_train.y == 0].value_counts() y_1 = data_train.job[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"job to buy") plt.ylabel(u"counts") plt.show() 
 
In [14]:
#婚姻和購買理財的關系
#看不出啥結果 y_0 = data_train.marital[data_train.y == 0].value_counts() y_1 = data_train.marital[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"marital to buy") plt.ylabel(u"counts") plt.show() 
 
In [15]:
#教育和購買理財的關系
y_0 = data_train.education[data_train.y == 0].value_counts() y_1 = data_train.education[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"education to buy") plt.ylabel(u"counts") plt.show() 
 
In [24]:
#上次活動結果和購買理財的關系
#發現poutcome指標相當重要,上次活動成功的客戶這次也購買理財的比例非常高 y_0 = data_train.poutcome[data_train.y == 0].value_counts() y_1 = data_train.poutcome[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() 
 
 

判斷day、month是和客戶交流的月份和日份,很容易被當成噪音特征。用統計來說話。

In [3]:
#月份對結果的影響
y_0 = data_train.month[data_train.y == 0].value_counts() y_1 = data_train.month[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #我們發現may(0.019789)和dec (0.001896)相差10倍,所以這個特征還是蠻重要的 
 
 
may    0.019789
aug    0.014773
jul    0.014022
apr    0.012916
jun    0.011613
feb    0.009954
nov    0.009045
oct    0.007465
sep    0.006241
mar    0.005727
jan    0.003515
dec    0.001896
Name: month, dtype: float64
In [4]:
#日對結果的影響
y_0 = data_train.day[data_train.y == 0].value_counts() y_1 = data_train.day[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"poutcome to buy") plt.ylabel(u"counts") plt.show() print(y_1/data_train.shape[0]) #發現30號最容易出單,31號極不容易出單 
 
 
30    0.005964
13    0.005253
15    0.005135
4     0.005016
14    0.004977
12    0.004898
18    0.004898
5     0.004661
20    0.004661
21    0.004621
11    0.004582
8     0.004463
16    0.004345
2     0.004345
3     0.004266
17    0.003950
9     0.003910
6     0.003792
27    0.003792
7     0.003476
22    0.003436
28    0.003160
23    0.002923
25    0.002646
26    0.002528
10    0.002528
29    0.002409
19    0.002370
1     0.001777
24    0.001303
31    0.000869
Name: day, dtype: float64
In [7]:
#'job','marital','education','default','housing','loan','contact','poutcome'這8個字段都
#要做one-hot編碼預處理,暫時先將unknown作為一個特征項。 dummy = pd.get_dummies(data_train[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) dummyTest = pd.get_dummies(data_test[['day','month','job','marital','education','default','housing','loan','contact','poutcome']]) data_train = pd.concat([dummy, data_train], axis=1) data_train.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) data_test = pd.concat([dummyTest, data_test], axis=1) data_test.drop(['day','month','job','marital','education','default','housing','loan','contact','poutcome'], inplace=True, axis=1) print("數值處理1:標簽指標one-hot編碼處理") #default、housing、loan都是2分類的指標,刪除其中一個即可 #data_train.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) #data_test.drop(['default_no','housing_no','loan_no'], inplace=True, axis=1) data_train['pdays'].replace(-1,999,inplace=True) data_test['pdays'].replace(-1,999,inplace=True) print("數值處理2:pdays將-1替換為999") 
 
數值處理1:標簽指標one-hot編碼處理
數值處理2:pdays將-1替換為999
In [20]:
data_train.head() 
Out[20]:
  job_admin. job_blue-collar job_entrepreneur job_housemaid job_management job_retired job_self-employed job_services job_student job_technician ... poutcome_other poutcome_success poutcome_unknown age balance duration campaign pdays previous y
0 0 0 0 0 1 0 0 0 0 0 ... 0 0 1 43 291 150 2 -1 0 0
1 0 0 0 0 0 0 0 0 0 1 ... 1 0 0 42 5076 99 1 251 2 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 1 47 104 77 2 -1 0 0
3 0 0 0 0 1 0 0 0 0 0 ... 0 0 1 28 -994 174 2 -1 0 0
4 0 0 0 0 0 0 0 0 0 1 ... 0 0 1 42 2974 187 5 -1 0 0

5 rows × 39 columns

In [6]:
#測試單一特征和目標的關系
#print('無違約:',data_train[data_train['default_yes']==0].count()) #print('有違約:',data_train[data_train['default_yes']==1].count()) print(data_train['default_yes'].value_counts()) print(data_test['default_yes'].value_counts()) #data_train.groupby(["default_yes"], as_index=False)['y'].count() 
 
0    24869
1      448
Name: default_yes, dtype: int64
0    24869
1      448
Name: default_yes, dtype: int64
Out[6]:
  job_admin. job_blue-collar job_entrepreneur job_housemaid job_management job_retired job_self-employed job_services job_student job_technician ... poutcome_other poutcome_success poutcome_unknown age balance duration campaign pdays previous y
job_admin. 1.000000 -0.188846 -0.067402 -0.059086 -0.185311 -0.082905 -0.068534 -0.115037 -0.052838 -0.161626 ... 0.013577 0.004200 -0.018840 -0.063839 -0.029366 -0.017629 -0.018559 0.021803 0.009821 0.000298
job_blue-collar -0.188846 1.000000 -0.098047 -0.085951 -0.269568 -0.120600 -0.099695 -0.167341 -0.076863 -0.235113 ... -0.003148 -0.056453 0.025315 -0.044350 -0.056248 0.010505 0.009946 0.016488 -0.019208 -0.075065
job_entrepreneur -0.067402 -0.098047 1.000000 -0.030677 -0.096212 -0.043044 -0.035583 -0.059726 -0.027433 -0.083915 ... -0.018659 -0.014969 0.013491 0.023331 0.010288 0.003927 -0.001803 -0.014705 -0.007958 -0.022519
job_housemaid -0.059086 -0.085951 -0.030677 1.000000 -0.084342 -0.037733 -0.031193 -0.052357 -0.024049 -0.073562 ... -0.018467 -0.009511 0.029735 0.084754 0.008013 -0.001337 0.002692 -0.032321 -0.013129 -0.015041
job_management -0.185311 -0.269568 -0.096212 -0.084342 1.000000 -0.118343 -0.097829 -0.164209 -0.075424 -0.230713 ... 0.008288 0.025737 -0.019421 -0.027075 0.078719 -0.010090 0.016234 -0.003619 0.025946 0.035234
job_retired -0.082905 -0.120600 -0.043044 -0.037733 -0.118343 1.000000 -0.043767 -0.073464 -0.033743 -0.103217 ... -0.001619 0.054668 -0.024616 0.451285 0.046370 0.026569 -0.031805 -0.003046 0.007511 0.083868
job_self-employed -0.068534 -0.099695 -0.035583 -0.031193 -0.097829 -0.043767 1.000000 -0.060730 -0.027894 -0.085325 ... -0.002526 0.004632 0.000565 -0.009973 0.000782 0.002657 -0.003602 -0.007433 -0.004029 0.001078
job_services -0.115037 -0.167341 -0.059726 -0.052357 -0.164209 -0.073464 -0.060730 1.000000 -0.046821 -0.143221 ... 0.001367 -0.020796 0.005367 -0.060838 -0.036640 0.000364 -0.001615 0.011358 -0.006309 -0.026688
job_student -0.052838 -0.076863 -0.027433 -0.024049 -0.075424 -0.033743 -0.027894 -0.046821 1.000000 -0.065784 ... 0.030733 0.049948 -0.045026 -0.195720 0.000799 -0.005165 -0.021539 0.024643 0.014206 0.069058
job_technician -0.161626 -0.235113 -0.083915 -0.073562 -0.230713 -0.103217 -0.085325 -0.143221 -0.065784 1.000000 ... -0.001704 -0.004072 0.011010 -0.063478 -0.015668 -0.011605 0.023601 -0.015579 -0.004059 -0.004942
job_unemployed -0.060802 -0.088448 -0.031568 -0.027673 -0.086792 -0.038829 -0.032099 -0.053879 -0.024747 -0.075699 ... -0.012716 0.016013 0.009008 0.005462 0.013252 0.023554 -0.021663 -0.013660 -0.008230 0.023980
job_unknown -0.029004 -0.042192 -0.015059 -0.013201 -0.041402 -0.018523 -0.015312 -0.025701 -0.011805 -0.036110 ... -0.016910 0.007256 0.011327 0.045026 0.015479 -0.003483 0.012938 -0.014763 -0.006241 0.001438
marital_divorced 0.027961 -0.062361 0.003040 0.016786 0.002196 0.053472 -0.017381 0.026199 -0.048590 0.007188 ... -0.001968 -0.002870 0.001999 0.165888 -0.028356 0.012815 -0.019830 0.003130 -0.004718 0.002723
marital_married -0.056102 0.125532 0.044894 0.045362 -0.033545 0.073654 0.002060 -0.019572 -0.161869 -0.058949 ... -0.028606 -0.022959 0.028377 0.284516 0.026577 -0.022557 0.039452 -0.027329 -0.006380 -0.054746
marital_single 0.041159 -0.092241 -0.050951 -0.061204 0.034904 -0.117958 0.010081 0.002703 0.210381 0.058978 ... 0.032488 0.026989 -0.032260 -0.426833 -0.008788 0.015434 -0.028825 0.027486 0.010278 0.057574
education_primary -0.110105 0.348314 -0.011630 0.164128 -0.175814 0.119077 -0.040373 -0.058845 -0.042160 -0.161923 ... -0.004174 -0.033214 0.032773 0.194451 -0.026575 -0.000034 0.012495 -0.011621 -0.012038 -0.043154
education_secondary 0.220828 0.037604 -0.051630 -0.062505 -0.405359 -0.037429 -0.053990 0.200833 0.007825 0.155845 ... 0.004079 -0.028471 0.002800 -0.093500 -0.074607 0.000568 -0.022185 0.017952 -0.011050 -0.038460
education_tertiary -0.146154 -0.320429 0.061969 -0.055380 0.601275 -0.062459 0.095847 -0.170206 -0.024021 -0.036790 ... 0.003128 0.050667 -0.030504 -0.083080 0.094686 -0.001067 0.011818 -0.006720 0.024955 0.066901
education_unknown -0.021208 0.010760 0.008699 -0.012186 -0.041017 0.022015 -0.010919 -0.008502 0.110442 -0.014967 ... -0.009791 0.015287 0.003656 0.073640 0.018380 0.001066 0.006071 -0.008665 -0.007600 0.021087
default_yes -0.005145 0.012717 0.029592 -0.007002 -0.008630 -0.008948 0.008743 -0.002526 -0.017596 -0.004049 ... -0.010326 -0.021432 0.038027 -0.019272 -0.068299 -0.011327 0.019978 -0.029440 -0.015293 -0.024608
housing_yes 0.043369 0.176937 0.017130 -0.074215 -0.063260 -0.159975 -0.023608 0.065284 -0.085328 -0.016506 ... 0.032566 -0.096285 -0.060478 -0.187364 -0.068780 0.002778 -0.024708 0.121740 0.032667 -0.143589
loan_yes 0.032612 0.012896 0.040955 -0.012334 -0.032051 -0.016304 -0.006878 0.036603 -0.058082 0.009240 ... -0.011531 -0.053573 0.035315 -0.016286 -0.085854 -0.011356 0.020537 -0.024458 -0.006240 -0.065231
contact_cellular -0.002431 -0.128760 -0.003751 -0.018765 0.101878 -0.010661 0.012462 -0.029756 0.027596 0.055623 ... 0.107764 0.104342 -0.263887 -0.072573 0.015821 0.018666 -0.027461 0.225438 0.122062 0.134791
contact_telephone -0.012570 -0.002537 -0.012075 0.044074 -0.031565 0.105808 0.001363 -0.015583 0.026084 -0.037147 ... 0.025071 0.009642 -0.026306 0.174284 0.042785 -0.015570 0.056106 0.017672 0.021314 0.020747
contact_unknown 0.009411 0.137290 0.010535 -0.004194 -0.090346 -0.046364 -0.013896 0.039893 -0.043332 -0.038483 ... -0.127399 -0.115385 0.292862 -0.018304 -0.039998 -0.011223 -0.001567 -0.247577 -0.140445 -0.153572
poutcome_failure 0.012266 0.002967 0.003890 -0.019621 0.004027 0.000278 -0.001732 0.004389 0.007463 -0.010275 ... -0.073107 -0.064271 -0.734653 -0.006166 0.012700 -0.019398 -0.089085 0.704495 0.313898 0.011927
poutcome_other 0.013577 -0.003148 -0.018659 -0.018467 0.008288 -0.001619 -0.002526 0.001367 0.030733 -0.001704 ... 1.000000 -0.038796 -0.443453 -0.021450 0.008611 -0.002584 -0.021604 0.384397 0.295747 0.038399
poutcome_success 0.004200 -0.056453 -0.014969 -0.009511 0.025737 0.054668 0.004632 -0.020796 0.049948 -0.004072 ... -0.038796 1.000000 -0.389856 0.039246 0.031758 0.045017 -0.058443 0.223025 0.174036 0.305806
poutcome_unknown -0.018840 0.025315 0.013491 0.029735 -0.019421 -0.024616 0.000565 0.005367 -0.045026 0.011010 ... -0.443453 -0.389856 1.000000 -0.002015 -0.029327 -0.003872 0.109688 -0.868084 -0.485981 -0.170697
age -0.063839 -0.044350 0.023331 0.084754 -0.027075 0.451285 -0.009973 -0.060838 -0.195720 -0.063478 ... -0.021450 0.039246 -0.002015 1.000000 0.093740 0.000416 0.006171 -0.026431 0.006575 0.029916
balance -0.029366 -0.056248 0.010288 0.008013 0.078719 0.046370 0.000782 -0.036640 0.000799 -0.015668 ... 0.008611 0.031758 -0.029327 0.093740 1.000000 0.026042 -0.010419 0.001032 0.015792 0.057564
duration -0.017629 0.010505 0.003927 -0.001337 -0.010090 0.026569 0.002657 0.000364 -0.005165 -0.011605 ... -0.002584 0.045017 -0.003872 0.000416 0.026042 1.000000 -0.087780 0.000040 0.001315 0.394746
campaign -0.018559 0.009946 -0.001803 0.002692 0.016234 -0.031805 -0.003602 -0.001615 -0.021539 0.023601 ... -0.021604 -0.058443 0.109688 0.006171 -0.010419 -0.087780 1.000000 -0.089224 -0.031667 -0.075173
pdays 0.021803 0.016488 -0.014705 -0.032321 -0.003619 -0.003046 -0.007433 0.011358 0.024643 -0.015579 ... 0.384397 0.223025 -0.868084 -0.026431 0.001032 0.000040 -0.089224 1.000000 0.411688 0.107565
previous 0.009821 -0.019208 -0.007958 -0.013129 0.025946 0.007511 -0.004029 -0.006309 0.014206 -0.004059 ... 0.295747 0.174036 -0.485981 0.006575 0.015792 0.001315 -0.031667 0.411688 1.000000 0.088337
y 0.000298 -0.075065 -0.022519 -0.015041 0.035234 0.083868 0.001078 -0.026688 0.069058 -0.004942 ... 0.038399 0.305806 -0.170697 0.029916 0.057564 0.394746 -0.075173 0.107565 0.088337 1.000000

36 rows × 36 columns

In [8]:
#違約記錄&訂購理財的關系
fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts() y_1 = data_train.default_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"default") plt.ylabel(u"counts") plt.show() 
 
---------------------------------------------------------------------------
NameError Traceback (most recent call last) <ipython-input-8-a047910fcfb8> in <module>  2 fig = plt.figure()  3 fig.set(alpha=0.2) # 設定圖表顏色alpha參數 ----> 4 y_0 = data_atrain.default_yes[data_train.y == 0].value_counts()  5 y_1 = data_train.default_yes[data_train.y == 1].value_counts()  6 df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) NameError: name 'data_atrain' is not defined
 
<Figure size 432x288 with 0 Axes>
In [9]:
#住房貸款&訂購理財的關系
#可以看出沒有房貸購買理財的比例稍微高一些,但不明顯,可能是還房貸的人資金壓力稍大 fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_train.housing_yes[data_train.y == 0].value_counts() y_1 = data_train.housing_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"housing") plt.ylabel(u"counts") plt.show() #發現沒有違約的人買理財比例略高 
 
<Figure size 432x288 with 0 Axes>
 
In [19]:
#個人貸款&訂購理財的關系
#可以看出兩種情況差別不大 fig = plt.figure() fig.set(alpha=0.2) # 設定圖表顏色alpha參數 y_0 = data_train.loan_yes[data_train.y == 0].value_counts() y_1 = data_train.loan_yes[data_train.y == 1].value_counts() df=pd.DataFrame({u'buy':y_1, u'not buy':y_0}) df.plot(kind='bar', stacked=True) plt.title(u"buy or not") plt.xlabel(u"loan") plt.ylabel(u"counts") plt.show() data_train[["loan_yes", "y"]].groupby(['loan_yes'], as_index=False).mean().sort_values(by='y', ascending=False) #可以看出12.6%的無個人貸的人買了理財,有貸款的只有6.89%買了理財 #說明無個貸買理財的機會比較大 
 
<Figure size 432x288 with 0 Axes>
 
Out[19]:
  loan_yes y
0 0 0.126117
1 1 0.068983
In [7]:
#使用直方圖來看看那個區段年齡的人最多購買或不購買
g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'age', bins=20) plt.show() #貌似看不出什么問題,只能說明買理財的年齡不大集中,不買的集中在30-40歲之間 
 
In [8]:
#使用直方圖來看看“距離上次活動最后一次聯系該客戶,過去了多久”的人最多購買或不購買
#看來是時間越短,購買率越高,說明pdays是相當重要的指標 g = sns.FacetGrid(data_train, col='y') g.map(plt.hist, 'pdays', bins=20) plt.show() #pdays指標讓人讀不懂,以后重點解決 
 
In [9]:
y = data_train['y'] X = data_train[data_train.columns[: -1]] X.info() 
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 51 columns):
month_apr              25317 non-null uint8
month_aug              25317 non-null uint8
month_dec              25317 non-null uint8
month_feb              25317 non-null uint8
month_jan              25317 non-null uint8
month_jul              25317 non-null uint8
month_jun              25317 non-null uint8
month_mar              25317 non-null uint8
month_may              25317 non-null uint8
month_nov              25317 non-null uint8
month_oct              25317 non-null uint8
month_sep              25317 non-null uint8
job_admin.             25317 non-null uint8
job_blue-collar        25317 non-null uint8
job_entrepreneur       25317 non-null uint8
job_housemaid          25317 non-null uint8
job_management         25317 non-null uint8
job_retired            25317 non-null uint8
job_self-employed      25317 non-null uint8
job_services           25317 non-null uint8
job_student            25317 non-null uint8
job_technician         25317 non-null uint8
job_unemployed         25317 non-null uint8
job_unknown            25317 non-null uint8
marital_divorced       25317 non-null uint8
marital_married        25317 non-null uint8
marital_single         25317 non-null uint8
education_primary      25317 non-null uint8
education_secondary    25317 non-null uint8
education_tertiary     25317 non-null uint8
education_unknown      25317 non-null uint8
default_no             25317 non-null uint8
default_yes            25317 non-null uint8
housing_no             25317 non-null uint8
housing_yes            25317 non-null uint8
loan_no                25317 non-null uint8
loan_yes               25317 non-null uint8
contact_cellular       25317 non-null uint8
contact_telephone      25317 non-null uint8
contact_unknown        25317 non-null uint8
poutcome_failure       25317 non-null uint8
poutcome_other         25317 non-null uint8
poutcome_success       25317 non-null uint8
poutcome_unknown       25317 non-null uint8
ID                     25317 non-null int64
age                    25317 non-null int64
balance                25317 non-null int64
duration               25317 non-null int64
campaign               25317 non-null int64
pdays                  25317 non-null int64
previous               25317 non-null int64
dtypes: int64(7), uint8(44)
memory usage: 2.4 MB
In [ ]:
#查看相關矩陣,連帶y也作為指標
#data_train.corr() #查看相關矩陣熱圖 #colormap = plt.cm.RdBu #plt.figure(figsize=(39,37)) #plt.title('Correlation of Features', y=1.05, size=37) #sns.heatmap(data_train.astype(float).corr(),linewidths=0.1,vmax=1.0, # square=True, cmap=colormap, linecolor='white', annot=True) #plt.show() 
In [11]:
print("數值處理3:數值指標Scaler變換") scaler = StandardScaler() X = scaler.fit_transform(X) data_test = scaler.fit_transform(data_test) #數據分割,用於測試 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=90) 
 
數值處理3:數值指標Scaler變換
In [12]:
# print('決策樹')
# clf = DecisionTreeClassifier(random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # # print('隨機森林') # clf = RandomForestClassifier(n_estimators=10, random_state=11) # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) # print('邏輯回歸') # clf = LogisticRegression() # clf.fit(X_train, y_train) # predictions = clf.predict(X_test) # print(classification_report(y_test, predictions)) # print(cross_val_score(clf,X_train, y_train,scoring='f1')) # print(cross_val_score(clf,X_test, y_test,scoring='f1')) # print(clf.score(X_test, y_test)) # y_predprob = clf.predict_proba(X_test) # y_predprob = y_predprob[:, 1] # # print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) # print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print('AdaBoost') adaBoost = AdaBoostClassifier(n_estimators=50, random_state=11) adaBoost.fit(X_train, y_train) predictionsByadaBoost = adaBoost.predict(X_test) print(classification_report(y_test, predictionsByadaBoost)) print(cross_val_score(adaBoost,X_train, y_train,scoring='f1')) print(cross_val_score(adaBoost,X_test, y_test,scoring='f1')) print(adaBoost.score(X_test, y_test)) pred = adaBoost.predict_proba(X_test) dataPred = pd.DataFrame(pred, columns=['pred0', 'pred']) dataPred.drop('pred0', axis=1, inplace=True) print(dataPred) y_predprob = adaBoost.predict_proba(X_test) y_predprob = y_predprob[:, 1] predictions_train = adaBoost.predict(X_train) y_predprob_train = adaBoost.predict_proba(X_train) y_predprob_train = y_predprob_train[:, 1] print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictionsByadaBoost)) print("AUC Score (Test): %f" % metrics.roc_auc_score(y_test, y_predprob)) print("Accuracy y_train : %.4g" % metrics.accuracy_score(y_train, predictions_train)) print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_predprob_train)) ID = list(range(25318,36170)) submission = pd.DataFrame(ID) submission.rename(columns = {0: 'ID'}, inplace = True) # 將pred_y從array轉化成DataFrame y_predprob_test = adaBoost.predict_proba(data_test) y_predprob_test = y_predprob_test[:, 1] y_predprob_DataFrame = pd.DataFrame(y_predprob_test) submission['pred'] =y_predprob_DataFrame submission.to_csv('Result.csv', index = False) 
 
AdaBoost
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2249
           1       1.00      1.00      1.00       283

    accuracy                           1.00      2532
   macro avg       1.00      1.00      1.00      2532
weighted avg       1.00      1.00      1.00      2532

[1.         1.         0.99943915]
[1. 1. 1.]
1.0
              pred
0     2.220446e-16
1     1.000000e+00
2     2.220446e-16
3     2.220446e-16
4     2.220446e-16
5     2.220446e-16
6     2.220446e-16
7     2.220446e-16
8     2.220446e-16
9     2.220446e-16
10    2.220446e-16
11    2.220446e-16
12    2.220446e-16
13    2.220446e-16
14    2.220446e-16
15    2.220446e-16
16    2.220446e-16
17    2.220446e-16
18    2.220446e-16
19    2.220446e-16
20    2.220446e-16
21    2.220446e-16
22    2.220446e-16
23    2.220446e-16
24    2.220446e-16
25    2.220446e-16
26    2.220446e-16
27    2.220446e-16
28    2.220446e-16
29    2.220446e-16
...            ...
2502  2.220446e-16
2503  2.220446e-16
2504  2.220446e-16
2505  2.220446e-16
2506  2.220446e-16
2507  2.220446e-16
2508  2.220446e-16
2509  2.220446e-16
2510  2.220446e-16
2511  2.220446e-16
2512  2.220446e-16
2513  2.220446e-16
2514  2.220446e-16
2515  2.220446e-16
2516  2.220446e-16
2517  2.220446e-16
2518  2.220446e-16
2519  2.220446e-16
2520  1.000000e+00
2521  2.220446e-16
2522  2.220446e-16
2523  2.220446e-16
2524  2.220446e-16
2525  2.220446e-16
2526  2.220446e-16
2527  2.220446e-16
2528  2.220446e-16
2529  2.220446e-16
2530  1.000000e+00
2531  1.000000e+00

[2532 rows x 1 columns]
Accuracy : 1
AUC Score (Test): 1.000000
Accuracy y_train : 1
AUC Score (Train): 1.000000
 
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)
/opt/conda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.
  warnings.warn(CV_WARNING, FutureWarning)
 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM