取UCI公共測試數據庫中澳大利亞信貸批准數據集作為本例數據集,
其擁有14個特征,1個分類標簽y(1--同意貸款,0--不同意貸款)共計690個申請者記錄
1、數據獲取
import pandas as pd
data = pd.read_excel('credit.xlsx')
data
x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | d | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 22.08 | 11.460 | 2 | 4 | 4 | 1.585 | 0 | 0 | 0 | 1 | 2 | 100 | 1213 | 0 |
1 | 0 | 22.67 | 7.000 | 2 | 8 | 4 | 0.165 | 0 | 0 | 0 | 0 | 2 | 160 | 1 | 0 |
2 | 0 | 29.58 | 1.750 | 1 | 4 | 4 | 1.250 | 0 | 0 | 0 | 1 | 2 | 280 | 1 | 0 |
3 | 0 | 21.67 | 11.500 | 1 | 5 | 3 | 0.000 | 1 | 1 | 11 | 1 | 2 | 0 | 1 | 1 |
4 | 1 | 20.17 | 8.170 | 2 | 6 | 4 | 1.960 | 1 | 1 | 14 | 0 | 2 | 60 | 159 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
685 | 1 | 31.57 | 10.500 | 2 | 14 | 4 | 6.500 | 1 | 0 | 0 | 0 | 2 | 0 | 1 | 1 |
686 | 1 | 20.67 | 0.415 | 2 | 8 | 4 | 0.125 | 0 | 0 | 0 | 0 | 2 | 0 | 45 | 0 |
687 | 0 | 18.83 | 9.540 | 2 | 6 | 4 | 0.085 | 1 | 0 | 0 | 0 | 2 | 100 | 1 | 1 |
688 | 0 | 27.42 | 14.500 | 2 | 14 | 8 | 3.085 | 1 | 1 | 1 | 0 | 2 | 120 | 12 | 1 |
689 | 1 | 41.00 | 0.040 | 2 | 10 | 4 | 0.040 | 0 | 1 | 1 | 0 | 1 | 560 | 1 | 1 |
690 rows × 15 columns
2、訓練樣本與測試樣本划分
#訓練用的特征數據用x表示,預測變量用y表示 測試樣本分別記為x1,y1
#以前600數據為訓練數據,后90個為測試數據
x = data.iloc[:600,:14].values
x
array([[1.000e+00, 2.208e+01, 1.146e+01, ..., 2.000e+00, 1.000e+02,
1.213e+03],
[0.000e+00, 2.267e+01, 7.000e+00, ..., 2.000e+00, 1.600e+02,
1.000e+00],
[0.000e+00, 2.958e+01, 1.750e+00, ..., 2.000e+00, 2.800e+02,
1.000e+00],
...,
[1.000e+00, 3.492e+01, 2.500e+00, ..., 2.000e+00, 2.390e+02,
2.010e+02],
[1.000e+00, 2.408e+01, 8.750e-01, ..., 2.000e+00, 2.540e+02,
1.951e+03],
[1.000e+00, 3.733e+01, 6.500e+00, ..., 2.000e+00, 9.300e+01,
1.000e+00]])
y = data.iloc[:600,14].values
y
array([0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0,
0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0,
1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
0, 0, 1, 1, 0, 1], dtype=int64)
x1 = data.iloc[600:,:14].values
x1
array([[0.000e+00, 2.075e+01, 9.540e+00, ..., 2.000e+00, 2.000e+02,
1.001e+03],
[1.000e+00, 3.667e+01, 3.250e+00, ..., 2.000e+00, 1.020e+02,
6.400e+02],
[1.000e+00, 2.258e+01, 1.004e+01, ..., 2.000e+00, 6.000e+01,
3.970e+02],
...,
[0.000e+00, 1.883e+01, 9.540e+00, ..., 2.000e+00, 1.000e+02,
1.000e+00],
[0.000e+00, 2.742e+01, 1.450e+01, ..., 2.000e+00, 1.200e+02,
1.200e+01],
[1.000e+00, 4.100e+01, 4.000e-02, ..., 1.000e+00, 5.600e+02,
1.000e+00]])
y1 = data.iloc[600:,14].values
y1
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
1, 1], dtype=int64)
3、邏輯回歸分析
#導入邏輯回歸模塊(LR)
from sklearn.linear_model import LogisticRegression as LR
#利用LR創建邏輯回歸對象lr
lr = LR(max_iter=3000)
#調用lr中的fit()方法進行訓練
lr.fit(x,y)
LogisticRegression(max_iter=3000)
這里遇到一個問題:TOP: TOTAL NO. of ITERATIONS REACHED LIMIT......extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG
解決辦法
意思是達到限制的迭代總數,只需要增加迭代次數(最大值)或縮放數據就可以。
將代碼改為(增加迭代次數):
最大迭代次數默認值為1000,把它改為3000即可
lr = LR(max_iter=3000)
#調用lr中的score()方法返回模型准確率
r = lr.score(x,y) #模型准確率(針對訓練數據)
r
0.875
#調用lr中的predict()方法,對測試樣本x1進行預測,獲取預測結果
R = lr.predict(x1)
R
array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0], dtype=int64)
#預測准確率
Z = R-y1
Rs=len(Z[Z==0])/len(Z)
Rs
0.8666666666666667
import pandas as pd
data = pd.read_excel('credit.xlsx')
x = data.iloc[:600,:14].values
y = data.iloc[:600,14].values
x1= data.iloc[600:,:14].values
y1= data.iloc[600:,14].values
from sklearn.linear_model import LogisticRegression as LR
lr = LR(max_iter=3000) #創建邏輯回歸模型類
lr.fit(x, y) #訓練數據
r=lr.score(x, y); # 模型准確率(針對訓練數據)
print('模型准確率(針對訓練數據):',r)
R=lr.predict(x1)
Z=R-y1
Rs=len(Z[Z==0])/len(Z)
print('預測結果為:',R)
print('預測准確率為:',Rs)
模型准確率(針對訓練數據): 0.875
預測結果為: [0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0
0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 0 1
0 0 0 0 0 1 0 1 1 0 1 1 0 1 1 0]
預測准確率為: 0.8666666666666667