import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.tree import DecisionTreeClassifier,plot_tree import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix
data = pd.read_csv('Algerian_forest_fires_dataset_UPDATE.csv') data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
DAY | MONTH | YEAR | TEMPERATURE | RH | WS | RAIN | FFMC | DMC | DC | ISI | BUI | FWI | CLASSES | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 6 | 2012 | 29 | 57 | 18 | 0 | 65.7 | 3.4 | 7.6 | 1.3 | 3.4 | 0.5 | not fire |
1 | 2 | 6 | 2012 | 29 | 61 | 13 | 1.3 | 64.4 | 4.1 | 7.6 | 1 | 3.9 | 0.4 | not fire |
2 | 3 | 6 | 2012 | 26 | 82 | 22 | 13.1 | 47.1 | 2.5 | 7.1 | 0.3 | 2.7 | 0.1 | not fire |
3 | 4 | 6 | 2012 | 25 | 89 | 13 | 2.5 | 28.6 | 1.3 | 6.9 | 0 | 1.7 | 0 | not fire |
4 | 5 | 6 | 2012 | 27 | 77 | 16 | 0 | 64.8 | 3 | 14.2 | 1.2 | 3.9 | 0.5 | not fire |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
242 | 26 | 9 | 2012 | 30 | 65 | 14 | 0 | 85.4 | 16 | 44.5 | 4.5 | 16.9 | 6.5 | fire |
243 | 27 | 9 | 2012 | 28 | 87 | 15 | 4.4 | 41.1 | 6.5 | 8 | 0.1 | 6.2 | 0 | not fire |
244 | 28 | 9 | 2012 | 27 | 87 | 29 | 0.5 | 45.9 | 3.5 | 7.9 | 0.4 | 3.4 | 0.2 | not fire |
245 | 29 | 9 | 2012 | 24 | 54 | 18 | 0.1 | 79.7 | 4.3 | 15.2 | 1.7 | 5.1 | 0.7 | not fire |
246 | 30 | 9 | 2012 | 24 | 64 | 15 | 0.2 | 67.3 | 3.8 | 16.5 | 1.2 | 4.8 | 0.5 | not fire |
247 rows × 14 columns
</div>
2.數據預處理
2.1整理數據集
#由於數據集分為兩個地區,因此我們把兩個地區的數據合並為一個來分析。 #因此去掉數據中的兩個地區行和一個空白行 data1 = data.iloc[0:122,:] data1 data2 = data.iloc[125:247,:] data2 data = pd.concat([data1,data2]) data #data為整理好的數據集 feature_names = data.columns.values #提取出列名 feature_names
array(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ',
'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes '],
dtype=object)
2.2分離出僅含特征列的部分作為 X 和僅含目標列的部分作為 Y
X = data.iloc[:,0:13] Y = data.iloc[:,-1] #整理目標列: s=Y.values for i in range(len(s)): s[i] = s[i].replace(' ','') print(s)
['notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'fire' 'fire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'notfire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire'
'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'notfire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'notfire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'fire' 'notfire'
'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'notfire' 'notfire'
'notfire' 'fire' 'fire' 'fire' 'notfire' 'notfire' 'fire' 'fire' 'fire'
'fire' 'fire' 'fire' 'fire' 'fire' 'notfire' 'fire' 'fire' 'fire'
'notfire' 'notfire' 'fire' 'notfire' 'notfire' 'notfire' 'notfire']
#使用訓練好的LabelEncoder對原數據進行編碼,notfire為1.fire為0 from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y=le.fit_transform(s) y
array([1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
1, 1])
y= pd.DataFrame(y)
y
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
0 | |
---|---|
0 | 1 |
1 | 1 |
2 | 1 |
3 | 1 |
4 | 1 |
... | ... |
239 | 0 |
240 | 1 |
241 | 1 |
242 | 1 |
243 | 1 |
244 rows × 1 columns
</div>
#2.使用train_test_split函數自動隨機划分訓練集與測試集(70%和 30%) x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1) #x_train,x_test,y_train,y_test #查看訓練集和測試集的大小 x_train.shape, x_test.shape, y_train.shape,y_test.shape
((170, 13), (74, 13), (170, 1), (74, 1))
#3.對X進行標准化處理,讓模型更擬合 scaler = StandardScaler().fit(x_train) x_train = pd.DataFrame(scaler.transform(x_train)) x_train
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.693550 | -0.377923 | 0.0 | -0.338241 | 1.164128 | -0.203369 | -0.414083 | 0.508256 | 1.099453 | 1.786169 | -0.022055 | 1.423974 | 0.490624 |
1 | -0.549241 | 1.406713 | 0.0 | -0.622900 | 1.027785 | 2.027127 | 0.621124 | -1.406602 | -1.043440 | -0.895966 | -0.921917 | -1.024626 | -0.941239 |
2 | 1.221384 | -1.270241 | 0.0 | 1.085051 | 0.005213 | 0.168381 | -0.414083 | 0.679098 | 0.132734 | -0.347347 | 0.552856 | -0.045186 | 0.325409 |
3 | -0.667282 | 1.406713 | 0.0 | -0.907558 | 0.823271 | -0.203369 | 0.218544 | -1.335418 | -0.817872 | -0.900170 | -0.996906 | -0.870714 | -0.941239 |
4 | 1.457467 | 0.514395 | 0.0 | 0.515734 | 0.141556 | 0.168381 | -0.414083 | 0.792993 | 2.847602 | 3.350048 | 0.627845 | 3.207954 | 1.757272 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
165 | 0.040967 | -1.270241 | 0.0 | -0.907558 | 1.709500 | -0.203369 | -0.184037 | -2.196748 | -0.858152 | -0.904374 | -1.146882 | -0.905694 | -0.968775 |
166 | -0.431199 | 0.514395 | 0.0 | 0.800393 | -0.744673 | -0.946867 | -0.241548 | 0.216400 | 0.060230 | 0.506060 | -0.571971 | 0.255642 | -0.404291 |
167 | 0.395092 | -1.270241 | 0.0 | -0.053583 | 0.346070 | -0.575118 | 2.173935 | -0.972378 | -0.842040 | -0.900170 | -0.946913 | -0.898698 | -0.927471 |
168 | 0.749217 | 1.406713 | 0.0 | 0.231076 | 0.141556 | -0.946867 | -0.414083 | 0.757401 | 0.906109 | 1.161879 | 0.577852 | 1.074174 | 0.903661 |
169 | -0.903366 | -0.377923 | 0.0 | 0.231076 | 0.414242 | 1.283628 | -0.414083 | 0.522493 | -0.189505 | -0.025743 | 0.302895 | -0.115146 | 0.118890 |
170 rows × 13 columns
</div>
2. 用 LogisticRegression 建立邏輯回歸模型
from sklearn.linear_model import LogisticRegression model_logic = LogisticRegression(max_iter=10000).fit(x_train, y_train) print(model_logic.score(x_test,y_test))
0.5
D:\Anoconda\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
3. 構建決策樹模型
3.1 定義一個決策樹分類器並預測
dtc = DecisionTreeClassifier() #調用該對象的訓練方法,接收兩個參數:訓練數據集及其樣本標簽 dtc.fit(x_train,y_train) #調用該對象的測試方法,接收一個參數:測試數據集 y_pre = dtc.predict(x_test) #調用該對象的准確率方法,接收兩個參數:測試數據集及其樣本標簽,返回測試集樣本映射到指定分類標記上的准確率 score = dtc.score(x_test,y_test) #輸出 print("測試結果:",y_pre) print("正確結果:",y_test) print("正確率:",score)
測試結果: [0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1
1 1 0 1 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 1]
正確結果: 0
67 0
243 1
206 0
122 1
89 0
.. ..
158 0
99 1
173 0
176 1
95 1
[74 rows x 1 columns]
正確率: 0.9864864864864865
3.2 構建決策樹模型
#分別取分裂節點為“gini”基尼指標和“entropy”信息增益,構建決策樹模型,並設置樹深為5 #gini指標的決策樹模型 dt_gini = DecisionTreeClassifier(criterion='gini',max_depth=5,random_state=0) dt_gini = dt_gini.fit(x_train,y_train) #用訓練集進行訓練 #entropy指標的決策樹模型 dt_entropy = DecisionTreeClassifier(criterion='entropy',max_depth=5,random_state=0) dt_entropy = dt_entropy.fit(x_train,y_train) #查看兩者的模型參數 dt_gini,dt_entropy
(DecisionTreeClassifier(max_depth=5, random_state=0),
DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=0))
3.3 數據可視化
plt.figure(figsize=(25,30)) #設置圖形大小寬為25英寸,高為30英寸 #gini模型 plot_tree(dt_gini,filled = True,feature_names = feature_names) #設置自動填充顏色 #entropy模型 plot_tree(dt_entropy,filled = True,feature_names = feature_names) #設置自動填充顏色
[Text(558.0, 1467.72, 'FFMC <= 80.1\nentropy = 0.974\nsamples = 170\nvalue = [101, 69]'),
Text(279.0, 1141.56, 'entropy = 0.0\nsamples = 67\nvalue = [0, 67]'),
Text(837.0, 1141.56, 'ISI <= 3.05\nentropy = 0.138\nsamples = 103\nvalue = [101, 2]'),
Text(558.0, 815.4000000000001, 'Temperature <= 33.5\nentropy = 0.764\nsamples = 9\nvalue = [7, 2]'),
Text(279.0, 489.24, 'entropy = 0.0\nsamples = 5\nvalue = [5, 0]'),
Text(837.0, 489.24, 'DC <= 82.75\nentropy = 1.0\nsamples = 4\nvalue = [2, 2]'),
Text(558.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2]'),
Text(1116.0, 163.08000000000015, 'entropy = 0.0\nsamples = 2\nvalue = [2, 0]'),
Text(1116.0, 815.4000000000001, 'entropy = 0.0\nsamples = 94\nvalue = [94, 0]')]
3.4 計算訓練誤差
# 對gini指標的模型計算訓練誤差 gini_train_score = dt_gini.score(x_train,y_train) print("gini指標的訓練誤差為:",gini_train_score) #對entropy信息增益的模型計算訓練誤差 entropy_train_score = dt_entropy.score(x_train,y_train) print("entropy指標的訓練誤差為:",entropy_train_score)
gini指標的訓練誤差為: 1.0
entropy指標的訓練誤差為: 1.0
3.5 計算測試誤差
# 對gini指標的模型計算測試誤差 gini_test_score = dt_gini.score(x_test,y_test) print("gini指標的測試誤差為:",gini_test_score) #對entropy信息增益的模型計算訓練誤差 entropy_test_score = dt_entropy.score(x_test,y_test) print("entropy指標的測試誤差為:",entropy_test_score)
gini指標的測試誤差為: 0.972972972972973
entropy指標的測試誤差為: 0.972972972972973
3.6 繪制學習曲線
#對gini指標和entropy指標的模型繪制學習曲線,樹深從1到30 test1 = [] #保存gini指標的每一次訓練后的測試結果 test2 = [] #保存entropy指標的每一次訓練后的測試結果 for i in range(30): clf_gini = DecisionTreeClassifier(max_depth=i+1,criterion='gini',random_state=30,splitter='random') clf_entropy = DecisionTreeClassifier(max_depth=i+1,criterion='entropy',random_state=30,splitter='random') clf_gini = clf_gini.fit(x_train,y_train) #訓練模型 clf_entropy = clf_entropy.fit(x_train,y_train) score1 = clf_gini.score(x_train,y_train) #計算測試結果 score2 = clf_entropy.score(x_train,y_train) test1.append(score1) #gini指標模型的每一次測試結果都存到test1中 test2.append(score2) #entropy指標模型的每一次測試結果都存到test2中 #畫圖 plt.subplot(1,2,1) plt.plot(range(1,31),test1,color='red',label='gini') #定義gini指標模型的圖形的橫軸范圍在30以內,曲線顏色為紅色,命名為gini plt.xlabel('max_depth') #命名x軸為max_depth plt.ylabel('training accuracy') #命名y軸為training accuracy plt.subplot(1,2,2) plt.plot(range(1,31),test2,color='blue',label='entropy') #定義entropy指標模型的圖形的橫軸范圍在30以內,曲線顏色為藍色,命名為entropy plt.xlabel('max_depth') #命名x軸為max_depth plt.ylabel('training accuracy') #命名y軸為training accuracy plt.legend() #創建圖例 plt.show() #展示圖形
3.7 模型評估與優化
(1)對gini指標的決策樹模型進行模型評估
#計算預測值 gini_pre = dt_gini.predict(x_test) gini_pre #輸出混淆矩陣 cm = confusion_matrix(y_test,gini_pre,labels=[0,1]) print(cm) tp,tn,fp,fn = cm.ravel() print(tp,tn,fp,fn) #計算模型的准確率 accuracy = (tp + tn) / (tp + tn + fp + fn) #計算模型的查全率 tpr = tp / (tp + fn) #計算模型的假正率 fpr = fp / (fp + tn) #計算模型的精確率 tpr = tp / (tp + fp) print("准確率為:{}%".format(accuracy*100)) print("查全率為:{}%".format(tpr*100)) print("假正率為:{}%".format(fpr*100)) print("精確率為:{}%".format(tpr*100))
[[36 1]
[ 1 36]]
36 1 1 36
准確率為:50.0%
查全率為:97.2972972972973%
假正率為:50.0%
精確率為:97.2972972972973%
(2)對entropy指標的決策樹模型進行模型評估
#計算預測值 entropy_pre = dt_entropy.predict(x_test) entropy_pre #輸出混淆矩陣 cm = confusion_matrix(y_test,entropy_pre,labels=[0,1]) print(cm) tp,tn,fp,fn = cm.ravel() print(tp,tn,fp,fn) #計算模型的准確率 accuracy = (tp + tn) / (tp + tn + fp + fn) #計算模型的查全率 tpr = tp / (tp + fn) #計算模型的假正率 fpr = fp / (fp + tn) #計算模型的精確率 tpr = tp / (tp + fp) print("准確率為:{}%".format(accuracy*100)) print("查全率為:{}%".format(tpr*100)) print("假正率為:{}%".format(fpr*100)) print("精確率為:{}%".format(tpr*100))
[[37 0]
[ 2 35]]
37 0 2 35
准確率為:50.0%
查全率為:94.87179487179486%
假正率為:100.0%
精確率為:94.87179487179486%
(3)用分類法對模型進行評估
#導入交叉驗證工具 from sklearn.model_selection import cross_val_score #導入用於分類額的支持向量機 from sklearn.svm import SVC #設置SVC的核函數為linear svc = SVC(kernel='linear') #使用交叉驗證法對SVC評分 scores = cross_val_score(svc,X,y,cv=10) #打印結果 print("交叉驗證得分:{}".format(scores))
交叉驗證得分:[0.96 1. 0.96 1. 1. 0.875
1. 1. 0.91666667 0.95833333]
plt.figure() plt.title('決策樹模型的k折交叉得分曲線圖') plt.plot(range(10),scores,'bs-')