1、決策樹算法是一種非參數的決策算法,它根據數據的不同特征進行多層次的分類和判斷,最終決策出所需要預測的結果。它既可以解決分類算法,也可以解決回歸問題,具有很好的解釋能力。另外,對於決策樹的構建方法具有多種出發點,它具有多種構建方式,如何構建決策樹的出發點主要在於決策樹每一個決策點上需要在哪些維度上進行划分以及在這些維度的哪些閾值節點做划分等細節問題。
具體在sklearn中調用決策樹算法解決分類問題和回歸問題的程序代碼如下所示:
#1-1導入基礎訓練數據集
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
d=datasets.load_iris()
x=d.data[:,2:]
y=d.target
plt.figure()
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.scatter(x[y==2,0],x[y==2,1],color="b")
plt.show()
#1-2導入sklearn中的決策樹算法進行數據的分類問題實現訓練預測
from sklearn.tree import DecisionTreeClassifier
dt1=DecisionTreeClassifier(max_depth=2,criterion="entropy") #定義決策樹的分類器相關決策超參數
dt1.fit(x,y)
def plot_decision_boundary(model,axis): #決策邊界輸出函數(二維數據點)
x0,x1=np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[2],axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1,1)
)
x_new=np.c_[x0.ravel(),x1.ravel()]
y_pre=model.predict(x_new)
zz=y_pre.reshape(x0.shape)
from matplotlib.colors import ListedColormap
cus=ListedColormap(["#EF9A9A","#FFF59D","#90CAF9"])
plt.contourf(x0,x1,zz,cmap=cus)
plot_decision_boundary(dt1,axis=[0.5,8,0,3])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.scatter(x[y==2,0],x[y==2,1],color="b")
plt.show()
#定義二分類問題的信息熵計算函數np.sum(-p*np.log(p))
def entropy(p):
return -p*np.log(p)-(1-p)*np.log(1-p)
x1=np.linspace(0.01,0.99,100)
y1=entropy(x1)
plt.plot(x1,y1,"r")
plt.show()
#利用信息熵的原理對數據進行實現划分,決策樹信息熵構建方式的原理實現代碼
def split(x,y,d,value):
index_a=(x[:,d]<=value)
index_b=(x[:,d]>value)
return x[index_a],x[index_b],y[index_a],y[index_b]
from collections import Counter
def entropy(y):
Counter1=Counter(y)
res=0.0
for num in Counter1.values():
p=num/len(y)
res+=-p*np.log(p)
return res
def try_spit(x,y):
best_entropy=float("inf")
best_d,best_v=-1,-1
for d in range(x.shape[1]):
sorted_index=np.argsort(x[:,d])
for i in range(1,len(x)):
if x[sorted_index[i-1],d] != x[sorted_index[i],d]:
v=(x[sorted_index[i-1],d]+x[sorted_index[i],d])/2
x_l,x_r,y_l,y_r=split(x,y,d,v)
e=entropy(y_l)+entropy(y_r)
if e<best_entropy:
best_entropy,best_d,best_v=e,d,v
return best_entropy,best_d,best_v
print(try_spit(x,y))
best_entropy=try_spit(x,y)[0]
best_d=try_spit(x,y)[1]
best_v=try_spit(x,y)[2]
x_l,x_r,y_l,y_r=split(x,y,best_d,best_v)
print(entropy(y_l))
print(entropy(y_r))
#基尼系數方式構建決策樹的代碼實現
from sklearn.tree import DecisionTreeClassifier
dt2=DecisionTreeClassifier(max_depth=2,criterion="gini") #定義決策樹的分類器相關決策超參數
dt2.fit(x,y)
plot_decision_boundary(dt2,axis=[0.5,8,0,3])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.scatter(x[y==2,0],x[y==2,1],color="b")
plt.show()
def split(x,y,d,value):
index_a=(x[:,d]<=value)
index_b=(x[:,d]>value)
return x[index_a],x[index_b],y[index_a],y[index_b]
from collections import Counter
def gini(y):
Counter1 = Counter(y)
res = 1.0
for num in Counter1.values():
p = num / len(y)
res -= p**2
return res
def try_spit1(x,y):
best_gini=float("inf")
best_d,best_v=-1,-1
for d in range(x.shape[1]):
sorted_index=np.argsort(x[:,d])
for i in range(1,len(x)):
if x[sorted_index[i-1],d] != x[sorted_index[i],d]:
v=(x[sorted_index[i-1],d]+x[sorted_index[i],d])/2
x_l,x_r,y_l,y_r=split(x,y,d,v)
g=gini(y_l)+gini(y_r)
if g<best_gini:
best_gini,best_d,best_v=g,d,v
return [best_gini,best_d,best_v]
best_gini,best_d,best_v=try_spit1(x,y)
print(best_gini,best_d,best_v)
#對於決策數進行相應的剪枝,盡可能降低過擬合的情況
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
x,y=datasets.make_moons(noise=0.25,random_state=666) #生成數據默認為100個數據樣本
print(x.shape)
print(y.shape)
plt.figure()
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
from sklearn.tree import DecisionTreeClassifier
dt2=DecisionTreeClassifier(max_depth=2,min_samples_split=10,min_samples_leaf=6,max_leaf_nodes=4) #默認情況下則為基尼系數,對於深度會一直划分下去使得基尼系數為0為止
#決策樹的主要超參數
dt2.fit(x,y)
plot_decision_boundary(dt2,axis=[-2,3,-1,1.5])
plt.scatter(x[y==0,0],x[y==0,1],color="r")
plt.scatter(x[y==1,0],x[y==1,1],color="g")
plt.show()
#使用決策樹解決回歸問題
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
d=datasets.load_boston()
x=d.data
y=d.target
print(x.shape)
print(y.shape)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=666)
from sklearn.tree import DecisionTreeRegressor
dr=DecisionTreeRegressor()
dr.fit(x_train,y_train)
print(dr.score(x_test,y_test))
print(dr.score(x_train,y_train)) #在訓練數據集的R2=1,而在測試集上比較小,因此已經產生了過擬合,學習曲線可以比較好的反映過擬合情況
#繪制不同參數組合情況下的學習曲線
from sklearn.metrics import mean_squared_error
def plot_learning_curve(algo,x_train,x_test,y_train,y_test):
train_score = []
test_score = []
for i in range(1, len(x_train)):
algo.fit(x_train[:i], y_train[:i])
y_train_pre = algo.predict(x_train[:i])
y_test_pre =algo.predict(x_test)
train_score.append(mean_squared_error(y_train[:i], y_train_pre))
test_score.append(mean_squared_error(y_test, y_test_pre))
plt.figure()
plt.plot([i for i in range(1, len(x_train))], np.sqrt(train_score), "g", label="train_error")
plt.plot([i for i in range(1, len(x_train))], np.sqrt(test_score), "r", label="test_error")
plt.legend()
#plt.axis([0,len(x_train)+1,0,5])
plt.show()
plot_learning_curve(DecisionTreeRegressor(max_depth=1),x_train,x_test,y_train,y_test) #欠擬合的情況
plot_learning_curve(DecisionTreeRegressor(max_depth=5),x_train,x_test,y_train,y_test) #較好擬合的情況
plot_learning_curve(DecisionTreeRegressor(max_depth=15),x_train,x_test,y_train,y_test) #過擬合的情況