from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
from sklearn.datasets import make_regression
隨機森林
from sklearn.ensemble import RandomForestRegressor
X, y = make_regression(n_features=4, n_informative=2,
random_state=0, shuffle=False)
plt.plot(X)
plt.plot(y)
X_train=X[:70]
y_train=y[:70]
X_test=X[:30]
y_test=y[:30]
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
# 預測
# 測試集
y_pred=regr.predict(X_test)
print(y_pred)
[ 41.71152007 -15.51877479 18.77435453 2.4613485 -5.25163664
11.98242971 -28.99147231 67.82781115 -46.47813223 58.94403962
-44.43019803 -25.35127762 -27.46837011 -31.48276853 17.81715876
-25.42572978 -16.172543 -20.43062853 -20.84673413 -30.25425251
17.90104445 67.70073552 28.81417535 33.29761523 40.28058259
-22.61219493 34.50175346 68.835082 38.18859153 -6.48249831]
# 繪制y_test曲線
# 創建t變量
t = np.arange(len(X_test))
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')
# 繪制y_pred曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')
# 返回擬合優度the coefficient of determination
regr.score(X_test,y_test)
0.8338446596824768
# mse
# https://blog.csdn.net/xiaohutong1991/article/details/108178143?spm=1001.2101.3001.6650.11&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&utm_relevant_index=14
metrics.mean_squared_error(y_test, y_pred)
334.42748631188385
regr.set_params()
RandomForestRegressor(max_depth=2, random_state=0)
regr.feature_importances_
array([0.15597865, 0.84082089, 0. , 0.00320046])
調優——k折交叉驗證,scikit-learn的網格搜索GridSearchCV
# param_grid = {"n_estimators":[5,10,50,100,200,500],"max_depth":[5,10,50,100,200,500]}
param_grid = {"n_estimators":[5,50,100],"max_depth":[8,9,10]}
# 調用scikit-learn的網格搜索,傳入參數選擇范圍,並且制定隨機森林回歸算法,cv = 5表示5折交叉驗證
grid_search = GridSearchCV(RandomForestRegressor(),param_grid,cv = 3)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})
y_pred=grid_search.predict(X_test)
print(y_pred)
[ 49.50191561 -0.7122897 15.26286215 17.50407347 15.87708862
-14.54908528 -13.32531612 80.64244515 -75.54860534 63.84753325
-68.76733049 -27.15074728 -34.90857798 -45.24935823 16.53953061
-25.26432862 -10.65729336 -18.79136562 -19.30815651 -38.14527267
6.93420609 88.31726657 16.87408796 34.57068077 53.79849864
-9.89424185 39.75832876 87.18227999 45.21303975 13.54728708]
plt.figure(figsize=(15, 10))
# 創建t變量
t = np.arange(len(X_test))
# 繪制y_test曲線
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')
# 繪制y_hat曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')
# 設置圖例
plt.legend()
plt.show()
# 擬合優度R2
print("r2:", grid_search.score(X_test, y_test))
r2: 0.9866915026043963
# 用Scikit_learn計算MSE
print("MSE:", metrics.mean_squared_error(y_test, y_pred))
MSE: 26.786543978030636
grid_search.set_params()
GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})
print(grid_search.best_params_)
{'max_depth': 9, 'n_estimators': 50}
調優——k折交叉驗證+逐個參數
superpa = []
for i in range(10,200,10):
regr = RandomForestRegressor(n_estimators=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 評估指標
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*10+10)#輸出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(10,200,10),superpa)#橫縱坐標
plt.show()
# max_depth的學習曲線
superpa = []
for i in range(10,30,2):
regr = RandomForestRegressor(n_estimators=170
,max_depth=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 評估指標
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+10)#輸出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(10,30,2),superpa)#橫縱坐標
plt.show()
# min_samples_split的學習曲線,分割內部節點所需的最小樣本數
superpa = []
for i in range(2,10,2):
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=i
,random_state=42
,n_jobs=-1)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 評估指標
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+2)#輸出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(2,10,2),superpa)#橫縱坐標
plt.show()
# min_samples_leaf的學習曲線
superpa = []
for i in range(1,15,1):
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=i
,random_state=42
)
regr_s = cross_val_score(regr
,X_train
,y_train
,cv=10
#,scoring='roc_auc'
).mean()# 評估指標
superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*1+1)#輸出最大值及其索引
plt.figure(figsize=[20,5])
plt.plot(range(1,15,1),superpa)#橫縱坐標
plt.show()
# max_features的學習曲線,每棵樹用到的最大特征數
#from sklearn.model_selection import GridSearchCV
param_grid = {'max_features':np.arange(3, 11, 1)}
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=1
,random_state=42
)
GS = GridSearchCV(regr,param_grid,cv=10)
GS.fit(X_train, y_train)
print(GS.best_params_)
#print(GS.best_score_)
{'max_features': 4}
最終模型:
regr = RandomForestRegressor(n_estimators=170
,max_depth=12
,min_samples_split=2
,min_samples_leaf=1
,random_state=42
)
# 返回擬合優度the coefficient of determination
regr.score(X_test,y_test)
0.9879834206877871
# mse
metrics.mean_squared_error(y_test, y_pred)
24.275331856914285
regr.set_params()
RandomForestRegressor(max_depth=12, n_estimators=170, random_state=42)
regr.feature_importances_
array([0.21877631, 0.7564047 , 0.01129733, 0.01352166])
plt.figure(figsize=(15, 10))
# 創建t變量
t = np.arange(len(X_test))
# 繪制y_test曲線
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')
# 繪制y_hat曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')
# 設置圖例
plt.legend()
plt.show()