回歸預測之隨機森林——運行+調優

本文轉載自查看原文 2022-02-18 20:27 1637 python/ 科研隨筆

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

from sklearn.datasets import make_regression

隨機森林

from sklearn.ensemble import RandomForestRegressor

X, y = make_regression(n_features=4, n_informative=2,
                        random_state=0, shuffle=False)

plt.plot(X)

plt.plot(y)

X_train=X[:70]
y_train=y[:70]

X_test=X[:30]
y_test=y[:30]

regr = RandomForestRegressor(max_depth=2, random_state=0)

regr.fit(X_train, y_train)

# 預測
# 測試集
y_pred=regr.predict(X_test)
print(y_pred)

[ 41.71152007 -15.51877479 18.77435453 2.4613485 -5.25163664
11.98242971 -28.99147231 67.82781115 -46.47813223 58.94403962
-44.43019803 -25.35127762 -27.46837011 -31.48276853 17.81715876
-25.42572978 -16.172543 -20.43062853 -20.84673413 -30.25425251
17.90104445 67.70073552 28.81417535 33.29761523 40.28058259
-22.61219493 34.50175346 68.835082 38.18859153 -6.48249831]

# 繪制y_test曲線
# 創建t變量
t = np.arange(len(X_test))
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')

# 繪制y_pred曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')

# 返回擬合優度the coefficient of determination
regr.score(X_test,y_test)

0.8338446596824768

# mse
# https://blog.csdn.net/xiaohutong1991/article/details/108178143?spm=1001.2101.3001.6650.11&utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromBaidu%7ERate-11.pc_relevant_default&utm_relevant_index=14
metrics.mean_squared_error(y_test, y_pred)

334.42748631188385

regr.set_params()

RandomForestRegressor(max_depth=2, random_state=0)

regr.feature_importances_

array([0.15597865, 0.84082089, 0. , 0.00320046])

調優——k折交叉驗證，scikit-learn的網格搜索GridSearchCV

# param_grid = {"n_estimators":[5,10,50,100,200,500],"max_depth":[5,10,50,100,200,500]}
param_grid = {"n_estimators":[5,50,100],"max_depth":[8,9,10]}

# 調用scikit-learn的網格搜索，傳入參數選擇范圍，並且制定隨機森林回歸算法，cv = 5表示5折交叉驗證
grid_search = GridSearchCV(RandomForestRegressor(),param_grid,cv = 3)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})

y_pred=grid_search.predict(X_test)
print(y_pred)

[ 49.50191561 -0.7122897 15.26286215 17.50407347 15.87708862
-14.54908528 -13.32531612 80.64244515 -75.54860534 63.84753325
-68.76733049 -27.15074728 -34.90857798 -45.24935823 16.53953061
-25.26432862 -10.65729336 -18.79136562 -19.30815651 -38.14527267
6.93420609 88.31726657 16.87408796 34.57068077 53.79849864
-9.89424185 39.75832876 87.18227999 45.21303975 13.54728708]

plt.figure(figsize=(15, 10))
# 創建t變量
t = np.arange(len(X_test))
# 繪制y_test曲線
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')
# 繪制y_hat曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')
# 設置圖例
plt.legend()
plt.show()

# 擬合優度R2
print("r2:", grid_search.score(X_test, y_test))

r2: 0.9866915026043963

 # 用Scikit_learn計算MSE
print("MSE:", metrics.mean_squared_error(y_test, y_pred))

MSE: 26.786543978030636

grid_search.set_params()

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
param_grid={'max_depth': [8, 9, 10], 'n_estimators': [5, 50, 100]})

print(grid_search.best_params_)

{'max_depth': 9, 'n_estimators': 50}

調優——k折交叉驗證+逐個參數

superpa = []
for i in range(10,200,10):
    regr = RandomForestRegressor(n_estimators=i
                                 ,random_state=42
                                 )
    regr_s = cross_val_score(regr
                            ,X_train
                            ,y_train
                            ,cv=10
                            #,scoring='roc_auc'
                           ).mean()# 評估指標
    superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*10+10)#輸出最大值及其索引
 
plt.figure(figsize=[20,5])
plt.plot(range(10,200,10),superpa)#橫縱坐標
plt.show()

# max_depth的學習曲線
superpa = []
for i in range(10,30,2):
    regr = RandomForestRegressor(n_estimators=170
                                 ,max_depth=i
                                 ,random_state=42
                                 )
    regr_s = cross_val_score(regr
                            ,X_train
                            ,y_train
                            ,cv=10
                            #,scoring='roc_auc'
                           ).mean()# 評估指標
    superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+10)#輸出最大值及其索引
 
plt.figure(figsize=[20,5])
plt.plot(range(10,30,2),superpa)#橫縱坐標

plt.show()

# min_samples_split的學習曲線，分割內部節點所需的最小樣本數
superpa = []
for i in range(2,10,2):
    regr = RandomForestRegressor(n_estimators=170
                                 ,max_depth=12
                                 ,min_samples_split=i
                                 ,random_state=42
                                 ,n_jobs=-1)
    regr_s = cross_val_score(regr
                            ,X_train
                            ,y_train
                            ,cv=10
                            #,scoring='roc_auc'
                           ).mean()# 評估指標
    superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*2+2)#輸出最大值及其索引
 
plt.figure(figsize=[20,5])
plt.plot(range(2,10,2),superpa)#橫縱坐標
plt.show()

# min_samples_leaf的學習曲線
superpa = []
for i in range(1,15,1):
    regr = RandomForestRegressor(n_estimators=170
                                 ,max_depth=12
                                 ,min_samples_split=2
                                 ,min_samples_leaf=i
                                 ,random_state=42
                                 )
    regr_s = cross_val_score(regr
                            ,X_train
                            ,y_train
                            ,cv=10
                            #,scoring='roc_auc'
                           ).mean()# 評估指標
    superpa.append(regr_s)
print(max(superpa),superpa.index(max(superpa)),(superpa.index(max(superpa)))*1+1)#輸出最大值及其索引
 
plt.figure(figsize=[20,5])
plt.plot(range(1,15,1),superpa)#橫縱坐標
plt.show()

# max_features的學習曲線，每棵樹用到的最大特征數
#from sklearn.model_selection import GridSearchCV
 
param_grid = {'max_features':np.arange(3, 11, 1)} 
 
regr = RandomForestRegressor(n_estimators=170
                                 ,max_depth=12
                                 ,min_samples_split=2
                                 ,min_samples_leaf=1
                                 ,random_state=42
                                 )
GS = GridSearchCV(regr,param_grid,cv=10)
GS.fit(X_train, y_train)
 
print(GS.best_params_)
#print(GS.best_score_)

{'max_features': 4}

最終模型：

regr = RandomForestRegressor(n_estimators=170
                                 ,max_depth=12
                                 ,min_samples_split=2
                                 ,min_samples_leaf=1
                                 ,random_state=42
                                 )

# 返回擬合優度the coefficient of determination
regr.score(X_test,y_test)

0.9879834206877871

# mse
metrics.mean_squared_error(y_test, y_pred)

24.275331856914285

regr.set_params()

RandomForestRegressor(max_depth=12, n_estimators=170, random_state=42)

regr.feature_importances_

array([0.21877631, 0.7564047 , 0.01129733, 0.01352166])

plt.figure(figsize=(15, 10))
# 創建t變量
t = np.arange(len(X_test))
# 繪制y_test曲線
plt.plot(t, y_test, 'r', linewidth=2, label='真實值')
# 繪制y_hat曲線
plt.plot(t, y_pred, 'g', linewidth=2, label='預測值')
# 設置圖例
plt.legend()
plt.show()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 回歸預測之XGBoost——運行+調優 RandomForest 隨機森林算法與模型參數的調優隨機森林（分類與回歸）回歸預測分析(RANSAC、多項式回歸、殘差圖、隨機森林) 機器學習之路：python 集成回歸模型隨機森林回歸RandomForestRegressor 極端隨機森林回歸ExtraTreesRegressor GradientBoostingRegressor回歸預測波士頓房價機器學習——用邏輯回歸及隨機森林實現泰坦尼克號的生存預測 pyspark RandomForestRegressor 隨機森林回歸隨機森林回歸器學習 MATLAB隨機森林回歸模型拓端數據|R語言隨機森林RandomForest、邏輯回歸Logisitc預測心臟病數據和可視化分析