在這個案例中:
1. datetime.datetime.strptime(data, '%Y-%m-%d') # 由字符串格式轉換為日期格式
2. pd.get_dummies(features) # 將數據中的文字標簽轉換為one-hot編碼形式,增加了特征的列數
3. rf.feature_importances 探究了隨機森林樣本特征的重要性,對其進行排序后條形圖
4.fig.autofmt_xdate(rotation=60) # 對圖中的X軸標簽進行60的翻轉
代碼:
第一步:數據讀取,通過.describe() 查看數據是否存在缺失值的情況
第二步:對年月日特征進行字符串串接,使用datetime.datetime.strptime(), 獲得日期格式作為X軸的標簽
第三步:對里面的幾個溫度特征做條形圖,fig.autofmt_xdate(rotation=60)設置日期標簽的旋轉角度,plt.style.use('fivethirtyeight') 設置畫風
第四步:使用pd.get_dummies(features) 將特征中文字類的標簽轉換為one-hot編碼形式,增加了特征的維度
第五步:提取數據的特征和樣本標簽,轉換為np.array格式
第六步:使用train_test_split 將特征和標簽分為訓練集和測試集
第七步:構建隨機森林模型進行模型的訓練和預測
第八步:進行隨機森林的可視化
第九步:使用rf.feature_importances_計算出各個特征的重要性,進行排序,然后做條形圖
第十步:根據第九步求得的特征重要性的排序結果,我們選用前兩個特征建立模型和預測
第十一步:對模型的預測結果畫直線圖plot和散點圖scatter,對於plot我們需要根據時間進行排序
import numpy as np import pandas as pd import matplotlib.pyplot as plt import datetime # 第一步提取數據 features = pd.read_csv('data/temps.csv') print(features.shape) print(features.columns) # 使用feature.describe() # 觀察數據是否存在缺失值 print(features.describe()) # 第二步:我們將year,month,day特征組合成一個dates特征,作為畫圖的標簽值比如2016-02-01 years = features['year'] months = features['month'] days = features['day'] # datetime.datetime.strptime() 將字符串轉換為日期類型 dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)] dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates] print(dates[0:5]) # 第三步進行畫圖操作 # 設置畫圖風格 plt.style.use('fivethirtyeight') # 使用plt.subplots畫出多副圖 fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(ncols=2, nrows=2, figsize=(12, 12)) # 使得標簽進行角度翻轉 fig.autofmt_xdate(rotation=60) ax1.plot(dates, features['temp_2'], linewidth=4) ax1.set_xlabel(''), ax1.set_ylabel('temperature'), ax1.set_title('pred two temperature') ax2.plot(dates, features['temp_1'], linewidth=4) ax2.set_xlabel(''), ax1.set_ylabel('temperature'), ax1.set_title('pred temperature') ax3.plot(dates, features['actual'], linewidth=4) ax3.set_xlabel(''), ax1.set_ylabel('temperature'), ax1.set_title('today temperature') ax4.plot(dates, features['friend'], linewidth=4) ax4.set_xlabel(''), ax1.set_ylabel('temperature'), ax1.set_title('friend temperature') plt.show()
# 第四步:pd.get_dummies() 來對特征中不是數字的特征進行one-hot編碼 features = pd.get_dummies(features) # 第五步:把數據分為特征和標簽 y = np.array(features['actual']) X = features.drop('actual', axis=1) feature_names = list(X.columns) X = np.array(X) # 第六步: 使用train_test_split 對數據進行拆分 from sklearn.model_selection import train_test_split train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42) # 第七步:建立隨機森林的模型進行預測 from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=1000) rf.fit(train_X, train_y) pred_y = rf.predict(test_X) # MSE指標通過真實值-預測值的絕對值求平均值 MSE = round(abs(pred_y - test_y).mean(), 2) # MAPE指標通過 1 - abs(誤差)/真實值來表示 error = abs(pred_y - test_y) MAPE = round(np.mean((1 - error / test_y) * 100), 2) print(MSE, MAPE) # 第八步進行隨機森林的可視化展示 # from sklearn.tree import export_graphviz # import pydot #pip install pydot # # # Pull out one tree from the forest # tree = model.estimators_[5] # # # Export the image to a dot file # export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_names, rounded = True, precision = 1) # # # Use dot file to create a graph # (graph, ) = pydot.graph_from_dot_file('tree.dot') # graph.write_png('tree.png'); # print('The depth of this tree is:', tree.tree_.max_depth) # # # 限制樹的深度重新畫圖 # rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3, random_state=42) # rf_small.fit(train_x,train_y) # # # Extract the small tree # tree_small = rf_small.estimators_[5] # # # Save the tree as a png image # export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_names, rounded = True, precision = 1) # # (graph, ) = pydot.graph_from_dot_file('small_tree.dot') # # graph.write_png('small_tree.png') #第九步:探討隨機森林特征的重要性 features_importances = rf.feature_importances_ features_importance_pairs = [(feature_name, features_importance) for feature_name, features_importance in zip(feature_names, features_importances)] # 對里面的特征進行排序操作 features_importance_pairs = sorted(features_importance_pairs, key=lambda x: x[1], reverse=True) features_importance_name = [name[0] for name in features_importance_pairs] features_importance_val = [name[1] for name in features_importance_pairs] figure = plt.figure() plt.bar(range(len(features_importance_name)), features_importance_val, orientation='vertical') plt.xticks(range(len(features_importance_name)), features_importance_name, rotation='vertical') plt.show()
# 第十步:通過上述的作圖,我們可以發現前兩個特征很重要,因此我們只選用前兩個特征作為訓練數據 X = features.drop('actual', axis=1) y = np.array(features['actual']) train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42) train_x_two = train_x[['temp_1', 'average']].values test_x_two = test_x[['temp_1', 'average']].values rf = RandomForestRegressor(n_estimators=1000) rf.fit(train_x_two, train_y) pred_y = rf.predict(test_x_two) # MSE指標通過真實值-預測值的絕對值求平均值 MSE = round(abs(pred_y - test_y).mean(), 2) # MAPE指標通過 1 - abs(誤差)/真實值來表示 error = abs(pred_y - test_y) MAPE = round(np.mean((1 - error / test_y) * 100), 2) print(MSE, MAPE) # 我們發現只使用兩個特征也是具有差不多的結果,因此我們可以通過減少特征來增加反應的時間 fig = plt.figure() years = test_x['year'] months = test_x['month'] days = test_x['day'] dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)] dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates] print(dates[0:5]) # 對真實的數據進行排序,因為需要畫plot圖 dates_test_paris = [(date, test_) for date, test_ in zip(dates, test_y)] dates_test_paris = sorted(dates_test_paris, key=lambda x: x[0], reverse=True) dates_test_data = [x[0] for x in dates_test_paris] dates_test_val = [x[1] for x in dates_test_paris] plt.plot(dates_test_data, dates_test_val, label='actual') plt.scatter(dates, pred_y, label='pred') plt.xticks(rotation='60') plt.legend() plt.show()