''' 案例:事件預測----加載event.txt,預測某個時間段是否會出現特殊事件。步驟如下: 1.數據預處理: 1>.讀取文件,加載data數組,刪除索引為1的列 2>.針對每一列做編碼,離散數據使用LabelEncoder,連續的數字數據使用DigitEncoder(需要自定義),編碼器需要保存 3>.整理數據集,划分測試集和訓練集 2.訓練SVM模型分類器 3.對測試集進行預測 4.自定義測試數據,實現事件預測 5.畫圖 ''' import numpy as np import matplotlib.pyplot as mp import sklearn.preprocessing as sp import sklearn.model_selection as sm import sklearn.svm as svm import warnings warnings.filterwarnings('ignore') class DigitEncoder: # 自定義編碼器:針對數字字符串做標簽編碼 def fit_transform(self, y): return y.astype('i4') def transform(self, y): return y.astype('i4') def inverse_transform(self, y): return y.astype('str') data = [] with open('./ml_data/event.txt', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) data = np.array(data) # 刪除第二列 data = np.delete(data, 1, axis=1) print(data.shape) # 整理輸入集與輸出集 encoders, x, y = [], [], [] data = data.T for row in range(len(data)): # 判斷每個特征值是否為數字 if data[row][0].isdigit(): encoder = DigitEncoder() else: encoder = sp.LabelEncoder() if row < len(data) - 1: x.append(encoder.fit_transform(data[row])) else: y = encoder.fit_transform(data[row]) encoders.append(encoder) x = np.array(x).T # print(x) # print(y) # 拆分測試集與訓練集 train_x, test_x, train_y, test_y = sm.train_test_split(x, y, test_size=0.25, random_state=7) # 交叉驗證 model = svm.SVC(kernel='rbf', class_weight='balanced') scores = sm.cross_val_score(model, train_x, train_y, cv=5, scoring='f1_weighted') print('交叉驗證平均得分:', scores.mean()) model.fit(train_x, train_y) # 測試集測試 pred_test_y = model.predict(test_x) print('預測精度:', (test_y == pred_test_y).sum() / test_y.size) # 對測試數據進行測試 data = [['Tuesday', '13:30:00', '21', '23'], ['Thursday', '13:30:00', '21', '23']] # 對測試數據進行編碼 data = np.array(data).T test_x = [] for row in range(len(data)): encoder = encoders[row] test_x.append(encoder.transform(data[row])) test_x = np.array(test_x).T # print(test_x) pred_test_y = model.predict(test_x) pred_test_y = encoders[-1].inverse_transform(pred_test_y) print('預測結果為: ', pred_test_y) 輸出結果: (5040, 5) 交叉驗證平均得分: 0.9458699461165295 預測精度: 0.9476190476190476 預測結果為: ['noevent' 'noevent']
''' 案例:交通流量預測(回歸)。步驟如下: 1.數據預處理: 1>.讀取文件,加載data數組,刪除索引為1的列 2>.針對每一列做編碼,離散數據使用LabelEncoder,連續的數字數據使用DigitEncoder(需要自定義),編碼器需要保存 3>.整理數據集,划分測試集和訓練集 2.訓練SVM模型回歸器 3.對測試集進行預測 4.自定義測試數據,實現事件預測 5.畫圖 ''' import numpy as np import matplotlib.pyplot as mp import sklearn.preprocessing as sp import sklearn.model_selection as sm import sklearn.svm as svm import warnings import sklearn.metrics as mm warnings.filterwarnings('ignore') class DigitEncoder: # 自定義編碼器:針對數字字符串做標簽編碼 def fit_transform(self, y): return y.astype('i4') def transform(self, y): return y.astype('i4') def inverse_transform(self, y): return y.astype('str') data = [] with open('./ml_data/traffic.txt', 'r') as f: for line in f.readlines(): data.append(line[:-1].split(',')) data = np.array(data) # 整理輸入集與輸出集 encoders, x, y = [], [], [] data = data.T for row in range(len(data)): # 判斷每個特征值是否為數字 if data[row][0].isdigit(): encoder = DigitEncoder() else: encoder = sp.LabelEncoder() if row < len(data) - 1: x.append(encoder.fit_transform(data[row])) else: y = encoder.fit_transform(data[row]) encoders.append(encoder) x = np.array(x).T # print(x) # print(y) # 拆分測試集與訓練集 train_x, test_x, train_y, test_y = sm.train_test_split(x, y, test_size=0.25, random_state=7) # 基於支持向量機的回歸模型-----epsilon為支持向量間距 model = svm.SVR(kernel='rbf', C=10, epsilon=0.2) model.fit(train_x, train_y) # 測試集測試 pred_test_y = model.predict(test_x) # 模型得分 score = mm.r2_score(test_y, pred_test_y) print('r2得分: ', score) # 對測試數據進行測試 data = [['Tuesday', '13:30', 'San Francisco', 'yes'], ['Thursday', '13:30', 'San Francisco', 'no']] # 對測試數據進行編碼 data = np.array(data).T test_x = [] for row in range(len(data)): encoder = encoders[row] test_x.append(encoder.transform(data[row])) test_x = np.array(test_x).T # print(test_x) pred_test_y = model.predict(test_x) pred_test_y = encoders[-1].inverse_transform(pred_test_y) print('預測結果為: ', pred_test_y) 預測結果: r2得分: 0.6484595603352119 預測結果為: ['24.111978908657576' '23.61904092888905']