''' 學習曲線:模型性能 = f(訓練集大小) 學習曲線所需API: _, train_scores, test_scores = ms.learning_curve( model, # 模型 輸入集, 輸出集, [0.9, 0.8, 0.7], # 訓練集大小序列 cv=5 # 折疊數 ) 案例:在小汽車評級案例中使用學習曲線選擇訓練集大小最優參數。 ''' import numpy as np import matplotlib.pyplot as mp import sklearn.preprocessing as sp import sklearn.ensemble as se import sklearn.model_selection as ms import sklearn.metrics as sm import warnings warnings.filterwarnings('ignore') data = [] with open('./ml_data/car.txt', 'r') as f: for line in f.readlines(): sample = line[:-1].split(',') data.append(sample) data = np.array(data) # print(data.shape) # 整理好每一列的標簽編碼器encoders # 整理好訓練輸入集與輸出集 data = data.T # print(data.shape) encoders = [] train_x, train_y = [], [] for row in range(len(data)): encoder = sp.LabelEncoder() if row < len(data) - 1: # 不是最后列 train_x.append(encoder.fit_transform(data[row])) else: # 是最后一列,作為輸出集 train_y = encoder.fit_transform(data[row]) encoders.append(encoder) train_x = np.array(train_x).T # 訓練隨機森林分類器 model = se.RandomForestClassifier(max_depth=6, n_estimators=150, random_state=7) # 繪制學習曲線 train_sizes = np.linspace(0.1, 1, 10) _, train_scores, test_scores = ms.learning_curve(model, train_x, train_y, train_sizes=train_sizes, cv=5) print(test_scores) print(np.mean(test_scores,axis=1)) # 訓練之前進行交叉驗證 cv = ms.cross_val_score(model, train_x, train_y, cv=4, scoring='f1_weighted') print(cv.mean()) model.fit(train_x, train_y) # 自定義測試集,預測小汽車的等級 # 保證每個特征使用的標簽編碼器與訓練時使用的標簽編碼器匹配 data = [ ['high', 'med', '5more', '4', 'big', 'low', 'unacc'], ['high', 'high', '4', '4', 'med', 'med', 'acc'], ['low', 'low', '2', '4', 'small', 'high', 'good'], ['low', 'med', '3', '4', 'med', 'high', 'vgood']] data = np.array(data).T test_x, test_y = [], [] for row in range(len(data)): encoder = encoders[row] # 每列對應的標簽編碼器 if row < len(data) - 1: test_x.append(encoder.transform(data[row])) # 這里需要訓練了,直接轉換 else: test_y = encoder.transform(data[row]) test_x = np.array(test_x).T pred_test_y = model.predict(test_x) print(pred_test_y) pred_test_y = encoders[-1].inverse_transform(pred_test_y) test_y = encoders[-1].inverse_transform(test_y) print(pred_test_y) print(test_y) # 畫圖顯示學習曲線 mp.figure('Learning Curve', facecolor='lightgray') mp.title('Learning Curve') mp.xlabel('train size') mp.ylabel('f1 score') mp.grid(linestyle=":") mp.plot(train_sizes, np.mean(test_scores, axis=1), label='Learning Curve') mp.legend() mp.show() 輸出結果: [[0.69942197 0.69942197 0.69942197 0.69942197 0.70348837] [0.67630058 0.79768786 0.69942197 0.71965318 0.70348837] [0.66184971 0.70231214 0.75433526 0.74855491 0.70348837] [0.71098266 0.78323699 0.74277457 0.73988439 0.7005814 ] [0.71387283 0.71965318 0.5982659 0.74277457 0.74127907] [0.71387283 0.76878613 0.70809249 0.74855491 0.73837209] [0.71387283 0.7716763 0.72254335 0.82080925 0.75872093] [0.71387283 0.76878613 0.72254335 0.83526012 0.75872093] [0.71387283 0.7716763 0.73121387 0.83526012 0.76744186] [0.73121387 0.76878613 0.72254335 0.8583815 0.86046512]] [0.70023525 0.71931039 0.71410808 0.735492 0.70316911 0.73553569 0.75752453 0.75983667 0.763893 0.78827799] 0.7477732938195376 [2 0 0 3] ['unacc' 'acc' 'acc' 'vgood'] ['unacc' 'acc' 'good' 'vgood']

