加載sklearn中的人臉數據集
from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people()
執行上面的第二行程序,python會從網上下載labeled_face_wild people數據集,這個數據集大概200M,因為牆的原因下載很慢失敗。
使用百度雲下載該數據集,是個.tgz
的壓縮包
把下載好的壓縮包放入C:\Users\Tim\scikit_learn_data\lfw_home,其中yyy是我的用戶名,再次運行faces = fetch_lfw_people()
,成功,jupyter notebook中的輸出如下:
from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape)
['Donald Rumsfeld' 'George W Bush' 'Gerhard Schroeder' 'Junichiro Koizumi' 'Tony Blair'] (964, 62, 47)
# 進行完上一步還可以看一下圖片長什么樣子
import matplotlib.pyplot as plt %matplotlib inline fig, ax = plt.subplots(3, 5) for i, axi in enumerate(ax.flat): axi.imshow(faces.images[i], cmap='bone') axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])
解決人臉識別(jupyter)
人臉識別是一個分類問題,因為機器學習中svd屬於王霸地位(深度學習不算),所以使用svd對圖像進行訓練。
# svc 支持向量解決分類問題 from sklearn.svm import SVC # 圖片的維度太高,降維 from sklearn.decomposition import PCA # 管道 from sklearn.pipeline import make_pipeline pca = PCA(n_components=150, whiten=True, random_state=42) svc = SVC(kernel='rbf', class_weight='balanced') model = make_pipeline(pca, svc)
和上一步一樣看看我們數據的同時加載數據
from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape) import matplotlib.pyplot as plt %matplotlib inline fig, ax = plt.subplots(3, 5) for i, axi in enumerate(ax.flat): axi.imshow(faces.images[i], cmap='bone') axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])
切分訓練集和測試集
from sklearn.model_selection import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=40)
# 進行訓練
from sklearn.model_selection import GridSearchCV param_grid = {'svc__C': [1, 5, 10], 'svc__gamma': [0.0001, 0.0005, 0.001]} grid = GridSearchCV(model, param_grid) %time grid.fit(Xtrain, ytrain)
GridSearchCV(cv='warn', error_score='raise-deprecating', estimator=Pipeline(memory=None, steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=150, random_state=42, svd_solver='auto', tol=0.0, whiten=True)), ('svc', SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto_deprecated', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))], verbose=False), iid='warn', n_jobs=None, param_grid={'svc__C': [1, 5, 10], 'svc__gamma': [0.0001, 0.0005, 0.001]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring=None, verbose=0)
print(grid.best_params_)
{'svc__C': 10, 'svc__gamma': 0.0001}
查看測試集的測試結果
model = grid.best_estimator_ yfit = model.predict(Xtest) yfit.shape import matplotlib as mpl # 防止中文報錯 mpl.rcParams["font.sans-serif"] = ["SimHei"] mpl.rcParams["axes.unicode_minus"] = False # 畫圖 fig, ax = plt.subplots(4, 6) for i, axi in enumerate(ax.flat): # 調整像素為[62,47] axi.imshow(Xtest[i].reshape(62, 47), cmap='bone') axi.set(xticks=[], yticks=[]) # 截取目標名字的最后一組字 axi.set_ylabel(faces.target_names[yfit[i]].split()[-1], color='black' if yfit[i] == ytest[i] else 'red') fig.suptitle('預測錯誤的名字被紅色標注', size=14);
可以看到預測錯誤了四個,准確率欠佳,下面打印分類報告
from sklearn.metrics import classification_report print(classification_report(ytest, yfit, target_names=faces.target_names))
precision recall f1-score support Donald Rumsfeld 0.75 0.87 0.81 31 George W Bush 0.97 0.92 0.94 124 Gerhard Schroeder 0.80 0.83 0.81 29 Junichiro Koizumi 1.00 1.00 1.00 16 Tony Blair 0.85 0.85 0.85 41 accuracy 0.90 241 macro avg 0.87 0.89 0.88 241 weighted avg 0.90 0.90 0.90 241
最后使用seaborn的heatmap打印混淆矩陣
import seaborn as sns from sklearn.metrics import confusion_matrix # 混淆矩陣 mat = confusion_matrix(ytest, yfit) # 注意這里的混淆矩陣的畫圖 sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=faces.target_names, yticklabels=faces.target_names) plt.xlabel('true label') plt.ylabel('predicted label');
對於svd不懂得可以轉頭看一下,svm原理