機器學習算法及代碼實現–支持向量機
1、支持向量機
SVM希望通過N-1維的分隔超平面線性分開N維的數據,距離分隔超平面最近的點被叫做支持向量,我們利用SMO(SVM實現方法之一)最大化支持向量到分隔面的距離,這樣當新樣本點進來時,其被分類正確的概率也就更大。我們計算樣本點到分隔超平面的函數間隔,如果函數間隔為正,則分類正確,函數間隔為負,則分類錯誤,函數間隔的絕對值除以||w||就是幾何間隔,幾何間隔始終為正,可以理解為樣本點到分隔超平面的幾何距離。若數據不是線性可分的,那我們引入核函數的概念,從某個特征空間到另一個特征空間的映射是通過核函數來實現的,我們利用核函數將數據從低維空間映射到高維空間,低維空間的非線性問題在高維空間往往會成為線性問題,再利用N-1維分割超平面對數據分類。
2、分類
線性可分、線性不可分
3、超平面公式(先考慮線性可分)
W*X+b=0
其中W={w1,w2,,,w3},為權重向量
下面用簡單的二維向量講解(思維導圖)
4、尋找超平面
5、例子
6、線性不可分
映射到高維
算法思路(思維導圖)
核函數舉例
代碼
# -*- coding: utf-8 -*- from sklearn import svm # 數據 x = [[2, 0], [1, 1], [2, 3]] # 標簽 y = [0, 0, 1] # 線性可分的svm分類器,用線性的核函數 clf = svm.SVC(kernel='linear') # 訓練 clf.fit(x, y) print clf # 獲得支持向量 print clf.support_vectors_ # 獲得支持向量點在原數據中的下標 print clf.support_ # 獲得每個類支持向量的個數 print clf.n_support_ # 預測 print clf.predict([2, 0])
# -*- coding: utf-8 -*- import numpy as np import pylab as pl from sklearn import svm np.random.seed(0) # 值固定,每次隨機結果不變 # 2組20個二維的隨機數,20個0,20個1的y (20,2)20行2列 X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20, 2) + [2, 2]] Y = [0] * 20 + [1] * 20 # 訓練 clf = svm.SVC(kernel='linear') clf.fit(X, Y) w = clf.coef_[0] a = -w[0] / w[1] xx = np.linspace(-5, 5) yy = a * xx - (clf.intercept_[0] / w[1]) # 點斜式 平分的線 b = clf.support_vectors_[0] yy_down = a* xx +(b[1] - a*b[0]) b = clf.support_vectors_[-1] yy_up = a* xx +(b[1] - a*b[0]) # 兩條虛線 print "w: ", w print "a: ", a # print " xx: ", xx # print " yy: ", yy print "support_vectors_: ", clf.support_vectors_ print "clf.coef_: ", clf.coef_ # In scikit-learn coef_ attribute holds the vectors of the separating hyperplanes for linear models. It has shape (n_classes, n_features) if n_classes > 1 (multi-class one-vs-all) and (1, n_features) for binary classification. # # In this toy binary classification example, n_features == 2, hence w = coef_[0] is the vector orthogonal to the hyperplane (the hyperplane is fully defined by it + the intercept). # # To plot this hyperplane in the 2D case (any hyperplane of a 2D plane is a 1D line), we want to find a f as in y = f(x) = a.x + b. In this case a is the slope of the line and can be computed by a = -w[0] / w[1]. # plot the line, the points, and the nearest vectors to the plane pl.plot(xx, yy, 'k-') pl.plot(xx, yy_down, 'k--') pl.plot(xx, yy_up, 'k--') pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, facecolors='none') pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired) pl.axis('tight') pl.show()
# -*- coding: utf-8 -*- from __future__ import print_function from time import time import logging # 打印程序進展的信息 import matplotlib.pyplot as plt from sklearn.cross_validation import train_test_split from sklearn.datasets import fetch_lfw_people from sklearn.grid_search import GridSearchCV from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.decomposition import RandomizedPCA from sklearn.svm import SVC print(__doc__) # 打印程序進展的信息 logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') ############################################################################### # 下載人臉數據集,並導入 lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) # 數據集多少,長寬多少 n_samples, h, w = lfw_people.images.shape # x是特征向量的矩陣,獲取矩陣列數,即緯度 X = lfw_people.data n_features = X.shape[1] # y是分類標簽向量 y = lfw_people.target # 類別里面有誰的名字 target_names = lfw_people.target_names # 名字有多少行,即有多少人要區分 n_classes = target_names.shape[0] # 打印 print("Total dataset size:") print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) print("n_classes: %d" % n_classes) ############################################################################### # 將數據集划分為訓練集和測試集,測試集占0.25 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25) ############################################################################### # PCA降維 n_components = 150 # 組成元素數量 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) t0 = time() # 建立PCA模型 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) # 提取特征臉 eigenfaces = pca.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") t0 = time() # 將特征向量轉化為低維矩陣 X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) ############################################################################### # Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() # C錯誤懲罰權重 gamma 建立核函數的不同比例 param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # 選擇核函數,建SVC,嘗試運行,獲得最好參數 clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) # 訓練 clf = clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0)) print("Best estimator found by grid search:") print(clf.best_estimator_) # 輸出最佳參數 ############################################################################### # Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() # 預測 y_pred = clf.predict(X_test_pca) print("done in %0.3fs" % (time() - t0)) print(classification_report(y_test, y_pred, target_names=target_names)) # 與真實情況作對比求置信度 print(confusion_matrix(y_test, y_pred, labels=range(n_classes))) # 對角線的為預測正確的,a預測為a ############################################################################### # Qualitative evaluation of the predictions using matplotlib def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray) plt.title(titles[i], size=12) plt.xticks(()) plt.yticks(()) # plot the result of the prediction on a portion of the test set def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue: %s' % (pred_name, true_name) prediction_titles = [title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])] plot_gallery(X_test, prediction_titles, h, w) # 畫出測試集和它的title # plot the gallery of the most significative eigenfaces eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] plot_gallery(eigenfaces, eigenface_titles, h, w) # 打印特征臉 plt.show() # 顯示