[Scikit-learn] 2.1 Clustering - Gaussian mixture models & EM

本文轉載自查看原文 2017-09-13 19:56 2932 Brain-ML

原理請觀良心視頻：機器學習課程 Expectation Maximisation

Expectation-maximization is a well-founded statistical algorithm to get around this problem by an iterative process.

First one assumes random components (randomly centered on data points, learned from k-means, or even just normally distributed around the origin) and computes for each point a probability of being generated by each component of the model.
Then, one tweaks the parameters to maximize the likelihood of the data given those assignments. Repeating this process is guaranteed to always converge to a local optimum.

實戰：

X_train Out[79]: array([[ 4.3,  3. ,  1.1,  0.1], [ 5.8,  4. ,  1.2,  0.2], [ 5.7,  4.4,  1.5,  0.4], ..., [ 6.5,  3. ,  5.2,  2. ], [ 6.2,  3.4,  5.4,  2.3], [ 5.9,  3. ,  5.1,  1.8]]) X_train.size Out[80]: 444 classifier.means_ Out[81]: array([[ 5.04594595,  3.45135126,  1.46486501,  0.25675684], # 1st 4d Gaussian [ 5.92023012,  2.75827264,  4.42168189,  1.43882194], # 2nd 4d Gaussian [ 6.8519452 ,  3.09214071,  5.71373857,  2.0934678 ]]) # 3rd 4d Gaussian  classifier.covars_ Out[82]: array([[ 0.08532076,  0.08532076,  0.08532076,  0.08532076], [ 0.14443088,  0.14443088,  0.14443088,  0.14443088], [ 0.1758563 ,  0.1758563 ,  0.1758563 ,  0.1758563 ]])

本有四個變量，如何畫在平面圖上的呢？以上只取了前兩維數據做圖。

""" ================== GMM classification ================== Demonstration of Gaussian mixture models for classification. See :ref:`gmm` for more information on the estimator. Plots predicted labels on both training and held out test data using a variety of GMM classifiers on the iris dataset. Compares GMMs with spherical, diagonal, full, and tied covariance matrices in increasing order of performance. Although one would expect full covariance to perform best in general, it is prone to overfitting on small datasets and does not generalize well to held out test data. On the plots, train data is shown as dots, while test data is shown as crosses. The iris dataset is four-dimensional. Only the first two dimensions are shown here, and thus some points are separated in other dimensions. """ print(__doc__) # Author: Ron Weiss <ronweiss@gmail.com>, Gael Varoquaux # License: BSD 3 clause

# $Id$

import matplotlib.pyplot as plt import matplotlib as mpl import numpy as np from sklearn import datasets from sklearn.cross_validation import StratifiedKFold from sklearn.externals.six.moves import xrange from sklearn.mixture import GMM def make_ellipses(gmm, ax): for n, color in enumerate('rgb'): v, w = np.linalg.eigh(gmm._get_covars()[n][:2, :2]) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi  # convert to degrees
        v *= 9 ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) 
 iris = datasets.load_iris()
 #數據預處理 # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets.
# 分層交叉驗證，使得交叉驗證抽到的樣本符合原始樣本的比例。
skf = StratifiedKFold(iris.target, n_folds=4)
 # Only take the first fold.
train_index, test_index = next(iter(skf)) # next(iter())逐個遍歷skf的elem， len(skf) = 4
# 隨機獲取了四組中的一組數據
 X_train = iris.data [train_index] y_train = iris.target[train_index] X_test = iris.data [test_index] y_test = iris.target[test_index] 
#GMM初始化 n_classes = len(np.unique(y_train))
# y_train就三種值，代表有仨個Gaussian # Try GMMs using different types of covariances.
# 四種不同的type做GMM，然后存放在dict中
classifiers = dict((covar_type, 
                    GMM(n_components=n_classes, covariance_type=covar_type, init_params='wc', n_iter=20)
 ) for covar_type in ['spherical', 'diag', 'tied', 'full']
 ) # NB：covar_type的表現往往體現在高斯分布圖像的旋轉 n_classifiers = len(classifiers) plt.figure(figsize=(3 * n_classifiers / 2, 6)) plt.subplots_adjust(bottom=.01, top=0.95, hspace=.15, wspace=.05, left=.01, right=.99) for index, (name, classifier) in enumerate(classifiers.items()):

"""

dict_items([('diag', GMM(covariance_type='diag', init_params='wc', min_covar=0.001, n_components=5, n_init=1, n_iter=20, params='wmc', random_state=None, tol=0.001, verbose=0)),

('tied', GMM(covariance_type='tied', init_params='wc', min_covar=0.001, n_components=5, n_init=1, n_iter=20, params='wmc', random_state=None, tol=0.001, verbose=0)),

('full', GMM(covariance_type='full', init_params='wc', min_covar=0.001, n_components=5, n_init=1, n_iter=20, params='wmc', random_state=None, tol=0.001, verbose=0)),

('spherical', GMM(covariance_type='spherical', init_params='wc', min_covar=0.001, n_components=5, n_init=1, n_iter=20, params='wmc', random_state=None, tol=0.001, verbose=0))])

"""

# 數據訓練
 # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.
    classifier.means_ = np.array([X_train[y_train == i].mean(axis=0) for i in xrange(n_classes)]) # axis=0 沿着Matrix的‘行’求統計量，NB：每個向量的第一元素求mean，第二個元素求mean ...
 # Train the other parameters using the EM algorithm.
 classifier.fit(X_train) 
# 數據表現 h = plt.subplot(2, n_classifiers / 2, index + 1) make_ellipses(classifier, h)

for n, color in enumerate('rgb'): data = iris.data[iris.target == n]        plt.scatter(data[:, 0], data[:, 1], 0.8, color=color, label=iris.target_names[n]) # Plot the test data with crosses
    for n, color in enumerate('rgb'): data = X_test[y_test == n] plt.plot(data[:, 0], data[:, 1], 'x', color=color) y_train_pred = classifier.predict(X_train) train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy, transform=h.transAxes) test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy, transform=h.transAxes) plt.xticks(()) plt.yticks(()) plt.title(name) plt.legend(loc='lower right', prop=dict(size=12)) plt.show()

New api: mixture.GMM

""" ============================================= Density Estimation for a mixture of Gaussians ============================================= Plot the density estimation of a mixture of two Gaussians. Data is generated from two Gaussians with different centers and covariance matrices. """

import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from sklearn import mixture n_samples = 300

# generate random sample, two components
np.random.seed(0) # generate spherical data centered on (20, 20)
shifted_gaussian = np.random.randn(n_samples, 2) + np.array([20, 20]) # generate zero centered stretched Gaussian data
C = np.array([[0., -0.7], [3.5, .7]]) stretched_gaussian = np.dot(np.random.randn(n_samples, 2), C) # concatenate the two datasets into the final training set
X_train = np.vstack([shifted_gaussian, stretched_gaussian]) # fit a Gaussian Mixture Model with two components
clf = mixture.GMM(n_components=2, covariance_type='full') clf.fit(X_train) # display predicted scores by the model as a contour plot
x = np.linspace(-20.0, 30.0) y = np.linspace(-20.0, 40.0) X, Y = np.meshgrid(x, y) XX = np.array([X.ravel(), Y.ravel()]).T Z = -clf.score_samples(XX)[0] Z = Z.reshape(X.shape) CS = plt.contour(X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10)) CB = plt.colorbar(CS, shrink=0.8, extend='both') plt.scatter(X_train[:, 0], X_train[:, 1], .8) plt.title('Negative log-likelihood predicted by a GMM') plt.axis('tight') plt.show()

如何判定模型中有幾個Gaussian，Selecting the number of components in a classical GMM

The BIC criterion can be used to select the number of components in a GMM in an efficient way. In theory, it recovers the true number of components only in the asymptotic regime (i.e. if much data is available).

Note that using a DPGMM avoids the specification of the number of components for a Gaussian mixture model.

（NOTE：DPGMM會放在Dirichlet Process章節中學習）

哪個模型更加的好呢？目前常用有如下方法：

AIC = -2 ln(L) + 2k Akaike information criterion

BIC = -2 ln(L) + ln(n)*k Bayesian information criterion

HQ = -2 ln(L) + ln(ln(n))*k Hannan-quinn criterion

其中L是在該模型下的最大似然，n是數據數量，k是模型的變量個數。

注意這些規則只是刻畫了用某個模型之后相對“真實模型”的信息損失【因為不知道真正的模型是什么樣子，所以訓練得到的所有模型都只是真實模型的一個近似模型】，所以用這些規則不能說明某個模型的精確度，即三個模型A, B, C，在通過這些規則計算后，我們知道B模型是三個模型中最好的，但是不能保證B這個模型就能夠很好地刻畫數據，因為很有可能這三個模型都是非常糟糕的，B只是爛蘋果中的相對好的蘋果而已。

這些規則理論上是比較漂亮的，但是實際在模型選擇中應用起來還是有些困難的，例如上面我們說了5個變量就有32個變量組合，如果是10個變量呢？2的10次方，我們不可能對所有這些模型進行一一驗證AIC, BIC，HQ規則來選擇模型，工作量太大。

""" ================================= Gaussian Mixture Model Selection ================================= This example shows that model selection can be performed with Gaussian Mixture Models using information-theoretic criteria (BIC). Model selection concerns both the covariance type and the number of components in the model. In that case, AIC also provides the right result (not shown to save time), but BIC is better suited if the problem is to identify the right model. Unlike Bayesian procedures, such inferences are prior-free. In that case, the model with 2 components and full covariance (which corresponds to the true generative model) is selected. """
print(__doc__) import itertools import numpy as np from scipy import linalg import matplotlib.pyplot as plt import matplotlib as mpl from sklearn import mixture # Number of samples per component
n_samples = 500

# Generate random sample, two components
np.random.seed(0) C = np.array([[0., -0.1], [1.7, .4]]) X = np.r_[np.dot(np.random.randn(n_samples, 2), C), .7 * np.random.randn(n_samples, 2) + np.array([-6, 3])] lowest_bic = np.infty bic = [] n_components_range = range(1, 7) cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a mixture of Gaussians with EM
        gmm = mixture.GMM(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm
 # 這里不需要 test set
 bic = np.array(bic) color_iter = itertools.cycle(['k', 'r', 'g', 'b', 'c', 'm', 'y']) clf = best_gmm bars = [] # Plot the BIC scores
spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): xpos = np.array(n_components_range) + .2 * (i - 2) bars.append(plt.bar(xpos, bic[i * len(n_components_range): (i + 1) * len(n_components_range)], width=.2, color=color)) plt.xticks(n_components_range) plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()]) plt.title('BIC score per model') xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\ .2 * np.floor(bic.argmin() / len(n_components_range)) plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14) spl.set_xlabel('Number of components') spl.legend([b[0] for b in bars], cv_types) # Plot the winner
splot = plt.subplot(2, 1, 2) Y_ = clf.predict(X) for i, (mean, covar, color) in enumerate(zip(clf.means_, clf.covars_, color_iter)): v, w = linalg.eigh(covar) if not np.any(Y_ == i): continue plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component
    angle = np.arctan2(w[0][1], w[0][0]) angle = 180 * angle / np.pi  # convert to degrees
    v *= 4 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180 + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(.5) splot.add_artist(ell) plt.xlim(-10, 10) plt.ylim(-3, 6) plt.xticks(()) plt.yticks(()) plt.title('Selected GMM: full model, 2 components') plt.subplots_adjust(hspace=.35, bottom=.02) plt.show()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 [Scikit-learn] 2.1 Clustering - Variational Bayesian Gaussian Mixture Scikit-Learn框架 scikit-learn點滴 scikit-learn 應用 scikit-learn 機器學習筆記：Hierarchical_clustering with scikit-learn(層次聚類) scikit-learn和tensorflow的區別 scikit-learn——快速入門 Python 安裝scikit-learn Scikit-learn CountVectorizer與TfidfVectorizer