GMM實戰

本文轉載自查看原文 2018-08-17 14:05 1458 數據科學

一道作業題：

https://www.kaggle.com/c/speechlab-aug03

就是給你訓練集，驗證集，要求用GMM（混合高斯模型）預測測試集的分類，這是個2分類的問題。

$ head train.txt dev.txt test.txt
==> train.txt <==
1.124586 1.491173 2
2.982154 0.275734 1
-0.367243 0.068235 2
1.216709 -0.804729 1
3.077832 0.307613 1
1.165453 1.627965 2
0.737648 1.055470 2
0.838988 1.355942 2
1.452510 -0.056967 2
-0.570093 -0.355338 2

==> dev.txt <==
0.900742 1.933559 2
3.112402 -0.817857 1
0.989450 1.605954 2
0.759107 -1.214189 1
1.433045 -0.986053 1
1.072825 1.654026 2
2.174214 0.638420 2
1.135233 -1.055797 1
-0.328072 -0.091407 2
1.220020 1.116177 2

==> test.txt <==
0.916983 0.353964
1.921382 1.958336
1.822650 2.328900
-0.786640 -0.059369
1.018302 1.406017
0.660574 0.847398
2.747331 0.910621
0.662462 1.935314
2.955916 -0.317031
-0.213735 0.126742

View Code

這里我已經把圖畫出來了，因為特征是2維的，所以可以用平面上的點來表示，不同的顏色代表不同的分類。

這個題的思路就是：用兩個GMM來做，每個GMM有4個組分，最后要看屬於哪一類，就看兩個GMM模型的概率，誰高就屬於哪一類。

要對兩個GMM模型做初始化，在這里用Kmeans來做初始化，比隨機初始化要好。

1.Kmeans初始化：

# from numpy import *
import numpy as np
import matplotlib.pyplot as plt
import os

use_new_data=0
#-------------------new-----------------
if use_new_data:
    colour=['lightblue','sandybrown']
    _path='/home/dahu/myfile/tianchi/kaggle/gmm_em/my_test/new'
    train_file=os.path.join(_path,'train.txt1')
    dev_file=os.path.join(_path,'dev.txt1')
    test_file=os.path.join(_path,'test.txt1')

#-------------------old-----------------
else :
    _path='/home/dahu/myfile/tianchi/kaggle/gmm_em/my_test/old'
    train_file=os.path.join(_path,'train.txt')
    dev_file=os.path.join(_path,'dev.txt')
    test_file=os.path.join(_path,'test.txt')

feature_num=2
n_classes = 4

plt.rc('figure', figsize=(10, 6))
def loadDataSet(fileName):  # 解析文件，按tab分割字段，得到一個浮點數字類型的矩陣
    dataMat = []              # 文件的最后一個字段是類別標簽
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split(' ')
#         fltLine = map(float, curLine)    # 將每個元素轉成float類型
        fltLine=[float(i) for i in curLine]
        dataMat.append(fltLine)
    return dataMat

# 計算歐幾里得距離
def distEclud(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2))) # 求兩個向量之間的距離

# 構建聚簇中心，取k個(此例中k=4)隨機質心
def randCent(dataSet, k):
    np.random.seed(1)
    n = np.shape(dataSet)[1]
    centroids = np.mat(np.zeros((k,n)))   # 每個質心有n個坐標值，總共要k個質心
    for j in range(n):
        minJ = min(dataSet[:,j])
        maxJ = max(dataSet[:,j])
        rangeJ = float(maxJ - minJ)
        centroids[:,j] = minJ + rangeJ * np.random.rand(k, 1)
    return centroids

# k-means 聚類算法
def kMeans(dataSet, k, distMeans =distEclud, createCent = randCent):
    '''
    :param dataSet:  沒有lable的數據集  (本例中是二維數據)
    :param k:  分為幾個簇
    :param distMeans:    計算距離的函數
    :param createCent:   獲取k個隨機質心的函數
    :return: centroids： 最終確定的 k個 質心
            clusterAssment:  該樣本屬於哪類  及  到該類質心距離
    '''
    m = np.shape(dataSet)[0]   #m=80,樣本數量
    clusterAssment = np.mat(np.zeros((m,2)))
    # clusterAssment第一列存放該數據所屬的中心點，第二列是該數據到中心點的距離，
    centroids = createCent(dataSet, k)
    clusterChanged = True   # 用來判斷聚類是否已經收斂
    while clusterChanged:
        clusterChanged = False;
        for i in range(m):  # 把每一個數據點划分到離它最近的中心點
            minDist = np.inf; minIndex = -1;
            for j in range(k):
                distJI = distMeans(centroids[j,:], dataSet[i,:])
                if distJI < minDist:
                    minDist = distJI; minIndex = j  # 如果第i個數據點到第j個中心點更近，則將i歸屬為j
            if clusterAssment[i,0] != minIndex:
                clusterChanged = True  # 如果分配發生變化，則需要繼續迭代
            clusterAssment[i,:] = minIndex,minDist**2   # 並將第i個數據點的分配情況存入字典
        # print centroids
        for cent in range(k):   # 重新計算中心點
#             ptsInClust = dataSet[clusterAssment.A[:,0]==cent]
            ptsInClust = dataSet[np.nonzero(clusterAssment[:,0].A == cent)[0]]
            centroids[cent,:] = np.mean(ptsInClust, axis = 0)  # 算出這些數據的中心點
    return centroids, clusterAssment
# --------------------測試----------------------------------------------------
# 用測試數據及測試kmeans算法

data=np.mat(loadDataSet(train_file))
# data=np.mat(loadDataSet('/home/dahu/myfile/my_git/pytorch_learning/pytorch_lianxi/gmm_em.lastyear/train.txt'))
# print(data)
kmeans=[]
colors = ['lightblue','sandybrown']

for i in [1,2]:
    datMat = data[np.nonzero(data[:,feature_num].A == i)[0]]  #選取某一標注的所有樣例
#     datMat = data[data.A[:,2]==i]
#     print(datMat)
    myCentroids,clustAssing = kMeans(datMat,n_classes)
    print('Kmeans 中心點坐標展示')
    print(myCentroids)
    x=np.array(datMat[:,0]).ravel()
    y=np.array(datMat[:,1]).ravel()
    plt.scatter(x,y, marker='o',color=colors[i-1],label=i)
    xcent=np.array(myCentroids[:,0]).ravel()
    ycent=np.array(myCentroids[:,1]).ravel()
    plt.scatter(xcent, ycent, marker='x', color='r', s=50)
#     print(myCentroids[:,:2])
    kmeans.append(myCentroids[:,:feature_num])
plt.legend(scatterpoints=1, loc='lower right', prop=dict(size=12))
plt.title('kmeans get center')
plt.show()

View Code

kmeans的方法之前的博客已經說了，方法是一樣的，在這里，已經把分類和各組分的中心點已經標出了。

kmeans找到的中心點，我們准備用來作為GMM的初始化。

2.GMM訓練

f_train=open(train_file,'r')
f_dev=open(dev_file,'r')

x_train=[]
Y_train=[]
x_test=[]
Y_test=[]
for line in f_train:
    a=line.split(' ')
    x_train.append([float(i) for i in a[:feature_num]])
    Y_train.append(int(a[-1]))
X_train=np.array(x_train)
y_train=np.array(Y_train)

for line in f_dev:
    a=line.split(' ')
    x_test.append([float(i) for i in a[:feature_num]])
    Y_test.append(int(a[-1]))
X_test=np.array(x_test)
y_test=np.array(Y_test)

c=[X_train,y_train,X_test,y_test]
# print(X_train[:5],y_train[:5],'\n\n',X_test[:5],y_test[:5],'\n',type(X_train))
# print(X_train[y_train==1])
x1=X_train[y_train==1]
x2=X_train[y_train==2]
xt1=X_test[y_test==1]
xt2=X_test[y_test==2]

print(x1[:5],x1.shape)
# for i in c:
#     print(i.shape)

# --------------------------GMM----------------------------------------------------
# 在這里准備開始搞GMM了，這里用了2個GMM模型

estimator1 = GaussianMixture(n_components=n_classes,
                   covariance_type='full', max_iter=200, random_state=0,tol=1e-5)
estimator2 = GaussianMixture(n_components=n_classes,
                   covariance_type='full', max_iter=200, random_state=0,tol=1e-5)
# estimator.means_init = np.array([X_train[y_train == i+1].mean(axis=0)
#                                     for i in range(n_classes)])

estimator1.means_init = np.array(kmeans[0])    #在這里初始化的，這個值就是我們之前kmeans得到的
estimator2.means_init = np.array(kmeans[1])

# estimator.fit(X_train)
estimator1.fit(x1)
estimator2.fit(x2)

x1_p= np.exp(estimator1.score_samples(X_train))
x2_p= np.exp(estimator2.score_samples(X_train))

# 寫了兩個函數，一個是預測分類的，其實就是根據 哪個GMM模型的得分高，就是哪一類 這樣來分類的， 另一個是根據分類結果，和標注對比，算一個准確率 def predict(x1test_p,x2test_p):      
    res=[]
    for i in range(x1test_p.shape[0]):
        if x1test_p[i]>x2test_p[i]:
            res.append(1)
        else:
            res.append(2)
    res=np.array(res)
    return res

def calculate_accuracy(x1test_p,x2test_p,y_test):
    res=predict(x1test_p,x2test_p)
    test_accuracy = np.mean(res.ravel() == y_test.ravel()) * 100
    return test_accuracy

print('開發集train准確率',calculate_accuracy(x1_p,x2_p,y_train))

print('-'*60)

x1test_p=np.exp(estimator1.score_samples(X_test))
x2test_p=np.exp(estimator2.score_samples(X_test))
# print(x1test_p[:5],x1test_p.shape)
# print(x2test_p[:5],x2test_p.shape)
# print(y_test[:5],y_test.shape)

print('驗證集dev准確率',calculate_accuracy(x1test_p,x2test_p,y_test))

[[ 2.982154  0.275734]
 [ 1.216709 -0.804729]
 [ 3.077832  0.307613]
 [ 0.710813  0.241071]
 [ 0.599696  0.490842]] (2400, 2)
開發集train准確率 98.375
------------------------------------------------------------
驗證集dev准確率 97.875

當然這里算測試集的分類，我沒有貼上去啊，最后提交的時候，准確率是98% ，當然前面還有得分更高的。。。

最后

經數學帝的提醒，對於確定是哪一個分類這一塊的描述不夠准確，重新描述一下：

a.我們求的2個GMM的概率其實是 p(x|y=1) ,p(x|y=2) ，在給定分類情況下，x的概率

b.而我們需要的是p(y=1|x) ,p(y=2|x) ，已經知道是x了，求是哪個分類的概率。

兩個不是同一個東西，需要轉化一下： p(y=1|x) = p(x|y=1)*p(y=1)/p(x) , p(y=2|x) = p(x|y=2)*p(y=2)/p(x)

分母p(x)是一樣的，可以約掉，分子上還有一個p(y=1) 和 p(y=2) 這是先驗概率,在題目中，分類1和分類2的數量是一樣多的，所以我們求 a ，能反應出我們求b ，如果題目中不一樣，還要把它考慮進去。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 高斯混合模型(GMM) GMM算法的matlab程序 GMM的EM算法實現 GMM及EM算法 [綜]前景檢測GMM SparkMLlib-----GMM算法 sklearn GMM模型介紹 kaldi GMM模型解碼指令 gmm-latgen-faster詳解前景檢測算法_3(GMM) MATLAB中“fitgmdist”的用法及其GMM聚類算法