聚類分析是一種靜態數據分析方法,常被用於機器學習,模式識別,數據挖掘等領域。通常認為,聚類是一種無監督式的機器學習方法,它的過程是這樣的:在未知樣本類別的情況下,通過計算樣本彼此間的距離(歐式距離,馬式距離,漢明距離,余弦距離等)來估計樣本所屬類別。從結構性來划分,聚類方法分為自上而下和自下而上兩種方法,前者的算法是先把所有樣本視為一類,然后不斷從這個大類中分離出小類,直到不能再分為止;后者則相反,首先所有樣本自成一類,然后不斷兩兩合並,直到最終形成幾個大類。
常用的聚類方法主要有以下四種: //照搬的wiki,比較懶...
Connectivity based clustering (如hierarchical clustering 層次聚類法)
Centroid-based clustering (如kmeans)
Distribution-based clustering
Density-based clustering
Kmeans聚類是一種自下而上的聚類方法,它的優點是簡單、速度快;缺點是聚類結果與初始中心的選擇有關系,且必須提供聚類的數目。Kmeans的第二個缺點是致命的,因為在有些時候,我們不知道樣本集將要聚成多少個類別,這種時候kmeans是不適合的,推薦使用hierarchical 或meanshift來聚類。第一個缺點可以通過多次聚類取最佳結果來解決。
Kmeans的計算過程大概表示如下
隨機選擇k個聚類中心. 最終的類別個數<= k
計算每個樣本到各個中心的距離
每個樣本聚類到離它最近的中心
重新計算每個新類的中心
重復以上步驟直到滿足收斂要求。(通常就是中心點不再改變或滿足一定迭代次數).
opencv1.0的例子, 隨機生機的散點,每個點是一個二維樣本
#include "cxcore.h" #include "highgui.h" #define MAX_CLUSTERS 5 int main( int argc, char** argv ) { CvScalar color_tab[MAX_CLUSTERS]; IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 ); CvRNG rng = cvRNG(0xffffffff); color_tab[0] = CV_RGB(255,0,0); color_tab[1] = CV_RGB(0,255,0); color_tab[2] = CV_RGB(100,100,255); color_tab[3] = CV_RGB(255,0,255); color_tab[4] = CV_RGB(255,255,0); cvNamedWindow( "clusters", 1 ); for(;;) { int k, cluster_count = cvRandInt(&rng)%MAX_CLUSTERS + 1; int i, sample_count = cvRandInt(&rng)%1000 + 1; CvMat* points = cvCreateMat( sample_count, 1, CV_32FC2 ); CvMat* clusters = cvCreateMat( sample_count, 1, CV_32SC1 ); /* generate random sample from multigaussian distribution */ for( k = 0; k < cluster_count; k++ ) { CvPoint center; CvMat point_chunk; center.x = cvRandInt(&rng)%img->width; center.y = cvRandInt(&rng)%img->height; cvGetRows( points, &point_chunk, k*sample_count/cluster_count, k == cluster_count - 1 ? sample_count : (k+1)*sample_count/cluster_count ); cvRandArr( &rng, &point_chunk, CV_RAND_NORMAL, cvScalar(center.x,center.y,0,0), cvScalar(img->width/6, img->height/6,0,0) ); } /* shuffle samples */ for( i = 0; i < sample_count/2; i++ ) { CvPoint2D32f* pt1 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; CvPoint2D32f* pt2 = (CvPoint2D32f*)points->data.fl + cvRandInt(&rng)%sample_count; CvPoint2D32f temp; CV_SWAP( *pt1, *pt2, temp ); } cvKMeans2( points, cluster_count, clusters, cvTermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0 )); cvZero( img ); for( i = 0; i < sample_count; i++ ) { CvPoint2D32f pt = ((CvPoint2D32f*)points->data.fl)[i]; int cluster_idx = clusters->data.i[i]; cvCircle( img, cvPointFrom32f(pt), 2, color_tab[cluster_idx], CV_FILLED ); } cvReleaseMat( &points ); cvReleaseMat( &clusters ); cvShowImage( "clusters", img ); int key = cvWaitKey(0); if( key == 27 ) // 'ESC' break; } }
opencv2.4的例子,對一張圖片(377*280)的像素點進行聚類,每個像素點是一個五維樣本(x,y,r,g,b),聚類結果如下
第一行: 原圖; k=2, 用時t=72ms; k=3, t=93ms
第二行:k=4, t= 128ms; k=10, t=330ms; k=20, t=676ms
從圖中某些局部可以看出,並不是k越大,細節就越顯著(如后兩幅圖中向日葵的眼睛),這是因為kmean的初始位置是隨機的。相同的樣本每次聚類會有不同的結果
#include "stdafx.h" #include "opencv2/opencv.hpp" #include <iostream> #include <string> using namespace cv; using namespace std; //這是Kmeans算法的一個缺點,在聚類之前需要指定類別個數 const int nClusters = 20; int _tmain(int argc, _TCHAR* argv[]) { Mat src; //相當於IplImage // src = imread("fruit.jpg"); //只是另一張圖 src = imread("zombie.jpg"); //cvLoadImage imshow("original", src); //cvShowImage blur(src, src, Size(11,11)); imshow("blurred", src); //p是特征矩陣,每行表示一個特征,每個特征對應src中每個像素點的(x,y,r,g,b共5維) Mat p = Mat::zeros(src.cols*src.rows, 5, CV_32F); //初始化全0矩陣 Mat bestLabels, centers, clustered; vector<Mat> bgr; cv::split(src, bgr); //分隔出src的三個通道 for(int i=0; i<src.cols*src.rows; i++) { p.at<float>(i,0) = (i/src.cols) / src.rows; // p.at<uchar>(y,x) 相當於 p->Imagedata[y *p->widthstep + x], p是8位uchar p.at<float>(i,1) = (i%src.cols) / src.cols; // p.at<float>(y,x) 相當於 p->Imagedata[y *p->widthstep + x], p是32位float p.at<float>(i,2) = bgr[0].data[i] / 255.0; p.at<float>(i,3) = bgr[1].data[i] / 255.0; p.at<float>(i,4) = bgr[2].data[i] / 255.0; } //計算時間 double t = (double)cvGetTickCount(); //kmeans聚類,每個樣本的標簽保存在bestLabels中 cv::kmeans(p, nClusters, bestLabels, TermCriteria( CV_TERMCRIT_EPS+CV_TERMCRIT_ITER, 10, 1.0), 3, KMEANS_PP_CENTERS, centers); t = (double)cvGetTickCount() - t; float timecost = t/(cvGetTickFrequency()*1000); //給每個類別賦顏色,其值等於每個類第一個元素的值 Vec3b colors[nClusters]; bool colormask[nClusters]; memset(colormask, 0, nClusters*sizeof(bool)); int count = 0; for(int i=0; i<src.cols*src.rows; i++) { int clusterindex = bestLabels.at<int>(i,0); for (int j=0; j<nClusters; j++) { if(j == clusterindex && colormask[j] == 0) { int y = i/src.cols; int x = i%src.cols; colors[j] = src.at<Vec3b>(y,x); colormask[j] = 1; count++; break; } } if(nClusters == count)break; } //顯示聚類結果 clustered = Mat(src.rows, src.cols, CV_8UC3); for(int i=0; i<src.cols*src.rows; i++) { int y = i/src.cols; int x = i%src.cols; int clusterindex = bestLabels.at<int>(i,0); clustered.at<Vec3b>(y, x) = colors[clusterindex]; } imshow("clustered", clustered); cout<< "time cost = %gms\n"<<timecost ; //保存圖像 stringstream s1,s2; s1<<timecost; s2<<nClusters; string name = "n=" + s2.str() + "_timecost=" + s1.str() + ".png"; imwrite(name, clustered); waitKey(); return 0; }