突然想記錄幾個聚類算法,由於實力有限就先介紹一下層次聚類算法(Hierarchical cluster algorithm),這個聚類算法思想簡單,但實現起來感覺復雜度挺大;以前看過《集體智慧編程》里介紹過,里面是用python實現的,由於python里面的列表和字典用起來方便,故實現該算法還行;這里我用c++重新寫了一下,感覺代碼蠻臃腫,可能是自己的c++沒有學習好吧!!!對於容器的使用還不夠熟練,這里貼出來的目的是希望哪位大牛看到了指導一二,這里感激不盡。廢話不多說了,進入正題吧!
************************************************************************************************************
Hierarchical cluster Algorithm的大致介紹
層次聚類算法有兩種實現思想,一種是初始時將每個待聚類的數據樣本視為一個cluster,采用合並的方式,每次合並兩個"距離"最近的cluster,直到合並成一個cluster為止(當然可以在達到自己設定想得到的cluster個數時終止迭代);另一種剛好與第一種相反,初始時將所有的數據樣本視為一個cluster,采用分解的方式(這里沒有實現就不說太多)。
************************************************************************************************************
算法的步驟及相關問題
算法步驟: (1)初始時,將每個數據樣本視為一個cluster(選取一個度量兩個cluster距離的方式),
(2)計算任意兩個cluster之間的距離;每次選取距離最小的兩個cluster,
(3)合並(2)中選擇的兩個cluster,將合並產生的新cluster加入cluster set中,並刪除被合並的兩個cluster,
(4)重復(2)(3),知道cluster set中元素只剩下一個為止。
相關問題: (1)度量兩個cluster之間的距離,應該選擇哪種距離???《集體智慧編程》中選擇的是Pearson,當然也可以直接選用歐氏距離
(2)如何合並兩個cluster,即新的cluster對應的屬性值如何表示???這里是用被合並的兩個cluster的平均值表示新的cluster
******************************************************************************************************************
1 /** 2 ** Hierarchical cluster Algorithm 3 ** step:(1)Firstly,regard each sample as a cluster, and 4 (2)Each time merge two clusters if the distance between them is lowest. 5 (3)then add the new cluster into cluster set, and delete two clusters merged from cluster set. 6 ** method: (1)as to merging, here replace the old two clusters with their average; 7 (2)measure the distance with the Pearson similarity. 8 ** Time:2013/7/10 9 **/ 10 #include <iostream> 11 #include <map> 12 #include <vector> 13 #include <string> 14 #include <fstream> 15 #include <cstring> 16 #include <sstream> 17 #include <cmath> 18 #include <iterator> 19 using namespace std; 20 //cluster 21 typedef struct bicluster{ 22 vector<double> attri;//attribute 23 int cid;//cluster id 24 }Bicluster; 25 //a pair 26 typedef struct lowpair{ 27 int leftid; 28 int rightid; 29 double dist; 30 }Lpair; 31 32 /***************************************************************** 33 ** convert string(char*) to double(or other type) 34 ** here should be included <sstream> before using the stringstream 35 ******************************************************************/ 36 double str2double(char* str){ 37 stringstream ss; 38 ss << str; 39 double tmp; 40 ss >> tmp; 41 return tmp; 42 } 43 /***************************************************************** 44 ** split the string containing some special tokens 45 ******************************************************************/ 46 string split(string &str, vector<double>& dvec, const char* tok){ 47 char *pch = NULL; 48 pch = strtok(const_cast<char*>(str.c_str()), tok); 49 string stmp(pch); 50 while( pch != NULL ){ 51 pch = strtok(NULL, tok); 52 if( !pch ) 53 break; 54 dvec.push_back(str2double(pch)); 55 } 56 return stmp; 57 } 58 /****************************************************************** 59 ** read data from 'blogdata.txt' 60 ** @is ------- a reference to ifstream object(input) 61 ** @data ----- a map used to store the data (output) 62 ******************************************************************/ 63 bool readfile(ifstream &is, map<string, vector<double> >& mydata){ 64 if( is.fail() ){ 65 cerr << "can't open the file !!!" << endl; 66 return false; 67 } 68 //ignore the first line of file 69 string str; 70 getline(is, str); 71 72 //store the data read from file into mydata 73 while( !is.eof() ){ 74 vector<double> dtmp; 75 string tmp; 76 getline(is, str); 77 tmp = split(str, dtmp, "\t"); 78 mydata.insert(pair<string,vector<double> >(tmp, dtmp)); 79 } 80 return true; 81 } 82 /***************************************************************** 83 ** compute the distance between two clusters 84 ** Note that Pearson value devotes to the similarity between 85 two clusters, that is, the greater the Pearson value, the 86 lower the distance between them. 87 *****************************************************************/ 88 double distPearson(vector<double>& left, vector<double>& right){ 89 double sum1 = 0; 90 double sum2 = 0; 91 int len = left.size(); 92 for(int i=0; i<len; ++i){ 93 sum1 += left[i]; 94 sum2 += right[i]; 95 } 96 97 /** 98 ** maybe you will feel it's complex, 99 ** and here we could replace Pearson with Euclidean distance 100 **/ 101 double sum1Sq = 0; 102 double sum2Sq = 0; 103 for(int j=0; j<len; ++j){ 104 sum1Sq += pow(left[j], 2); 105 sum2Sq += pow(right[j], 2); 106 } 107 108 double pSum = 0, num, den; 109 for(int k=0; k<len; ++k) 110 pSum += left[k]*right[k]; 111 num = pSum - sum1*sum2 / len; 112 den = sqrt((sum1Sq - pow(sum1,2)/len) * (sum1Sq - pow(sum2,2)/len)); 113 if( den == 0 ) 114 return 0; 115 return 1.0 - num/den; 116 } 117 /************************************************************* 118 ** Given two clusters, the distance between them 119 should be checked whether it exists before compute it. 120 **************************************************************/ 121 bool isExist(vector<Lpair> &lp, int leftid, int rightid, double &d){ 122 vector<Lpair>::iterator it = lp.begin(); 123 for(; it!=lp.end(); ++it){ 124 if( (it->leftid==leftid) && (it->rightid==rightid) ){ 125 d = it->dist;//if the distance has been computed, assign its value to d 126 return true; 127 } 128 } 129 d = 0; 130 return false; 131 } 132 /************************************************************* 133 ** Given a cluster's id, delete the cluster from cluster set 134 **************************************************************/ 135 void Del(vector<Bicluster> &cvec, int clusterid){ 136 vector<Bicluster>::iterator it = cvec.begin(); 137 for(; it!=cvec.end(); ++it){ 138 if( it->cid == clusterid ) 139 break; 140 } 141 cvec.erase(it); 142 } 143 /************************************************************* 144 ** Hierarchical Cluster Algorithm 145 **************************************************************/ 146 void HierarchicalCluster(map<string, vector<double> > &mydata){ 147 vector<Lpair> distances;//used to store the distance 148 149 //firstly,regard each sample as a cluster 150 vector<Bicluster> cvec; 151 map<string, vector<double> >::iterator it = mydata.begin(); 152 int myid = 0; 153 for(; it!= mydata.end(); ++it){ 154 Bicluster btmp; 155 btmp.attri = it->second; 156 btmp.cid = myid++; 157 cvec.push_back(btmp); 158 } 159 myid = -1; 160 //search the pair 161 while( cvec.size()>1 ){ 162 Lpair lowp; 163 double closedis = distPearson(cvec[0].attri,cvec[1].attri); 164 lowp.leftid = cvec[0].cid, lowp.rightid = cvec[1].cid; 165 lowp.dist = closedis; 166 167 int leftps = 0, rightps = 1; 168 for(int ix=0; ix<cvec.size(); ++ix){ 169 for(int iy=ix+1; iy<cvec.size(); ++iy){ 170 double d; 171 int lid = cvec[ix].cid, rid = cvec[iy].cid; 172 if( !isExist(distances,lid,rid,d) ){ 173 Lpair lptmp; 174 lptmp.dist = distPearson(cvec[ix].attri, cvec[iy].attri); 175 lptmp.leftid = lid; 176 lptmp.rightid= rid; 177 distances.push_back(lptmp); 178 d = lptmp.dist; 179 } 180 if( d < lowp.dist ){ 181 lowp.leftid = lid; 182 lowp.rightid = rid; 183 leftps = ix; 184 rightps = iy; 185 lowp.dist = d; 186 } 187 } 188 } 189 //create a new cluster 190 Bicluster ncluster; 191 for(int i=0; i<cvec[0].attri.size(); ++i){ 192 double av; 193 av = (cvec[leftps].attri[i] + cvec[rightps].attri[i]) / 2.0; 194 ncluster.attri.push_back(av); 195 } 196 ncluster.cid = myid--;//assign negative to the new cluster's id 197 cout << "leftid: " << lowp.leftid << ", rightid: " << lowp.rightid << endl; 198 //delete the pair 199 Del(cvec, lowp.leftid); 200 Del(cvec, lowp.rightid); 201 cvec.push_back(ncluster); 202 } 203 } 204 int main() 205 { 206 ifstream is("blogdata.txt"); 207 if( is.fail() ){ 208 cerr << "error!!!" << endl; 209 exit(-1); 210 } 211 map<string, vector<double> > mydata; 212 if(readfile(is, mydata)) 213 HierarchicalCluster(mydata); 214 return 0; 215 }
代碼寫的有點亂且復雜,最后顯示的結果不是樹狀圖(python很易實現),只是簡單的顯示了每次被合並的兩個cluster的id.代碼中用到的數據可以從http://kiwitobes.com/clusters/blog.txt下載得到。
