Hierarchical cluster算法介紹


  突然想記錄幾個聚類算法,由於實力有限就先介紹一下層次聚類算法(Hierarchical cluster algorithm),這個聚類算法思想簡單,但實現起來感覺復雜度挺大;以前看過《集體智慧編程》里介紹過,里面是用python實現的,由於python里面的列表和字典用起來方便,故實現該算法還行;這里我用c++重新寫了一下,感覺代碼蠻臃腫,可能是自己的c++沒有學習好吧!!!對於容器的使用還不夠熟練,這里貼出來的目的是希望哪位大牛看到了指導一二,這里感激不盡。廢話不多說了,進入正題吧!

************************************************************************************************************

Hierarchical cluster Algorithm的大致介紹

  層次聚類算法有兩種實現思想,一種是初始時將每個待聚類的數據樣本視為一個cluster,采用合並的方式,每次合並兩個"距離"最近的cluster,直到合並成一個cluster為止(當然可以在達到自己設定想得到的cluster個數時終止迭代);另一種剛好與第一種相反,初始時將所有的數據樣本視為一個cluster,采用分解的方式(這里沒有實現就不說太多)。

************************************************************************************************************

算法的步驟及相關問題

  算法步驟:  (1)初始時,將每個數據樣本視為一個cluster(選取一個度量兩個cluster距離的方式),

       (2)計算任意兩個cluster之間的距離;每次選取距離最小的兩個cluster,

       (3)合並(2)中選擇的兩個cluster,將合並產生的新cluster加入cluster set中,並刪除被合並的兩個cluster,

       (4)重復(2)(3),知道cluster set中元素只剩下一個為止。

  相關問題: (1)度量兩個cluster之間的距離,應該選擇哪種距離???《集體智慧編程》中選擇的是Pearson,當然也可以直接選用歐氏距離

        (2)如何合並兩個cluster,即新的cluster對應的屬性值如何表示???這里是用被合並的兩個cluster的平均值表示新的cluster

******************************************************************************************************************

  1 /**
  2 ** Hierarchical cluster Algorithm
  3 ** step:(1)Firstly,regard each sample as a cluster, and
  4          (2)Each time merge two clusters if the distance between them is lowest.
  5          (3)then add the new cluster into cluster set, and delete two clusters merged from cluster set.
  6 ** method: (1)as to merging, here replace the old two clusters with their average;
  7            (2)measure the distance with the Pearson similarity.
  8 ** Time:2013/7/10 
  9 **/
 10 #include <iostream>
 11 #include <map>
 12 #include <vector>
 13 #include <string>
 14 #include <fstream> 
 15 #include <cstring>
 16 #include <sstream> 
 17 #include <cmath>
 18 #include <iterator>
 19 using namespace std;
 20 //cluster
 21 typedef    struct bicluster{
 22     vector<double> attri;//attribute
 23     int  cid;//cluster id 
 24 }Bicluster;
 25 //a pair
 26 typedef struct lowpair{
 27     int leftid;
 28     int rightid;
 29     double dist;
 30 }Lpair;
 31 
 32 /*****************************************************************
 33 ** convert string(char*) to double(or other type)
 34 ** here should be included <sstream> before using the stringstream
 35 ******************************************************************/
 36 double str2double(char* str){
 37     stringstream ss;
 38     ss << str;
 39     double tmp;
 40     ss >> tmp;
 41     return tmp;    
 42 }
 43 /*****************************************************************
 44 ** split the string containing some special tokens
 45 ******************************************************************/
 46 string split(string &str, vector<double>& dvec, const char* tok){
 47     char *pch = NULL;
 48     pch = strtok(const_cast<char*>(str.c_str()), tok);
 49     string stmp(pch);
 50     while( pch != NULL ){
 51         pch = strtok(NULL, tok);
 52         if( !pch )
 53             break;
 54         dvec.push_back(str2double(pch));
 55     }
 56     return stmp;
 57 }
 58 /******************************************************************
 59 ** read data from 'blogdata.txt'
 60 ** @is ------- a reference to ifstream object(input)
 61 ** @data ----- a map used to store the data (output)
 62 ******************************************************************/
 63 bool readfile(ifstream &is, map<string, vector<double> >& mydata){
 64     if( is.fail() ){
 65         cerr << "can't open the file !!!" << endl;
 66         return false;
 67     }
 68     //ignore the first line of file
 69     string str;
 70     getline(is, str);
 71     
 72     //store the data read from file into mydata 
 73     while( !is.eof() ){
 74         vector<double> dtmp;
 75         string tmp;
 76         getline(is, str);
 77         tmp = split(str, dtmp, "\t");
 78         mydata.insert(pair<string,vector<double> >(tmp, dtmp));
 79     }
 80     return true;         
 81 }
 82 /*****************************************************************
 83 ** compute the distance between two clusters
 84 ** Note that Pearson value devotes to the similarity between 
 85     two clusters, that is, the greater the Pearson value, the 
 86     lower the distance between them.
 87 *****************************************************************/ 
 88 double distPearson(vector<double>& left, vector<double>& right){
 89     double sum1 = 0;
 90     double sum2 = 0;
 91     int len = left.size();
 92     for(int i=0; i<len; ++i){
 93         sum1 += left[i];
 94         sum2 += right[i];
 95     }
 96     
 97     /**
 98     ** maybe you will feel it's complex, 
 99     **  and here we could replace Pearson with Euclidean distance
100     **/
101     double sum1Sq = 0;
102     double sum2Sq = 0;
103     for(int j=0; j<len; ++j){
104         sum1Sq += pow(left[j], 2);
105         sum2Sq += pow(right[j], 2);
106     }
107     
108     double pSum = 0, num, den;
109     for(int k=0; k<len; ++k)
110         pSum += left[k]*right[k];
111     num = pSum - sum1*sum2 / len;
112     den = sqrt((sum1Sq - pow(sum1,2)/len) * (sum1Sq - pow(sum2,2)/len));
113     if( den == 0 )
114         return 0;
115     return 1.0 - num/den;
116 }
117 /*************************************************************
118 ** Given two clusters, the distance between them 
119     should be checked whether it exists before compute it.
120 **************************************************************/
121 bool isExist(vector<Lpair> &lp, int leftid, int rightid, double &d){
122     vector<Lpair>::iterator it = lp.begin();
123     for(; it!=lp.end(); ++it){
124         if( (it->leftid==leftid) && (it->rightid==rightid) ){
125             d = it->dist;//if the distance has been computed, assign its value to d
126             return true;
127         }        
128     }
129     d = 0;
130     return false;
131 }
132 /*************************************************************
133 ** Given a cluster's id, delete the cluster from cluster set
134 **************************************************************/
135 void Del(vector<Bicluster> &cvec, int clusterid){
136     vector<Bicluster>::iterator it = cvec.begin();
137     for(; it!=cvec.end(); ++it){
138         if( it->cid == clusterid )
139             break;
140     }
141     cvec.erase(it);
142 } 
143 /*************************************************************
144 ** Hierarchical Cluster Algorithm
145 **************************************************************/
146 void HierarchicalCluster(map<string, vector<double> > &mydata){
147     vector<Lpair> distances;//used to store the distance
148      
149     //firstly,regard each sample as a cluster
150     vector<Bicluster> cvec;
151     map<string, vector<double> >::iterator it = mydata.begin();
152     int myid = 0;
153     for(; it!= mydata.end(); ++it){
154         Bicluster btmp;
155         btmp.attri = it->second;
156         btmp.cid = myid++;
157         cvec.push_back(btmp);
158     } 
159     myid = -1;
160     //search the pair
161     while( cvec.size()>1 ){
162         Lpair lowp;
163         double closedis = distPearson(cvec[0].attri,cvec[1].attri);
164         lowp.leftid = cvec[0].cid, lowp.rightid = cvec[1].cid;
165         lowp.dist = closedis;
166         
167         int leftps = 0, rightps = 1;
168         for(int ix=0; ix<cvec.size(); ++ix){
169             for(int iy=ix+1; iy<cvec.size(); ++iy){
170                 double d;
171                 int lid = cvec[ix].cid, rid = cvec[iy].cid;
172                 if( !isExist(distances,lid,rid,d) ){
173                     Lpair lptmp;
174                     lptmp.dist = distPearson(cvec[ix].attri, cvec[iy].attri);
175                     lptmp.leftid = lid;
176                     lptmp.rightid= rid;
177                     distances.push_back(lptmp);
178                     d = lptmp.dist;
179                   } 
180                  if( d < lowp.dist ){
181                      lowp.leftid = lid;
182                      lowp.rightid = rid;
183                      leftps = ix;
184                      rightps = iy;
185                      lowp.dist = d;
186                  }
187             }
188         }
189         //create a new cluster
190         Bicluster ncluster;
191         for(int i=0; i<cvec[0].attri.size(); ++i){
192             double av;
193             av = (cvec[leftps].attri[i] + cvec[rightps].attri[i]) / 2.0;
194             ncluster.attri.push_back(av);
195         }
196         ncluster.cid = myid--;//assign negative to the new cluster's id
197         cout << "leftid: " << lowp.leftid <<  ", rightid: " << lowp.rightid << endl;
198         //delete the pair
199         Del(cvec, lowp.leftid); 
200         Del(cvec, lowp.rightid);
201         cvec.push_back(ncluster);
202     } 
203 } 
204 int main()
205 {
206     ifstream is("blogdata.txt");
207     if( is.fail() ){
208         cerr << "error!!!" << endl;
209         exit(-1);
210     }
211     map<string, vector<double> > mydata;
212     if(readfile(is, mydata))
213         HierarchicalCluster(mydata);
214     return 0;
215 }

  代碼寫的有點亂且復雜,最后顯示的結果不是樹狀圖(python很易實現),只是簡單的顯示了每次被合並的兩個cluster的id.代碼中用到的數據可以從http://kiwitobes.com/clusters/blog.txt下載得到。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM