譜聚類


廣義上來說,任何在算法中用到SVD/特征值分解的,都叫Spectral Algorithm。順便說一下,對於任意矩陣只存在奇異值分解,不存在特征值分解。對於正定的對稱矩陣,奇異值就是特征值,奇異向量就是特征向量。

傳統的聚類算法,如K-Means、EM算法都是建立在凸球形樣本空間上,當樣本空間不為凸時,算法會陷入局部最優,最終結果受初始參數的選擇影響比較大。而譜聚類可以在任意形狀的樣本空間上聚類,且收斂於全局最優解。

譜聚類和CHAMELEON聚類很像,都是把樣本點的相似度放到一個帶權無向圖中,采用“圖划分”的方法進行聚類。只是譜聚類算法在進行圖划分的時候發現計算量很大,轉而求特征值去了,而且最后還在幾個小特征向量組成的矩陣上進行了K-Means聚類

Simply speaking,譜聚類算法分為3步:

  1. 構造一個N×N的權值矩陣W,Wij表示樣本i和樣本j的相似度,顯然W是個對稱矩陣。相似度的計算方法很多了,你可以用歐拉距離、街區距離、向量夾角、皮爾森相關系數等。並不是任意兩個點間的相似度都要表示在圖上,我們希望的權值圖是比較稀疏的,有2種方法:權值小於閾值的認為是0;K最鄰近方法,即每個點只和跟它最近的k個點連起來,CHAMELEON算法的第1階段就是這么干的。再構造一個對角矩陣D,Dii為W第i列元素之和。最后構造矩陣L=D-W。可以證明L是個半正定和對稱矩陣。
  2. 求L的前K特征值對應的特征向量(這要用到奇異值分解了)。把K個特征向量放在一起構造一個N×K的矩陣M。
  3. 把M的每一行當成一個新的樣本點,對這N個新的樣本點進行K-Means聚類。

從文件讀入樣本點,最終算得矩陣L

#include<math.h>
#include<string.h>
#include"matrix.h"
#include"svd.h"

#define N 19		//樣本點個數
#define K 4			//K-Means算法中的K
#define T 0.1		//樣本點之間相似度的閾值

double sample[N][2];	//存放所有樣本點的坐標(2維的)

void readSample(char *filename){
	FILE *fp;
	if((fp=fopen(filename,"r"))==NULL){
		perror("fopen");
		exit(0);
	}
	char buf[50]={0};
	int i=0;
	while(fgets(buf,sizeof(buf),fp)!=NULL){
		char *w=strtok(buf," \t");
		double x=atof(w);
		w=strtok(NULL," \t");
		double y=atof(w);
		sample[i][0]=x;
		sample[i][1]=y;
		i++;
		memset(buf,0x00,sizeof(buf));
	}
	assert(i==N);
	fclose(fp);
}

double** getSimMatrix(){
	//為二維矩陣申請空間
	double **matrix=getMatrix(N,N);
	//計算樣本點兩兩之間的相似度,得到矩陣W
	int i,j;
	for(i=0;i<N;i++){
		matrix[i][i]=1;
		for(j=i+1;j<N;j++){
			double dist=sqrt(pow(sample[i][0]-sample[j][0],2)+pow(sample[i][1]-sample[j][1],2));
			double sim=1.0/(1+dist);
			if(sim>T){
				matrix[j][i]=sim;
				matrix[i][j]=sim;
			}
		}
	}
	//計算L=D-W
	for(j=0;j<N;j++){
		double sum=0;
		for(i=0;i<N;i++){
			sum+=matrix[i][j];
			if(i!=j)
				matrix[i][j]=0-matrix[i][j];
		}
		matrix[j][j]=matrix[j][j]-sum;
	}
	return matrix;
}

int main(){
	char *file="/home/orisun/data";
	readSample(file);
	double **L=getSimMatrix();
	printMatrix(L,N,N);
	
	double **M=singleVector(L,N,N,5);
	printMatrix(M,N,5);
	
	freeMatrix(L,N);

	return 0;
}

L已是對稱矩陣,直接奇異值分解的得到的就是特征向量

#ifndef _MATRIX_H
#define _MATRIX_H
  
#include<assert.h>
#include<stdlib.h>
#include<stdio.h>

//初始化一個二維矩陣
double** getMatrix(int rows,int columns){
    double **rect=(double**)calloc(rows,sizeof(double*));
    int i;
    for(i=0;i<rows;++i)
        rect[i]=(double*)calloc(columns,sizeof(double));
    return rect;
}

//返回一個單位矩陣
double** getIndentityMatrix(int rows){
    double** IM=getMatrix(rows,rows);
    int i;
    for(i=0;i<rows;++i)
        IM[i][i]=1.0;
    return IM;
}
  
//返回一個矩陣的副本
double** copyMatrix(double** matrix,int rows,int columns){
    double** rect=getMatrix(rows,columns);
    int i,j;
    for(i=0;i<rows;++i)
        for(j=0;j<columns;++j)
            rect[i][j]=matrix[i][j];
    return rect;
}

//從一個一維矩陣得到一個二維矩陣
void getFromArray(double** matrix,int rows,int columns,double *arr){
    int i,j,k=0;
    for(i=0;i<rows;++i){
        for(j=0;j<columns;++j){
            matrix[i][j]=arr[k++];
        }
    }
}

//打印二維矩陣
void printMatrix(double** matrix,int rows,int columns){
    int i,j;
    for(i=0;i<rows;++i){
        for(j=0;j<columns;++j){
            printf("%-10f\t",matrix[i][j]);
        }
        printf("\n");
    }
}

//釋放二維矩陣
void freeMatrix(double** matrix,int rows){
    int i;
    for(i=0;i<rows;++i)
        free(matrix[i]);
    free(matrix);
}
  
//獲取二維矩陣的某一行
double* getRow(double **matrix,int rows,int columns,int index){
    assert(index<rows);
    double *rect=(double*)calloc(columns,sizeof(double));
    int i;
    for(i=0;i<columns;++i)
        rect[i]=matrix[index][i];
    return rect;
}

//獲取二維矩陣的某一列  
double* getColumn(double **matrix,int rows,int columns,int index){
    assert(index<columns);
    double *rect=(double*)calloc(rows,sizeof(double));
    int i;
    for(i=0;i<rows;++i)
        rect[i]=matrix[i][index];
    return rect;
}

//設置二維矩陣的某一列   
void setColumn(double **matrix,int rows,int columns,int index,double *arr){
    assert(index<columns);
    int i;
    for(i=0;i<rows;++i)
        matrix[i][index]=arr[i];
}

//交換矩陣的某兩列
void exchangeColumn(double **matrix,int rows,int columns,int i,int j){
    assert(i<columns);
    assert(j<columns);
    int row;
    for(row=0;row<rows;++row){
        double tmp=matrix[row][i];
        matrix[row][i]=matrix[row][j];
        matrix[row][j]=tmp;
    }
}


//得到矩陣的轉置
double** getTranspose(double **matrix,int rows,int columns){
    double **rect=getMatrix(columns,rows);
    int i,j;
    for(i=0;i<columns;++i){
        for(j=0;j<rows;++j){
            rect[i][j]=matrix[j][i];
        }
    }
    return rect;
}

//計算兩向量內積
double vectorProduct(double *vector1,double *vector2,int len){
    double rect=0.0;
    int i;
    for(i=0;i<len;++i)
        rect+=vector1[i]*vector2[i];
    return rect;
}

//兩個矩陣相乘
double** matrixProduct(double **matrix1,int rows1,int columns1,double **matrix2,int columns2){
    double **rect=getMatrix(rows1,columns2);
    int i,j;
    for(i=0;i<rows1;++i){
        for(j=0;j<columns2;++j){
            double *vec1=getRow(matrix1,rows1,columns1,i);
            double *vec2=getColumn(matrix2,columns1,columns2,j);
            rect[i][j]=vectorProduct(vec1,vec2,columns1);
            free(vec1);
            free(vec2);
        }
    }
    return rect;
}

//得到某一列元素的平方和
double getColumnNorm(double** matrix,int rows,int columns,int index){
    assert(index<columns);
    double* vector=getColumn(matrix,rows,columns,index);
    double norm=vectorProduct(vector,vector,rows);
    free(vector);
    return norm;
}

//打印向量
void printVector(double* vector,int len){
    int i;
    for(i=0;i<len;++i)
        printf("%-15.8f\t",vector[i]);
    printf("\n");
}
  
#endif
#include"matrix.h"

#define ITERATION 100		//單邊Jacobi最大迭代次數
#define THREASHOLD 0.1

//符號函數
int sign(double number) {
    if(number<0)
        return -1;
    else
        return 1; 
}

//兩個向量進行單邊Jacobi正交變換
void orthogonalVector(double *Ci,double *Cj,int len1,double *Vi,double *Vj,int len2,int *pass){
    double ele=vectorProduct(Ci,Cj,len1);
    if(fabs(ele)<THREASHOLD)
        return;     //如果兩列已經正交,不需要進行變換,則返回true
    *pass=0;
    double ele1=vectorProduct(Ci,Ci,len1);
    double ele2=vectorProduct(Cj,Cj,len1);
      
      
  
    double tao=(ele1-ele2)/(2*ele);
    double tan=sign(tao)/(fabs(tao)+sqrt(1+pow(tao,2)));
    double cos=1/sqrt(1+pow(tan,2));
    double sin=cos*tan;
      
    int row;
    for(row=0;row<len1;++row){
        double var1=Ci[row]*cos+Cj[row]*sin;
        double var2=Cj[row]*cos-Ci[row]*sin;
  		Ci[row]=var1;
        Cj[row]=var2;
    }
    for(row=0;row<len2;++row){
        double var1=Vi[row]*cos+Vj[row]*sin;
        double var2=Vj[row]*cos-Vi[row]*sin;
  		Vi[row]=var1;
        Vj[row]=var2;
    }
}
  
//矩陣的兩列進行單邊Jacobi正交變換。V是方陣,行/列數為columns
void orthogonal(double **matrix,int rows,int columns,int i,int j,int *pass,double **V){
    assert(i<j);
      
    double* Ci=getColumn(matrix,rows,columns,i);
    double* Cj=getColumn(matrix,rows,columns,j);
    double* Vi=getColumn(V,columns,columns,i);
    double* Vj=getColumn(V,columns,columns,j);
    orthogonalVector(Ci,Cj,rows,Vi,Vj,columns,pass);
      
    int row;
    for(row=0;row<rows;++row){
        matrix[row][i]=Ci[row];
        matrix[row][j]=Cj[row];
    }
    for(row=0;row<columns;++row){
        V[row][i]=Vi[row];
        V[row][j]=Vj[row];
    }
    free(Ci);
    free(Cj);
    free(Vi);
    free(Vj);
}

//循環正交,進行奇異值分解
void hestens_jacobi(double **matrix,int rows,int columns,double **V)
{     
    int iteration = ITERATION;
    while (iteration-- > 0) {
        int pass = 1;
        int i,j;
        for (i = 0; i < columns; ++i) {
            for (j = i+1; j < columns; ++j) {
                orthogonal(matrix,rows,columns,i,j,&pass,V);      //經過多次的迭代正交后,V就求出來了
            }
        }
        if (pass==1)   //當任意兩列都正交時退出迭代
            break;
    }
    printf("迭代次數:%d\n",ITERATION - iteration);
}

//獲取矩陣前n小的奇異向量
double **singleVector(double **A,int rows,int columns,int n){
	double **V=getIndentityMatrix(columns);
	hestens_jacobi(A,rows,columns,V);
	
	double *singular=(double*)calloc(columns,sizeof(double));		//特征值
	int i,j;
	for(i=0;i<columns;++i){
		double *vector=getColumn(A,rows,columns,i);
		double norm=sqrt(vectorProduct(vector,vector,rows));
        singular[i]=norm;
    }
    
    int *sort=(int*)calloc(columns,sizeof(int));
    for(i=0;i<columns;++i)
    	sort[i]=i;
    for(i=0;i<columns-1;++i){
    	int minIndex=i;
    	int minValue=singular[i];
    	for(j=i+1;j<columns;++j){
    		if(singular[j]<minValue){
    			minValue=singular[j];
    			minIndex=j;
    		}
    	}
    	//交換sigular的第i個和第minIndex個元素
    	singular[minIndex]=singular[i];
    	singular[i]=minValue;
    	//交換sort的第i個和第minIndex個元素
    	int tmp=sort[minIndex];
    	sort[minIndex]=sort[i];
    	sort[i]=tmp;
    }
    
    double **rect=getMatrix(rows,n);
    for(i=0;i<rows;++i){
    	for(j=0;j<n;++j){
    		rect[i][j]=V[i][sort[j]];
    	}
    }
	
	freeMatrix(V,columns);
	free(sort);
	free(singular);
	
	return rect;
}

最后是運行KMeans的Java代碼

package ai;

public class Global {
	//計算兩個向量的歐氏距離
    public static double calEuraDist(double[] arr1,double[] arr2,int len){
        double result=0.0;
        for(int i=0;i<len;i++){
            result+=Math.pow(arr1[i]-arr2[i],2.0);
        }
        return Math.sqrt(result);
    }
}
package ai;

public class DataObject {

	String docname;
	double[] vector;
	int cid;	
	boolean visited;
	
	public DataObject(int len){
		vector=new double[len];
	}

	public String getName() {
		return docname;
	}

	public void setName(String docname) {
		this.docname = docname;
	}

	public double[] getVector() {
		return vector;
	}

	public void setVector(double[] vector) {
		this.vector = vector;
	}

	public int getCid() {
		return cid;
	}

	public void setCid(int cid) {
		this.cid = cid;
	}

	public boolean isVisited() {
		return visited;
	}

	public void setVisited(boolean visited) {
		this.visited = visited;
	}

}
package ai;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
public class DataSource {

	ArrayList<DataObject> objects;
	int row;
	int col;

	public void readMatrix(File dataFile) {
		try {
			FileReader fr = new FileReader(dataFile);
			BufferedReader br = new BufferedReader(fr);
			String line = br.readLine();
			String[] words = line.split("\\s+");
			row = Integer.parseInt(words[0]);
			// row=1000;
			col = Integer.parseInt(words[1]);
			objects = new ArrayList<DataObject>(row);
			for (int i = 0; i < row; i++) {
				DataObject object = new DataObject(col);
				line = br.readLine();
				words = line.split("\\s+");
				for (int j = 0; j < col; j++) {
					object.getVector()[j] = Double.parseDouble(words[j]);
				}
				objects.add(object);
			}
			br.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void readRLabel(File file) {
		try {
			FileReader fr = new FileReader(file);
			BufferedReader br = new BufferedReader(fr);
			String line = null;
			for (int i = 0; i < row; i++) {
				line = br.readLine();
				objects.get(i).setName(line.trim());
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public void printResult(ArrayList<DataObject> objects, int n) {
		//DBScan是從第1類開始,K-Means是從第0類開始
//		for (int i =0; i <n; i++) {
		for(int i=1;i<=n;i++){
			System.out.println("=============屬於第"+i+"類的有:===========================");
			Iterator<DataObject> iter = objects.iterator();
			while (iter.hasNext()) {
				DataObject object = iter.next();
				int cid=object.getCid();
				if(cid==i){
					System.out.println(object.getName());
//					switch(Integer.parseInt(object.getName())/1000){
//					case 0:
//						System.out.println(0);
//						break;
//					case 1:
//						System.out.println(1);
//						break;
//					case 2:
//						System.out.println(2);
//						break;
//					case 3:
//						System.out.println(3);
//						break;
//					case 4:
//						System.out.println(4);
//						break;
//					case 5:
//						System.out.println(5);
//						break;
//					default:
//						System.out.println("Go Out");
//						break;
//					}				
				}
			}
		}
	}
}
package ai;

import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
 
public class KMeans {
 
    int k; // 指定划分的簇數
    double mu; // 迭代終止條件,當各個新質心相對於老質心偏移量小於mu時終止迭代
    double[][] center; // 上一次各簇質心的位置
    int repeat; // 重復運行次數
    double[] crita; // 存放每次運行的滿意度
 
    public KMeans(int k, double mu, int repeat, int len) {
        this.k = k;
        this.mu = mu;
        this.repeat = repeat;
        center = new double[k][];
        for (int i = 0; i < k; i++)
            center[i] = new double[len];
        crita = new double[repeat];
    }
 
    // 初始化k個質心,每個質心是len維的向量,每維均在left--right之間
    public void initCenter(int len, ArrayList<DataObject> objects) {
        Random random = new Random(System.currentTimeMillis());
        int[] count = new int[k]; // 記錄每個簇有多少個元素
        Iterator<DataObject> iter = objects.iterator();
        while (iter.hasNext()) {
            DataObject object = iter.next();
            int id = random.nextInt(10000)%k;
            count[id]++;
            for (int i = 0; i < len; i++)
                center[id][i] += object.getVector()[i];
        }
        for (int i = 0; i < k; i++) {
            for (int j = 0; j < len; j++) {
                center[i][j] /= count[i];
            }
        }
    }
 
    // 把數據集中的每個點歸到離它最近的那個質心
    public void classify(ArrayList<DataObject> objects) {
        Iterator<DataObject> iter = objects.iterator();
        while (iter.hasNext()) {
            DataObject object = iter.next();
            double[] vector = object.getVector();
            int len = vector.length;
            int index = 0;
            double neardist = Double.MAX_VALUE;
            for (int i = 0; i < k; i++) {
                double dist = Global.calEuraDist(vector, center[i], len); // 使用歐氏距離
                if (dist < neardist) {
                    neardist = dist;
                    index = i;
                }
            }
            object.setCid(index);
        }
    }
 
    // 重新計算每個簇的質心,並判斷終止條件是否滿足,如果不滿足更新各簇的質心,如果滿足就返回true.len是數據的維數
    public boolean calNewCenter(ArrayList<DataObject> objects, int len) {
        boolean end = true;
        int[] count = new int[k]; // 記錄每個簇有多少個元素
        double[][] sum = new double[k][];
        for (int i = 0; i < k; i++)
            sum[i] = new double[len];
        Iterator<DataObject> iter = objects.iterator();
        while (iter.hasNext()) {
            DataObject object = iter.next();
            int id = object.getCid();
            count[id]++;
            for (int i = 0; i < len; i++)
                sum[id][i] += object.getVector()[i];
        }
        for (int i = 0; i < k; i++) {
            if (count[i] != 0) {
                for (int j = 0; j < len; j++) {
                    sum[i][j] /= count[i];
                }
            }
            // 簇中不包含任何點,及時調整質心
            else {
                int a=(i+1)%k;
                int b=(i+3)%k;
                int c=(i+5)%k;
                for (int j = 0; j < len; j++) {
                    center[i][j] = (center[a][j]+center[b][j]+center[c][j])/3;
                }
            }
        }
        for (int i = 0; i < k; i++) {
            // 只要有一個質心需要移動的距離超過了mu,就返回false
            if (Global.calEuraDist(sum[i], center[i], len) >= mu) {
                end = false;
                break;
            }
        }
        if (!end) {
            for (int i = 0; i < k; i++) {
                for (int j = 0; j < len; j++)
                    center[i][j] = sum[i][j];
            }
        }
        return end;
    }
 
    // 計算各簇內數據和方差的加權平均,得出本次聚類的滿意度.len是數據的維數
    public double getSati(ArrayList<DataObject> objects, int len) {
        double satisfy = 0.0;
        int[] count = new int[k];
        double[] ss = new double[k];
        Iterator<DataObject> iter = objects.iterator();
        while (iter.hasNext()) {
            DataObject object = iter.next();
            int id = object.getCid();
            count[id]++;
            for (int i = 0; i < len; i++)
                ss[id] += Math.pow(object.getVector()[i] - center[id][i], 2.0);
        }
        for (int i = 0; i < k; i++) {
            satisfy += count[i] * ss[i];
        }
        return satisfy;
    }
 
    public double run(int round, DataSource datasource, int len) {
        System.out.println("第" + round + "次運行");
        initCenter(len,datasource.objects);
        classify(datasource.objects);
        while (!calNewCenter(datasource.objects, len)) {
            classify(datasource.objects);
        }
        datasource.printResult(datasource.objects, k);
        double ss = getSati(datasource.objects, len);
        System.out.println("加權方差:" + ss);
        return ss;
    }
 
    public static void main(String[] args) {
        DataSource datasource = new DataSource();
        datasource.readMatrix(new File("/home/orisun/test/dot.mat"));
        datasource.readRLabel(new File("/home/orisun/test/dot.rlabel"));
        int len = datasource.col;
        // 划分為4個簇,質心移動小於1E-8時終止迭代,重復運行7次
        KMeans km = new KMeans(4, 1E-10, 7, len);
        int index = 0;
        double minsa = Double.MAX_VALUE;
        for (int i = 0; i < km.repeat; i++) {
            double ss = km.run(i, datasource, len);
            if (ss < minsa) {
                minsa = ss;
                index = i;
            }
        }
        System.out.println("最好的結果是第" + index + "次。");
    }
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM