K-means


      K-均值算法的基本思想是首先從含有N個數據對象的數據集中隨機選擇K個數據對象作為初始中心,然后計算每個數據對象到各中心的距離,根據最近鄰原則,所有數據對象將會被划分到離它最近的那個中心所代表的簇中,接着分別計算新生成的各個簇中數據對象的均值作為各簇新的中心,比較新的中心和上一次得到的中心,如果新的中心沒有發生變化,則算法收斂,輸出結果,如果新的中心和上一次的中心相比發生變化,則要根據新的中心對所有數據對象重新進行划分。直到滿足算法的收斂條件為止。

      K-means算法的過程可以描述為:

          算法:划分並計算基於簇中對象的平均值。 

          輸入:簇的數目K和包含N個對象的數據庫。 

         輸出:平方誤差總和最小條件下的K個簇。 

     方法:

          1) 任意選擇K個對象作為初始的簇中心; 

          2) 分別計算數據集中每個元素與所選簇的中心計算距離(一般采用歐式距離),根據最近鄰原則,將元素划分到相應的簇中; 

          3) 計算每個簇中對象的平均值,更新簇的中心; 

          4) 重復上面的步驟,直至更新的簇的中心與原簇的中心的差值在預定范圍內或者達到預設的迭代次數; 

          5) 輸出K個簇中心。 

 

     K-means 方法的時間復雜度為O(NKT),N代表總元素個數,K代表簇中心個數,T代表迭代次數。K-means算法是一種硬性划分的聚類,即每個數據點唯一地分配給一個聚類,由於事先不知道實際的聚類情況,因此可能是一種嚴重的局限。該算法對初始中心的選取非常敏感,初始中心隨機選取,導致結果波動較大,穩定性較差。同時該算法對噪聲數據和孤立點數據較為敏感。該算法通常采用歐式距離作為數據樣本之間的度量方式,導致該算法對球狀的簇有比較好的聚類效果,但是很難發現其他形狀的簇。

#include <fstream>
#include <iomanip>
#include <iostream>
#include <time.h>
#include <stdlib.h>
using namespace std;


template <typename DataType>
int readData( char* file_path , DataType** &data,int dimension)
{
    
    if(dimension <= 0)
        return -1;

    int data_number = 0;
    fstream infile;
    infile.open(file_path,ios::in);
    DataType datum;
    
    long int position = infile.tellg();
    while(!infile.eof())
    {
        infile >> datum;
        data_number++;
    }
    infile.close();
    position = infile.tellg();
    infile.seekg(0,ios::beg);
    position = infile.tellg();
    data_number /= dimension;
    infile.open(file_path,ios::in);
    data = new DataType*[data_number];
    
    for( int i = 0; i < data_number; i++)
    {
        data[i] = new DataType[dimension];
        for( int j = 0; j < dimension; j++)
        {
            infile >> data[i][j];
        }
    }

    infile.close();

    return data_number;
}

template <typename DataType>
void kmeans(DataType** &data,int data_number,int dimension, DataType** &centers,int K, int* &labels,  int iterations, DataType threshold)
{
    if(data == NULL)
        return;

    centers = new DataType*[K];
    labels = new int[data_number];
    DataType** sum;
    int* counts;
    sum = new DataType*[K];
    counts = new int[K];

    for(int i = 0; i < K; i++)
    {
        centers[i] = new DataType[dimension];
        sum[i] = new DataType[dimension];
        counts[i] = 0;
        for( int j = 0; j < dimension; j++)
        {
            sum[i][j] = 0;
        }
    }

    rand_init_centers(data,data_number,dimension,centers,K);
    
    int iteration_time = 0;
    DataType difference = INT_MAX;
    while( iteration_time < iterations || difference > threshold)
    {        
        for(int i = 0; i < K; i++)
        {    
            counts[i] = 0;
            for( int j = 0; j < dimension; j++)
            {
                sum[i][j] = 0;
            }
        }

        for(int i = 0; i < data_number; i++)
        {
            labels[i] = select_center(data[i],centers,K,dimension);
            counts[labels[i]]++;
            for( int j = 0; j < dimension; j++)
            {
                sum[labels[i]][j] += data[i][j];
            }
        }
        difference = 0;
        for( int i = 0; i < K; i++)
        {
            for( int j = 0; j < dimension; j++)
            {
                if(counts[i] > 0)
                {    
                    sum[i][j] /= counts[i];    
                    DataType delta = sum[i][j] - centers[i][j];
                    difference += delta*delta;
                    centers[i][j] = sum[i][j];
                    
                }                
            }
            
        }
        cout << iteration_time << '\t' << difference << endl;
        iteration_time++;
    }

    for( int i = 0; i < K; i++)
    {
        if(sum[i] != NULL)
        {
            delete[] sum[i];
        }
    }
    if(sum != NULL)
    {
        delete [] sum;
    }
    delete [] counts;
    
}

template <typename DataType>
void save_centers(char* file_path, DataType** centers,int K,int dimension)
{
    ofstream outfile;
    outfile.open(file_path,ios::out);

    for(int i = 0; i < K; i++)
    {
        outfile.setf(ios::left); 
        for(int j = 0; j < dimension; j++)
        {
            outfile.width(10);
            outfile << centers[i][j];
        }
        outfile.unsetf(ios::left);
        outfile << endl;
    }

    outfile.close();

}

template <typename DataType>
void save_labels(char* file_path, DataType** data, int* labels,int data_number,int dimension)
{
    ofstream outfile;
    outfile.open(file_path,ios::out);

    for(int i = 0; i < data_number; i++)
    {
        outfile.setf(ios::left);
        outfile.width(10);
        outfile << labels[i];
        for(int j = 0; j < dimension; j++)
        {
            outfile.width(10);
            outfile << data[i][j];
        }
        outfile.unsetf(ios::left);
        outfile<< endl;
    }

    outfile.close();
}


template <typename DataType>
void rand_init_centers(DataType** &data, int data_number,int dimension, DataType** &centers,int K)
{
    int step = data_number/K;
    for(int i = 0; i < K; i++)
    {
        srand(time(NULL));
        int m = rand()%step;
        for(int j = 0; j < dimension; j++)
        {            
            centers[i][j] = data[i*step+m][j];
        }
    }
}


template <typename DataType>
int select_center(DataType* &data_i, DataType** &centers, int K, int dimension)
{
    if( K < 0)
        return -1;
    int label = 0;

    DataType min_dist = calculate_dist(data_i, centers[0],dimension);
    DataType dist = 0;
    for(int i = 1; i < K; i++)
    {
        dist = calculate_dist(data_i, centers[i],dimension);
        if(min_dist > dist)
        {
            min_dist = dist;
            label = i;
        }
    }

    return label;
}

template <typename DataType>
DataType calculate_dist(DataType* &data_i, DataType* &centers_i,int dimension)
{
    if(data_i == NULL || centers_i == NULL)
    {
        return (DataType)-1;
    }
    DataType dist = 0;

    for( int j = 0; j < dimension; j++)
    {
        DataType delta = data_i[j] - centers_i[j];
        dist += delta*delta;
    }

    return dist;
}
Kmeans.h
#include <iostream>
using namespace std;

#include "Kmeans.h"

int main(int argc, char* argv[])
{
    float** data_source;
    float** clusters;
    int* labels;

    data_source = NULL;
    clusters = NULL;
    labels = NULL;

    int K = 5;
    int iterations = 50;
    float threshold = 0.001;
    int dimension = 1764;
    int data_number = readData("D:/Users/Surge/Desktop/test.txt",data_source,dimension);
    kmeans(data_source,data_number,dimension,clusters,K,labels,iterations,threshold);
    save_centers("D:/Users/Surge/Desktop/test_centers.txt",clusters,K,dimension);
    save_labels("D:/Users/Surge/Desktop/test_labels.txt",data_source,labels,data_number,dimension);

    for(int i = 0; i < data_number; i++)
    {
        if(data_source[i] != NULL)
        {
            delete[] data_source[i];
        }
    
    }

    for(int i = 0; i < K; i++)
    {
        if(clusters[i] != NULL)
        {
            delete[] clusters[i];
        }
    }

    if(data_source != NULL)
    {
        delete[] data_source;
    }
    if(clusters != NULL)
    {
        delete[] clusters;
    }
    if(labels != NULL)
    {
        delete[] labels;
    }
    system("pause");
    return 0;
}
main.cpp

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM