聚類算法效果評估entropy purity nmi

本文轉載自查看原文 2012-02-21 13:03 5828 purity/ 聚類評價 entropy/ nmi

1.數據管理腳本:原始文件格式id\tclusterId\tgoldstandardId

DataManagement.py

# !/usr/bin/python
import cPickle as p;
import sys;
import re;
if( __name__== " __main__ "):
    filename=str(sys.argv[1]);
    preturn=re.compile( ' (^\s+|\s+$) ');
    fidsrc=file(filename, ' r ');
    clusters={}; # (key,[])
    goldstandards={}; # (key,[])
     for line in fidsrc.readlines():
        line=preturn.sub( '',line);
        m=line.split( ' \t ');
         # print m
         # s=raw_input('please enter');
         if(len(m)==3): # if
             if( not clusters.has_key(int(m[1]))):
                clusters[int(m[1])]=[];
                clusters[int(m[1])].append(int(m[0]));
             else:
                 clusters[int(m[1])].append(int(m[0]));
             if( not goldstandards.has_key(int(m[2]))):
                goldstandards[int(m[2])]=[];
                goldstandards[int(m[2])].append(int(m[0]));
             else:
                goldstandards[int(m[2])].append(int(m[0]));
    fidclusters=file(sys.argv[2], ' w ');
    fidgoldstandards=file(sys.argv[3], ' w ');
    p.dump(clusters,fidclusters);
    fidclusters.close();
    p.dump(goldstandards,fidgoldstandards);
    fidgoldstandards.close();
    fidsrc.close();
     print ' %s has finished! '%sys.argv[0];

EvaluationClusterAlgorithm.py

# !/usr/bin/python
# -*- coding:cp936 -*-
import re;
import cPickle as mypickle;
import sys;
import math;
class Evaluation:
     def __init__(self,clusterfid,goldstandardfid):
        self.clusters=mypickle.load(file(clusterfid)); # get the cluster algorithm results
        self.goldstandards=mypickle.load(file(goldstandardfid)); # get the gold-standard answers
        tempclusterkeys=self.clusters.keys();
        tempclusterkeys.sort();
        tempgoldstandardkeys=self.goldstandards.keys();
        tempgoldstandardkeys.sort();
        self.k=len(tempclusterkeys);
        self.q=len(tempgoldstandardkeys);
        self.minclusterId=tempclusterkeys[0]; # 最小聚類ID
        self.maxclusterId=tempclusterkeys[self.k-1]; # 最大聚類ID
        self.mingoldstandardId=tempgoldstandardkeys[0];
        self.maxgoldstandardId=tempgoldstandardkeys[self.q-1];
        self.coocurrence={}; # (clusterId,goldstandardId)=num;store the number of documents shared by clusterId and goldstandardId;
        N1=0;
        N2=0;
         for m in tempclusterkeys:
            N1=N1+len(self.clusters[m]);
         for m in tempgoldstandardkeys:
            N2=N2+len(self.goldstandards[m]);
         if(N1==N2):
            self.N=N1; # num of documents
         else:
             print ' there is a error N1=%d,N2=%d,please reexamine the data source '%(N1,N2);
     def GenerateCoocurrence(self):
         for key_cluster in self.clusters.keys():
            set1=set(self.clusters[key_cluster]);
             for key_gold in self.goldstandards.keys():
                set2=set(self.goldstandards[key_gold]);
                setintersect=set1&set2;
                Num=len(setintersect);
                 if( not self.coocurrence.has_key((key_cluster,key_gold))):
                    self.coocurrence[(key_cluster,key_gold)]=Num;


     def CalPurityForPerCluster(self,clusterId):
        result=0.0;
        NumCollection=[];
         for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            NumCollection.append(self.coocurrence[(clusterId,Id)]);
        NumCollection.sort();
        result=float(NumCollection[len(NumCollection)-1])/float(len(self.clusters[clusterId]));

         return result;
     def CalPurity(self):
        result=0.0;
         for clusterId in range(self.minclusterId,self.maxclusterId+1):
            purityPer=self.CalPurityForPerCluster(clusterId);
            result=result+float(len(self.clusters[clusterId]))*purityPer/float(self.N);
         return result;
     def CalEntropyFormula(self,seq):
        result=0.0;
         for elemP in seq:
             if(elemP>0):
                result=result+elemP*math.log(elemP,2);
         return -result;
     def CalEntropyForPerCluster(self,clusterId):
        seq=[];
        result=0;
         for Id in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            Prob=float(self.coocurrence[(clusterId,Id)])/float(len(self.clusters[clusterId]));
            seq.append(Prob);
        result=self.CalEntropyFormula(seq);
         return result;
     def CalEntropy(self):
        result=0;
         for clusterId in range(self.minclusterId,self.maxclusterId+1):
            entropyPer=self.CalEntropyForPerCluster(clusterId);
            result=result+float(len(self.clusters[clusterId]))*entropyPer/float(self.N);
         return result;
     def CalMutualInformation(self):
        result=0.0;
         for clusterId in range(self.minclusterId,self.maxclusterId+1):
            N_c=len(self.clusters[clusterId]);
             for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
                N_g=len(self.goldstandards[goldId]);
                N_cg=self.coocurrence[(clusterId,goldId)];
                part=float(self.N)*float(N_cg)/(N_c*N_g);
                 if(part>0):
                    result=result+(float(N_cg)/float(self.N))*math.log(part,2);
         return result;
     def CalNMI(self):
        NMI=0.0;
        seq1=[]; # calculate the entropy of automated clusters
        seq2=[]; # calculate the entropy of gold-standard clusters
         for clusterId in range(self.minclusterId,self.maxclusterId+1):
            Prob=float(len(self.clusters[clusterId]))/float(self.N);
            seq1.append(Prob);
         for goldId in range(self.mingoldstandardId,self.maxgoldstandardId+1):
            Prob=float(len(self.goldstandards[goldId]))/float(self.N);
            seq2.append(Prob);
        H1=self.CalEntropyFormula(seq1);
        H2=self.CalEntropyFormula(seq2);
        IG=self.CalMutualInformation();
        NMI=2*IG/(H1+H2);
         return NMI;



if( __name__== " __main__ "):
    clusterAddress=str(sys.argv[1]);
    goldAddress=str(sys.argv[2]);
    e= Evaluation(clusterAddress,goldAddress);
     print ' 聚類算法產生簇個數%d '%e.k;
     print   ' 人工標注的標准答案中簇個數%d '%e.q;
     print ' 文檔總數%d '%e.N;
     print ' 最小聚類ID標號%d '%e.minclusterId;
     print ' 最大聚類ID標號%d '%e.maxclusterId;
     print ' 標准答案中最小聚類ID標號%d '%e.mingoldstandardId;
     print ' 標准答案中最大聚類ID標號%d '% e.maxgoldstandardId;
    e.GenerateCoocurrence();
     # for m in e.coocurrence:
        # print m;
        # print e.coocurrence[m];
        # print '***************************'
    purity=e.CalPurity();
     print ' 純度為%f '% purity;
     # a=[0.2,0.3,0.5,0];
     # print e.CalEntropyFormula(a);
    entropy= e.CalEntropy();
     print ' 熵為%f '%entropy;
    nmi=e.CalNMI();
     print ' 歸一化互信息為%f '%nmi

代碼調用示意圖

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 聚類ACC，NMI兩大評估指標的計算方式：匈牙利算法聚類算法性能評估聚類算法及其評估指標聚類算法的評估數學建模及機器學習算法（一）：聚類-kmeans（Python及MATLAB實現，包括k值選取與聚類效果評估）聚類評估指標系列(一)：標准化互信息NMI計算步驟及其Python實現機器學習——聚類算法的評估指標聚類算法4-模型評估（SSE、“肘”部法、SC系數和CH系數）用Sklearn實現聚類算法並用散點圖展現效果聚類算法