最近由於要實現‘基於網格的DBSCAN算法’,網上有沒有找到現成的代碼[如果您有代碼,麻煩聯系我],只好參考已有的DBSCAN算法的實現。先從網上隨便找了幾篇放這兒,之后對比研究。
DBSCAN簡介:
1.簡介
DBSCAN 算法是一種基於密度的空間聚類算法。該算法利用基於密度的聚類的概念,即要求聚類空間中的一定區域內所包含對象(點或其它空間對象)的數目不小於某一給定閥 值。DBSCAN 算法的顯著優點是聚類速度快且能夠有效處理噪聲點和發現任意形狀的空間聚類。但是由於它直接對整個數據庫進行操作且進行聚類時使用了一個全局性的表征 密度的參數,因此也具有兩個比較明顯的弱點:
1. 當數據量增大時,要求較大的內存支持 I/0 消耗也很大;
2. 當空間聚類的密度不均勻、聚類間距離相差很大時,聚類質量較差。
2.DBSCAN算法的聚類過程
DBSCAN算法基於一個事實:一個聚類可以由其中的任何核心對象唯一確定。等價可以表述為: 任一滿足核心對象條件的數據對象p,數據庫D中所有從p密度可達的數據對象o 所組成的集合構成了一個完整的聚類C,且p屬於C。
3.DBSCAN中的幾個定義
密度可達是直接密度可達的傳遞閉包,非對稱性關系;密度相連是對稱性關系。DBSCA目的是找到密度相連對象的最大集合。
E領域:給定對象p半徑為E內的區域稱為該對象的E領域;
核心對象:p的E領域內樣本數大於MinPts(算法輸入值),則該對象p為核心對象;
直接密度可達:對於樣本集合D,如果樣本點q在p的E領域內,且p為核心對象,則p直接密度可達q;
密度可達:對於樣本集合D,存在一串樣本點p1,p2,p3,...pn,其中連續兩個點直接密度可達,則 p=p1,q=qn,則p密度可達q;
密度相連:對於樣本集合D中任意一點o,存在p到o密度可達,並且q到o密度可達,那么q從p密度相連;
算法偽代碼

1 DBSCAN(SetOfPoints, Eps, MinPts){ 2 ClusterId=nextId(NOISE) 3 for(i=0;i<SetOfPoints.size();i++){ 4 point=SetOfPoints.get(i) 5 if (point.ClId==UNCLASSIFIED){ 6 If(ExpandCluster(SetOfPoints,point, ClusterId, Eps, MinPts)){ 7 ClusterId=nextId(ClusterId) 8 } 9 } 10 } 11 } 12 13 ExpandCluster(SetOfPoints,Point, ClId, Eps, MinPts){ 14 seeds=SetOfPoints.regionQuery(Point, Eps) 15 if(seeds.size()<MinPts){ 16 SetOfPoints.changeClId(Point,NOISE) 17 return False 18 }else{ 19 SetOfPoints.changeClIds(seeds,ClId) 20 seeds.delete(Point) 21 while(seeds.size()>0){ 22 currentP=seeds.first() 23 result=SetOfPoints.regionQuery(currentP, Eps) 24 if(result.size()>= MinPts){ 25 for(i=0;i<result.size();i++){ 26 resultP=result.get(i) 27 if(resultP.ClId ==UNCLASSIFIED or resultP.ClId ==NOISE){ 28 if(resultP.ClId ==UNCLASSIFIED){ 29 seeds.append(resultP) 30 } 31 SetOfPoints.changeClId(resultP,ClId) 32 } 33 } 34 } 35 seeds.delete(currentP) 36 } 37 return True 38 } 39 }
JAVA實現:

1 package orisun; 2 3 import java.io.File; 4 import java.util.ArrayList; 5 import java.util.Vector; 6 import java.util.Iterator; 7 8 public class DBScan { 9 10 double Eps=3; //區域半徑 11 int MinPts=4; //密度 12 13 //由於自己到自己的距離是0,所以自己也是自己的neighbor 14 public Vector<DataObject> getNeighbors(DataObject p,ArrayList<DataObject> objects){ 15 Vector<DataObject> neighbors=new Vector<DataObject>(); 16 Iterator<DataObject> iter=objects.iterator(); 17 while(iter.hasNext()){ 18 DataObject q=iter.next(); 19 double[] arr1=p.getVector(); 20 double[] arr2=q.getVector(); 21 int len=arr1.length; 22 23 if(Global.calEditDist(arr1,arr2,len)<=Eps){ //使用編輯距離 24 // if(Global.calEuraDist(arr1, arr2, len)<=Eps){ //使用歐氏距離 25 // if(Global.calCityBlockDist(arr1, arr2, len)<=Eps){ //使用街區距離 26 // if(Global.calSinDist(arr1, arr2, len)<=Eps){ //使用向量夾角的正弦 27 neighbors.add(q); 28 } 29 } 30 return neighbors; 31 } 32 33 public int dbscan(ArrayList<DataObject> objects){ 34 int clusterID=0; 35 boolean AllVisited=false; 36 while(!AllVisited){ 37 Iterator<DataObject> iter=objects.iterator(); 38 while(iter.hasNext()){ 39 DataObject p=iter.next(); 40 if(p.isVisited()) 41 continue; 42 AllVisited=false; 43 p.setVisited(true); //設為visited后就已經確定了它是核心點還是邊界點 44 Vector<DataObject> neighbors=getNeighbors(p,objects); 45 if(neighbors.size()<MinPts){ 46 if(p.getCid()<=0) 47 p.setCid(-1); //cid初始為0,表示未分類;分類后設置為一個正數;設置為-1表示噪聲。 48 }else{ 49 if(p.getCid()<=0){ 50 clusterID++; 51 expandCluster(p,neighbors,clusterID,objects); 52 }else{ 53 int iid=p.getCid(); 54 expandCluster(p,neighbors,iid,objects); 55 } 56 } 57 AllVisited=true; 58 } 59 } 60 return clusterID; 61 } 62 63 private void expandCluster(DataObject p, Vector<DataObject> neighbors, 64 int clusterID,ArrayList<DataObject> objects) { 65 p.setCid(clusterID); 66 Iterator<DataObject> iter=neighbors.iterator(); 67 while(iter.hasNext()){ 68 DataObject q=iter.next(); 69 if(!q.isVisited()){ 70 q.setVisited(true); 71 Vector<DataObject> qneighbors=getNeighbors(q,objects); 72 if(qneighbors.size()>=MinPts){ 73 Iterator<DataObject> it=qneighbors.iterator(); 74 while(it.hasNext()){ 75 DataObject no=it.next(); 76 if(no.getCid()<=0) 77 no.setCid(clusterID); 78 } 79 } 80 } 81 if(q.getCid()<=0){ //q不是任何簇的成員 82 q.setCid(clusterID); 83 } 84 } 85 } 86 87 public static void main(String[] args){ 88 DataSource datasource=new DataSource(); 89 //Eps=3,MinPts=4 90 datasource.readMatrix(new File("/home/orisun/test/dot.mat")); 91 datasource.readRLabel(new File("/home/orisun/test/dot.rlabel")); 92 //Eps=2.5,MinPts=4 93 // datasource.readMatrix(new File("/home/orisun/text.normalized.mat")); 94 // datasource.readRLabel(new File("/home/orisun/text.rlabel")); 95 DBScan ds=new DBScan(); 96 int clunum=ds.dbscan(datasource.objects); 97 datasource.printResult(datasource.objects,clunum); 98 } 99 }
C++實現:
數據結構

1 #include <vector> 2 3 using namespace std; 4 5 const int DIME_NUM=2; //數據維度為2,全局常量 6 7 //數據點類型 8 class DataPoint 9 { 10 private: 11 unsigned long dpID; //數據點ID 12 double dimension[DIME_NUM]; //維度數據 13 long clusterId; //所屬聚類ID 14 bool isKey; //是否核心對象 15 bool visited; //是否已訪問 16 vector<unsigned long> arrivalPoints; //領域數據點id列表 17 public: 18 DataPoint(); //默認構造函數 19 DataPoint(unsigned long dpID,double* dimension , bool isKey); //構造函數 20 21 unsigned long GetDpId(); //GetDpId方法 22 void SetDpId(unsigned long dpID); //SetDpId方法 23 double* GetDimension(); //GetDimension方法 24 void SetDimension(double* dimension); //SetDimension方法 25 bool IsKey(); //GetIsKey方法 26 void SetKey(bool isKey); //SetKey方法 27 bool isVisited(); //GetIsVisited方法 28 void SetVisited(bool visited); //SetIsVisited方法 29 long GetClusterId(); //GetClusterId方法 30 void SetClusterId(long classId); //SetClusterId方法 31 vector<unsigned long>& GetArrivalPoints(); //GetArrivalPoints方法 32 };
實現

1 #include "DataPoint.h" 2 3 //默認構造函數 4 DataPoint::DataPoint() 5 { 6 } 7 8 //構造函數 9 DataPoint::DataPoint(unsigned long dpID,double* dimension , bool isKey):isKey(isKey),dpID(dpID) 10 { 11 //傳遞每維的維度數據 12 for(int i=0; i<DIME_NUM;i++) 13 { 14 this->dimension[i]=dimension[i]; 15 } 16 } 17 18 //設置維度數據 19 void DataPoint::SetDimension(double* dimension) 20 { 21 for(int i=0; i<DIME_NUM;i++) 22 { 23 this->dimension[i]=dimension[i]; 24 } 25 } 26 27 //獲取維度數據 28 double* DataPoint::GetDimension() 29 { 30 return this->dimension; 31 } 32 33 //獲取是否為核心對象 34 bool DataPoint::IsKey() 35 { 36 return this->isKey; 37 } 38 39 //設置核心對象標志 40 void DataPoint::SetKey(bool isKey) 41 { 42 this->isKey = isKey; 43 } 44 45 //獲取DpId方法 46 unsigned long DataPoint::GetDpId() 47 { 48 return this->dpID; 49 } 50 51 //設置DpId方法 52 void DataPoint::SetDpId(unsigned long dpID) 53 { 54 this->dpID = dpID; 55 } 56 57 //GetIsVisited方法 58 bool DataPoint::isVisited() 59 { 60 return this->visited; 61 } 62 63 64 //SetIsVisited方法 65 void DataPoint::SetVisited( bool visited ) 66 { 67 this->visited = visited; 68 } 69 70 //GetClusterId方法 71 long DataPoint::GetClusterId() 72 { 73 return this->clusterId; 74 } 75 76 //GetClusterId方法 77 void DataPoint::SetClusterId( long clusterId ) 78 { 79 this->clusterId = clusterId; 80 } 81 82 //GetArrivalPoints方法 83 vector<unsigned long>& DataPoint::GetArrivalPoints() 84 { 85 return arrivalPoints; 86 }
PYTHON實現:

1 from matplotlib.pyplot import * 2 from collections import defaultdict 3 import random 4 5 #function to calculate distance 6 def dist(p1, p2): 7 return ((p1[0]-p2[0])**2+ (p1[1]-p2[1])**2)**(0.5) 8 9 #randomly generate around 100 cartesian coordinates 10 all_points=[] 11 12 for i in range(100): 13 randCoord = [random.randint(1,50), random.randint(1,50)] 14 if not randCoord in all_points: 15 all_points.append(randCoord) 16 17 18 #take radius = 8 and min. points = 8 19 E = 8 20 minPts = 8 21 22 #find out the core points 23 other_points =[] 24 core_points=[] 25 plotted_points=[] 26 for point in all_points: 27 point.append(0) # assign initial level 0 28 total = 0 29 for otherPoint in all_points: 30 distance = dist(otherPoint,point) 31 if distance<=E: 32 total+=1 33 34 if total > minPts: 35 core_points.append(point) 36 plotted_points.append(point) 37 else: 38 other_points.append(point) 39 40 #find border points 41 border_points=[] 42 for core in core_points: 43 for other in other_points: 44 if dist(core,other)<=E: 45 border_points.append(other) 46 plotted_points.append(other) 47 48 49 #implement the algorithm 50 cluster_label=0 51 52 for point in core_points: 53 if point[2]==0: 54 cluster_label+=1 55 point[2]=cluster_label 56 57 for point2 in plotted_points: 58 distance = dist(point2,point) 59 if point2[2] ==0 and distance<=E: 60 print point, point2 61 point2[2] =point[2] 62 63 64 #after the points are asssigned correnponding labels, we group them 65 cluster_list = defaultdict(lambda: [[],[]]) 66 for point in plotted_points: 67 cluster_list[point[2]][0].append(point[0]) 68 cluster_list[point[2]][1].append(point[1]) 69 70 markers = ['+','*','.','d','^','v','>','<','p'] 71 72 #plotting the clusters 73 i=0 74 print cluster_list 75 for value in cluster_list: 76 cluster= cluster_list[value] 77 plot(cluster[0], cluster[1],markers[i]) 78 i = i%10+1 79 80 #plot the noise points as well 81 noise_points=[] 82 for point in all_points: 83 if not point in core_points and not point in border_points: 84 noise_points.append(point) 85 noisex=[] 86 noisey=[] 87 for point in noise_points: 88 noisex.append(point[0]) 89 noisey.append(point[1]) 90 plot(noisex, noisey, "x") 91 92 title(str(len(cluster_list))+" clusters created with E ="+str(E)+" Min Points="+str(minPts)+" total points="+str(len(all_points))+" noise Points = "+ str(len(noise_points))) 93 axis((0,60,0,60)) 94 show()
參考:http://www.cnblogs.com/zhangchaoyang/articles/2182748.html
http://www.cnblogs.com/lovell-liu/archive/2011/11/08/2241542.html
http://blog.sudipk.com.np/2013/02/implementation-of-dbscan-algorithm-for.html
http://caoyaqiang.diandian.com/post/2012-09-26/40039517485