from:https://www.cnblogs.com/wsine/p/5180778.html
運行環境
- Pyhton3
- numpy(科學計算包)
- matplotlib(畫圖所需,不畫圖可不必)
計算過程
st=>start: 開始 e=>end: 結束 op1=>operation: 讀入數據 cond=>condition: 是否還有未分類數據 op2=>operation: 找一未分類點擴散 op3=>operation: 輸出結果 st->op1->op2->cond cond(yes)->op2 cond(no)->op3->e
輸入樣例
/* 788points.txt */ 15.55,28.65 14.9,27.55 14.45,28.35 14.15,28.8 13.75,28.05 13.35,28.45 13,29.15 13.45,27.5 13.6,26.5 12.8,27.35 12.4,27.85 12.3,28.4 12.2,28.65 13.4,25.1 12.95,25.95
788points.txt完整文件:下載
代碼實現
# -*- coding: utf-8 -*- __author__ = 'Wsine' import numpy as np import matplotlib.pyplot as plt import math import time UNCLASSIFIED = False NOISE = 0 def loadDataSet(fileName, splitChar='\t'): """ 輸入:文件名 輸出:數據集 描述:從文件讀入數據集 """ dataSet = [] with open(fileName) as fr: for line in fr.readlines(): curline = line.strip().split(splitChar) fltline = list(map(float, curline)) dataSet.append(fltline) return dataSet def dist(a, b): """ 輸入:向量A, 向量B 輸出:兩個向量的歐式距離 """ return math.sqrt(np.power(a - b, 2).sum()) def eps_neighbor(a, b, eps): """ 輸入:向量A, 向量B 輸出:是否在eps范圍內 """ return dist(a, b) < eps def region_query(data, pointId, eps): """ 輸入:數據集, 查詢點id, 半徑大小 輸出:在eps范圍內的點的id """ nPoints = data.shape[1] seeds = [] for i in range(nPoints): if eps_neighbor(data[:, pointId], data[:, i], eps): seeds.append(i) return seeds def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): """ 輸入:數據集, 分類結果, 待分類點id, 簇id, 半徑大小, 最小點個數 輸出:能否成功分類 """ seeds = region_query(data, pointId, eps) if len(seeds) < minPts: # 不滿足minPts條件的為噪聲點 clusterResult[pointId] = NOISE return False else: clusterResult[pointId] = clusterId # 划分到該簇 for seedId in seeds: clusterResult[seedId] = clusterId while len(seeds) > 0: # 持續擴張 currentPoint = seeds[0] queryResults = region_query(data, currentPoint, eps) if len(queryResults) >= minPts: for i in range(len(queryResults)): resultPoint = queryResults[i] if clusterResult[resultPoint] == UNCLASSIFIED: seeds.append(resultPoint) clusterResult[resultPoint] = clusterId elif clusterResult[resultPoint] == NOISE: clusterResult[resultPoint] = clusterId seeds = seeds[1:] return True def dbscan(data, eps, minPts): """ 輸入:數據集, 半徑大小, 最小點個數 輸出:分類簇id """ clusterId = 1 nPoints = data.shape[1] clusterResult = [UNCLASSIFIED] * nPoints for pointId in range(nPoints): point = data[:, pointId] if clusterResult[pointId] == UNCLASSIFIED: if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts): clusterId = clusterId + 1 return clusterResult, clusterId - 1 def plotFeature(data, clusters, clusterNum): nPoints = data.shape[1] matClusters = np.mat(clusters).transpose() fig = plt.figure() scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown'] ax = fig.add_subplot(111) for i in range(clusterNum + 1): colorSytle = scatterColors[i % len(scatterColors)] subCluster = data[:, np.nonzero(matClusters[:, 0].A == i)] ax.scatter(subCluster[0, :].flatten().A[0], subCluster[1, :].flatten().A[0], c=colorSytle, s=50) def main(): dataSet = loadDataSet('788points.txt', splitChar=',') dataSet = np.mat(dataSet).transpose() # print(dataSet) clusters, clusterNum = dbscan(dataSet, 2, 15) print("cluster Numbers = ", clusterNum) # print(clusters) plotFeature(dataSet, clusters, clusterNum) if __name__ == '__main__': start = time.clock() main() end = time.clock() print('finish all in %s' % str(end - start)) plt.show()
輸出樣例
cluster Numbers = 7 finish all in 32.712135628590794