Python實現DBScan

運行環境

Pyhton3
numpy(科學計算包)
matplotlib(畫圖所需，不畫圖可不必)

計算過程

st=>start: 開始
e=>end: 結束
op1=>operation: 讀入數據
cond=>condition: 是否還有未分類數據
op2=>operation: 找一未分類點擴散
op3=>operation: 輸出結果

st->op1->op2->cond
cond(yes)->op2
cond(no)->op3->e

輸入樣例

/* 788points.txt */
15.55,28.65
14.9,27.55
14.45,28.35
14.15,28.8
13.75,28.05
13.35,28.45
13,29.15
13.45,27.5
13.6,26.5
12.8,27.35
12.4,27.85
12.3,28.4
12.2,28.65
13.4,25.1
12.95,25.95

788points.txt完整文件：下載

代碼實現

# -*- coding: utf-8 -*-
__author__ = 'Wsine'

import numpy as np
import matplotlib.pyplot as plt
import math
import time

UNCLASSIFIED = False
NOISE = 0

def loadDataSet(fileName, splitChar='\t'):
	"""
	輸入：文件名
	輸出：數據集
	描述：從文件讀入數據集
	"""
	dataSet = []
	with open(fileName) as fr:
		for line in fr.readlines():
			curline = line.strip().split(splitChar)
			fltline = list(map(float, curline))
			dataSet.append(fltline)
	return dataSet

def dist(a, b):
	"""
	輸入：向量A, 向量B
	輸出：兩個向量的歐式距離
	"""
	return math.sqrt(np.power(a - b, 2).sum())

def eps_neighbor(a, b, eps):
	"""
	輸入：向量A, 向量B
	輸出：是否在eps范圍內
	"""
	return dist(a, b) < eps

def region_query(data, pointId, eps):
	"""
	輸入：數據集, 查詢點id, 半徑大小
	輸出：在eps范圍內的點的id
	"""
	nPoints = data.shape[1]
	seeds = []
	for i in range(nPoints):
		if eps_neighbor(data[:, pointId], data[:, i], eps):
			seeds.append(i)
	return seeds

def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):
	"""
	輸入：數據集, 分類結果, 待分類點id, 簇id, 半徑大小, 最小點個數
	輸出：能否成功分類
	"""
	seeds = region_query(data, pointId, eps)
	if len(seeds) < minPts: # 不滿足minPts條件的為噪聲點
		clusterResult[pointId] = NOISE
		return False
	else:
		clusterResult[pointId] = clusterId # 划分到該簇
		for seedId in seeds:
			clusterResult[seedId] = clusterId

		while len(seeds) > 0: # 持續擴張
			currentPoint = seeds[0]
			queryResults = region_query(data, currentPoint, eps)
			if len(queryResults) >= minPts:
				for i in range(len(queryResults)):
					resultPoint = queryResults[i]
					if clusterResult[resultPoint] == UNCLASSIFIED:
						seeds.append(resultPoint)
						clusterResult[resultPoint] = clusterId
					elif clusterResult[resultPoint] == NOISE:
						clusterResult[resultPoint] = clusterId
			seeds = seeds[1:]
		return True

def dbscan(data, eps, minPts):
	"""
	輸入：數據集, 半徑大小, 最小點個數
	輸出：分類簇id
	"""
	clusterId = 1
	nPoints = data.shape[1]
	clusterResult = [UNCLASSIFIED] * nPoints
	for pointId in range(nPoints):
		point = data[:, pointId]
		if clusterResult[pointId] == UNCLASSIFIED:
			if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):
				clusterId = clusterId + 1
	return clusterResult, clusterId - 1

def plotFeature(data, clusters, clusterNum):
	nPoints = data.shape[1]
	matClusters = np.mat(clusters).transpose()
	fig = plt.figure()
	scatterColors = ['black', 'blue', 'green', 'yellow', 'red', 'purple', 'orange', 'brown']
	ax = fig.add_subplot(111)
	for i in range(clusterNum + 1):
		colorSytle = scatterColors[i % len(scatterColors)]
		subCluster = data[:, np.nonzero(matClusters[:, 0].A == i)]
		ax.scatter(subCluster[0, :].flatten().A[0], subCluster[1, :].flatten().A[0], c=colorSytle, s=50)

def main():
	dataSet = loadDataSet('788points.txt', splitChar=',')
	dataSet = np.mat(dataSet).transpose()
	# print(dataSet)
	clusters, clusterNum = dbscan(dataSet, 2, 15)
	print("cluster Numbers = ", clusterNum)
	# print(clusters)
	plotFeature(dataSet, clusters, clusterNum)

if __name__ == '__main__':
	start = time.clock()
	main()
	end = time.clock()
	print('finish all in %s' % str(end - start))
	plt.show()

輸出樣例

cluster Numbers =  7
finish all in 32.712135628590794

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 DBscan算法及其Python實現 Python實現DBScan DBSCAN算法的Java，C++,Python實現挑子學習筆記：DBSCAN算法的python實現 Python實現DBSCAN聚類算法（簡單樣例測試）聚類算法實現（二）DBSCAN DBSCAN算法及sklearn實現 DBSCAN DBSCAN java 實現DBScan聚類算法