Python實現kMeans(k均值聚類)


Python實現kMeans(k均值聚類)

運行環境

  • Pyhton3
  • numpy(科學計算包)
  • matplotlib(畫圖所需,不畫圖可不必)

計算過程

st=>start: 開始
e=>end: 結束
op1=>operation: 讀入數據
op2=>operation: 隨機初始化聚類中心
cond=>condition: 是否聚類是否變化
op3=>operation: 尋找最近的點加入聚類
op4=>operation: 更新聚類中心
op5=>operation: 輸出結果

st->op1->op2->op3->op4->cond
cond(yes)->op3
cond(no)->op5->e

輸入樣例

/* 788points.txt */
15.55,28.65
14.9,27.55
14.45,28.35
14.15,28.8
13.75,28.05
13.35,28.45
13,29.15
13.45,27.5
13.6,26.5
12.8,27.35
12.4,27.85
12.3,28.4
12.2,28.65
13.4,25.1
12.95,25.95

788points.txt完整文件:下載

代碼實現

# -*- coding: utf-8 -*-
__author__ = 'Wsine'

from numpy import *
import matplotlib.pyplot as plt
import operator
import time

INF = 9999999.0

def loadDataSet(fileName, splitChar='\t'):
	"""
	輸入:文件名
	輸出:數據集
	描述:從文件讀入數據集
	"""
	dataSet = []
	with open(fileName) as fr:
		for line in fr.readlines():
			curline = line.strip().split(splitChar)
			fltline = list(map(float, curline))
			dataSet.append(fltline)
	return dataSet

def createDataSet():
	"""
	輸出:數據集
	描述:生成數據集
	"""
	dataSet = [[0.0, 2.0],
			   [0.0, 0.0],
			   [1.5, 0.0],
			   [5.0, 0.0],
			   [5.0, 2.0]]
	return dataSet

def distEclud(vecA, vecB):
	"""
	輸入:向量A, 向量B
	輸出:兩個向量的歐式距離
	"""
	return sqrt(sum(power(vecA - vecB, 2)))

def randCent(dataSet, k):
	"""
	輸入:數據集, 聚類個數
	輸出:k個隨機質心的矩陣
	"""
	n = shape(dataSet)[1]
	centroids = mat(zeros((k, n)))
	for j in range(n):
		minJ = min(dataSet[:, j])
		rangeJ = float(max(dataSet[:, j]) - minJ)
		centroids[:, j] = minJ + rangeJ * random.rand(k, 1)
	return centroids

def kMeans(dataSet, k, distMeans=distEclud, createCent=randCent):
	"""
	輸入:數據集, 聚類個數, 距離計算函數, 生成隨機質心函數
	輸出:質心矩陣, 簇分配和距離矩陣
	"""
	m = shape(dataSet)[0]
	clusterAssment = mat(zeros((m, 2)))
	centroids = createCent(dataSet, k)
	clusterChanged = True
	while clusterChanged:
		clusterChanged = False
		for i in range(m): # 尋找最近的質心
			minDist = INF
			minIndex = -1
			for j in range(k):
				distJI = distMeans(centroids[j, :], dataSet[i, :])
				if distJI < minDist:
					minDist = distJI
					minIndex = j
			if clusterAssment[i, 0] != minIndex:
				clusterChanged = True
			clusterAssment[i, :] = minIndex, minDist**2
		for cent in range(k): # 更新質心的位置
			ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
			centroids[cent, :] = mean(ptsInClust, axis=0)
	return centroids, clusterAssment

def plotFeature(dataSet, centroids, clusterAssment):
	m = shape(centroids)[0]
	fig = plt.figure()
	scatterMarkers = ['s', 'o', '^', '8', 'p', 'd', 'v', 'h', '>', '<']
	scatterColors = ['blue', 'green', 'yellow', 'purple', 'orange', 'black', 'brown']
	ax = fig.add_subplot(111)
	for i in range(m):
		ptsInCurCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
		markerStyle = scatterMarkers[i % len(scatterMarkers)]
		colorSytle = scatterColors[i % len(scatterColors)]
		ax.scatter(ptsInCurCluster[:, 0].flatten().A[0], ptsInCurCluster[:, 1].flatten().A[0], marker=markerStyle, c=colorSytle, s=90)
	ax.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0], marker='+', c='red', s=300)

def main():
	#dataSet = loadDataSet('testSet2.txt')
	dataSet = loadDataSet('788points.txt', splitChar=',')
	#dataSet = createDataSet()
	dataSet = mat(dataSet)
	resultCentroids, clustAssing = kMeans(dataSet, 6)
	print('*******************')
	print(resultCentroids)
	print('*******************')
	plotFeature(dataSet, resultCentroids, clustAssing)

if __name__ == '__main__':
	start = time.clock()
	main()
	end = time.clock()
	print('finish all in %s' % str(end - start))
	plt.show()

輸出樣例

*******************
[[ 33.14278846   8.79375   ]
 [ 32.69453125  22.13789062]
 [  9.25928144  22.98113772]
 [ 18.8620283    7.11037736]
 [  9.50503876   7.55620155]
 [ 21.16041667  22.89895833]]
*******************
finish all in 5.454627327134057


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM