聚类是机器学习中一种重要的无监督算法，它可以将数据点归结为一系列特定的组合。理论上归为一类的数据点具有相同的特性，而不同类别的数据点则具有各不相同的属性。在数据科学中聚类会从数据中发掘出很多分析和理解的视角，让我们更深入的把握数据资源的价值、并据此指导生产生活。

K均值聚类

这一最著名的聚类算法主要基于数据点之间的均值和与聚类中心的聚类迭代而成。它主要的优点是十分的高效，由于只需要计算数据点与剧类中心的距离，其计算复杂度只有O(n)。其工作原理主要分为以下四步：

1.首先我们需要预先给定聚类的数目同时随机初始化聚类中心。我们可以初略的观察数据并给出较为准确的聚类数目；

2.每一个数据点通过计算与聚类中心的距离了来分类到最邻近的一类中；

3.根据分类结果，利用分类后的数据点重新计算聚类中心；

4.重复步骤二三直到聚类中心不再变化。（可以随机初始化不同的聚类中心以选取最好的结果）

python实现

1 point.py

class Point:

def __init__(self, latit_, longit_):

#self.id = id_ #an id which uniquely identifies a point

self.latit = latit_

self.longit = longit_

2 clustering.py

import randomas rand

import mathas math

from pointimport Point

#import pkg_resources

#pkg_resources.require("matplotlib")

import numpyas np

from mpl_toolkits.mplot3dimport Axes3D

import matplotlib.pyplotas plt

class clustering:

def __init__(self, geo_locs_, k_):

self.geo_locations = geo_locs_

self.k = k_

self.clusters = []#clusters of nodes

self.means = []#means of clusters

self.debug =True #debug flag

#this method returns the next random node

def next_random(self, index, points, clusters):

#pick next node that has the maximum distance from other nodes

print 'index: %d ' % (index)

dist = {}

for point_1in points:

#if self.debug:

#print 'point_1: %f %f' % (point_1.latit, point_1.longit)

#compute this node distance from all other points in cluster

for clusterin clusters.values():

point_2 = cluster[0]

#if self.debug:

# print 'point_2: %f %f' % (point_2.latit, point_2.longit)

if point_1not in dist:

dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

else:

dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

#if self.debug:

#for key, value in dist.items():

#print "(%f, %f) ==> %f" % (key.latit,key.longit,value)

#now let's return the point that has the maximum distance from previous nodes

count_ =0

max_ =0

for key, valuein dist.items():

if count_ ==0:

max_ = value

max_point = key

count_ +=1

else:

if value > max_:

max_ = value

max_point = key

return max_point

#this method computes the initial means

def initial_means(self, points):

#pick the first node at random

point_ = rand.choice(points)

if self.debug:

print 'point#0: %f %f' % (point_.latit, point_.longit)

clusters =dict()

clusters.setdefault(0, []).append(point_)

points.remove(point_)

#now let's pick k-1 more random points

for iin range(1,self.k):

point_ =self.next_random(i, points, clusters)

if self.debug:

print 'point#%d: %f %f' % (i, point_.latit, point_.longit)

#clusters.append([point_])

clusters.setdefault(i, []).append(point_)

points.remove(point_)

#compute mean of clusters

#self.print_clusters(clusters)

self.means =self.compute_mean(clusters)

if self.debug:

print "initial means:"

self.print_means(self.means)

def compute_mean(self, clusters):

means = []

for clusterin clusters.values():

mean_point = Point(0.0,0.0)

cnt =0.0

for pointin cluster:

#print "compute: point(%f,%f)" % (point.latit, point.longit)

mean_point.latit += point.latit

mean_point.longit += point.longit

cnt +=1.0

mean_point.latit = mean_point.latit/cnt

mean_point.longit = mean_point.longit/cnt

means.append(mean_point)

return means

#this method assign nodes to the cluster with the smallest mean

def assign_points(self, points):

if self.debug:

print "assign points"

clusters =dict()

for pointin points:

dist = []

if self.debug:

print "point(%f,%f)" % (point.latit, point.longit)

#find the best cluster for this node

for meanin self.means:

dist.append(math.sqrt(math.pow(point.latit - mean.latit,2.0) + math.pow(point.longit - mean.longit,2.0)))

#let's find the smallest mean

if self.debug:

print dist

cnt_ =0

index =0

min_ = dist[0]

for din dist:

if d < min_:

min_ = d

index = cnt_

cnt_ +=1

if self.debug:

print "index: %d" % index

clusters.setdefault(index, []).append(point)

return clusters

def update_means(self, means, threshold):

#check the current mean with the previous one to see if we should stop

for iin range(len(self.means)):

mean_1 =self.means[i]

mean_2 = means[i]

if self.debug:

print "mean_1(%f,%f)" % (mean_1.latit, mean_1.longit)

print "mean_2(%f,%f)" % (mean_2.latit, mean_2.longit)

if math.sqrt(math.pow(mean_1.latit - mean_2.latit,2.0) + math.pow(mean_1.longit - mean_2.longit,2.0)) > threshold:

return False

return True

#debug function: print cluster points

def print_clusters(self, clusters):

cluster_cnt =1

for clusterin clusters.values():

print "nodes in cluster #%d" % cluster_cnt

cluster_cnt +=1

for pointin cluster:

print "point(%f,%f)" % (point.latit, point.longit)

#print means

def print_means(self, means):

for pointin means:

print "%f %f" % (point.latit, point.longit)

#k_means algorithm

def k_means(self, plot_flag):

if len(self.geo_locations)

return -1 #error

points_ = [pointfor pointin self.geo_locations]

#compute the initial means

self.initial_means(points_)

stop =False

while not stop:

#assignment step: assign each node to the cluster with the closest mean

points_ = [pointfor pointin self.geo_locations]

clusters =self.assign_points(points_)

if self.debug:

self.print_clusters(clusters)

means =self.compute_mean(clusters)

if self.debug:

print "means:"

print self.print_means(means)

print "update mean:"

stop =self.update_means(means,0.01)

if not stop:

self.means = []

self.means = means

self.clusters = clusters

#plot cluster for evluation

if plot_flag:

fig = plt.figure()

ax = fig.add_subplot(111)

markers = ['o','d','x','h','H',7,4,5,6,'8','p',',','+','.','s','*',3,0,1,2]

colors = ['r','k','b', [0,0,0], [0,0,1], [0,1,0], [0,1,1], [1,0,0], [1,0,1], [1,1,0], [1,1,1]]

cnt =0

for clusterin clusters.values():

latits = []

longits = []

for pointin cluster:

latits.append(point.latit)

longits.append(point.longit)

ax.scatter(longits, latits,s=60,c=colors[cnt],marker=markers[cnt])

cnt +=1

plt.show()

return 0

3 main.py

import randomas rand

from clusteringimport clustering

from pointimport Point

import csv

geo_locs = []

#loc_ = Point(0.0, 0.0) #tuples for location

#geo_locs.append(loc_)

#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object

f =open('./drinking_fountains.csv','r')

reader = csv.reader(f,delimiter=",")

for linein reader:

loc_ = Point(float(line[0]),float(line[1]))#tuples for location

geo_locs.append(loc_)

#print len(geo_locs)

#for p in geo_locs:

# print "%f %f" % (p.latit, p.longit)

#let's run k_means clustering. the second parameter is the no of clusters

cluster = clustering(geo_locs,8 )

flag = cluster.k_means(True)

if flag == -1:

print "Error in arguments!"

else:

#the clustering results is a list of lists where each list represents one cluster

print "clustering results:"

cluster.print_clusters(cluster.clusters)

4 drinking_fountains.csv 示例数据

1,1

2,2

3,3

4,4

5,5

6,6

7,7

8,8

9,9

10,10

11,11

12,12

13,13

14,14

15,15

运行main.py ,图片如下：

k-means聚类算法研究

k-means聚类算法研究

1 point.py

2 clustering.py

3 main.py

4 drinking_fountains.csv 示例数据

推荐阅读更多精彩内容