k-means聚类算法研究

        聚类是机器学习中一种重要的无监督算法,它可以将数据点归结为一系列特定的组合。理论上归为一类的数据点具有相同的特性,而不同类别的数据点则具有各不相同的属性。在数据科学中聚类会从数据中发掘出很多分析和理解的视角,让我们更深入的把握数据资源的价值、并据此指导生产生活。

K均值聚类

这一最著名的聚类算法主要基于数据点之间的均值和与聚类中心的聚类迭代而成。它主要的优点是十分的高效,由于只需要计算数据点与剧类中心的距离,其计算复杂度只有O(n)。其工作原理主要分为以下四步:

1.首先我们需要预先给定聚类的数目同时随机初始化聚类中心。我们可以初略的观察数据并给出较为准确的聚类数目;

2.每一个数据点通过计算与聚类中心的距离了来分类到最邻近的一类中;

3.根据分类结果,利用分类后的数据点重新计算聚类中心;

4.重复步骤二三直到聚类中心不再变化。(可以随机初始化不同的聚类中心以选取最好的结果)

python实现

1 point.py

class Point:

def __init__(self, latit_, longit_):

#self.id = id_  #an id which uniquely identifies a point

        self.latit = latit_

self.longit = longit_

2 clustering.py

import randomas rand

import mathas math

from pointimport Point

#import pkg_resources

#pkg_resources.require("matplotlib")

import numpyas np

from mpl_toolkits.mplot3dimport Axes3D

import matplotlib.pyplotas plt

class clustering:

def __init__(self, geo_locs_, k_):

self.geo_locations = geo_locs_

self.k = k_

self.clusters = []#clusters of nodes

        self.means = []#means of clusters

        self.debug =True  #debug flag

#this method returns the next random node

    def next_random(self, index, points, clusters):

#pick next node that has the maximum distance from other nodes

        print 'index: %d ' % (index)

dist = {}

for point_1in points:

#if self.debug:

#print 'point_1: %f %f' % (point_1.latit, point_1.longit)

#compute this node distance from all other points in cluster

            for clusterin clusters.values():

point_2 = cluster[0]

#if self.debug:

# print 'point_2: %f %f' % (point_2.latit, point_2.longit)

                if point_1not in dist:

dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

else:

dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit,2.0) + math.pow(point_1.longit - point_2.longit,2.0))

#if self.debug:

#for key, value in dist.items():

#print "(%f, %f) ==> %f" % (key.latit,key.longit,value)

#now let's return the point that has the maximum distance from previous nodes

        count_ =0

        max_ =0

        for key, valuein dist.items():

if count_ ==0:

max_ = value

max_point = key

count_ +=1

            else:

if value > max_:

max_ = value

max_point = key

return max_point

#this method computes the initial means

    def initial_means(self, points):

#pick the first node at random

        point_ = rand.choice(points)

if self.debug:

print 'point#0: %f %f' % (point_.latit, point_.longit)

clusters =dict()

clusters.setdefault(0, []).append(point_)

points.remove(point_)

#now let's pick k-1 more random points

        for iin range(1,self.k):

point_ =self.next_random(i, points, clusters)

if self.debug:

print 'point#%d: %f %f' % (i, point_.latit, point_.longit)

#clusters.append([point_])

            clusters.setdefault(i, []).append(point_)

points.remove(point_)

#compute mean of clusters

#self.print_clusters(clusters)

        self.means =self.compute_mean(clusters)

if self.debug:

print "initial means:"

            self.print_means(self.means)

def compute_mean(self, clusters):

means = []

for clusterin clusters.values():

mean_point = Point(0.0,0.0)

cnt =0.0

            for pointin cluster:

#print "compute: point(%f,%f)" % (point.latit, point.longit)

                mean_point.latit += point.latit

mean_point.longit += point.longit

cnt +=1.0

            mean_point.latit = mean_point.latit/cnt

mean_point.longit = mean_point.longit/cnt

means.append(mean_point)

return means

#this method assign nodes to the cluster with the smallest mean

    def assign_points(self, points):

if self.debug:

print "assign points"

        clusters =dict()

for pointin points:

dist = []

if self.debug:

print "point(%f,%f)" % (point.latit, point.longit)

#find the best cluster for this node

            for meanin self.means:

dist.append(math.sqrt(math.pow(point.latit - mean.latit,2.0) + math.pow(point.longit - mean.longit,2.0)))

#let's find the smallest mean

            if self.debug:

print dist

cnt_ =0

            index =0

            min_ = dist[0]

for din dist:

if d < min_:

min_ = d

index = cnt_

cnt_ +=1

            if self.debug:

print "index: %d" % index

clusters.setdefault(index, []).append(point)

return clusters

def update_means(self, means, threshold):

#check the current mean with the previous one to see if we should stop

        for iin range(len(self.means)):

mean_1 =self.means[i]

mean_2 = means[i]

if self.debug:

print "mean_1(%f,%f)" % (mean_1.latit, mean_1.longit)

print "mean_2(%f,%f)" % (mean_2.latit, mean_2.longit)

if math.sqrt(math.pow(mean_1.latit - mean_2.latit,2.0) + math.pow(mean_1.longit - mean_2.longit,2.0)) > threshold:

return False

        return True

    #debug function: print cluster points

    def print_clusters(self, clusters):

cluster_cnt =1

        for clusterin clusters.values():

print "nodes in cluster #%d" % cluster_cnt

cluster_cnt +=1

            for pointin cluster:

print "point(%f,%f)" % (point.latit, point.longit)

#print means

    def print_means(self, means):

for pointin means:

print "%f %f" % (point.latit, point.longit)

#k_means algorithm

    def k_means(self, plot_flag):

if len(self.geo_locations)

return -1  #error

        points_ = [pointfor pointin self.geo_locations]

#compute the initial means

        self.initial_means(points_)

stop =False

        while not stop:

#assignment step: assign each node to the cluster with the closest mean

            points_ = [pointfor pointin self.geo_locations]

clusters =self.assign_points(points_)

if self.debug:

self.print_clusters(clusters)

means =self.compute_mean(clusters)

if self.debug:

print "means:"

                print self.print_means(means)

print "update mean:"

            stop =self.update_means(means,0.01)

if not stop:

self.means = []

self.means = means

self.clusters = clusters

#plot cluster for evluation

        if plot_flag:

fig = plt.figure()

ax = fig.add_subplot(111)

markers = ['o','d','x','h','H',7,4,5,6,'8','p',',','+','.','s','*',3,0,1,2]

colors = ['r','k','b', [0,0,0], [0,0,1], [0,1,0], [0,1,1], [1,0,0], [1,0,1], [1,1,0], [1,1,1]]

cnt =0

            for clusterin clusters.values():

latits = []

longits = []

for pointin cluster:

latits.append(point.latit)

longits.append(point.longit)

ax.scatter(longits, latits,s=60,c=colors[cnt],marker=markers[cnt])

cnt +=1

            plt.show()

return 0

3 main.py

import randomas rand

from clusteringimport clustering

from pointimport Point

import csv

geo_locs = []

#loc_ = Point(0.0, 0.0)  #tuples for location

#geo_locs.append(loc_)

#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object

f =open('./drinking_fountains.csv','r')

reader = csv.reader(f,delimiter=",")

for linein reader:

loc_ = Point(float(line[0]),float(line[1]))#tuples for location

    geo_locs.append(loc_)

#print len(geo_locs)

#for p in geo_locs:

#    print "%f %f" % (p.latit, p.longit)

#let's run k_means clustering. the second parameter is the no of clusters

cluster = clustering(geo_locs,8 )

flag = cluster.k_means(True)

if flag == -1:

print "Error in arguments!"

else:

#the clustering results is a list of lists where each list represents one cluster

    print "clustering results:"

    cluster.print_clusters(cluster.clusters)


4 drinking_fountains.csv 示例数据

1,1

2,2

3,3

4,4

5,5

6,6

7,7

8,8

9,9

10,10

11,11

12,12

13,13

14,14

15,15


运行main.py ,图片如下:


©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容