思路还挺简单的可惜架不住我脑残(……)
判断相等的while有问题会多跑好多次,还在想怎么改
所用数据集:
http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
#coding = utf-8
'''
●K-means v1.0
作者:Kadoya
创建日期:18.4.12
最近修改时间:18.4.13
程序目的:聚类
主要算法说明:
1.随机选择k个数据点作为质心以分成k个分组
2.分别计算每个点离每个质心的欧几里得距离,选距离最小的分成一组
3.每组选出新的虚拟质心,这个质心不用落到具体的数据点上
4.再次计算每个点离新质心的欧几里得距离并重复以上步骤直至分组前后没有变化
程序备注:
1.iris只有3和4,即花瓣宽度和花瓣长度对区分有用
2.雨血敢出敢买所以求你了快出(
3.想玩牧羊人之心!跪求公测!
更新历史:
4.13 v1.0
①基本功能完成,小bug不计其数,智障操作不计其数
Attribute Information:
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa
-- Iris Versicolour
-- Iris Virginica
'''
import random
import matplotlib.pyplot as plt
def stardust(data,k):#基本处理 参数:原始数据,质心数
lost_star=[]
for i in data:#只有3,4列有用所以[2:-1]
lost_star.append(i.strip('\n').split(',')[2:-1])
lost_star=[list(map(float,i)) for i in lost_star]#str转为float
#centroids=[random.choice(lost_star) for i in range(k)]
#↑会选到重复的,老老实实展开写了
sss=lost_star[:]
centroids=[]
for i in range(k):#随机选k个数据点作为质心
a=random.choice(sss)
sss.remove(a)
centroids+=[a]
return lost_star,centroids #返回数据集和质心
def cost(lost_star,centroids):#计算欧几里得距离并分组且返回新质心 参数:数据集list,质心list
scrap,new,puppet={},[],[]
for s in range(len(lost_star)):#遍历数据集里的点
star={}
for c in centroids:#计算该点和不同质点间的距离(cost)
cost=sum([pow(lost_star[s][j]-c[j],2) for j in range(len(lost_star[s]))])
star[centroids.index(c)]=cost #key:value=质心id:数据点到该质心的距离
cen=[i for i,j in star.items() if j == min(star.values())][0]#最近的质心的id
if cen not in scrap.keys():#分类,key:value=质心id:属于该质心的点的列表
scrap[cen]=[]
scrap[cen].append(s)
puppet.append(cen)#这个才是!要传回去的列表!(
for i in scrap.values():#计算新的质心
n=[sum([lost_star[j][k] for j in i])/len(i) for k in range(len(lost_star[0]))]
new.append(n)
return puppet,new
def kmeans(lost_star,centroids):#参数:数据集,质心
puppet,new=cost(lost_star,centroids)
puppet2,new2=cost(lost_star,new)
while puppet!=puppet2:#分组前后不同的话就重来
puppet,new=cost(lost_star,new2)
puppet2,new2=cost(lost_star,new)
return puppet,new
if __name__ == '__main__':
import datetime
starttime = datetime.datetime.now()
path = r'data.txt'
with open(path) as f:
data = f.readlines()
k=3
lost_star,centroids=stardust(data,k)#拿到数据集和质心
puppet,new=kmeans(lost_star,centroids)
print(puppet,new)
endtime = datetime.datetime.now()
print (endtime - starttime)
图像代码
from matplotlib.colors import ListedColormap
from kadoya import * #kmeans算法的py文件名
def ori(lost_star,new,s,d):
plt.figure(1) # 第一张图
for i in range(len(s)):
if s[i]==d[0]:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'r',marker = 'o')
if s[i]==d[1]:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'b',marker = 'o')
if s[i]==d[2]:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'g',marker = 'o')
plt.title('training set')
plt.figure(2) # 第二张图
for i in range(len(lost_star)):
if int(star[i])==0:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'r',marker = 'o')
if int(star[i])==1:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'b',marker = 'o')
if int(star[i])==2:
plt.scatter(lost_star[i][0],lost_star[i][1],c = 'g',marker = 'o')
for i in range(len(new)):
if i==0:
plt.scatter(new[i][0],new[i][1],c = 'r',marker = 'x')
if i==1:
plt.scatter(new[i][0],new[i][1],c = 'b',marker = 'x')
if i==2:
plt.scatter(new[i][0],new[i][1],c = 'g',marker = 'x')
plt.title('test set')
plt.show()
if __name__ == '__main__':
path = r'data.txt'
with open(path) as f:
data = f.readlines()
k=3
lost_star,centroids=stardust(data,k)#拿到数据集和质心
star,new=kmeans(lost_star,centroids)
s,d=[],[]
for i in data:
s.append(i.strip('\n').split(',')[-1])
for i in s:
if i not in d:
d.append(i)
ori(lost_star,new,s,d)