机器学习实战

机器学习实战第二章kNN: http://blog.csdn.net/fenfenmiao/article/details/52165472

参考 http://blog.sciencenet.cn/blog-791354-705248.html

from numpy import *#导入numpy库

import operator

def createDataSet():#定义函数

group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])

labels = ['A','A','B','B']

return group,labels

def classify0(inX,dataSet,labels,k):#输入待分类向量，da teSet为测试集，labels为目标结果，k为样本值

dataSetSize=dataSet.shape[0]#获取数组的行数

diffMat = tile(inX,(dataSetSize,1)) - dataSet#将inX复制dataSize行，在与dataSet相减，即（x1-x2）

sqDiffMat=diffMat**2#(x1-x2)的平方

sqDistances=sqDiffMat.sum(axis=1)#(x1-x2)^2+(y1-y2)^2

distances=sqDistances**0.5#上述结果开根号

sortedDistIndices=distances.argsort()#将数组的下标按数组从小到大的顺序排序

classCount={}#计算k个值里的类别

for i in range(k):

voteIlabel=labels[sortedDistIndices[i]]#从距离最小的类别开始统计

classCount[voteIlabel]=classCount.get(voteIlabel,0)+1#ge t表示先查看classCount中是否存在votellabel，如果存在,则获取个数，否则，返回0

sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)

return sortedClassCount[0][0]#返回统计中类别值最大的

def file2matrix(filename):#将文件转化成python可以处理的形式

fr = open(filename)#打开文件，此文件与kNN.py文件在同一目录下

numberOfLines = len(fr.readlines()) #get the number of lines in the file

returnMat = zeros((numberOfLines,3)) #prepare matrix to return

classLabelVector = [] #prepare labels return

fr = open(filename)

index = 0

for line in fr.readlines():

line = line.strip()#去enter

listFromLine = line.split('\t')#根据\t 将整行分割成元素列表

returnMat[index,:] = listFromLine[0:3]#赋给

classLabelVector.append(int(listFromLine[-1]))#将元素列表的最后一个元素添加到classLabelVector

index += 1#行数+1

return returnMat,classLabelVector

（由于简书显示代码模式很麻烦，决定搬家到CSDN）

机器学习实战

推荐阅读更多精彩内容