KNN算法实际应用学习,手写字体识别,我们将手写字体二值化为当前的格式,同时控制输入字体的大小,方便后续我们使用KNN算法来进行识别。
#kNN 手写字体模式识别
import time
from numpy import *
import os
def kNNclassify(inX, dataSet, lables, k):
'''
kNN 算法核心思想
:param inX: 用于分类的向量
:param dataSet:训练样本集
:param lables:标签向量
:param k:近邻数目
:return:
'''
# 通过欧式公式计算点的距离,然后打上不同的标签
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() # 此时已经排序好所有距离
classCount = {}
for i in range(k):
voteIlable = lables[sortedDistIndicies[i]]
classCount[voteIlable] = classCount.get(voteIlable,0) + 1
# dic为比较函数,value 为排序的对象(这里指键或键值),reverse:注明升序还是降序,True--降序,False--升序(默认)
sortedClassCount = sorted(classCount.items(), key=lambda x:x[1], reverse=True)
return sortedClassCount[0][0]
def img2vector(filename):
'''
转换图片为向量,数字已经二值化为32*32的图片
:param filename:
:return:
'''
returnVect = zeros((1, 1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32 * i + j] = int(lineStr[j])
return returnVect
def handwritingClassTest(trainPath,testPath):
'''
手写算法kNN
:return:
'''
hwLables = []
# 先构造训练矩阵
trainningFileList = os.listdir(trainPath)
m = len(trainningFileList)
trainningMat = zeros((m, 1024))
for i in range(m):
if 'txt' in trainningFileList[i]:
fileNameStr = trainningFileList[i]
trainningMat[i,:] = img2vector(os.path.join(trainPath,fileNameStr))
hwLables.append(int(fileNameStr.replace('_','.').split('.')[0]))
# begin to test the Algo
errorCount = 0
for testPic in os.listdir(testPath):
if 'txt' in testPic:
testNum = int(testPic.replace('_','.').split('.')[0])
tesVector = img2vector(os.path.join(trainPath,testPic))
start = time.time()
knnTsetResult = kNNclassify(tesVector,trainningMat,hwLables,3)
costTime = time.time() - start
# print("KNN cost: %d" % costTime)
if (knnTsetResult != testNum):
errorCount += 1.0
print("the file is %s , the classifier came back with: %d, the real answer is: %d" % (testPic, knnTsetResult, testNum))
totalTestTime = len(os.listdir(testPath))
print("\nthe total test time is %d, number of errors is: %d" % (totalTestTime, errorCount))
er = 0 if totalTestTime == 0 else format(errorCount / totalTestTime, '.3%')
print("\nthe total error rate is: %s" %er)
def knn_debug():
group = array([[1.0,1.1], [1.0,1.0], [0,0], [0,0.1]])
lables = ['a','a','b','b']
test = kNNclassify([0,0.2], group, lables, 3)
print(test)
if __name__ == '__main__':
handwritingClassTest("/Users/moxi.hyy/Downloads/machinelearninginaction/Ch02/digits/trainingDigits",
"/Users/moxi.hyy/Downloads/machinelearninginaction/Ch02/digits/testDigits")