KNN 是 ML 里面最简单的分类算法 算法思路:
- 计算目标点与样本的距离
- 选取样本中 K 个距离最近的点
- 统计这 K 个距离最近点的类别
- 出现点数类别最多的类别 即为所预测的目的类别
from numpy import *
import operator
from os import listdir
def knn_class(inx, dataset, labels, k):
dataset_size = dataset.shape[0] # shape return size
diff_mat = tile(inx, (dataset_size, 1)) - dataset # tile() 计算距离
sq_diff_mat = diff_mat**2 # python ** == ^ 这里平方算距离
sq_distances = sq_diff_mat.sum(axis = 1) # axis = 0 -> 列 asix = 1 -> 行 按列累和 (x^2 + y^2)
distances = sq_distances**0.5 # (x^2 + y^2)开方算距离
sorted_dist_indicies = distances.argsort() # 距离计算 argsort() 函数返回从小到大的索引值
print('distances: ', distances)
print('sorted_dist_indicies: ', sorted_dist_indicies)
print('\n')
# 选取 K 个距离最小的点 进行分类 这里是分为了 A B 两类
class_count = {}
for i in range(k): # [0, k-1]
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label, 0) + 1
print(' i : vote_label ', i, sorted_dist_indicies[i], vote_label)
print('class_count: ', vote_label, class_count[vote_label])
# 统计 AB两类中出现次数最多的类别 即为 点 (0, 0) 所属的类别
# itemgetter(1) 即为第二个元素 即元素的个数
sorted_class_count = sorted(class_count.iteritems(), key=operator.itemgetter(1), reverse=True)
print('sorted_class_count: ', sorted_class_count)
# ('sorted_class_count: ', [('B', 2), ('A', 1)])
return sorted_class_count[0][0]
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
result = knn_class([0, 0], group, labels, 3)
result1 = knn_class([1, 3], group, labels, 3)
print('result: ', result) # ('result: ', 'B')
print('result: ', result1) # ('result: ', 'A')