简介
kNN算法也叫K-近邻算法,是一种用于分类和回归的无分母统计方法。在k-NN分类中,输出是一个分类族群。一个对象的分类是由其邻居的“多数表决”确定的,k个最近邻居(k为正整数,通常较小)中最常见的分类决定了赋予该对象的类别。若k = 1,则该对象的类别直接由最近的一个节点赋予。训练样本是多维特征空间向量,其中每个训练样本带有一个类别标签。算法的训练阶段只包含存储的特征向量和训练样本的标签。在分类阶段,k是一个用户定义的常数。一个没有类别标签的向量(查询或测试点)将被归类为最接近该点的k个样本点中最频繁使用的一类。一般情况下,将欧氏距离作为距离度量,但是这是只适用于连续变量。在文本分类这种离散变量情况下,另一个度量——重叠度量(或海明距离)可以用来作为度量。例如对于基因表达微阵列数据,k-NN也与Pearson和Spearman相关系数结合起来使用。[2]通常情况下,如果运用一些特殊的算法来计算度量的话,k近邻分类精度可显著提高,如运用大间隔最近邻居或者邻里成分分析法。
逐步测试
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import numpy as np
import math
from collections import Counter
"""
训练集可视化
"""
# raw_data_x是特征,raw_data_y是标签,0为良性,1为恶性
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343853454, 3.368312451],
[3.582294121, 4.679917921],
[2.280362211, 2.866990212],
[7.423436752, 4.685324231],
[5.745231231, 3.532131321],
[9.172112222, 2.511113104],
[7.927841231, 3.421455345],
[7.939831414, 0.791631213]]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 散点图
plt.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='g', label='Tumor Size')
plt.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='r', label='Time')
plt.xlabel('Tumor Size')
plt.ylabel('Time')
plt.axis([0, 10, 0, 5])
#plt.show()
"""
训练过程
"""
# 计算指定点到训练集每个点的欧式距离
x = [8.90933607318, 3.365731514]
distances = []
for x_train in X_train:
d = math.sqrt(np.sum((x_train - x) ** 2))
distances.append(d)
print(distances) #> [5.611968000921151, 6.011747706769277, 7.565483059418645, 5.486753308891268, 6.647709180746875, 1.9872648870854204, 3.168477291709152, 0.8941051007010301, 0.9830754144862234, 2.7506238644678445]
# 对距离排序
nearest = np.argsort(distances)
print(nearest) #> [7 8 5 9 6 3 0 1 4 2]
# 前6个的分类
k = 6
topK_y = [y_train[i] for i in nearest[:k]]
print(topK_y) #> [1, 1, 1, 1, 1, 0]
"""
预测过程
"""
# 计数
votes = Counter(topK_y)
print(votes) #> Counter({1: 5, 0: 1})
# 频次最多的标签
predict_y = votes.most_common(1)[0][0]
print(predict_y) #> 1
封装为类
import numpy as np
from math import sqrt
from collections import Counter
class kNNClassifier:
def __init__(self, k):
"""初始化分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
"""根据训练数据集X_train和y_train训练kNN分类器"""
assert X_train.shape[0] == y_train.shape[0], "the size of X_train must be equal to the size of y_train"
assert self.k <= X_train.shape[0], "the size of X_train must be at least k"
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict结果的向量"""
assert self._X_train is not None and self._y_train is not None, "must fit before predict!"
assert X_predict.shape[1] == self._X_train.shape[1], "the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
distances = [sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def __repr__(self):
return "kNN(k=%d)" % self.k
# raw_data_x是特征,raw_data_y是标签,0为良性,1为恶性`
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343853454, 3.368312451],
[3.582294121, 4.679917921],
[2.280362211, 2.866990212],
[7.423436752, 4.685324231],
[5.745231231, 3.532131321],
[9.172112222, 2.511113104],
[7.927841231, 3.421455345],
[7.939831414, 0.791631213]]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
x = np.array([8.90933607318, 3.365731514])
knn_clf = kNNClassifier(k=6)
knn_clf.fit(X_train, y_train)
X_predict = x.reshape(1, -1)
y_predict = knn_clf.predict(X_predict)
print(y_predict)
使用sk-learn
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
raw_data_X = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343853454, 3.368312451],
[3.582294121, 4.679917921],
[2.280362211, 2.866990212],
[7.423436752, 4.685324231],
[5.745231231, 3.532131321],
[9.172112222, 2.511113104],
[7.927841231, 3.421455345],
[7.939831414, 0.791631213]]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_X)
y_train = np.array(raw_data_y)
# 创建kNN_classifier实例
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
# kNN_classifier做一遍fit(拟合)的过程,没有返回值,模型就存储在kNN_classifier实例中
kNN_classifier.fit(X_train, y_train)
# kNN进行预测predict,需要传入一个矩阵,而不能是一个数组。reshape()成一个二维数组,第一个参数是1表示只有一个数据,第二个参数-1,numpy自动决定第二维度有多少
x = np.array([8.90933607318, 3.365731514])
y_predict = kNN_classifier.predict(x.reshape(1, -1))
print(y_predict)