1 KNN算法的原理介绍
- 优点
- 原理案例介绍
假设现在设计一个程序判断一个新的肿瘤病人是良性肿瘤还是恶性肿瘤。
先基于原有的肿瘤病人的发现时间和肿瘤大小(特征)对应的良性/恶性(值)建立了一张散点图,横坐标是肿瘤大小,纵坐标是发现时间,红色代表良性,蓝色代表恶性,现在要预测的病人的颜色为绿色。
首先需要取一个k值(这个k值的取法后面会介绍),然后找到距离要预测的病人的点(绿点)距离最近的k个点。
然后用第一步中取到的三个点进行投票,比如本例中投票结果就是
蓝:红 = 3:0
,3>0,所以判断这个新病人幻的事恶性肿瘤。-
本质
如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别。
2. KNN算法的一个简单实现
import numpy as np
import matplotlib.pyplot as plt
原始集合
# 特征
raw_data_x= [[3.393533211,2.331273381],
[2.110073483,1.781539638],
[1.343808831,3.368360954],
[3.582294042,4.679179110],
[2.280362439,2.866990263],
[7.423436942,4.696522875],
[5.745051997,3.533989803],
[9.172168622,2.511101045],
[7.792783481,3.424088941],
[7.939820817,0.791637231]
]
# 所属类别
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
训练集合
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 要预测的点
x = np.array([8.093607318,3.365731514])
绘制数据集及要预测的点
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='g')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='r')
plt.scatter(x[0],x[1],color='b')
<matplotlib.collections.PathCollection at 0x11addb908>
KNN 实现过程简单编码
from math import sqrt
distances = []
for x_train in X_train:
# 欧拉
# **2 求平方
d = sqrt(np.sum((x_train - x)**2))
distances.append(d)
distances
[4.812566907609877,
6.189696362066091,
6.749798999160064,
4.6986266144110695,
5.83460014556857,
1.4900114024329525,
2.354574897431513,
1.3761132675144652,
0.3064319992975,
2.5786840957478887]
# 生成表达式
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
distances
[4.812566907609877,
6.189696362066091,
6.749798999160064,
4.6986266144110695,
5.83460014556857,
1.4900114024329525,
2.354574897431513,
1.3761132675144652,
0.3064319992975,
2.5786840957478887]
# 返回排序后的结果的索引,也就是距离测试点距离最近的点的排序坐标数组
nearset = np.argsort(distances)
k = 6
投票
# 求出距离测试点最近的6个点的类别
topK_y = [y_train[i] for i in nearset[:k]]
topK_y
[1, 1, 1, 1, 1, 0]
# collections的Counter方法可以求出一个数组的相同元素的个数,返回一个dict【key=元素名,value=元素个数】
from collections import Counter
Counter(topK_y)
Counter({0: 1, 1: 5})
# most_common方法求出最多的元素对应的那个键值对
votes = Counter(topK_y)
votes.most_common(1)
[(1, 5)]
votes.most_common(1)[0][0]
1
predict_y = votes.most_common(1)[0][0]
predict_y
1
将KNN算法封装成函数
import numpy as np
from math import sqrt
from collections import Counter
def kNN_classify(k, X_train, y_train, x):
assert 1 <= k <= X_train.shape[0], "k must be valid"
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must equal to the size of y_train"
assert X_train.shape[1] == x.shape[0], \
"the feature number of x must be equal to X_train"
distances = [sqrt(np.sum((x_train - x)**2)) for x_train in X_train]
nearest = np.argsort(distances)
topK_y = [y_train[i] for i in nearest[:k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
# 特征
raw_data_x= [[3.393533211,2.331273381],
[2.110073483,1.781539638],
[1.343808831,3.368360954],
[3.582294042,4.679179110],
[2.280362439,2.866990263],
[7.423436942,4.696522875],
[5.745051997,3.533989803],
[9.172168622,2.511101045],
[7.792783481,3.424088941],
[7.939820817,0.791637231]
]
# 所述类别
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 要预测的点
x = np.array([8.093607318,3.365731514])
predict = kNN_classify(6,X_train,y_train,x)
print(predict)
3. 机器学习套路
可以说kNN是一个不需要训练过程的算法 k近邻算法是非常特殊的,可以被认为是没有模型的算法 为了和其他算法统一,可以认为训练数据集就是模型
使用scikit-learn中的kNN
from sklearn.neighbors import KNeighborsClassifier
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=6, p=2,
weights='uniform')
kNN_classifier.predict(x)
/Users/yuanzhang/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
array([1])
X_predict = x.reshape(1, -1)
X_predict
array([[ 8.09360732, 3.36573151]])
kNN_classifier.predict(X_predict)
array([1])
y_predict = kNN_classifier.predict(X_predict)
y_predict[0]
1
完整代码
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
kNN_classifier = KNeighborsClassifier(n_neighbors=6)
# 特征
raw_data_x= [[3.393533211,2.331273381],
[2.110073483,1.781539638],
[1.343808831,3.368360954],
[3.582294042,4.679179110],
[2.280362439,2.866990263],
[7.423436942,4.696522875],
[5.745051997,3.533989803],
[9.172168622,2.511101045],
[7.792783481,3.424088941],
[7.939820817,0.791637231]
]
# 所述类别
raw_data_y = [0,0,0,0,0,1,1,1,1,1]
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
# 要预测的点
x = np.array([8.093607318,3.365731514])
kNN_classifier.fit(X_train, y_train)
X_predict = x.reshape(1, -1)
kNN_classifier.predict(X_predict)
y_predict = kNN_classifier.predict(X_predict)
print(y_predict[0])
4. 判断机器学习算法的性能
train test split
使用sklearn的分割函数分割训练集并测试
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(112, 4)
(112,)
(38, 4)
(38,)
from sklearn.neighbors import KNeighborsClassifier
sklearn_knn_clf = KNeighborsClassifier(n_neighbors=6)
sklearn_knn_clf.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=6, p=2,
weights='uniform')
y_predict = sklearn_knn_clf.predict(X_test)
y_predict
array([2, 2, 2, 1, 0, 0, 2, 2, 2, 1, 1, 0, 1, 1, 2, 2, 2, 2, 0, 0, 1, 2,
0, 2, 0, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 2, 2])
y_test
array([2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 1, 0, 1, 1, 2, 2, 2, 2, 0, 0, 1, 2,
0, 2, 0, 2, 1, 1, 2, 1, 1, 1, 2, 0, 1, 2, 1, 2])
# 计算得分
sum(y_predict==y_test)/len(y_test)
0.9473684210526315