- 目录
- 实现MyKNeighborsClassifier
- 实现my_train_test_split
- 测试自己实现的算法
- KNN癌症检测案例
- 交叉表查看分类报告
- 优化-网格搜索参数
- 优化-对数据进行标准化
基于numpy数组实现k近邻算法MyKNNClassifier类
包括训练数据fit()
预测数据predict()
模型得分score()
实现my_train_test_split切分数据集并返回训练集和测试集
话不多说,上代码
import numpy as np
from collections import Counter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
class MyKNNClassifier:
def __init__(self, k=3):
assert k >= 1, 'k must >=1'
self.k = k
self.x_train = None
self.y_train = None
self.predict_data = None
def fit(self, x_train, y_train):
'''
提供训练方法
'''
assert x_train.shape[0] == y_train.shape[0], '数据维度必须相同'
assert self.k <= y_train.shape[0], 'k值必须小于等于数据长度'
self.x_train = x_train
self.y_train = y_train
return self
def predict(self, x):
# self._predict(x)
'''
获取分类向量并返回
:return:
'''
y_predict = [self._predict(p) for p in x]
self.predict_data = np.array(y_predict)
return self.predict_data
# 私有方法
def _predict(self, x):
# 按照待预测x点与已知的所属分类y去获取欧拉距离的列表值
olist = [np.sqrt(np.sum((p - x) ** 2)) for p in self.x_train]
# 获取排好序的索引 # 找最近的k个点
sindex = np.argsort(olist)[:self.k]
data = self.y_train[sindex]
# 获取预测值,返回最多的一个
return Counter(data).most_common(1)[0][0]
# 获取预测完的数据集与传入结果集比对,生成得分
def score(self, x_test, y_test):
return np.sum(y_test == self.predict_data) / y_test.shape[0]
# 封装自己的train_test_split算法
def my_train_test_split(x, y, test_ratio=0.2):
# permutation(n) 给出从0到n-1的一个不重复随机排列
shuffle_indexes = np.random.permutation(x.shape[0])
# 测试集数量
test_size = int(x.shape[0] * test_ratio)
# 训练集数量
train_size = int(x.shape[0] * (1 - test_ratio))
# 测试集 训练集 索引
test_indexes = shuffle_indexes[:test_size]
train_indexes = shuffle_indexes[train_size:]
# 根据索引取数据
x_train = x[train_indexes]
y_train = y[train_indexes]
x_test = x[test_indexes]
y_test = y[test_indexes]
return x_train, x_test, y_train, y_test
# 测试自己的方法与sklearn封装的方法,进行比对
# 自己实现的算法
# ----------------------------------------------------------------------------
iris = datasets.load_iris()
data = iris['data']
# 结果集
target = iris['target']
# 数据集a 花萼
adata = data[:, :2]
# 数据集b 花瓣
bdata = data[:, 2:4]
# 使用自己的方法切分数据集(可以分别对花萼 花瓣数据进行切分 这里对花瓣进行操作)
x_train, x_test, y_train, y_test = my_train_test_split(bdata, target, test_ratio=0.3)
myknn = MyKNNClassifier(k=3)
# 训练
myknn.fit(x_train, y_train)
# 预测
y = myknn.predict(x_test)
print('自己实现预测结果集:', y)
# 预测得分
score = myknn.score(x_test, y_test)
print('自己实现预测得分:', score)
# 使用sklearn封装好的算法
print('*'*50)
# ----------------------------------------------------------------------------
# 使用自己的方法切分数据集(可以分别对花萼 花瓣数据进行切分 这里对花瓣进行操作)
x_train, x_test, y_train, y_test = train_test_split(bdata, target, test_size=0.3)
knn = KNeighborsClassifier(n_neighbors=3)
# 训练
knn.fit(x_train, y_train)
# 预测
y = knn.predict(x_test)
print('sklearn预测结果集:', y)
# 预测得分
score = knn.score(x_test, y_test)
print('sklearn预测得分:', score)
下面是运行结果
使用自己的方法测试手写数字识别
数据集
datasets.load_digits()
上代码
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from KNN.MyKNNClassifier import MyKNNClassifier, my_train_test_split
# 1.加载数据
df = datasets.load_digits()
data = df['data']
print(data.shape)
# 标签
target = df['target']
# 使用自己的方法切分数据集
x_train, x_test, y_train, y_test = my_train_test_split(data, target, test_ratio=0.3)
myknn = MyKNNClassifier(k=3)
# 训练
myknn.fit(x_train, y_train)
# 预测
y = myknn.predict(x_test)
print('自己实现预测结果集:', y)
# 预测得分
score = myknn.score(x_test, y_test)
print('自己实现预测得分:', score)
# 使用sklearn封装好的算法
print('*' * 50)
# ----------------------------------------------------------------------------
# 使用自己的方法切分数据集(可以分别对花萼 花瓣数据进行切分 这里对花瓣进行操作)
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.3)
knn = KNeighborsClassifier(n_neighbors=3)
# 训练
knn.fit(x_train, y_train)
# 预测
y = knn.predict(x_test)
print('sklearn预测结果集:', y)
# 预测得分
score = knn.score(x_test, y_test)
print('sklearn预测得分:', score)
预测结果以及得分:
KNN癌症检测案例
- 导入相关的包(为了方便,我把一些笔记加了进来)
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
# 调用GridSearchCV创建网格搜索对象,传入参数为Classifier对象以及参数列表
# from sklearn.model_selection import GridSearchCV
# param_grid =[
# {
# 'weights':['uniform'],
# 'n_neighbors': [i for i in range(1,11)]
# },
# {
# # 权重
# 'weights':['distance'],
# 'n_neighbors': [i for i in range(1,11)],
# 'p': [i for i in range(1,6)]
# }
# ]
# grid_search = GridSearchCV(knn,param_grid,n_jobs=-1)
# # 调用fit方法执行网格搜索
# grid_search.fit(X_train,y_train)
# 不是用户传入的参数,而是根据用户传入的参数计算出来的结果,以_结尾
# 最好的评估结果,返回的是KNeighborsClassifier对象
# grid_search.best_estimator_
# from sklearn.preprocessing import StandardScaler
# 归一化方法
# StandardScaler(copy=True, with_mean=True, with_std=True)
# X_train = standardScaler.transform(X_train)
# 平均绝对误差mae
from sklearn.metrics import mean_absolute_error
# 均方误差
from sklearn.metrics import mean_squared_error
# 均方对数误差
from sklearn.metrics import mean_squared_log_error
# R²得分 ***
from sklearn.metrics import r2_score
# 分类准确率分数是指所有分类正确的百分比。
# 分类准确率这一衡量分类器的标准比较容易理解,但是它不能告诉你响应值的潜在分布,并且它也不能告诉你分类器犯错的类型。
from sklearn.metrics import accuracy_score
# (分类)交叉表 pandas也自带交叉表pd.crosstab(index=y_test,columns=y_,rownames=['确诊'],colnames=['预测'],margins=True)
from sklearn.metrics import confusion_matrix
# (分类)分类报告
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt
# # 如果不适用这句 魔法命令 图形就会另外打开一个窗口
# %matplotlib inline
# diabetes = datasets.load_diabetes()
# X = diabetes['data']
# y = diabetes['target']
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
# 导入数据 去掉\t
cancer = pd.read_csv('./data/cancer.csv',sep='\t')
看看数据长什么样子
# 获取数据集
X = cancer.iloc[:,2:]
y = cancer['Diagnosis']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_ = knn.predict(X_test)
display(y_,y_test.values)
# 查看得分
knn.score(X_test,y_test)
0.9035087719298246
accuracy_score(y_test,y_)
0.9035087719298246
# 交叉表
pd.crosstab(index=y_test,columns=y_,rownames=['确诊'],colnames=['预测'],margins=True)
# 查看分类报告
# precision recall f1-score support
# 精确率 召回率
# precision:预测正确的个数/预测出来的总数
# recall:预测出来的正确数量/真实的数量
report = classification_report(y_test,y_)
print(report)
precision recall f1-score support
B 0.89 0.96 0.92 67
M 0.93 0.83 0.88 47
micro avg 0.90 0.90 0.90 114
macro avg 0.91 0.89 0.90 114
weighted avg 0.91 0.90 0.90 114
- 我们发现得分很低
这时候可以调整参数和对数据进行标准化处理
但是调参提升精度有限
我们可以对数据进行标准化
- 首先使用网格搜索寻找最好的参数和得分
from sklearn.model_selection import GridSearchCV
param_grid =[
{
'weights':['uniform'],
'n_neighbors': [i for i in range(4,11)]
},
{
'weights':['distance'],
'n_neighbors': [i for i in range(4,11)],
'p': [i for i in range(1,6)]
}
]
grid_search = GridSearchCV(knn,param_grid,n_jobs=-1)
# # 调用fit方法执行网格搜索
grid_search.fit(X_train,y_train)
# 不是用户传入的参数,而是根据用户传入的参数计算出来的结果,以_结尾
# 最好的评估结果,返回的是KNeighborsClassifier对象
grid_search.best_estimator_
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=9, p=1,
weights='distance')
grid_search.best_score_
0.9406593406593406
- 然后使用标准化处理查看得分
# 去除单位的影响 归一化/标准化
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X2 = s.fit_transform(X)
# 归一化求平均
score = 0
for i in range(100):
X_train,X_test,y_train,y_test = train_test_split(X2,y,test_size=0.2)
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_=knn.predict(X_test)
score+=accuracy_score(y_test,y_)
print(score)
96.50877192982449
# 不经过归一化求平均
score = 0
for i in range(100):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
y_=knn.predict(X_test)
score+=accuracy_score(y_test,y_)
print(score)
92.93859649122794