import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
常见数据集
forge数据集
X,y = mglearn.datasets.make_forge()
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.legend(["Class 0","Class 1"],loc=4)
plt.xlabel("First feature")
plt.ylabel("Second feature")
print("X.shape:{}".format(X.shape))
X.shape:(26, 2)

]
make_forge()源码,make_blobs调用了sklearn
def make_forge():
# a carefully hand-designed dataset lol
X, y = make_blobs(centers=2, random_state=4, n_samples=30)
y[np.array([7, 27])] = 0
mask = np.ones(len(X), dtype=np.bool)
mask[np.array([0, 1, 5, 26])] = 0
X, y = X[mask], y[mask]
return X, y
wava数据集
X,y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(X,y,'o')
plt.ylim(-3,3)
plt.xlabel("Feature")
plt.ylabel("Target")
Text(0,0.5,'Target')

make_wave()源码
def make_wave(n_samples=100):
rnd = np.random.RandomState(42)
x = rnd.uniform(-3, 3, size=n_samples)
y_no_noise = (np.sin(4 * x) + x)
y = (y_no_noise + rnd.normal(size=len(x))) / 2
return x.reshape(-1, 1), y
威斯康星州乳腺癌数据集
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print("cancer.key():\n{}".format(cancer.keys()))
print("Shape of cancer data:{}".format(cancer.data.shape))
print("Sample counts per class:\n{}".format({n:v for n,v in zip(cancer.target_names,np.bincount(cancer.target))}))
print("Feature names:\n{}".format(cancer.feature_names))
cancer.key():
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
Shape of cancer data:(569, 30)
Sample counts per class:
{'malignant': 212, 'benign': 357}
Feature names:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
'mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal dimension']
波士顿房价数据集
from sklearn.datasets import load_boston
boston = load_boston()
print("Data shape:{}".format(boston.data.shape))
Data shape:(506, 13)
X,y = mglearn.datasets.load_extended_boston()
print("X.shape:{}".format(X.shape))
X.shape:(506, 104)
load_extended_boston()源码
def load_extended_boston():
boston = load_boston()
X = boston.data
X = MinMaxScaler().fit_transform(boston.data)
X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)
return X, boston.target
K近邻(即KNN)
#一个最近邻
mglearn.plots.plot_knn_classification(n_neighbors=1)

plot_knn_classification()源码
def plot_knn_classification(n_neighbors=1):
X, y = make_forge()
X_test = np.array([[8.2, 3.66214339], [9.9, 3.2], [11.2, .5]])
dist = euclidean_distances(X, X_test)
closest = np.argsort(dist, axis=0)
for x, neighbors in zip(X_test, closest.T):
for neighbor in neighbors[:n_neighbors]:
plt.arrow(x[0], x[1], X[neighbor, 0] - x[0],
X[neighbor, 1] - x[1], head_width=0, fc='k', ec='k')
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
test_points = discrete_scatter(X_test[:, 0], X_test[:, 1], clf.predict(X_test), markers="*")
training_points = discrete_scatter(X[:, 0], X[:, 1], y)
plt.legend(training_points + test_points, ["training class 0", "training class 1",
"test pred 0", "test pred 1"])
- 在forge数据集上分类
- KNeighborsClassifier是sklearn的方法
# 3个最近邻
mglearn.plots.plot_knn_classification(n_neighbors=3)

#通过scikit-learn应用k近邻算法
from sklearn.model_selection import train_test_split
X,y = mglearn.datasets.make_forge()
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train)
print("Test set predictions:{}".format(clf.predict(X_test)))
print("Test set accuracy:{:.2f}".format(clf.score(X_test,y_test)))
Test set predictions:[1 0 1 0 1 0 0]
Test set accuracy:0.86
KNeighborsClassifier源码
import numpy as np
from scipy import stats
from ..utils.extmath import weighted_mode
from .base import
_check_weights, _get_weights,
NeighborsBase, KNeighborsMixin,
RadiusNeighborsMixin, SupervisedIntegerMixin
from ..base import ClassifierMixin
from ..utils import check_array
class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
SupervisedIntegerMixin, ClassifierMixin):
def __init__(self, n_neighbors=5,
weights='uniform', algorithm='auto', leaf_size=30,
p=2, metric='minkowski', metric_params=None, n_jobs=1,
**kwargs):
self._init_params(n_neighbors=n_neighbors,
algorithm=algorithm,
leaf_size=leaf_size, metric=metric, p=p,
metric_params=metric_params, n_jobs=n_jobs, **kwargs)
self.weights = _check_weights(weights)
def predict(self, X):
X = check_array(X, accept_sparse='csr')
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_outputs = len(classes_)
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)
y_pred = np.empty((n_samples, n_outputs), dtype=classes_[0].dtype)
for k, classes_k in enumerate(classes_):
if weights is None:
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
else:
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
mode = np.asarray(mode.ravel(), dtype=np.intp)
y_pred[:, k] = classes_k.take(mode)
if not self.outputs_2d_:
y_pred = y_pred.ravel()
return y_pred
def predict_proba(self, X):
X = check_array(X, accept_sparse='csr')
neigh_dist, neigh_ind = self.kneighbors(X)
classes_ = self.classes_
_y = self._y
if not self.outputs_2d_:
_y = self._y.reshape((-1, 1))
classes_ = [self.classes_]
n_samples = X.shape[0]
weights = _get_weights(neigh_dist, self.weights)
if weights is None:
weights = np.ones_like(neigh_ind)
all_rows = np.arange(X.shape[0])
probabilities = []
for k, classes_k in enumerate(classes_):
pred_labels = _y[:, k][neigh_ind]
proba_k = np.zeros((n_samples, classes_k.size))
# a simple ':' index doesn't work right
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
proba_k[all_rows, idx] += weights[:, i]
# normalize 'votes' into real [0,1] probabilities
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
normalizer[normalizer == 0.0] = 1.0
proba_k /= normalizer
probabilities.append(proba_k)
if not self.outputs_2d_:
probabilities = probabilities[0]
return probabilities
- https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/neighbors/classification.py#L23
- https://sklearn.org/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
- https://sklearn.org/
#根据类别对平面着色
fig,axes = plt.subplots(1,3,figsize=(10,3))
for n_neighbors,ax in zip([1,3,9],axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X,y)
mglearn.plots.plot_2d_separator(clf,X,fill=True,eps=0.5,ax=ax,alpha=.4)
mglearn.discrete_scatter(X[:,0],X[:,1],y,ax=ax)
ax.set_title("{} neighbors(s)".format(n_neighbors))
ax.set_xlabel("feature 0")
ax.set_ylabel("feature 1")
axes[0].legend(loc=3)
