在sklearn库中处理kmeans聚类问题,用到的是sklearn.cluster.KMeans 这个类
案列一
- 首先我们随机创建一些二维数据作为训练集,观察在不同的k值下Calinski-Harabasz分数。
import numpy as np
import matplotlib.pyplot as plt
#matplotlib inline
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
# X为样本特征,Y为样本簇类别, 共1000个样本,
# 每个样本4个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1],[2,2], 簇方差分别为[0.4, 0.2, 0.2]
X, y = make_blobs(n_samples=1000, n_features=2,
centers=[[-1,-1], [0,0], [1,1], [2,2]],
cluster_std=[0.4, 0.2, 0.2, 0.2],
random_state =9)
plt.scatter(X[:, 0], X[:, 1], marker='o')
plt.show()
数据如下图:
from sklearn.cluster import KMeans
y_pred = KMeans(n_clusters=2, random_state=9).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
#用Calinski-Harabasz Index评估二分类的聚类分数
print(metrics.calinski_harabaz_score(X, y_pred))
#Calinski-Harabasz Index对应的方法是metrics.calinski_harabaz_score
from sklearn.cluster import KMeans
y_pred = KMeans(n_clusters=3, random_state=9).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
print(metrics.calinski_harabaz_score(X, y_pred))
#用Calinski-Harabasz Index评估三分类的聚类分数
from sklearn.cluster import KMeans
y_pred = KMeans(n_clusters=4, random_state=9).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
print(metrics.calinski_harabaz_score(X, y_pred))
#用Calinski-Harabasz Index评估四分类的聚类分数
案列二
运用Kmeans算法实现图像压缩
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin
from sklearn.datasets import load_sample_image
from sklearn.utils import shuffle
from time import time
n_colors = 64
# 加载sklearn中样图
china = load_sample_image("china.jpg")
china = np.array(china, dtype=np.float64) / 255
# 加载图像并转换成二维数字阵列
w, h, d = original_shape = tuple(china.shape)
assert d == 3
image_array = np.reshape(china, (w * h, d))
print("一个小样本数据拟合模型")
t0 = time()
image_array_sample = shuffle(image_array, random_state=0)[:1000]
kmeans = KMeans(n_clusters=n_colors,
random_state=0).fit(image_array_sample)
print("完成时间 %0.3fs." % (time() - t0))
# Get labels for all points
print("预测全图像上的颜色指数(k-均值)")
t0 = time()
labels = kmeans.predict(image_array)
print("完成时间 %0.3fs." % (time() - t0))
def recreate_image(codebook, labels, w, h):
"""从代码簿和标签中重新创建(压缩)图像"""
d = codebook.shape[1]
image = np.zeros((w, h, d))
label_idx = 0
for i in range(w):
for j in range(h):
image[i][j] = codebook[labels[label_idx]]
label_idx += 1
return image
# 与原始图像一起显示所有结果
plt.figure(1)
plt.clf()
ax = plt.axes([0, 0, 1, 1])
plt.axis('off')
plt.title('Original image (96,615 colors)')
plt.imshow(china)
plt.figure(2)
plt.clf()
ax = plt.axes([0, 0, 1, 1])
plt.axis('off')
plt.title('Quantized (64 colors, K-Means)')
plt.imshow(recreate_image(kmeans.cluster_centers_, labels, w,
h))
plt.show()
实现结果如下:
小结