Outline
1.算法思想
2.概念解释
3.Sklearn Code
Part 1 算法思想:
- 目的:依据原有数据将不同类别用一条‘决策边界“分离开,数据的分类根据位置在边界两侧中的一侧进行分类。在保证一定error在上限以下,margin(support vector距离边界的距离)越大越好
- 优势:support vector非常重要,以至于其他的训练数据可以忽略。而这也是SVM的优势,使用少量的support vector数据,训练速度极快。
Part 2 概念解释:
linear kernel:处理线性边界进行分类的问题。
C:The C parameter controls how much error is allowed. A large C allows for little error and creates a hard margin. A small C allows for more error and creates a soft margin.惩罚参数越小,容忍性就越大。
polynomial kernel:A polynomial kernel transforms points into three dimensions
degree:2
rbf:rbf kernel transforms points into infinite dimensions
gamma:If gamma is large, the training data is more relevant, and as a result overfitting can occur.
Part 3 Sklearn Code:
利用以下代码,可绘制分类边界和支持向量
from sklearn.svm import SVC
# SVM 函数
clf = SVC(kernel='linear')
clf.fit(X, y)
# 最佳函数:为什么这是最佳函数???待回答
w = clf.coef_[0]
a = -w[0] / w[1]
y_3 = a*x_fit - (clf.intercept_[0]) / w[1]
# 最大边距 下届
b_down = clf.support_vectors_[0]
y_down = a* x_fit + b_down[1] - a * b_down[0]
# 最大边距 上届
b_up = clf.support_vectors_[-1]
y_up = a* x_fit + b_up[1] - a * b_up[0]
# 画散点图
X, y = make_blobs(n_samples=60, centers=2, random_state=0, cluster_std=0.4)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=plt.cm.Paired)
# 画函数
plt.plot(x_fit, y_3, '-c')
# 画边距
plt.fill_between(x_fit, y_down, y_up, edgecolor='none', color='#AAAAAA', alpha=0.4)
# 画支持向量
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], edgecolor='b',
s=80, facecolors='none')
3.1 线性核代码:
#linear
from sklearn.svm import SVC
from graph import points, labels
classifier = SVC(kernel = 'linear')
classifier.fit(points, labels)
print(classifier.predict([[3, 4], [6, 7]]))
3.2 多项式核代码:
#polynomial kernel:将二维数据升级为三维数据
from sklearn.datasets import make_circles
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
#Makes concentric circles
points, labels = make_circles(n_samples=300, factor=.2, noise=.05, random_state = 1)
#Makes training set and validation set.
training_data, validation_data, training_labels, validation_labels = train_test_split(points, labels, train_size = 0.8, test_size = 0.2, random_state = 100)
classifier = SVC(kernel = "linear", random_state = 1)
classifier.fit(training_data, training_labels)
print(classifier.score(validation_data, validation_labels))
print(training_data[0])
new_training = [[2 ** 0.5 * pt[0] * pt[1], pt[0] ** 2, pt[1] ** 2] for pt in training_data]
new_validation = [[2 ** 0.5 * pt[0] * pt[1], pt[0] ** 2, pt[1] ** 2] for pt in validation_data]
#classifier=SVC(kernel="poly",degree=2)
#classifier.fit(training_data,training_labels)
classifier.fit(new_training, training_labels)
print(classifier.score(new_validation, validation_labels))
*超平面的补充:
# 如果我们遇到这样的数据集,没有办法利用线性分类器进行分类
from sklearn.datasets.samples_generator import make_circles
# 画散点图
X, y = make_circles(100, factor=.1, noise=.1, random_state=2019)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=plt.cm.Paired)
clf = SVC(kernel='linear').fit(X, y)
# 最佳函数
x_fit = np.linspace(-1.5, 1.5)
w = clf.coef_[0]
a = -w[0] / w[1]
y_3 = a*X - (clf.intercept_[0]) / w[1]
plt.plot(X, y_3, '-c')
#我们可以将二维(低维)空间的数据映射到三维(高维)空间中。 此时,我们便可以通过一个超平面对数据进行划分。所以,我们映射的目的在于使用 SVM 在高维空间找到超平面的能力。
from mpl_toolkits.mplot3d import Axes3D
# 数据映射
r = np.exp(-(X[:, 0] ** 2 + X[:, 1] ** 2))
ax = plt.subplot(projection='3d')
ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap=plt.cm.Paired)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
x_1, y_1 = np.meshgrid(np.linspace(-1, 1), np.linspace(-1, 1))
z = 0.01*x_1 + 0.01*y_1 + 0.5
ax.plot_surface(x_1, y_1, z, alpha=0.3)
#在SVC中,我们可以用高斯核函数来实现这以功能:kernel='rbf'
# 画图
X, y = make_circles(100, factor=.1, noise=.1, random_state=2019)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=plt.cm.Paired)
clf = SVC(kernel='rbf')
clf.fit(X, y)
ax = plt.gca()
x = np.linspace(-1, 1)
y = np.linspace(-1, 1)
x_1, y_1 = np.meshgrid(x, y)
P = np.zeros_like(x_1)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function(np.array([[xi, yj]]))
ax.contour(x_1, y_1, P, colors='k', levels=[-1, 0, 0.9], alpha=0.5,
linestyles=['--', '-', '--'])
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], edgecolor='b',
s=80, facecolors='none');
3.3高斯核代码:
#rbf
from data import points, labels
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
training_data, validation_data, training_labels, validation_labels = train_test_split(points, labels, train_size = 0.8, test_size = 0.2, random_state = 100)
classifier = SVC(kernel = "rbf", gamma = 0.1)
classifier.fit(training_data, training_labels)
print(classifier.score(validation_data, validation_labels))