正规化 Normalization(把x、y缩到同一个区间,类似椭圆变圆)
# 验证scale的作用
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets._samples_generator import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt
X, Y = make_classification(n_samples=300, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, scale=100)
# plt.scatter(X[:, 0], X[:, 1], c=Y)
# plt.show()
X = preprocessing.scale(X) # 验证scale的作用
# plt.scatter(X[:, 0], X[:, 1], c=Y)
# plt.show()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) # train_data, test_data, train_label, test_label
clf = SVC()
clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
print()
检验神经网络 (Evaluation)
交叉验证 1 Cross-validation(对比一下和之前的sklearn莺尾花有什么不同)
from __future__ import print_function
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
iris = load_iris()
X = iris.data
y = iris.target
# test train split #
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
knn = KNeighborsClassifier(n_neighbors=5) # 看附近五个,然后综合考虑
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(knn.score(X_test, y_test))
# this is cross_val_score #
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy') # k折交叉,k选为5,依照accuracy打分
print(scores)
# this is how to use cross_val_score to choose model and configs #
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
k_range = range(1, 31)
k_scores = []
for k in k_range: # 测试选择不同个数的neighbor有什么区别,测试结果是太高太低都不好
knn = KNeighborsClassifier(n_neighbors=k)
# loss = -cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error') # for regression用于回归问题
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # for classification
k_scores.append(scores.mean())
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()
交叉验证 2 Cross-validation(监视过拟合)
from sklearn.model_selection import learning_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
X = digits.data
Y = digits.target
train_sizes, train_loss, test_loss = learning_curve(
SVC(gamma=0.001), X, Y, cv=10, scoring='neg_mean_squared_error',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1] # 记录五个点
)
train_loss_mean = -np.mean(train_loss, axis=1) # 第一维度是五个点,-1才是loss
test_loss_mean = -np.mean(test_loss, axis=1)
plt.plot(train_sizes, train_loss_mean, 'o-', color='r', label='Training')
plt.plot(train_sizes, test_loss_mean, 'o-', color='g', label='Cross-validation')
plt.xlabel('Training examples')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.show()
交叉验证 3 Cross-validation validation_curve 检视过拟合
from sklearn.model_selection import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
X = digits.data
Y = digits.target
param_range = np.logspace(-6, -2.3, 5) # 在这个区间,取五个值
train_loss, test_loss = validation_curve(
SVC(), X, Y, param_name='gamma', param_range=param_range, cv=10, scoring='neg_mean_squared_error',)
train_loss_mean = -np.mean(train_loss, axis=1) # 第一维度是五个点,-1才是loss
test_loss_mean = -np.mean(test_loss, axis=1)
plt.plot(param_range, train_loss_mean, 'o-', color='r', label='Training')
plt.plot(param_range, test_loss_mean, 'o-', color='g', label='Cross-validation')
plt.xlabel('gamma')
plt.ylabel('Loss')
plt.legend(loc='best')
plt.show()