一、基本原理
神经网络(neural networks)模拟生物神经网络的一种机器学习模型。下面以三层神经网络为例具体说明其实现过程:该模型共包括输入层、隐含层和输出层,表示第
个神经元的输入,
表示第
的第
个神经元,
表示第
层到
层映射的权值矩阵。·
由以上神经网络,可以计算得到隐含层参数:
输出层参数:
其中,激活函数为:
考虑正则化的代价函数为:
其中表示层数,
表示第
层的神经元个数,
表示输出层有
个神经元(这也是与逻辑回归代价函数的主要区别)。
通过以上过程,信息向网络前方流动,能够计算得到代价函数,这称之为前向传播(forward propagation),为了计算
梯度,还需要误差反向传播(error BackPropagation, BP)。
以三层神经网络为例,记为第
层第
个神经元的误差,则
,
,没有
,因为输入没有误差。由于sigmoid激活具有性质
,因此
可以在前向传播中计算出来,则反向传播误差计算梯度的过程为:对于每组训练样本,正向计算
,反向计算误差
,则
,最后得到代价函数的梯度
。其详细推导过程如下图所示,一些变量的表示略有差异,其中
表示均方误差(mean square error, MSE)。
二、算法实现
2.1、手动实现
1、导入包和文件
import numpy as np
import scipy.io as spio
import matplotlib.pyplot as plt
from scipy import optimize
import time
datafile = 'data_digits.mat'
2、主函数,实现神经网络的过程
def NeuralNteworks(input_layer_size,hidden_layer_size,output_layer_size):
data_img = spio.loadmat(datafile)
X = data_img['X']
Y = data_img['y']
m,n = X.shape
rand_indices = [t for t in [np.random.randint(x-x,m) for x in range(1000)]]
plot_data(X[rand_indices,:])
Lambda = 1
init_theta_1 = randInitWeights(input_layer_size,hidden_layer_size)
init_theta_2 = randInitWeights(hidden_layer_size,output_layer_size)
init_nn_params = np.vstack((init_theta_1.reshape(-1,1),init_theta_2.reshape(-1,1)))
start = time.time()
result = optimize.fmin_cg(nnCostFunction,init_nn_params,fprime=nnGradient,args=(input_layer_size,hidden_layer_size,output_layer_size,X,Y,Lambda),maxiter=100)
print('run time:',time.time()-start)
print(result)
length = result.shape[0]
theta_1 = result[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1)
theta_2 = result[hidden_layer_size*(input_layer_size+1):length].reshape(output_layer_size,hidden_layer_size+1)
plot_data(theta_1[:,1:length])
plot_data(theta_2[:,1:length])
p = predict(theta_1,theta_2,X)
print('accuracy:{:.2f}%'.format(np.mean(np.float64(p == Y.reshape(-1,1))*100)))
res = np.hstack((p.reshape(-1,1)))
np.savetxt('predict.txt',res,delimiter=',')
2、代价函数,上文的具体实现
def nnCostFunction(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda):
length = nn_params.shape[0]
theta_1 = nn_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1)
theta_2 = nn_params[hidden_layer_size*(input_layer_size+1):length].reshape(num_labels,hidden_layer_size+1)
m = X.shape[0]
class_y = np.zeros((m,num_labels))
for i in range(num_labels):
class_y[:,i] = np.int32(y==i).reshape(1,-1)
theta1_col = theta_1.shape[1]
theta1_x = theta_1[:,1:theta1_col]
theta2_col = theta_2.shape[1]
theta2_x = theta_2[:,1:theta2_col]
temp = np.dot(np.transpose(np.vstack((theta1_x.reshape(-1,1),theta2_x.reshape(-1,1)))),np.vstack((theta1_x.reshape(-1,1),theta2_x.reshape(-1,1))))
a1 = np.hstack((np.ones((m,1)),X))
z2 = np.dot(a1,np.transpose(theta_1))
a2 = sigmoid(z2)
a2 = np.hstack((np.ones((m,1)),a2))
z3 = np.dot(a2,np.transpose(theta_2))
h = sigmoid(z3)
J = -(np.dot(np.transpose(class_y.reshape(-1,1)),np.log(h.reshape(-1,1)))+np.dot(np.transpose(1-class_y.reshape(-1,1)),np.log(1-h.reshape(-1,1))))/m + temp*Lambda/2/m
return np.ravel(J)
3、梯度值的计算,误差反向传递
def nnGradient(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda):
length = nn_params.shape[0]
theta_1 = nn_params[0:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1).copy()
theta_2 = nn_params[hidden_layer_size*(input_layer_size+1):length].reshape(num_labels,hidden_layer_size+1).copy()
m = X.shape[0]
class_y = np.zeros((m,num_labels))
for i in range(num_labels):
class_y[:,i] = np.int32(y==i).reshape(1,-1)
theta1_col = theta_1.shape[1]
theta1_x = theta_1[:,1:theta1_col]
theta2_col = theta_2.shape[1]
theta2_x = theta_2[:,1:theta2_col]
theta1_grad = np.zeros((theta_1.shape))
theta2_grad = np.zeros((theta_2.shape))
a1 = np.hstack((np.ones((m,1)),X))
z2 = np.dot(a1,np.transpose(theta_1))
a2 = sigmoid(z2)
a2 = np.hstack((np.ones((m,1)),a2))
z3 = np.dot(a2,np.transpose(theta_2))
h = sigmoid(z3)
delta_3 = np.zeros((m,num_labels))
delta_2 = np.zeros((m,hidden_layer_size))
for i in range(m):
delta_3[i,:] = h[i,:] - class_y[i,:]
theta2_grad += np.dot(np.transpose(delta_3[i,:].reshape(1,-1)),a2[i,:].reshape(1,-1))
delta_2[i,:] = np.dot(delta_3[i,:].reshape(1,-1),theta2_x)*sigmoidGradient(z2[i,:])
theta1_grad += np.dot(np.transpose(delta_2[i,:].reshape(1,-1)),a1[i,:].reshape(1,-1))
theta_1[:,0] = 0
theta_2[:,0] = 0
grad = (np.vstack((theta1_grad.reshape(-1,1),theta2_grad.reshape(-1,1)))+Lambda*np.vstack((theta_1.reshape(-1,1),theta_2.reshape(-1,1))))/m
return np.ravel(grad)
4、辅助函数的实现
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoidGradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitWeights(L_in,L_out):
w = np.zeros((L_out,L_in+1))
init_epsilon = (6.0/(L_out+L_in))**0.5
w = np.random.rand(L_out,1+L_in)*2*init_epsilon - init_epsilon
return w
def checkGradient(Lambda=0):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
init_theta_1 = debugInitWeights(input_layer_size,hidden_layer_size)
init_theta_2 = debugInitWeights(hidden_layer_size,num_labels)
X = debugInitWeights(input_layer_size-1,m)
y = np.transpose(np.mod(np.arange(1,m+1),num_labels))
y = y.reshape(-1,1)
nn_params = np.vstack((init_theta_1.reshape(-1,1),init_theta_2.reshape(-1,1)))
bp_grad = nnGradient(nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
num_grad = np.zeros((nn_params.shape[0]))
step = np.zeros((nn_params.shape[0]))
error = 1e-4
for i in range(nn_params.shape[0]):
step[i] = error
loss_1 = nnCostFunction(nn_params-step.reshape(-1,1),input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
loss_2 = nnCostFunction(nn_params+step.reshape(-1,1),input_layer_size,hidden_layer_size,num_labels,X,y,Lambda)
num_grad[i] = (loss_2-loss_1)/(2*error)
step[i] = 0
res = np.hstack((num_grad.reshape(-1,1),bp_grad.reshape(-1,1)))
print(res)
def debugInitWeights(fan_in,fan_out):
w = np.zeros((fan_out,fan_in+1))
x = np.arange(1,fan_out*(fan_in+1)+1)
w = np.sin(x).reshape(w.shape)/10
return w
5、绘图函数与预测函数
def plot_data(ImgData):
sum = 0
m,n = ImgData.shape
width = np.int32(np.round(np.sqrt(n)))
height = np.int32(n/width)
rows = np.int32(np.floor(np.sqrt(m)))
cols = np.int32(np.ceil(m/rows))
pad = 1
display_array = -np.ones((pad+rows*(height+pad),pad+cols*(width+pad)))
for i in range(rows):
for j in range(cols):
if sum >= m:
break
display_array[pad+i*(height+pad):pad+i*(height+pad)+height,pad+j*(width+pad):pad+j*(width+pad)+width] = ImgData[sum,:].reshape(height,width,order='F')
sum += 1
if sum >= m:
break
plt.imshow(display_array,cmap='gray')
plt.axis('off')
plt.show()
def predict(theta_1,theta_2,X):
m = X.shape[0]
num_labels = theta_2.shape[0]
X = np.hstack((np.ones((m,1)),X))
h1 = sigmoid(np.dot(X,np.transpose(theta_1)))
h1 = np.hstack((np.ones((m,1)),h1))
h2 = sigmoid(np.dot(h1,np.transpose(theta_2)))
p = np.array(np.where(h2[0,:]==np.max(h2,axis=1)[0]))
for i in np.arange(1,m):
t = np.array(np.where(h2[i,:]==np.max(h2,axis=1)[i]))
p = np.vstack((p,t))
return p
6、调用主函数
if __name__=='__main__':
checkGradient()
NeuralNteworks(400,25,10)
计算结果如下所示,共迭代100次,运行时间90秒,预测准确率98.46%
Warning: Maximum number of iterations has been exceeded.
Current function value: 0.364905
Iterations: 100
Function evaluations: 235
Gradient evaluations: 235
run time: 89.65801429748535
accuracy:98.46%
2.2、调用sklearn包
调入sklearn的neural_network的MLPClassifier模型和make_moons数据集,实现神经网络的分类功能。其中使用了mglearn库进行辅助绘图。
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import mglearn
X,y = make_moons(n_samples=200,noise=0.25,random_state=3)
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state=3)
mlp = MLPClassifier(solver='lbfgs',random_state=0,hidden_layer_sizes=[100]).fit(X_train,y_train)
mglearn.plots.plot_2d_separator(mlp,X_train,fill=True,alpha=0.3)
mglearn.discrete_scatter(X_train[:,0],X_train[:,1],y_train)
plt.xlabel('feature 0')
plt.ylabel('feature 1')
print('Accuracy on training set:{:.2f}%'.format(mlp.score(X_train,y_train)*100))
print('Accuracy on test set:{:.2f}%'.format(mlp.score(X_test,y_test)*100))
在只有一层隐含层100个神经元,优化方法为'lbfgs',激活函数默认为'relu'的情况下的计算结果与分类边界:
Accuracy on training set:100.00%
Accuracy on test set:96.00%
修改参数,在有两层隐含层10*10个神经元,优化方法为'lbfgs',激活函数为'logistic'(即sigmiod)的情况下的计算结果与分类边界:
mlp = MLPClassifier(solver='lbfgs',activation='logistic',random_state=0,hidden_layer_sizes=[10,10]).fit(X_train,y_train)
Accuracy on training set:99.33%
Accuracy on test set:92.00%
三、问题探讨
3.1、经验风险最小与结构风险最小
大部分机器学习问题都可以归结为经验风险(损失函数)和结构风险(损失函数+正则化)函数的优化问题。
经验风险是损失函数在数据集上的期望,可以由一组样本损失函数的均值来定义:
这样根据风险最小,可以计算参数
其中是样本集合,
是学习模型的参数。
前边也有所提及,损失函数一般有两种角度定义:
1.误差函数(Error Function, ERF),常见的有均方误差(mean-squre error, MSE)和均绝对值差(mean-absolute error, MAE)
2.负对数似然函数(Negative Log Likelihood, NLL),即
二者关联是:最小平方误差等价于残差符合高斯分布的最小NLL,最小绝对值误差等价于残差符合拉普拉斯分布下的最小NLL。
经验风险没有考虑模型学习能力和数据的匹配度。在经验风险最小的同时,兼顾平衡模型的学习能力与数据的匹配,避免出现过拟合的新目标,就是结构风险最小(Structural Risk Minimization)。
参考资料
[1] https://github.com/lawlite19/MachineLearning_Python
[2] 周志华 著. 机器学习. 北京:清华大学出版社,2016
[3] 李航 著. 统计学习方法. 北京:清华大学出版社,2012
[4] 史春奇等 著. 机器学习算法背后的理论与优化. 北京:清华大学出版社,2019
[5] Peter Harrington 著. 李锐等 译. 机器学习实战. 北京:人民邮电出版社,2013
[6] Ian Goodfellow等 著. 赵申剑等 译. 深度学习. 北京:人民邮电出版社, 2017
为天地立心,为生民立命,为往圣继绝学,为万世开太平。——张载《横渠四句》