反向传播实现

sklearn源码中反向传播算法的实现
sklearn/neural_network/multilayer_perceptron.py

参数初始化

n_samples, n_features = X.shape # 样本数，输入特征维度
n_outputs_ = y.shape[1]  # 输出维度
layer_units = ([n_features] + hidden_layer_sizes +  [self.n_outputs_]) # 网络总层数

# Initialize coefficient and intercept layers
self.coefs_ = []
self.intercepts_ = []

for i in range(self.n_layers_ - 1):
    coef_init, intercept_init = self._init_coef(layer_units[I],  layer_units[i + 1])                                               
    self.coefs_.append(coef_init)
    self.intercepts_.append(intercept_init)

# Initialize lists
activations = [X] # 网络第一层激活值就是输入特征
activations.extend(np.empty((batch_size, n_fan_out))
                   for n_fan_out in layer_units[1:])
deltas = [np.empty_like(a_layer) for a_layer in activations]

coef_grads = [np.empty((n_fan_in_, n_fan_out_)) for n_fan_in_,
              n_fan_out_ in zip(layer_units[:-1],
                                layer_units[1:])]

intercept_grads = [np.empty(n_fan_out_) for n_fan_out_ in
                   layer_units[1:]]

参数定义

    def _backprop(self, X, y, activations, deltas, coef_grads,
                  intercept_grads):
        """Compute the MLP loss function and its corresponding derivatives
        with respect to each parameter: weights and bias vectors.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            The input data.

        y : array-like, shape (n_samples,)
            The target values.

        activations : list, length = n_layers - 1
             The ith element of the list holds the values of the ith layer.

        deltas : list, length = n_layers - 1
            The ith element of the list holds the difference between the
            activations of the i + 1 layer and the backpropagated error.
            More specifically, deltas are gradients of loss with respect to z
            in each layer, where z = wx + b is the value of a particular layer
            before passing through the activation function

        coef_grad : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            coefficient parameters of the ith layer in an iteration.

        intercept_grads : list, length = n_layers - 1
            The ith element contains the amount of change used to update the
            intercept parameters of the ith layer in an iteration.

        Returns
        -------
        loss : float
        coef_grads : list, length = n_layers - 1
        intercept_grads : list, length = n_layers - 1
        """

输入
X：特征，形状(n_samples, n_features)
y：标签，形状(n_samples,)
activations[i]：神经网络的第i层激活值
deltas[i]：第i+1层网络的神经元误差
coef_grad[i]：第i层网络权重的梯度
intercept_grads[i]：第i层网络偏置项的梯度
输出
loss，coef_grads，intercept_grads

        n_samples = X.shape[0] #一次迭代的样本数量

        # Forward propagate
        # 前向传播，得到网络每层的激活值
        activations = self._forward_pass(activations) 

        # Get loss
        loss_func_name = self.loss
        if loss_func_name == 'log_loss' and self.out_activation_ == 'logistic':
            loss_func_name = 'binary_log_loss'
        # 根据预测y与真实y，计算loss
        loss = LOSS_FUNCTIONS[loss_func_name](y, activations[-1])
        # Add L2 regularization term to loss
        # loss加上权重的L2正则项
        values = np.sum(
            np.array([np.dot(s.ravel(), s.ravel()) for s in self.coefs_]))
        loss += (0.5 * self.alpha) * values / n_samples

        # Backward propagate
        # 输出层误差值的下标
        last = self.n_layers_ - 2

        # The calculation of delta[last] here works with following
        # combinations of output activation and loss function:
        # sigmoid and binary cross entropy, softmax and categorical cross
        # entropy, and identity with squared loss
        # 按照identity激活函数和squared loss，计算输出层的误差
        deltas[last] = activations[-1] - y

        # Compute gradient for the last layer
        # 根据激活值和误差，计算输出层的参数梯度
        coef_grads, intercept_grads = self._compute_loss_grad(
            last, n_samples, activations, deltas, coef_grads, intercept_grads)

        # Iterate over the hidden layers
        # 在隐藏层间反向传播误差
        for i in range(self.n_layers_ - 2, 0, -1):
            deltas[i - 1] = safe_sparse_dot(deltas[i], self.coefs_[i].T)
            inplace_derivative = DERIVATIVES[self.activation]
            # 根据隐藏层的激活函数，计算该层误差
            inplace_derivative(activations[i], deltas[i - 1])
            # 根据激活值和误差，计算该层的参数梯度
            coef_grads, intercept_grads = self._compute_loss_grad(
                i - 1, n_samples, activations, deltas, coef_grads,
                intercept_grads)

        # 得到每层参数的梯度，传入优化算法进行参数更新
        return loss, coef_grads, intercept_grads

计算参数梯度

    def _compute_loss_grad(self, layer, n_samples, activations, deltas,
                           coef_grads, intercept_grads):
        """Compute the gradient of loss with respect to coefs and intercept for
        specified layer.

        This function does backpropagation for the specified one layer.
        """
        # 权重的梯度
        coef_grads[layer] = safe_sparse_dot(activations[layer].T,
                                            deltas[layer])
        # 加上L2正则项的梯度
        coef_grads[layer] += (self.alpha * self.coefs_[layer])
        coef_grads[layer] /= n_samples

        intercept_grads[layer] = np.mean(deltas[layer], 0)

        return coef_grads, intercept_grads

前向传播

    def _forward_pass(self, activations):
        """Perform a forward pass on the network by computing the values
        of the neurons in the hidden layers and the output layer.

        Parameters
        ----------
        activations : list, length = n_layers - 1
            The ith element of the list holds the values of the ith layer.

        with_output_activation : bool, default True
            If True, the output passes through the output activation
            function, which is either the softmax function or the
            logistic function
        """
        # 隐藏层激活函数
        hidden_activation = ACTIVATIONS[self.activation]
        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 1):
            activations[i + 1] = safe_sparse_dot(activations[i],
                                                 self.coefs_[i])
            activations[i + 1] += self.intercepts_[i]

            # For the hidden layers
            if (i + 1) != (self.n_layers_ - 1):
                activations[i + 1] = hidden_activation(activations[i + 1])

        # For the last layer
        output_activation = ACTIVATIONS[self.out_activation_]
        activations[i + 1] = output_activation(activations[i + 1])
        # 返回神经网络每层激活值
        return activations

sklearn/neural_network/_base.py
激活函数
ACTIVATIONS = {'identity': identity, 'tanh': tanh, 'logistic': logistic, 'relu': relu, 'softmax': softmax}

def identity(X):
    """Simply return the input array. 

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Data, where n_samples is the number of samples
        and n_features is the number of features.

    Returns
    -------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Same as the input data.
    """
    # 相当于没有激活函数
    return X

def logistic(X):
    """Compute the logistic function inplace.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        The input data.

    Returns
    -------
    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
        The transformed data.
    """
    # sigmoid激活函数
    return logistic_sigmoid(X, out=X)

def relu(X):
    """Compute the rectified linear unit function inplace.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        The input data.

    Returns
    -------
    X_new : {array-like, sparse matrix}, shape (n_samples, n_features)
        The transformed data.
    """
    # 将输入值X在0～max之间的不变，小于0的变为0
    np.clip(X, 0, np.finfo(X.dtype).max, out=X)
    return X

DERIVATIVES = {'identity': inplace_identity_derivative,
'tanh': inplace_tanh_derivative,
'logistic': inplace_logistic_derivative,
'relu': inplace_relu_derivative}

def inplace_relu_derivative(Z, delta):
    """Apply the derivative of the relu function.

    It exploits the fact that the derivative is a simple function of the output
    value from rectified linear units activation function.

    Parameters
    ----------
    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
        The data which was output from the rectified linear units activation
        function during the forward pass.

    delta : {array-like}, shape (n_samples, n_features)
         The backpropagated error signal to be modified inplace.
    """
    delta[Z == 0] = 0

传入第i层激活值 Z=activations[i], 第i-1层误差 deltas[i - 1]
当激活值Z=relu(X)=0时，说明X<0，relu(X)导数为0，误差为0；
激活值>0时，relu(X)导数为1，误差不变

def inplace_logistic_derivative(Z, delta):
    """Apply the derivative of the logistic sigmoid function.

    It exploits the fact that the derivative is a simple function of the output
    value from logistic function.

    Parameters
    ----------
    Z : {array-like, sparse matrix}, shape (n_samples, n_features)
        The data which was output from the logistic activation function during
        the forward pass.

    delta : {array-like}, shape (n_samples, n_features)
         The backpropagated error signal to be modified inplace.
    """
    delta *= Z
    delta *= (1 - Z)

传入Z=sigmoid(X)为激活值，sigmoid(X)的导数=sigmoid(X)(1-sigmoid(X)) = Z(1-Z)

LOSS_FUNCTIONS = {'squared_loss': squared_loss, 'log_loss': log_loss, 'binary_log_loss': binary_log_loss}

def binary_log_loss(y_true, y_prob):
    """Compute binary logistic loss for classification.

    This is identical to log_loss in binary classification case,
    but is kept for its use in multilabel case.

    Parameters
    ----------
    y_true : array-like or label indicator matrix
        Ground truth (correct) labels.

    y_prob : array-like of float, shape = (n_samples, n_classes)
        Predicted probabilities, as returned by a classifier's
        predict_proba method.

    Returns
    -------
    loss : float
        The degree to which the samples are correctly predicted.
    """
    y_prob = np.clip(y_prob, 1e-10, 1 - 1e-10)

    return -np.sum(y_true * np.log(y_prob) +
                   (1 - y_true) * np.log(1 - y_prob)) / y_prob.shape[0]