"backprop" in pure C/C++

上周提起我司每个C/C++工程师大概都能1、2天理解并不依赖现有数据分析库实现一个反向传播。于是自己试着手写一下,算了下大致时间 - 看懂数学具体细节大概3-4个小时(不算证明部分),实现大概3个小时,纠错大概30分钟。最后结果是对于不同的输入数据训练后,test_data误差都能收敛。

算法高度浓缩一下就是一套公式和一套算法 (ref):

backprop equations

backprop algorithm

涉及到数学的地方很少(证明部分除外) - 第三步里的计算“∇aC”(如果取C为MSE (mean square error)的话,那这一步也就变得非常简单了)和σ′的计算,手边放一本高等代数的书就可以了。矩阵乘法不优化的传统实现也是straightforward。

有点意思的部分是数学,代码实现正常的C/C++ eng就完全可以。

另 - 培根说的“写作让人精确”,受益匪浅,很多感受心理和事实细节非下笔不能仔细把握也。理解一个算法细节其实一样 - code it out!

#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <vector>
#include <random>

typedef std::vector<double> vec;
typedef std::vector<vec> matrix;
typedef std::vector<matrix> matrix_vec;
typedef std::pair<vec, vec> sample;
typedef std::vector<sample> sample_data;

class Network {
public:
  Network(const std::vector<int>& layer_sizes_, double eta_):
    layer_num(layer_sizes_.size()), layer_sizes(layer_sizes_), eta(eta_)
  {
    // Generates initial values for weights & bias, using normal distribution (0, 0.4).
    shape(weights);
    shape(bias);
    for (int l = 1; l < layer_num; ++l) {
      unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
      std::default_random_engine generator(seed);
      std::normal_distribution<double> distribution(0, 0.4);
      for (int i = 0; i < layer_sizes[l]; ++i) {
        bias[l][i] = distribution(generator);
        for (int j = 0; j < layer_sizes[l-1]; ++j)
          weights[l][i][j] = distribution(generator);
      }
    }
  }

  void train(const sample_data& data, const sample_data& test_data) {
    printf("Pre-epoch cost: %f\n", cost(test_data));
    const int mini_data_size = 50;
    int epoch = 1;
    for (auto p = data.begin(), q = data.end(); p < q; p += mini_data_size, ++epoch) {
      train_batch(p, p + mini_data_size < q ? p + mini_data_size : q);
      printf("Epoch: %d: %f\n", epoch, cost(test_data));
    }
  }

  void train_batch(sample_data::const_iterator start, sample_data::const_iterator end) {
    const int n = end - start;
    matrix_vec w_delta;
    matrix b_delta;
    shape(w_delta);
    shape(b_delta);
    for (auto i = start; i < end; ++i)
      backprop(*i, w_delta, b_delta);

    // now update weights and bias
    double t = eta / (double)n;
    for (int l = 1; l < layer_num; ++l) {
      for (int j = 0; j < layer_sizes[l]; ++j) {
        bias[l][j] -= t * b_delta[l][j];
        for (int k = 0; k < layer_sizes[l - 1]; ++k) {
          weights[l][j][k] -= t * w_delta[l][j][k];
        }
      }
    }
  }

  void print_weights_and_bias() {
    for (int l = 1; l < layer_num; ++l) {
      printf("Layer %d\n", l);
      printf("  bias:");
      for (int j = 0; j < layer_sizes[l]; ++j) {
        printf(" %f", bias[l][j]);
      }
      printf("\n");
      printf("  weights:\n");
      for (int j = 0; j < layer_sizes[l]; ++j) {
        for (int k = 0; k < layer_sizes[l - 1]; ++k)
          printf(" %8f", weights[l][j][k]);
        printf("\n");
      }
    }
  }

  // Given a single training (X, Y), calculate weights_delta & bias_delta.
  void backprop(const sample& s, matrix_vec& weights_delta, matrix& bias_delta) {
    // For convience.
    const vec& X = s.first;
    const vec& Y = s.second;
    assert(X.size() == layer_sizes[0]);
    assert(Y.size() == layer_sizes[layer_num - 1]);
    matrix z_values;
    matrix a_values;
    feed_forward(X, z_values, a_values);

    // Now compute errors at the output layer.
    matrix errors;
    shape(errors);
    const vec& aL = a_values[layer_num - 1];
    const vec& zL = z_values[layer_num - 1];
    vec& eL = errors[layer_num - 1];
    for (int n = 0; n < layer_sizes[layer_num - 1]; ++n) {
      eL[n] = (aL[n] - Y[n]) * sigmoid_prime(zL[n]);
    }

    // Back propagate errors.
    for (int l = layer_num - 2; l >= 1; --l) {
      // Transpose weights[l + 1], which is of shape:
      //   (layer_sizes[l+1] x layer_sizes[l]).
      // So the transposed twl is of shape:
      //   (layer_sizes[l] x layer_sizes[l+1].
      matrix twl;
      transpose(weights[l + 1], twl);
      // Now twl is of shape (layer_num[l], layer_num[l+1]).
      // Multiply it by shape(layer_num[l+1]), 1), result in (layer_num[l],1).
      vec result;
      multiply(twl, errors[l + 1], result);
      assert(result.size() == layer_sizes[l]);

      for (int i = 0; i < errors[l].size(); ++i)
        errors[l][i] = result[i] * sigmoid_prime(z_values[l][i]);
    }

    // Now we have error[1..L-1], a[0..L-1] ready, we are good to
    // compute gradient vector.
    for (int l = 1; l < layer_num; ++l) {
      for (int a = 0; a < layer_sizes[l]; ++a) {
        bias_delta[l][a] += errors[l][a];
        for (int b = 0; b < layer_sizes[l - 1]; ++b) {
          weights_delta[l][a][b] += a_values[l - 1][b] * errors[l][a];
        }
      }
    }
  }

protected:
  static void multiply(const matrix& m, const vec& v, vec& result) {
    assert(m[0].size() == v.size());
    result = vec(m.size());
    for (int a = 0; a < m.size(); ++a) {
      double t(0);
      for (int b = 0; b < m[a].size(); ++b) {
        t += m[a][b] * v[b];
      }
      result[a] = t;
    }
  }

  void shape(matrix& m) const {
    m = std::vector<vec>(layer_num);
    for (int l = 0; l < layer_num; ++l)
      shape(m[l], layer_sizes[l]);
  }

  void shape(matrix_vec& mv) const {
    mv = std::vector<matrix>(layer_num);
    for (int l = 1; l < layer_num; ++l) {
      shape(mv[l], layer_sizes[l], layer_sizes[l - 1]);
    }
  }

  // z_values[l][a] is the input value of layer[l], neuron a.
  // a_values[l][a] is the activation value of layer[l], neuron a.
  // In general, a_values = sigmoid(z_values).
  // They are important values used in backprop.
  void feed_forward(const vec X, matrix& z_values, matrix& a_values) const {
    shape(z_values);
    shape(a_values);
    z_values[0] = X;
    for (int i = 0; i < z_values[0].size(); ++i) {
      a_values[0][i] = sigmoid(z_values[0][i]);
    }

    for (int l = 1; l < layer_num; ++l) {
      for (int n = 0; n < layer_sizes[l]; ++n) {
        double z_sum = bias[l][n];
        for (int m = 0; m < layer_sizes[l - 1]; ++m) {
          z_sum += a_values[l - 1][m] * weights[l][n][m];
        }
        z_values[l][n] = z_sum;
        a_values[l][n] = sigmoid(z_sum);
      }
    }
  }

  double cost(const sample_data& test_data) const {
    double c = 0.0;
    std::for_each(test_data.begin(), test_data.end(),
                  [&c, this](const sample& s) {
                    c += this->cost(s);
                  });
    return c / test_data.size();
  }

  double cost(const sample& s) const {
    matrix z_values;
    matrix a_values;
    const vec& X = s.first;
    const vec& Y = s.second;
    feed_forward(X, z_values, a_values);
    double c = 0.0;
    for (int k = 0, l = layer_num - 1; k < layer_sizes[layer_num - 1]; ++k) {
      double t = (a_values[l][k] - Y[k]);
      c += t * t;
    }
    return c;
  }

  static void shape(vec& v, int n) {
    v = vec(n, 0.0f);
  }

  static void shape(matrix& m, int r, int c) {
    m = matrix(r);
    std::for_each(m.begin(), m.end(),
                  [c](vec& v) {v = vec(c, 0.0f);});
  }

  static double sigmoid(double z) {
    return 1.0 / (1 + exp(-z));
  }

  static double sigmoid_prime(double z) {
    return sigmoid(z)*(1 - sigmoid(z));
  }

  static void transpose(const matrix& m1, matrix& m2) {
    shape(m2, m1[0].size(), m1.size());
    for (int i = 0; i < m1.size(); ++i)
      for (int j = 0; j < m1[i].size(); ++j)
        m2[j][i] = m1[i][j];
  }


private:
  // How many layers does the network have?
  const int layer_num;
  // Each element is the number of neurons of each layer.  For example
  // <1,8,7> means a 3-layer network, with 1 neuron in 1st layer, 8 in
  // 2nd layer, and 7 in 3rd layer.
  const std::vector<int> layer_sizes;

  // bias[l][a] means bias of (layer l, neuron a). 1 <= l < layer_num.
  matrix bias;

  // weights[l][a][b] means: 1 <= l < layer_num;
  //   weight that connects: (layer l-1, neuron b) to (layer l, neuron a).
  // Note the sequence of a, b. "a" is the neuron on layer l, "b" is on layer l - 1.
  std::vector<matrix> weights;

  // Hyper parameter for learning.
  double eta;
};

inline static double P(double d) {
  if (d < 0) d = -d;
  return d > 1 ? 1 / d : d;
}

// A target function to be learned.
double func(const vec& X) {
  double v(0);
  std::for_each(X.begin(), X.end(),
                [&v](const double d) {
                  v += std::sin(d) / std::cos(1+d) + std::tan(d);
                });
  static std::default_random_engine generator(
      std::chrono::system_clock::now().time_since_epoch().count());
  static std::normal_distribution<double> distribution(0, 0.05);
  return P(v + distribution(generator));
}

void generate_data(sample_data& training_data, double seed, const int n) {
  training_data = sample_data(n);
  std::for_each(training_data.begin(),
                training_data.end(),
                [&seed](sample& sam) {
                  double di(seed);
                  vec input = {P(di), P(di * seed), P(di / seed), P(di / 3)};
                  double output = func(input);
                  sam = sample(input, {output});
                  seed = output;
                });
}

int main() {
  sample_data training_data, test_data;
  generate_data(training_data, 1, 50000);
  generate_data(test_data, 0.01, 200);
  Network network({(int)training_data[0].first.size(), 10, 9, 8, 7, 1}, 0.09);
  network.train(training_data, test_data);
  network.print_weights_and_bias();
  return 0;
}

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 211,884评论 6 492
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 90,347评论 3 385
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 157,435评论 0 348
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 56,509评论 1 284
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 65,611评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,837评论 1 290
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,987评论 3 408
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,730评论 0 267
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,194评论 1 303
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,525评论 2 327
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,664评论 1 340
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,334评论 4 330
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,944评论 3 313
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,764评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,997评论 1 266
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,389评论 2 360
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,554评论 2 349

推荐阅读更多精彩内容