ROIPooling代码理解（CPU）

MXNet中ROIPooling的具体实现。
代码来自https://github.com/apache/incubator-mxnet
包括前向传播的c++实现

void ROIPoolForward(out,in,bbox,max_idx,spatial_scal){...}

反向传播的c++实现

void ROIPoolBackwardAcc(in_grad,out_grad,bbox,max_idx,spatial_scal){...}

以上操作封装成Operator，并在MXNet里注册。
前传定义为ROIPooling，反传定义为ROIPoolingProp。

namespace mxnet {
namespace op {

template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
  Operator* op = NULL;
  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
    op = new ROIPoolingOp<cpu, DType>(param);
  });
  return op;
}

Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                           std::vector<int> *in_type) const {
  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}

DMLC_REGISTER_PARAMETER(ROIPoolingParam);

MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp).describe(...)

代码中还给了具体的例子：

//4-d tensor input, shape=(batch,channel,h,w)
  x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
         [  6.,   7.,   8.,   9.,  10.,  11.],
         [ 12.,  13.,  14.,  15.,  16.,  17.],
         [ 18.,  19.,  20.,  21.,  22.,  23.],
         [ 24.,  25.,  26.,  27.,  28.,  29.],
         [ 30.,  31.,  32.,  33.,  34.,  35.],
         [ 36.,  37.,  38.,  39.,  40.,  41.],
         [ 42.,  43.,  44.,  45.,  46.,  47.]]]]

//2-d tensor bounding box, shape=(num_roi, coordinate)
 y = [[0,0,0,4,4]]

//pooled_size
 (2,2)

//ROIPooling的结果，缩放尺寸为1的情况下，bbox的坐标为[0,0,4,4]，pooling到2*2的尺寸
  ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                    [ 26.,  28.]]]]

//ROIPooling的结果，缩放尺寸为0.7的情况下，bbox的坐标为[0,0,3,3]，pooling到2*2的尺寸
  ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                    [ 19.,  21.]]]]

完整代码和注释如下：

#include "./roi_pooling-inl.h"
#include <mshadow/base.h>
#include <mshadow/tensor.h>
#include <mshadow/packet-inl.h>
#include <mshadow/dot_engine-inl.h>
#include <cassert>

using std::max;
using std::min;
using std::floor;
using std::ceil;
//ROIPooling前向部分
namespace mshadow {
template<typename Dtype>
inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,//[batch,channel,h,w]
                           const Tensor<cpu, 4, Dtype> &data,//[batch,channel,h,w]
                           const Tensor<cpu, 2, Dtype> &bbox,//[index,coordinate]
                           const Tensor<cpu, 4, Dtype> &max_idx,
                           const float spatial_scale_) {//ROI的缩放尺度1,0.75,0.5,1.25等
  const Dtype *bottom_data = data.dptr_;
  const Dtype *bottom_rois = bbox.dptr_;
  Dtype *top_data = out.dptr_;
  Dtype *argmax_data = max_idx.dptr_;
  const int channels_ = data.size(1);
  const int height_ = data.size(2);
  const int width_ = data.size(3);
  const int pooled_height_ = out.size(2);
  const int pooled_width_ = out.size(3);

  const int num_rois = bbox.size(0);
  const int data_size = data.size(1) * data.size(2) * data.size(3);
  const int data_size_c = data.size(2) * data.size(3);
  const int out_size_c = out.size(2) * out.size(3);
  const int out_size = channels_ * out_size_c;
  const int max_idx_size_c = max_idx.size(2) * max_idx.size(3);
  const int max_idx_size = channels_ * max_idx_size_c;
  // For each ROI R = [batch_index x1 y1 x2 y2]: 对每个ROI做max pooling
  for (int n = 0; n < num_rois; ++n) {
    // 定位到第n个roi
    const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1);
    Dtype *top_data_n = top_data + n * out_size;
    Dtype *argmax_data_n = argmax_data + n * max_idx_size;
    int roi_batch_ind = bottom_rois_n[0];
    int roi_start_w = round(bottom_rois_n[1] * spatial_scale_);
    int roi_start_h = round(bottom_rois_n[2] * spatial_scale_);
    int roi_end_w = round(bottom_rois_n[3] * spatial_scale_);
    int roi_end_h = round(bottom_rois_n[4] * spatial_scale_);
    assert(roi_batch_ind >= 0);
    assert(static_cast<index_t>(roi_batch_ind) < data.size(0) /* batch size */);

    // 避免ROI的大小小于1*1
    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
    //计算pooling后的坐标，ROI Pooling是固定宽高的
    const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                             / static_cast<Dtype>(pooled_height_);
    const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                             / static_cast<Dtype>(pooled_width_);
    //定位到下一个ROI
    const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;

    #pragma omp parallel for
    for (int c = 0; c < channels_; ++c) {
      // 定位到第n个ROI，第c个通道
      const Dtype* batch_data_c = batch_data + c * data_size_c;
      Dtype* top_data_c = top_data_n + c * out_size_c;
      Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c;
      // 定位当前pooling的位置坐标
      for (int ph = 0; ph < pooled_height_; ++ph) {
        for (int pw = 0; pw < pooled_width_; ++pw) {
          
          // start (included) = floor(ph * roi_height / pooled_height_)
          //左上两个坐标向下取整。
          // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
          //右下两个坐标向上取整。
        
          int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
                                              * bin_size_h));
          int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
                                              * bin_size_w));
          int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
                                           * bin_size_h));
          int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
                                           * bin_size_w));
          
          hstart = min(max(hstart + roi_start_h, 0), height_);
          hend = min(max(hend + roi_start_h, 0), height_);
          wstart = min(max(wstart + roi_start_w, 0), width_);
          wend = min(max(wend + roi_start_w, 0), width_);

          bool is_empty = (hend <= hstart) || (wend <= wstart);

          const int pool_index = ph * pooled_width_ + pw;
          if (is_empty) {
            top_data_c[pool_index] = 0;
            argmax_data_c[pool_index] = -1;
          }
          //做maxpooling
          for (int h = hstart; h < hend; ++h) {
            for (int w = wstart; w < wend; ++w) {
              const int index = h * width_ + w;
              //找最大值
              if (batch_data_c[index] > top_data_c[pool_index]) {
                top_data_c[pool_index] = batch_data_c[index];
                //输出到data的第c个通道的相应位置
                argmax_data_c[pool_index] = index;
              }
            }
          }
        }
      }
    }
  }
  return;
}
//ROIPooling后向部分
template<typename Dtype>
inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
                               const Tensor<cpu, 4, Dtype> &out_grad,
                               const Tensor<cpu, 2, Dtype> &bbox,
                               const Tensor<cpu, 4, Dtype> &max_idx,
                               const float spatial_scale_) {
  const Dtype *top_diff = out_grad.dptr_;
  const Dtype *bottom_rois = bbox.dptr_;
  Dtype *bottom_diff = in_grad.dptr_;
  Dtype *argmax_data = max_idx.dptr_;

  const int batch_size_ = in_grad.size(0);
  const int channels_ = in_grad.size(1);
  const int height_ = in_grad.size(2);
  const int width_ = in_grad.size(3);
  const int pooled_height_ = out_grad.size(2);
  const int pooled_width_ = out_grad.size(3);

  const int num_rois = bbox.size(0);

  for (int b = 0; b < batch_size_; ++b) {
    for (int c = 0; c < channels_; ++c) {
      for (int h = 0; h < height_; ++h) {
        for (int w = 0; w < width_; ++w) {
          int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
          offset_bottom_diff += h * width_ + w;

          Dtype gradient = 0;
          // Accumulate gradient over all ROIs that pooled this element
          for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
            const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
            int roi_batch_ind = offset_bottom_rois[0];
            assert(roi_batch_ind >= 0);
            assert(roi_batch_ind < batch_size_);
            if (b != roi_batch_ind) {
              continue;
            }

            int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
            int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
            int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
            int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);

            bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
                           h >= roi_start_h && h <= roi_end_h);
            if (!in_roi) {
              continue;
            }

            // force malformed ROIs to be 1 * 1
            int roi_height = max(roi_end_h - roi_start_h + 1, 1);
            int roi_width = max(roi_end_w - roi_start_w + 1, 1);
            const Dtype bin_size_h = static_cast<Dtype>(roi_height)
                                     / static_cast<Dtype>(pooled_height_);
            const Dtype bin_size_w = static_cast<Dtype>(roi_width)
                                     / static_cast<Dtype>(pooled_width_);

            // compute pooled regions correspond to original (h, w) point
            int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
                                                 / bin_size_h));
            int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
                                                 / bin_size_w));
            int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
                                              / bin_size_h));
            int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
                                              / bin_size_w));

            // clip to boundaries of pooled region
            phstart = min(max(phstart, 0), pooled_height_);
            phend = min(max(phend, 0), pooled_height_);
            pwstart = min(max(pwstart, 0), pooled_width_);
            pwend = min(max(pwend, 0), pooled_width_);

            // accumulate over gradients in pooled regions
            int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
            const Dtype* offset_top_diff = top_diff + offset;
            const Dtype* offset_argmax_data = argmax_data + offset;
            for (int ph = phstart; ph < phend; ++ph) {
              for (int pw = pwstart; pw < pwend; ++pw) {
                const int pooled_index = ph * pooled_width_ + pw;
                if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
                  gradient += offset_top_diff[pooled_index];
                }
              }
            }
          }
          bottom_diff[offset_bottom_diff] += gradient;
        }
      }
    }
  }

  return;
}
}  // namespace mshadow

namespace mxnet {
namespace op {

template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
  Operator* op = NULL;
  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
    op = new ROIPoolingOp<cpu, DType>(param);
  });
  return op;
}

Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                           std::vector<int> *in_type) const {
  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}

DMLC_REGISTER_PARAMETER(ROIPoolingParam);

MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
.describe(R"code(Performs region of interest(ROI) pooling on the input array.
ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
region of interest is a parameter. Its purpose is to perform max pooling on the inputs
of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
layer mostly used in training a `Fast R-CNN` network for object detection.
This operator takes a 4D feature map as an input array and region proposals as `rois`,
then it pools over sub-regions of input and produces a fixed-sized output array
regardless of the ROI size.
To crop the feature map accordingly, you can resize the bounding box coordinates
by changing the parameters `rois` and `spatial_scale`.
The cropped feature maps are pooled by standard max pooling operation to a fixed size output
indicated by a `pooled_size` parameter. batch_size will change to the number of region
bounding boxes after `ROIPooling`.
The size of each region of interest doesn't have to be perfectly divisible by
the number of pooling sections(`pooled_size`).
Example::
  x = [[[[  0.,   1.,   2.,   3.,   4.,   5.],
         [  6.,   7.,   8.,   9.,  10.,  11.],
         [ 12.,  13.,  14.,  15.,  16.,  17.],
         [ 18.,  19.,  20.,  21.,  22.,  23.],
         [ 24.,  25.,  26.,  27.,  28.,  29.],
         [ 30.,  31.,  32.,  33.,  34.,  35.],
         [ 36.,  37.,  38.,  39.,  40.,  41.],
         [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
  // region of interest i.e. bounding box coordinates.
  y = [[0,0,0,4,4]]
  // returns array of shape (2,2) according to the given roi with max pooling.
  ROIPooling(x, y, (2,2), 1.0) = [[[[ 14.,  16.],
                                    [ 26.,  28.]]]]
  // region of interest is changed due to the change in `spacial_scale` parameter.
  ROIPooling(x, y, (2,2), 0.7) = [[[[  7.,   9.],
                                    [ 19.,  21.]]]]
)code" ADD_FILELINE)
.add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
                                            " a 4D Feature maps ")
.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
"[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
"corners of designated region of interest. `batch_index` indicates the index of corresponding "
"image in the input array")
.add_arguments(ROIPoolingParam::__FIELDS__());
}  // namespace op
}  // namespace mxnet

最后编辑于：2018.05.23 12:24:07

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 213,335评论 6赞 492
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 90,895评论 3赞 387
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 158,766评论 0赞 348
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 56,918评论 1赞 285
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 66,042评论 6赞 385
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 50,169评论 1赞 291
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 39,219评论 3赞 412
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 37,976评论 0赞 268
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 44,393评论 1赞 304
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 36,711评论 2赞 328
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 38,876评论 1赞 341
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 34,562评论 4赞 336
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 40,193评论 3赞 317
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 30,903评论 0赞 21
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 32,142评论 1赞 267
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 46,699评论 2赞 362
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 43,764评论 2赞 351

ROIPooling代码理解（CPU）

推荐阅读更多精彩内容