MXNet中ROIPooling的具体实现。
代码来自https://github.com/apache/incubator-mxnet
包括前向传播的c++实现
void ROIPoolForward(out,in,bbox,max_idx,spatial_scal){...}
反向传播的c++实现
void ROIPoolBackwardAcc(in_grad,out_grad,bbox,max_idx,spatial_scal){...}
以上操作封装成Operator,并在MXNet里注册。
前传定义为ROIPooling,反传定义为ROIPoolingProp。
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
Operator* op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new ROIPoolingOp<cpu, DType>(param);
});
return op;
}
Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const {
DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}
DMLC_REGISTER_PARAMETER(ROIPoolingParam);
MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp).describe(...)
代码中还给了具体的例子:
//4-d tensor input, shape=(batch,channel,h,w)
x = [[[[ 0., 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10., 11.],
[ 12., 13., 14., 15., 16., 17.],
[ 18., 19., 20., 21., 22., 23.],
[ 24., 25., 26., 27., 28., 29.],
[ 30., 31., 32., 33., 34., 35.],
[ 36., 37., 38., 39., 40., 41.],
[ 42., 43., 44., 45., 46., 47.]]]]
//2-d tensor bounding box, shape=(num_roi, coordinate)
y = [[0,0,0,4,4]]
//pooled_size
(2,2)
//ROIPooling的结果,缩放尺寸为1的情况下,bbox的坐标为[0,0,4,4],pooling到2*2的尺寸
ROIPooling(x, y, (2,2), 1.0) = [[[[ 14., 16.],
[ 26., 28.]]]]
//ROIPooling的结果,缩放尺寸为0.7的情况下,bbox的坐标为[0,0,3,3],pooling到2*2的尺寸
ROIPooling(x, y, (2,2), 0.7) = [[[[ 7., 9.],
[ 19., 21.]]]]
完整代码和注释如下:
#include "./roi_pooling-inl.h"
#include <mshadow/base.h>
#include <mshadow/tensor.h>
#include <mshadow/packet-inl.h>
#include <mshadow/dot_engine-inl.h>
#include <cassert>
using std::max;
using std::min;
using std::floor;
using std::ceil;
//ROIPooling前向部分
namespace mshadow {
template<typename Dtype>
inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,//[batch,channel,h,w]
const Tensor<cpu, 4, Dtype> &data,//[batch,channel,h,w]
const Tensor<cpu, 2, Dtype> &bbox,//[index,coordinate]
const Tensor<cpu, 4, Dtype> &max_idx,
const float spatial_scale_) {//ROI的缩放尺度1,0.75,0.5,1.25等
const Dtype *bottom_data = data.dptr_;
const Dtype *bottom_rois = bbox.dptr_;
Dtype *top_data = out.dptr_;
Dtype *argmax_data = max_idx.dptr_;
const int channels_ = data.size(1);
const int height_ = data.size(2);
const int width_ = data.size(3);
const int pooled_height_ = out.size(2);
const int pooled_width_ = out.size(3);
const int num_rois = bbox.size(0);
const int data_size = data.size(1) * data.size(2) * data.size(3);
const int data_size_c = data.size(2) * data.size(3);
const int out_size_c = out.size(2) * out.size(3);
const int out_size = channels_ * out_size_c;
const int max_idx_size_c = max_idx.size(2) * max_idx.size(3);
const int max_idx_size = channels_ * max_idx_size_c;
// For each ROI R = [batch_index x1 y1 x2 y2]: 对每个ROI做max pooling
for (int n = 0; n < num_rois; ++n) {
// 定位到第n个roi
const Dtype *bottom_rois_n = bottom_rois + n * bbox.size(1);
Dtype *top_data_n = top_data + n * out_size;
Dtype *argmax_data_n = argmax_data + n * max_idx_size;
int roi_batch_ind = bottom_rois_n[0];
int roi_start_w = round(bottom_rois_n[1] * spatial_scale_);
int roi_start_h = round(bottom_rois_n[2] * spatial_scale_);
int roi_end_w = round(bottom_rois_n[3] * spatial_scale_);
int roi_end_h = round(bottom_rois_n[4] * spatial_scale_);
assert(roi_batch_ind >= 0);
assert(static_cast<index_t>(roi_batch_ind) < data.size(0) /* batch size */);
// 避免ROI的大小小于1*1
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
//计算pooling后的坐标,ROI Pooling是固定宽高的
const Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
const Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
//定位到下一个ROI
const Dtype* batch_data = bottom_data + data_size * roi_batch_ind;
#pragma omp parallel for
for (int c = 0; c < channels_; ++c) {
// 定位到第n个ROI,第c个通道
const Dtype* batch_data_c = batch_data + c * data_size_c;
Dtype* top_data_c = top_data_n + c * out_size_c;
Dtype* argmax_data_c = argmax_data_n + c * max_idx_size_c;
// 定位当前pooling的位置坐标
for (int ph = 0; ph < pooled_height_; ++ph) {
for (int pw = 0; pw < pooled_width_; ++pw) {
// start (included) = floor(ph * roi_height / pooled_height_)
//左上两个坐标向下取整。
// end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
//右下两个坐标向上取整。
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
* bin_size_w));
hstart = min(max(hstart + roi_start_h, 0), height_);
hend = min(max(hend + roi_start_h, 0), height_);
wstart = min(max(wstart + roi_start_w, 0), width_);
wend = min(max(wend + roi_start_w, 0), width_);
bool is_empty = (hend <= hstart) || (wend <= wstart);
const int pool_index = ph * pooled_width_ + pw;
if (is_empty) {
top_data_c[pool_index] = 0;
argmax_data_c[pool_index] = -1;
}
//做maxpooling
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
const int index = h * width_ + w;
//找最大值
if (batch_data_c[index] > top_data_c[pool_index]) {
top_data_c[pool_index] = batch_data_c[index];
//输出到data的第c个通道的相应位置
argmax_data_c[pool_index] = index;
}
}
}
}
}
}
}
return;
}
//ROIPooling后向部分
template<typename Dtype>
inline void ROIPoolBackwardAcc(const Tensor<cpu, 4, Dtype> &in_grad,
const Tensor<cpu, 4, Dtype> &out_grad,
const Tensor<cpu, 2, Dtype> &bbox,
const Tensor<cpu, 4, Dtype> &max_idx,
const float spatial_scale_) {
const Dtype *top_diff = out_grad.dptr_;
const Dtype *bottom_rois = bbox.dptr_;
Dtype *bottom_diff = in_grad.dptr_;
Dtype *argmax_data = max_idx.dptr_;
const int batch_size_ = in_grad.size(0);
const int channels_ = in_grad.size(1);
const int height_ = in_grad.size(2);
const int width_ = in_grad.size(3);
const int pooled_height_ = out_grad.size(2);
const int pooled_width_ = out_grad.size(3);
const int num_rois = bbox.size(0);
for (int b = 0; b < batch_size_; ++b) {
for (int c = 0; c < channels_; ++c) {
for (int h = 0; h < height_; ++h) {
for (int w = 0; w < width_; ++w) {
int offset_bottom_diff = (b * channels_ + c) * height_ * width_;
offset_bottom_diff += h * width_ + w;
Dtype gradient = 0;
// Accumulate gradient over all ROIs that pooled this element
for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
const Dtype* offset_bottom_rois = bottom_rois + roi_n * 5;
int roi_batch_ind = offset_bottom_rois[0];
assert(roi_batch_ind >= 0);
assert(roi_batch_ind < batch_size_);
if (b != roi_batch_ind) {
continue;
}
int roi_start_w = round(offset_bottom_rois[1] * spatial_scale_);
int roi_start_h = round(offset_bottom_rois[2] * spatial_scale_);
int roi_end_w = round(offset_bottom_rois[3] * spatial_scale_);
int roi_end_h = round(offset_bottom_rois[4] * spatial_scale_);
bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
h >= roi_start_h && h <= roi_end_h);
if (!in_roi) {
continue;
}
// force malformed ROIs to be 1 * 1
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
const Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height_);
const Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width_);
// compute pooled regions correspond to original (h, w) point
int phstart = static_cast<int>(floor(static_cast<Dtype>(h - roi_start_h)
/ bin_size_h));
int pwstart = static_cast<int>(floor(static_cast<Dtype>(w - roi_start_w)
/ bin_size_w));
int phend = static_cast<int>(ceil(static_cast<Dtype>(h - roi_start_h + 1)
/ bin_size_h));
int pwend = static_cast<int>(ceil(static_cast<Dtype>(w - roi_start_w + 1)
/ bin_size_w));
// clip to boundaries of pooled region
phstart = min(max(phstart, 0), pooled_height_);
phend = min(max(phend, 0), pooled_height_);
pwstart = min(max(pwstart, 0), pooled_width_);
pwend = min(max(pwend, 0), pooled_width_);
// accumulate over gradients in pooled regions
int offset = (roi_n * channels_ + c) * pooled_height_ * pooled_width_;
const Dtype* offset_top_diff = top_diff + offset;
const Dtype* offset_argmax_data = argmax_data + offset;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
const int pooled_index = ph * pooled_width_ + pw;
if (static_cast<int>(offset_argmax_data[pooled_index]) == h * width_ + w) {
gradient += offset_top_diff[pooled_index];
}
}
}
}
bottom_diff[offset_bottom_diff] += gradient;
}
}
}
}
return;
}
} // namespace mshadow
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<cpu>(ROIPoolingParam param, int dtype) {
Operator* op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new ROIPoolingOp<cpu, DType>(param);
});
return op;
}
Operator *ROIPoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const {
DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}
DMLC_REGISTER_PARAMETER(ROIPoolingParam);
MXNET_REGISTER_OP_PROPERTY(ROIPooling, ROIPoolingProp)
.describe(R"code(Performs region of interest(ROI) pooling on the input array.
ROI pooling is a variant of a max pooling layer, in which the output size is fixed and
region of interest is a parameter. Its purpose is to perform max pooling on the inputs
of non-uniform sizes to obtain fixed-size feature maps. ROI pooling is a neural-net
layer mostly used in training a `Fast R-CNN` network for object detection.
This operator takes a 4D feature map as an input array and region proposals as `rois`,
then it pools over sub-regions of input and produces a fixed-sized output array
regardless of the ROI size.
To crop the feature map accordingly, you can resize the bounding box coordinates
by changing the parameters `rois` and `spatial_scale`.
The cropped feature maps are pooled by standard max pooling operation to a fixed size output
indicated by a `pooled_size` parameter. batch_size will change to the number of region
bounding boxes after `ROIPooling`.
The size of each region of interest doesn't have to be perfectly divisible by
the number of pooling sections(`pooled_size`).
Example::
x = [[[[ 0., 1., 2., 3., 4., 5.],
[ 6., 7., 8., 9., 10., 11.],
[ 12., 13., 14., 15., 16., 17.],
[ 18., 19., 20., 21., 22., 23.],
[ 24., 25., 26., 27., 28., 29.],
[ 30., 31., 32., 33., 34., 35.],
[ 36., 37., 38., 39., 40., 41.],
[ 42., 43., 44., 45., 46., 47.]]]]
// region of interest i.e. bounding box coordinates.
y = [[0,0,0,4,4]]
// returns array of shape (2,2) according to the given roi with max pooling.
ROIPooling(x, y, (2,2), 1.0) = [[[[ 14., 16.],
[ 26., 28.]]]]
// region of interest is changed due to the change in `spacial_scale` parameter.
ROIPooling(x, y, (2,2), 0.7) = [[[[ 7., 9.],
[ 19., 21.]]]]
)code" ADD_FILELINE)
.add_argument("data", "NDArray-or-Symbol", "The input array to the pooling operator, "
" a 4D Feature maps ")
.add_argument("rois", "NDArray-or-Symbol", "Bounding box coordinates, a 2D array of "
"[[batch_index, x1, y1, x2, y2]], where (x1, y1) and (x2, y2) are top left and bottom right "
"corners of designated region of interest. `batch_index` indicates the index of corresponding "
"image in the input array")
.add_arguments(ROIPoolingParam::__FIELDS__());
} // namespace op
} // namespace mxnet