object_detectionAPI源码阅读笔记（12-faster-rnn_meta_arch的predict详解）

已经被这个API折磨的不行了，每天都脑壳疼。。。。。怎么才能创建一个自定义的特征提取类，我现在还没头绪。。。。。

如果你身边有大神熟悉这个API,能否推荐我的微信（yanghe_hehe）给ta，谢啦。

。。。。。。。。。。。。本人太菜了。。。。。。。。。。。哎。

有好几天没看代码，发现又忘了很多东西。

串一下检测流程。

这里发现一个比较简单的特征提取器，就是faster_rcnn_inception_v2_feature_extractor.py这里的特征提取器。faster rcnn在这里仅仅提取inception_v2_base的['Mixed_4e']层。

重新整理一下思路，发现做了很多无用功，浪费了很多时间。

model_builder.py总体上导入两个功能函数

特征提取模型
目标检测模型

特征提取函数到哪里去了？？

def _build_faster_rcnn_feature_extractor(
    feature_extractor_config, is_training, reuse_weights=None):
  feature_type = feature_extractor_config.type
  first_stage_features_stride = (
      feature_extractor_config.first_stage_features_stride)
  batch_norm_trainable = feature_extractor_config.batch_norm_trainable

  if feature_type not in FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP:
    raise ValueError('Unknown Faster R-CNN feature_extractor: {}'.format(
        feature_type))
  feature_extractor_class = FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP[
      feature_type]
  return feature_extractor_class(
      is_training, first_stage_features_stride,
      batch_norm_trainable, reuse_weights)

直接看返回值 feature_extractor_class

feature_extractor_class(
      is_training, first_stage_features_stride,
      batch_norm_trainable, reuse_weights)

这里可以到特征提取的模型看看，看这里

class FasterRCNNInceptionV2FeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
  def __init__(self,
               is_training,
               first_stage_features_stride,
               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0,
               depth_multiplier=1.0,
               min_depth=16)

回到model_builder.py, 看看检测模型去了哪。

def _build_faster_rcnn_model(frcnn_config, is_training):
  """Builds a Faster R-CNN or R-FCN detection model based on the model config.

  Builds R-FCN model if the second_stage_box_predictor in the config is of type
  `rfcn_box_predictor` else builds a Faster R-CNN model.

  Args:
    frcnn_config: A faster_rcnn.proto object containing the config for the
    desired FasterRCNNMetaArch or RFCNMetaArch.
    is_training: True if this model is being built for training purposes.

  Returns:
    FasterRCNNMetaArch based on the config.
  Raises:
    ValueError: If frcnn_config.type is not recognized (i.e. not registered in
      model_class_map).
  """
  num_classes = frcnn_config.num_classes
  image_resizer_fn = image_resizer_builder.build(frcnn_config.image_resizer)

  feature_extractor = _build_faster_rcnn_feature_extractor(
      frcnn_config.feature_extractor, is_training)

  first_stage_only = frcnn_config.first_stage_only
  first_stage_anchor_generator = anchor_generator_builder.build(
      frcnn_config.first_stage_anchor_generator)

  first_stage_atrous_rate = frcnn_config.first_stage_atrous_rate
  first_stage_box_predictor_arg_scope = hyperparams_builder.build(
      frcnn_config.first_stage_box_predictor_conv_hyperparams, is_training)
  first_stage_box_predictor_kernel_size = (
      frcnn_config.first_stage_box_predictor_kernel_size)
  first_stage_box_predictor_depth = frcnn_config.first_stage_box_predictor_depth
  first_stage_minibatch_size = frcnn_config.first_stage_minibatch_size
  first_stage_positive_balance_fraction = (
      frcnn_config.first_stage_positive_balance_fraction)
  first_stage_nms_score_threshold = frcnn_config.first_stage_nms_score_threshold
  first_stage_nms_iou_threshold = frcnn_config.first_stage_nms_iou_threshold
  first_stage_max_proposals = frcnn_config.first_stage_max_proposals
  first_stage_loc_loss_weight = (
      frcnn_config.first_stage_localization_loss_weight)
  first_stage_obj_loss_weight = frcnn_config.first_stage_objectness_loss_weight

  initial_crop_size = frcnn_config.initial_crop_size
  maxpool_kernel_size = frcnn_config.maxpool_kernel_size
  maxpool_stride = frcnn_config.maxpool_stride

  second_stage_box_predictor = box_predictor_builder.build(
      hyperparams_builder.build,
      frcnn_config.second_stage_box_predictor,
      is_training=is_training,
      num_classes=num_classes)
  second_stage_batch_size = frcnn_config.second_stage_batch_size
  second_stage_balance_fraction = frcnn_config.second_stage_balance_fraction
  (second_stage_non_max_suppression_fn, second_stage_score_conversion_fn
  ) = post_processing_builder.build(frcnn_config.second_stage_post_processing)
  second_stage_localization_loss_weight = (
      frcnn_config.second_stage_localization_loss_weight)
  second_stage_classification_loss = (
      losses_builder.build_faster_rcnn_classification_loss(
          frcnn_config.second_stage_classification_loss))
  second_stage_classification_loss_weight = (
      frcnn_config.second_stage_classification_loss_weight)
  second_stage_mask_prediction_loss_weight = (
      frcnn_config.second_stage_mask_prediction_loss_weight)

  hard_example_miner = None
  if frcnn_config.HasField('hard_example_miner'):
    hard_example_miner = losses_builder.build_hard_example_miner(
        frcnn_config.hard_example_miner,
        second_stage_classification_loss_weight,
        second_stage_localization_loss_weight)

  common_kwargs = {
      'is_training': is_training,
      'num_classes': num_classes,
      'image_resizer_fn': image_resizer_fn,
      'feature_extractor': feature_extractor,
      'first_stage_only': first_stage_only,
      'first_stage_anchor_generator': first_stage_anchor_generator,
      'first_stage_atrous_rate': first_stage_atrous_rate,
      'first_stage_box_predictor_arg_scope':
      first_stage_box_predictor_arg_scope,
      'first_stage_box_predictor_kernel_size':
      first_stage_box_predictor_kernel_size,
      'first_stage_box_predictor_depth': first_stage_box_predictor_depth,
      'first_stage_minibatch_size': first_stage_minibatch_size,
      'first_stage_positive_balance_fraction':
      first_stage_positive_balance_fraction,
      'first_stage_nms_score_threshold': first_stage_nms_score_threshold,
      'first_stage_nms_iou_threshold': first_stage_nms_iou_threshold,
      'first_stage_max_proposals': first_stage_max_proposals,
      'first_stage_localization_loss_weight': first_stage_loc_loss_weight,
      'first_stage_objectness_loss_weight': first_stage_obj_loss_weight,
      'second_stage_batch_size': second_stage_batch_size,
      'second_stage_balance_fraction': second_stage_balance_fraction,
      'second_stage_non_max_suppression_fn':
      second_stage_non_max_suppression_fn,
      'second_stage_score_conversion_fn': second_stage_score_conversion_fn,
      'second_stage_localization_loss_weight':
      second_stage_localization_loss_weight,
      'second_stage_classification_loss':
      second_stage_classification_loss,
      'second_stage_classification_loss_weight':
      second_stage_classification_loss_weight,
      'hard_example_miner': hard_example_miner}

  if isinstance(second_stage_box_predictor, box_predictor.RfcnBoxPredictor):
    return rfcn_meta_arch.RFCNMetaArch(
        second_stage_rfcn_box_predictor=second_stage_box_predictor,
        **common_kwargs)
  else:
    return faster_rcnn_meta_arch.FasterRCNNMetaArch(
        initial_crop_size=initial_crop_size,
        maxpool_kernel_size=maxpool_kernel_size,
        maxpool_stride=maxpool_stride,
        second_stage_mask_rcnn_box_predictor=second_stage_box_predictor,
        second_stage_mask_prediction_loss_weight=(
            second_stage_mask_prediction_loss_weight),
        **common_kwargs)

这里的_build_faster_rcnn_model就是整个检测模型的构建过程。

num_classes = frcnn_config.num_classes
image_resizer_fn = image_resizer_builder.build(frcnn_config.image_resizer)

这里不学解释，就是获取一个图片变换函数，

feature_extractor = _build_faster_rcnn_feature_extractor(
      frcnn_config.feature_extractor, is_training)

这里就是拿到特征提取模型

first_stage_only = frcnn_config.first_stage_only
first_stage_anchor_generator = anchor_generator_builder.build(
      frcnn_config.first_stage_anchor_generator)

是否仅仅是训练RPN，同时拿到box产生模型

  first_stage_atrous_rate = frcnn_config.first_stage_atrous_rate
  first_stage_box_predictor_arg_scope = hyperparams_builder.build(
      frcnn_config.first_stage_box_predictor_conv_hyperparams, is_training)
  first_stage_box_predictor_kernel_size = (
      frcnn_config.first_stage_box_predictor_kernel_size)
  first_stage_box_predictor_depth = frcnn_config.first_stage_box_predictor_depth
  first_stage_minibatch_size = frcnn_config.first_stage_minibatch_size
  first_stage_positive_balance_fraction = (
      frcnn_config.first_stage_positive_balance_fraction)
  first_stage_nms_score_threshold = frcnn_config.first_stage_nms_score_threshold
  first_stage_nms_iou_threshold = frcnn_config.first_stage_nms_iou_threshold
  first_stage_max_proposals = frcnn_config.first_stage_max_proposals
  first_stage_loc_loss_weight = (
      frcnn_config.first_stage_localization_loss_weight)
  first_stage_obj_loss_weight = frcnn_config.first_stage_objectness_loss_weight

  initial_crop_size = frcnn_config.initial_crop_size
  maxpool_kernel_size = frcnn_config.maxpool_kernel_size
  maxpool_stride = frcnn_config.maxpool_stride

这上面就是拿第一阶段到的config设置参数。

  second_stage_box_predictor = box_predictor_builder.build(
      hyperparams_builder.build,
      frcnn_config.second_stage_box_predictor,
      is_training=is_training,
      num_classes=num_classes)
  second_stage_batch_size = frcnn_config.second_stage_batch_size
  second_stage_balance_fraction = frcnn_config.second_stage_balance_fraction
  (second_stage_non_max_suppression_fn, second_stage_score_conversion_fn
  ) = post_processing_builder.build(frcnn_config.second_stage_post_processing)
  second_stage_localization_loss_weight = (
      frcnn_config.second_stage_localization_loss_weight)
  second_stage_classification_loss = (
      losses_builder.build_faster_rcnn_classification_loss(
          frcnn_config.second_stage_classification_loss))
  second_stage_classification_loss_weight = (
      frcnn_config.second_stage_classification_loss_weight)
  second_stage_mask_prediction_loss_weight = (
      frcnn_config.second_stage_mask_prediction_loss_weight)

拿到第二阶段预测需要的检测模型和config参数。

  if frcnn_config.HasField('hard_example_miner'):
    hard_example_miner = losses_builder.build_hard_example_miner(
        frcnn_config.hard_example_miner,
        second_stage_classification_loss_weight,
        second_stage_localization_loss_weight)

hard_example_miner是在进行RPN训练时对正负样本平衡。

faster_rcnn_meta_arch.FasterRCNNMetaArch(
        initial_crop_size=initial_crop_size,
        maxpool_kernel_size=maxpool_kernel_size,
        maxpool_stride=maxpool_stride,
        second_stage_mask_rcnn_box_predictor=second_stage_box_predictor,
        second_stage_mask_prediction_loss_weight=(
            second_stage_mask_prediction_loss_weight),
        **common_kwargs)

_build_faster_rcnn_model 返回值就是faster_rcnn_meta_arch.FasterRCNNMetaArch，这就是FasterRCNN的检测模型。

为了把大流程弄出来，这里借用一下evaluator.py中的一个简单流程。在这
def _extract_prediction_tensors(model,create_input_dict_fn, ignore_groundtruth=False)里面，有这么三句话：

model就是上面说的_build_faster_rcnn_model返回值。

  preprocessed_image = model.preprocess(tf.to_float(original_image))
  prediction_dict = model.predict(preprocessed_image)
  detections = model.postprocess(prediction_dict)

处理图片
进行预测
处理预测的结果

这里model.preprocess(preprocessed_image)已经介绍过了。

  def preprocess(self, inputs):
    if inputs.dtype is not tf.float32:
      raise ValueError('`preprocess` expects a tf.float32 tensor')
    with tf.name_scope('Preprocessor'):
      outputs = shape_utils.static_or_dynamic_map_fn(
          self._image_resizer_fn,
          elems=inputs,
          dtype=[tf.float32, tf.int32],
          parallel_iterations=self._parallel_iterations)
      resized_inputs = outputs[0]
      true_image_shapes = outputs[1]
      return (self._feature_extractor.preprocess(resized_inputs),
              true_image_shapes)

这里把图片的尺寸进行进行resize，在调用特征提取的preprocessor就是把图片的值缩放到[-1,1]。

再看看predict

prediction_dict = model.predict(preprocessed_image)

def predict(self, preprocessed_inputs, true_image_shapes):
   
    (rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
     image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs)
    (rpn_box_encodings, rpn_objectness_predictions_with_background
    ) = self._predict_rpn_proposals(rpn_box_predictor_features)

    # The Faster R-CNN paper recommends pruning anchors that venture outside
    # the image window at training time and clipping at inference time.
    clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
    if self._is_training:
      if self.clip_anchors_to_image:
        anchors_boxlist = box_list_ops.clip_to_window(
            anchors_boxlist, clip_window, filter_nonoverlapping=False)
      else:
        (rpn_box_encodings, rpn_objectness_predictions_with_background,
         anchors_boxlist) = self._remove_invalid_anchors_and_predictions(
             rpn_box_encodings, rpn_objectness_predictions_with_background,
             anchors_boxlist, clip_window)
    else:
      anchors_boxlist = box_list_ops.clip_to_window(
          anchors_boxlist, clip_window)

    self._anchors = anchors_boxlist
    prediction_dict = {
        'rpn_box_predictor_features': rpn_box_predictor_features,
        'rpn_features_to_crop': rpn_features_to_crop,
        'image_shape': image_shape,
        'rpn_box_encodings': rpn_box_encodings,
        'rpn_objectness_predictions_with_background':
        rpn_objectness_predictions_with_background,
        'anchors': self._anchors.get()
    }

    if self._number_of_stages >= 2:
      # If mixed-precision training on TPU is enabled, rpn_box_encodings and
      # rpn_objectness_predictions_with_background are bfloat16 tensors.
      # Considered prediction results, they need to be casted to float32
      # tensors for correct postprocess_rpn computation in predict_second_stage.
      prediction_dict.update(self._predict_second_stage(
          tf.to_float(rpn_box_encodings),
          tf.to_float(rpn_objectness_predictions_with_background),
          rpn_features_to_crop,
          self._anchors.get(), image_shape, true_image_shapes))

    if self._number_of_stages == 3:
      prediction_dict = self._predict_third_stage(
          prediction_dict, true_image_shapes)

    return prediction_dict

开始看看这里的_extract_rpn_feature_maps

(rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
     image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs)

这里的_extract_rpn_feature_maps其实没对太多的工作，就是拿到所有的feature map，同时对进入RPN的所有的feature map的维度进一个设置，感觉就像一个普通的卷积层。
FasterRCNN就是在这个函数分为两路的。

 (rpn_box_encodings, rpn_objectness_predictions_with_background
    ) = self._predict_rpn_proposals(rpn_box_predictor_features)

这里就是把上面的rpn_box_predictor_features进行计算，得到box的坐标和box里面的类别。

clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
    if self._is_training:
      if self.clip_anchors_to_image:
        anchors_boxlist = box_list_ops.clip_to_window(
            anchors_boxlist, clip_window, filter_nonoverlapping=False)
      else:
        (rpn_box_encodings, rpn_objectness_predictions_with_background,
         anchors_boxlist) = self._remove_invalid_anchors_and_predictions(
             rpn_box_encodings, rpn_objectness_predictions_with_background,
             anchors_boxlist, clip_window)

这里就是把box的坐标进行修建，使用了很多其他的函数。

def _predict_second_stage(self, rpn_box_encodings,
                            rpn_objectness_predictions_with_background,
                            rpn_features_to_crop,
                            anchors,
                            image_shape,
                            true_image_shapes):
    prediction_dict = {
        'refined_box_encodings': refined_box_encodings,
        'class_predictions_with_background':
        class_predictions_with_background,
        'num_proposals': num_proposals,
        'proposal_boxes': absolute_proposal_boxes,
        'box_classifier_features': box_classifier_features,
        'proposal_boxes_normalized': proposal_boxes_normalized,
    }

    return prediction_dict

对RPN提取的网络进行预测，输出最后的box坐标，同时输出classes的预测。

把这里的box从预测值转码：从预测值转化到图片上尺寸。

detections = model.postprocess(prediction_dict)

就到这里吧！！！！！

object_detectionAPI源码阅读笔记（12-faster-rnn_meta_arch的predict详解）

串一下检测流程。

推荐阅读更多精彩内容