Tensorflow版本yolo v3源码阅读笔记(3)

了解了yolov3模型和损失函数的计算后，我们来分析yolov3训练和测试数据集的制作代码。

这里有必要说明一下，这里的数据集就是我们在训练yolov3神经网络时所需要的图片资源和相应的真实框标记。

数据集的制作流程如下：

1. 从文件中读取图片数据和标记的真实框数据(2个左上角数据+2个右下角数据)、真实框的类别(用一个整数表示)
1. 生成3个尺度的特征图，找出每个特征图中真实框位置处对应的3个先验框，然后这3个先验框和真实框计算iou值，然后根据iou值大于指定阈值的条件，找出符合条件的先验框，最后把真实框数据填充到先验框在特征图中对应的位置处。还有，把类别通过one-hot编码，也填充到先验框在特征图中对应的位置处。
1. 生成3个尺度的真实框张量。在上一步中，我们可以找到每个尺度的特征图中和真实框匹配的先验框，这样，我们就可以把真实框保存进这个尺度的真实框张量中。
1. 数据基本已经制作完成了，返回3个尺度的特征图和3个尺度的真实框张量。

数据集的制作流程基本就是这样的，下面我们看具体的代码实现。

class Dataset(object):
    """implement Dataset here"""
    def __init__(self, dataset_type):
        self.annot_path  = cfg.TRAIN.ANNOT_PATH if dataset_type == 'train' else cfg.TEST.ANNOT_PATH
        self.input_sizes = cfg.TRAIN.INPUT_SIZE if dataset_type == 'train' else cfg.TEST.INPUT_SIZE
        self.batch_size  = cfg.TRAIN.BATCH_SIZE if dataset_type == 'train' else cfg.TEST.BATCH_SIZE
        self.data_aug    = cfg.TRAIN.DATA_AUG   if dataset_type == 'train' else cfg.TEST.DATA_AUG

        self.train_input_sizes = cfg.TRAIN.INPUT_SIZE#训练集图片尺寸
        self.strides = np.array(cfg.YOLO.STRIDES)#每个特征图中的一个格子代表原始图像中的几个格子
        self.classes = utils.read_class_names(cfg.YOLO.CLASSES)#类别的索引
        self.num_classes = len(self.classes)#类别的个数
        self.anchors = np.array(utils.get_anchors(cfg.YOLO.ANCHORS))#3个尺度的3个不同大小的先验框，一共有9个
        self.anchor_per_scale = cfg.YOLO.ANCHOR_PER_SCALE#一个尺度上有几个先验框，这里是3个
        self.max_bbox_per_scale = 150#一个尺度上的最大先验框个数

        self.annotations = self.load_annotations(dataset_type)#加载训练集标签
        self.num_samples = len(self.annotations)#样本数量
        self.num_batchs = int(np.ceil(self.num_samples / self.batch_size))#一共有几个 batch
        self.batch_count = 0 # 计数

load_annotations

    def load_annotations(self, dataset_type):
        with open(self.annot_path, 'r') as f:
            txt = f.readlines()
            annotations = [line.strip() for line in txt if len(line.strip().split()[1:]) != 0]
        np.random.shuffle(annotations)
        return annotations

    def __iter__(self):
        return self

    def __next__(self):

        with tf.device('/cpu:0'):
            self.train_input_size = random.choice(self.train_input_sizes)#输入的图片尺寸大小
            self.train_output_sizes = self.train_input_size // self.strides#计算3个尺度的输出大小

            # 初始化一个批次的样本
            batch_image = np.zeros((self.batch_size, self.train_input_size, self.train_input_size, 3), dtype=np.float32)

            # 初始化一个批次的输出（批次大小为4，三个尺度上的输出尺寸分别为52、26、13，一个尺度上共有3个先验框，2个坐标中心点+2个高宽信息+1个置信度信息+80个类别信息）
            batch_label_sbbox = np.zeros((self.batch_size, self.train_output_sizes[0], self.train_output_sizes[0],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)#初始化小尺度的输出，shape = [4，52，52，3，85]
            batch_label_mbbox = np.zeros((self.batch_size, self.train_output_sizes[1], self.train_output_sizes[1],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)#初始化中尺度的输出，shape = [4, 26, 26, 3, 85]
            batch_label_lbbox = np.zeros((self.batch_size, self.train_output_sizes[2], self.train_output_sizes[2],
                                          self.anchor_per_scale, 5 + self.num_classes), dtype=np.float32)# 初始化大尺度的输出，shape = [4，13，13，3，85]

            batch_sbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)#小尺度的真实框张量
            batch_mbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)#中尺度的真实框张量
            batch_lbboxes = np.zeros((self.batch_size, self.max_bbox_per_scale, 4), dtype=np.float32)#大尺度的真实框张量

            num = 0#记录遍历到一个批次（4张图片）中的第几张图片了
            if self.batch_count < self.num_batchs:#如果记录的 batch 个数还没达到总 batch 个数
                while num < self.batch_size:#如果这个批次（4张图片）还没遍历完
                    index = self.batch_count * self.batch_size + num#记录现在是第几个样本
                    if index >= self.num_samples: index -= self.num_samples#如果这个样本已经超过了总个图片数量，那么从头再开始
                    annotation = self.annotations[index]#根据索引取出这个图片的信息（包括图片，真实框信息，类别信息）
                    image, bboxes = self.parse_annotation(annotation)#解析出图片信息和真实框信息
                    label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes = self.preprocess_true_boxes(bboxes)#返回保存有真实框信息的3个尺度的特征图和3个真实框张量信息

                    batch_image[num, :, :, :] = image
                    batch_label_sbbox[num, :, :, :, :] = label_sbbox
                    batch_label_mbbox[num, :, :, :, :] = label_mbbox
                    batch_label_lbbox[num, :, :, :, :] = label_lbbox
                    batch_sbboxes[num, :, :] = sbboxes
                    batch_mbboxes[num, :, :] = mbboxes
                    batch_lbboxes[num, :, :] = lbboxes
                    num += 1
                self.batch_count += 1
                batch_smaller_target = batch_label_sbbox, batch_sbboxes
                batch_medium_target  = batch_label_mbbox, batch_mbboxes
                batch_larger_target  = batch_label_lbbox, batch_lbboxes

                return batch_image, (batch_smaller_target, batch_medium_target, batch_larger_target)
            else:
                self.batch_count = 0
                np.random.shuffle(self.annotations)
                raise StopIteration

图片的一系列增强操作

    # 图片增强操作之-水平翻转
    def random_horizontal_flip(self, image, bboxes):

        if random.random() < 0.5:
            _, w, _ = image.shape
            image = image[:, ::-1, :]
            bboxes[:, [0,2]] = w - bboxes[:, [2,0]]

        return image, bboxes

     # 图片增强操作之-随机裁剪
    def random_crop(self, image, bboxes):

        if random.random() < 0.5:
            h, w, _ = image.shape
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]
            max_u_trans = max_bbox[1]
            max_r_trans = w - max_bbox[2]
            max_d_trans = h - max_bbox[3]

            crop_xmin = max(0, int(max_bbox[0] - random.uniform(0, max_l_trans)))
            crop_ymin = max(0, int(max_bbox[1] - random.uniform(0, max_u_trans)))
            crop_xmax = max(w, int(max_bbox[2] + random.uniform(0, max_r_trans)))
            crop_ymax = max(h, int(max_bbox[3] + random.uniform(0, max_d_trans)))

            image = image[crop_ymin : crop_ymax, crop_xmin : crop_xmax]

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] - crop_xmin
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] - crop_ymin

        return image, bboxes

     # 图片增强操作之-随机移动
    def random_translate(self, image, bboxes):

        if random.random() < 0.5:
            h, w, _ = image.shape
            max_bbox = np.concatenate([np.min(bboxes[:, 0:2], axis=0), np.max(bboxes[:, 2:4], axis=0)], axis=-1)

            max_l_trans = max_bbox[0]
            max_u_trans = max_bbox[1]
            max_r_trans = w - max_bbox[2]
            max_d_trans = h - max_bbox[3]

            tx = random.uniform(-(max_l_trans - 1), (max_r_trans - 1))
            ty = random.uniform(-(max_u_trans - 1), (max_d_trans - 1))

            M = np.array([[1, 0, tx], [0, 1, ty]])
            image = cv2.warpAffine(image, M, (w, h))

            bboxes[:, [0, 2]] = bboxes[:, [0, 2]] + tx
            bboxes[:, [1, 3]] = bboxes[:, [1, 3]] + ty

        return image, bboxes

parse_annotation

    #解析出图片信息和真实框信息
    def parse_annotation(self, annotation):

        line = annotation.split()
        image_path = line[0]
        if not os.path.exists(image_path):
            raise KeyError("%s does not exist ... " %image_path)
        image = cv2.imread(image_path)#读取图片信息
        bboxes = np.array([list(map(int, box.split(','))) for box in line[1:]])#将真实框数据由字符串型转换为整型

        # 一系列的数据增强操作
        if self.data_aug:
            image, bboxes = self.random_horizontal_flip(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_crop(np.copy(image), np.copy(bboxes))
            image, bboxes = self.random_translate(np.copy(image), np.copy(bboxes))

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image, bboxes = utils.image_preporcess(np.copy(image), [self.train_input_size, self.train_input_size], np.copy(bboxes))#将图片尺寸处理成神经网络需要的图片尺寸，相应的真实框的数据信息也要进行处理(主要是数据偏移处理)
        return image, bboxes

bbox_iou

    # 计算两个框框的iou值，处理方法与yolov3.py文件中的bbox_iou方法大致相同
    def bbox_iou(self, boxes1, boxes2):

        boxes1 = np.array(boxes1)
        boxes2 = np.array(boxes2)

        boxes1_area = boxes1[..., 2] * boxes1[..., 3]
        boxes2_area = boxes2[..., 2] * boxes2[..., 3]

        boxes1 = np.concatenate([boxes1[..., :2] - boxes1[..., 2:] * 0.5,
                                boxes1[..., :2] + boxes1[..., 2:] * 0.5], axis=-1)
        boxes2 = np.concatenate([boxes2[..., :2] - boxes2[..., 2:] * 0.5,
                                boxes2[..., :2] + boxes2[..., 2:] * 0.5], axis=-1)

        left_up = np.maximum(boxes1[..., :2], boxes2[..., :2])
        right_down = np.minimum(boxes1[..., 2:], boxes2[..., 2:])

        inter_section = np.maximum(right_down - left_up, 0.0)
        inter_area = inter_section[..., 0] * inter_section[..., 1]
        union_area = boxes1_area + boxes2_area - inter_area

        return inter_area / union_area

preprocess_true_boxes

    def preprocess_true_boxes(self, bboxes):

        # 3个尺度的特征图信息
        label = [np.zeros((self.train_output_sizes[i], self.train_output_sizes[i], self.anchor_per_scale,
                           5 + self.num_classes)) for i in range(3)]
        # 3个尺度的真实框张量
        bboxes_xywh = [np.zeros((self.max_bbox_per_scale, 4)) for _ in range(3)]
        bbox_count = np.zeros((3,))

        for bbox in bboxes:#遍历真实框数组
            bbox_coor = bbox[:4]#取出每个真实框的坐标信息(2个左上角坐标+2个右下角坐标)
            bbox_class_ind = bbox[4]#每个真实框的类别

            #开始对类别进行one-hot编码
            onehot = np.zeros(self.num_classes, dtype=np.float)
            onehot[bbox_class_ind] = 1.0
            uniform_distribution = np.full(self.num_classes, 1.0 / self.num_classes)
            deta = 0.01
            #最终处理完成的类别one-hot编码
            smooth_onehot = onehot * (1 - deta) + deta * uniform_distribution

            #将真实框的坐标从（2个左上角坐标+2个右下角坐标）转换为（2个中心点坐标+2个高宽）
            bbox_xywh = np.concatenate([(bbox_coor[2:] + bbox_coor[:2]) * 0.5, bbox_coor[2:] - bbox_coor[:2]], axis=-1)
           #将真实框的坐标转换为特征图上的坐标，bbox_xywh的shape=[1,4]，self.strides的shape=[3,1]，于是bbox_xywh_scaled的shape=[1,3,4]，即计算出了每个真实框在3个尺度下的坐标信息
            bbox_xywh_scaled = 1.0 * bbox_xywh[np.newaxis, :] / self.strides[:, np.newaxis]

            iou = []#保存真实框与9个先验框计算的iou值
            exist_positive = False#真实框是否有相匹配的先验框的标志
            for i in range(3):#遍历3个尺度
                #每个尺度下有3个先验框
                anchors_xywh = np.zeros((self.anchor_per_scale, 4))
                #3个先验框的中心坐标，让它在真实框所在的格子的中心坐标上。
                anchors_xywh[:, 0:2] = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32) + 0.5
                #3个先验框的尺寸大小
                anchors_xywh[:, 2:4] = self.anchors[i]

                #计算真实框与这3个先验框的iou值
                iou_scale = self.bbox_iou(bbox_xywh_scaled[i][np.newaxis, :], anchors_xywh)
                #保存iou值
                iou.append(iou_scale)
                #找出符合条件的iou值
                iou_mask = iou_scale > 0.3

                if np.any(iou_mask):
                    #到这一步，说明有符合条件的先验框，于是找出真实框所在的格子坐标，即第几行几列
                    xind, yind = np.floor(bbox_xywh_scaled[i, 0:2]).astype(np.int32)

                    #首先清除下该尺度下对应格子的信息
                    label[i][yind, xind, iou_mask, :] = 0
                    #在该尺度的格子中填充进真实框坐标信息
                    label[i][yind, xind, iou_mask, 0:4] = bbox_xywh
                    #该尺度的格子中的置信度置为1，表明该格子中有检测到物体
                    label[i][yind, xind, iou_mask, 4:5] = 1.0
                    #填充该尺度的格子中的类别信息，就是前面已经做好的类别one-hot编码
                    label[i][yind, xind, iou_mask, 5:] = smooth_onehot

                    #找出该尺度下真实框张量中的索引
                    bbox_ind = int(bbox_count[i] % self.max_bbox_per_scale)
                    #在真实框张量的索引位置处填充进真实框信息
                    bboxes_xywh[i][bbox_ind, :4] = bbox_xywh
                    #该索引下的值+1
                    bbox_count[i] += 1

                    #表明有符合真实框要求的先验框
                    exist_positive = True

            if not exist_positive:
                #如果没有符合真实框要求的先验框，那就在9个iou值中找出最大的那个值的索引
                best_anchor_ind = np.argmax(np.array(iou).reshape(-1), axis=-1)
                #计算这个最大的先验框所在的尺度
                best_detect = int(best_anchor_ind / self.anchor_per_scale)
                #计算在该尺度下的哪个先验框
                best_anchor = int(best_anchor_ind % self.anchor_per_scale)
                #同样，找出在该尺度下的格子坐标，即第几行第几列
                xind, yind = np.floor(bbox_xywh_scaled[best_detect, 0:2]).astype(np.int32)

                #和上面的处理过程一样，首先清除该尺度下的对应格子信息
                label[best_detect][yind, xind, best_anchor, :] = 0
                #在该格子中填充真实框信息
                label[best_detect][yind, xind, best_anchor, 0:4] = bbox_xywh
                #把该格子的置信度置为1
                label[best_detect][yind, xind, best_anchor, 4:5] = 1.0
                #填充该格子的类别one-hot
                label[best_detect][yind, xind, best_anchor, 5:] = smooth_onehot

                #这也和上面一样，找出该尺度下的真实框张量的索引
                bbox_ind = int(bbox_count[best_detect] % self.max_bbox_per_scale)
               #把真实框信息填充到真实框张量的索引位置处
                bboxes_xywh[best_detect][bbox_ind, :4] = bbox_xywh
                #该尺度下的索引值+1
                bbox_count[best_detect] += 1
        #3个尺度的特征信息
        label_sbbox, label_mbbox, label_lbbox = label
        #3个尺度的真实框张量信息
        sbboxes, mbboxes, lbboxes = bboxes_xywh
        return label_sbbox, label_mbbox, label_lbbox, sbboxes, mbboxes, lbboxes

len

    #返回数据集的批次大小
    def __len__(self):
        return self.num_batchs

这就是yolov3数据集的制作流程和代码，在下一篇中我们来看yolov3是如何实现训练神经网络。

Tensorflow版本yolo v3源码阅读笔记(3)

Tensorflow版本yolo v3源码阅读笔记(3)

load_annotations

图片的一系列增强操作

parse_annotation

bbox_iou

preprocess_true_boxes

len

相关阅读更多精彩内容

友情链接更多精彩内容