如何训练YOLOv3模型？pytorch代码实现

之前，我们介绍了什么是目标检测算法、介绍了YOLO的基础知识和网络模型，学会如何选取锚框和将图片resize。上一节我们介绍了如何准备yolo训练的数据集。这里我们将一起讨论如何训练YOLOV3模型。

我的其他笔记链接：

一、网络模型

YOLO模型主要包括了一个主干网络（darknet 53）和三个侦测头。

代码实现：

import torch 
import torch.nn as nn 

# 定义上采样
class UpSampleLayer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.layers = nn.Upsample(scale_factor=2,mode="nearest")

    def forward(self, x):
        return self.layers(x)

# 定义卷积层
class ConvolutionalLayer(nn.Module):
    def __init__(self,i_c,o_c,k,s=1,p=0,bias=False) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(i_c,o_c,k,s,p,bias=bias),
            nn.BatchNorm2d(o_c),
            nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        return self.layers(x)

# 定义残差层
class ResidualLayer(nn.Module):
    def __init__(self, i) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            ConvolutionalLayer(i, i//2, (1,1), 1, 0),
            ConvolutionalLayer(i//2, i, (3,3), 1, 1),
        )

    def forward(self, x):
        y = self.layers(x)
        return x + y

# 定义下采样层
class DownSampleLayer(nn.Module):
    def __init__(self, i, o) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            ConvolutionalLayer(i,o,3,2,1)
        )

    def forward(self, x):
        return self.layers(x)

# 定义卷积块
class ConcollutionalSetLayer(nn.Module):
    def __init__(self,i,o) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            ConvolutionalLayer(i,o,1,1,0),
            ConvolutionalLayer(o,i,3,1,1),
            ConvolutionalLayer(i,o,1,1,0),
            ConvolutionalLayer(o,i,3,1,1),
            ConvolutionalLayer(i,o,1,1,0),
        )

    def forward(self, x):
        return self.layers(x)

# 定义示例层
class MainNet(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.conv_52 = nn.Sequential(
            ConvolutionalLayer(3,32,3,1,1),
            DownSampleLayer(32,64),
            ResidualLayer(64), # x1
            DownSampleLayer(64,128),
            ResidualLayer(128), # x2
            ResidualLayer(128), # x2
            DownSampleLayer(128,256),
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
            ResidualLayer(256), # x8
        )

        self.conv_26 = nn.Sequential(
            DownSampleLayer(256,512),
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
            ResidualLayer(512), # x8
        )

        self.conv_13 = nn.Sequential(
            DownSampleLayer(512,1024),
            ResidualLayer(1024), # x4
            ResidualLayer(1024), # x4
            ResidualLayer(1024), # x4
            ResidualLayer(1024), # x4
        )

        self.convset_13 = nn.Sequential(
            ConcollutionalSetLayer(1024,512)
        )

        self.detection_13 = nn.Sequential(
            ConvolutionalLayer(512,1024,3,1,1),
            nn.Conv2d(1024,30,1,1,0)
        )

        self.up_26 = nn.Sequential(
            ConvolutionalLayer(512, 256, 1, 1, 0),
            UpSampleLayer()
        )

        self.convset_26 = nn.Sequential(
            ConcollutionalSetLayer(768,256) # 这里有通道合并256 + 512
        )

        self.detection_26 = nn.Sequential(
            ConvolutionalLayer(256,512,3,1,1),
            nn.Conv2d(512,30,1,1,0)
        )

        self.up_52 = nn.Sequential(
            ConvolutionalLayer(256, 128, 1, 1, 0),
            UpSampleLayer()
        )
        self.convset_52 = nn.Sequential(
            ConcollutionalSetLayer(384,128) # 这里有通道合并256 + 512
        )

        self.detection_52 = nn.Sequential(
            ConvolutionalLayer(128,256,3,1,1),
            nn.Conv2d(256,30,1,1,0)
        )

    def forward(self, x):
        h_52 = self.conv_52(x)
        h_26 = self.conv_26(h_52)
        h_13 = self.conv_13(h_26)
        convset_out13 = self.convset_13(h_13)
        detection_13 = self.detection_13(convset_out13)
        up_out_26 = self.up_26(convset_out13)
        route_out_26 = torch.cat((up_out_26, h_26),dim=1)
        convset_out26 = self.convset_26(route_out_26)
        detection_26 = self.detection_26(convset_out26)
        up_out_52 = self.up_52(convset_out26)
        route_out_52 = torch.cat((up_out_52, h_52),dim=1)
        convset_out52 = self.convset_52(route_out_52)
        detection_52 = self.detection_52(convset_out52)
        return detection_13, detection_26, detection_52
    
if __name__ == "__main__":
    net = MainNet()
    data = torch.randn((1,3,416,416))
    y1, y2, y3 = net(data)
    print(y1.shape)
    print(y2.shape)
    print(y3.shape)

二、数据集制作

数据集制作过程在上一个文章中提到，我们这里就不叙述过多。其总体步骤如下：

提取XML格式的标签文件中的信息，并保存到TXT文件中
将图片进行resize，同时框也要跟着resize
构造数据集，继承 Dataset类，重写__init__, __len__, __getitem__方法
具体代码可以看上篇文章。

这里使用的数据集市百度AI开放的数据集，链接也附上：车辆识别 - 飞桨AI Studio (baidu.com)

一共有5类车辆，包括了：公交车、轿车、SUV、卡车、的士
共计：699张图片

三、训练YOLO v3

这一步就开始描述如何训练yolov3代码了。

（1）损失函数

训练置信度使用 BCELoss
训练位置偏移量使用 MSELoss
训练分类概率使用 CrossEntropy

（2）正负样本不均衡问题

yolo v3的标签数据中，有信息的部分是十分少的。所以数据中存在着正负样本严重不均衡的现象，负样本数量远超正样本数量。导致了yolo v3中的置信度难以训练。解决方案如下：

$loss = λloss_1 + (1-λ)loss_2$

并且YOLO V3网络较大，数量多，所以在训练的时候可以适当的给少一点的批次。

import torch 
from torch import nn, optim
from torch.utils.data import DataLoader 
from model.yolov3 import MainNet
from utils.dataset import Car_Dataset
import os 
import time 

class Trainer: # 定义训练类
    def __init__(self) -> None:
        self.save_path = "./param/yolo.pt" # 模型保存位置
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.net = MainNet() # 实例化网络
        if os.path.exists(self.save_path):
            self.net.load_state_dict(torch.load(self.save_path, map_location='cpu')) # 加载权重文件
        self.data_loader = DataLoader(Car_Dataset(), shuffle=True, batch_size=1)
        self.conf_loss = nn.BCEWithLogitsLoss() # 置信度的损失函数
        self.offset_loss = nn.MSELoss() # 偏移量的损失函数
        self.cls_loss = nn.CrossEntropyLoss() # 分类的损失函数
        self.optimzer = optim.Adam(self.net.parameters()) # 定义网络优化器
        # self.optimzer = optim.Adam(self.net.parameters(), lr=0.1, weight_decay=0.0001) # 定义网络优化器
        # self.optimzer = optim.SGD(self.net.parameters(), lr=1e-3, weight_decay=0.0001) # 定义网络优化器
        self.log_file_path = "./param/log.txt"

    def loss_fn(self, output, target, alpha):
        output = output.permute(0,2,3,1) # 将输出结果转换为与标签相同的形状 [13,13,3,10]
        output = output.reshape(output.shape[0], output.shape[1], output.shape[2], 3, -1)

        mask_obj = target[..., 0] > 0 # 最后一个维度上的第0个元素就是置信度， >0说明就是正样本; ...就是省略前面所有的冒号
        output_obj = output[mask_obj] # 获取输出的正样本
        target_obj = target[mask_obj] # 获取标签的正样本
        loss_obj_conf = self.conf_loss(torch.sigmoid(output_obj[:, 0]), target_obj[:, 0])
        loss_obj_offset = self.offset_loss(output_obj[:, 1:5], target_obj[:, 1:5])
        loss_obj_cls = self.cls_loss(output_obj[:,5:], torch.argmax(target_obj[:,5:], dim=1)) # 

        loss_obj = loss_obj_conf + loss_obj_offset + loss_obj_cls # 正样本的总损失

        mask_noobj = target[..., 0] == 0 # ; ...就是省略前面所有的冒号
        output_noobj = output[mask_noobj] # 获取输出的负样本
        target_noobj = target[mask_noobj] # 获取标签的负样本
        loss_noobj = self.conf_loss(torch.sigmoid(output_noobj[:,0]),  target_noobj[:, 0]) # 负样本的总损失，负样本只有置信度的损失

        loss = alpha*loss_obj + (1-alpha)*loss_noobj
        return loss

    def train(self):
        self.net.to(self.device)
        self.net.train() 
        epochs = 0 
        f = open(self.log_file_path, "a")
        f.write("\n\n")

        while True:
            for i, (target_13, target_26, target_52, img) in enumerate(self.data_loader) :
                # 将数据移至GPU
                if self.device == "cuda":
                    target_13 = target_13.to(self.device)
                    target_26 = target_26.to(self.device)
                    target_52 = target_52.to(self.device)
                    img = img.to(self.device)

                out_13, out_26, out_52 = self.net(img) # 神经网络数据结果
                # 计算损失值
                loss_13 = self.loss_fn(out_13, target_13, 0.9)
                loss_26 = self.loss_fn(out_26, target_26, 0.9)
                loss_52 = self.loss_fn(out_52, target_52, 0.9)

                loss = loss_13 + loss_26 + loss_52 
                self.optimzer.zero_grad() 
                loss.backward() 
                self.optimzer.step() 

                if i % 300 == 0: 
                    strs = f"{time.strftime('%Y-%m-%d %H:%M:%S')} epoch {epochs} batch {i} loss:{loss.item()}"
                    print(strs)
                    f.write(strs + "\n")
                    torch.save(self.net.state_dict(), self.save_path)
            epochs += 1
            if epochs >= 1000:
                f.close()
                break

if __name__ == "__main__":
    trainer = Trainer() 
    trainer.train()

四、测试

代码实现，支持图片和视频两种测试

from model.yolov3 import * 
from utils import cfg, dataset, utils
import torch 
import torch.nn as nn 
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import os 
from matplotlib import pyplot as plt
import cv2
from torchvision.transforms import ToTensor
from train import Trainer

device = "cuda" if torch.cuda.is_available() else "cpu"
param_path = "./param/yolo.pt"

class Detector(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        # 加载模型参数
        self.net = MainNet()
        self.net.load_state_dict(torch.load(param_path))
        self.net.to(device)
        self.net.eval()

    # 前向计算：
    # 参数（输入图像，置信度阈值，建议框）
    def forward(self, x, thresh, anchors):
        out_13, out_26, out_52 = self.net(x)
        idxs_13, vecs_13 = self._filter(out_13, thresh) # 过滤大于置信度阈值的中心点坐标和宽高
        boxes_13 = self._parse(idxs_13, vecs_13, 32, anchors[13]) # 将最终的中心点位置和宽高反算出来 
        idxs_26, vecs_26 = self._filter(out_26, thresh) # 过滤大于置信度阈值的中心点坐标和宽高
        boxes_26 = self._parse(idxs_26, vecs_26, 16, anchors[26]) # 将最终的中心点位置和宽高反算出来 
        idxs_52, vecs_52 = self._filter(out_52, thresh) # 过滤大于置信度阈值的中心点坐标和宽高
        boxes_52 = self._parse(idxs_52, vecs_52, 8, anchors[52]) # 将最终的中心点位置和宽高反算出来 
        boxes = torch.cat([boxes_13, boxes_26, boxes_52], dim=0)
        # print(idxs_52)
        # print(boxes_13.shape)
        # print(boxes_26.shape)
        # print(boxes_52.shape)
        # print(boxes.shape)
        # exit()
        return boxes # 将所有的框拼接在一起并返回

    def _filter(self, output, thresh):
        # [N,30,13H,13W] -> [N,13H,13W,30] -> [N,13H,13W,3,10]
        output = output.permute(0, 2, 3, 1)
        output = output.reshape(output.shape[0], output.shape[1], output.shape[2], 3, -1)
        mask = torch.sigmoid(output[..., 0]) > thresh # 筛选出大于某个阈值 [N,13,13,3]

        idxs = mask.nonzero() # 将其索引取出来 [n, 4]，n是符合条件的个数，4是4个索引值,分别代表(N, H, W, 3)
        vecs = output[mask,:] # [n,10] = [[confidence, x1, y1, x2, y2, one-hot]]
        return idxs,vecs
    
    def _parse(self, idxs, vecs, t, anchors):
        anchors = torch.tensor(anchors).to(device)
        a = idxs[:,3] # 那idxs所有行的第三列，为三个形状的那一列，便于索引anchor
        confidence = torch.sigmoid(vecs[:, 0]) # 将置信度通过sigmoid转化为概率值

        _cls = vecs[:, 5:]
        if len(_cls) == 0:
            cls = torch.tensor([]).to(device)
        else:
            cls = torch.argmax(_cls, dim=1).float() 

        # 数据反算
        cy = (idxs[:,1].float() + vecs[:, 2]) * t 
        cx = (idxs[:,2].float() + vecs[:, 1]) * t 
        w = anchors[a, 0] * torch.exp(vecs[:, 3])
        h = anchors[a, 1] * torch.exp(vecs[:, 4])

        x1 = cx - w/2
        y1 = cy - h/2
        x2 = cx + w/2
        y2 = cy + h/2

        out = torch.stack([confidence, x1,y1,x2,y2,cls], dim=1)
        return out
    
def predict_video(videopath,res_path):
    name = {
        0:"truck",
        1:"bus",
        2:"SUV",
        3:"taxi",
        4:"car"
    }
    color = {
        0:"red",
        1:"blueviolet",
        2:"blue",
        3:"green",
        4:"yellow"
    }
    font = ImageFont.truetype("simsun.ttc",18,encoding="unic")
    capture = cv2.VideoCapture(videopath)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(res_path, fourcc, 20, (416, 416), True)
    detector = Detector()
    if capture.isOpened():
        counter = 0
        while True:
            if counter%100 == 0: print(">",end="")
            counter += 1
            ret, img = capture.read()
            if not ret : break # 判断视频是否结束
            # 将图片转换为tensor
            src_img = Image.fromarray(cv2.cvtColor(img,cv2.COLOR_BGR2RGB))
            img = ToTensor()(src_img)
            img = img.unsqueeze(dim=0).to(device)

            out_value = detector(img, 0.3, cfg.ANCHORS_CROUP_KMEANS)
            out_value = out_value.cpu() # [6324, 6] = [[conf,x1,y1,x2,y2,cls]]
            # print(out_value.shape)
            # exit()
            boxes = []

            for j in range(5): # 判断分类
                cls_mask = (out_value[..., -1] == j) # 对应类别的掩码
                _boxes = out_value[cls_mask] # 取出对应的类别

                boxes.extend(utils.NMS(_boxes, thre=0.5, isMin=True)) # 让同一个类别的物体做NMS去重
            
            for box in boxes:
                conf,x1,y1,x2,y2,cls = box
                conf = round(conf.item(),2)
                x1 = int(x1)
                y1 = int(y1)
                x2 = int(x2)
                y2 = int(y2)
                cls = int(cls)

                # x1,y1,x2,y2 = 10,10,200,200 # 测试
                img_draw = ImageDraw.Draw(src_img)

                img_draw.rectangle((x1,y1,x2,y2),outline=color[cls], width=3)
                img_draw.text((x1+3,y1),f"{name[cls]}",fill=color[cls],font=font)
                img_draw.text((x1+50,y1),f":{conf}",fill=color[cls],font=font)
            writer.write(np.array(src_img))
        print("\nfinished!")

def predict_img(img_path, save_path):
    name = {
        0:"truck",
        1:"bus",
        2:"SUV",
        3:"taxi",
        4:"car"
    }
    color = {
        0:"red",
        1:"blueviolet",
        2:"blue",
        3:"green",
        4:"yellow",
    }
    font = ImageFont.truetype("simsun.ttc",18,encoding="unic")
    detector = Detector()
    # 将图片转换为tensor
    src_img = utils.img_resize(cv2.imread(img_path), (416,416))
    src_img = Image.fromarray(src_img)
    img = ToTensor()(src_img)
    img = img.unsqueeze(dim=0).to(device)

    out_value = detector(img, 0.25, cfg.ANCHORS_CROUP_KMEANS)
    out_value = out_value.cpu() # [6324, 6] = [[conf,x1,y1,x2,y2,cls]]
    # print(out_value.shape)
    # exit()
    boxes = []

    for j in range(5): # 判断分类
        cls_mask = (out_value[..., -1] == j) # 对应类别的掩码
        _boxes = out_value[cls_mask] # 取出对应的类别

        boxes.extend(utils.NMS(_boxes, thre=0.4, isMin=True)) # 让同一个类别的物体做NMS去重
    
    for box in boxes:
        conf,x1,y1,x2,y2,cls = box
        conf = round(conf.item(),2)
        x1 = int(x1)
        y1 = int(y1)
        x2 = int(x2)
        y2 = int(y2)
        cls = int(cls)

        # x1,y1,x2,y2 = 10,10,200,200 # 测试
        img_draw = ImageDraw.Draw(src_img)

        img_draw.rectangle((x1,y1,x2,y2),outline=color[cls], width=3)
        img_draw.text((x1+3,y1),f"{name[cls]}",fill=color[cls],font=font)
        img_draw.text((x1+50,y1),f":{conf}",fill=color[cls],font=font)
    # src_img.show() 
    src_img.save(save_path)

if __name__ == "__main__":
    print("--  TRAIN MODE --".center(50, "*"))
    # 第一步训练模型
    # trainer = Trainer() 
    # trainer.train()

    # 第二步：测试模型
    print("--  TRAIN MODE --".center(50, "*"))
    img_path = "./img/3.jpg"
    save_path = "./img/result5.jpg"
    # predict_img(img_path,save_path)
    # quit()

    videopath = "./img/3.mp4"
    out_path = "./img/OUT.mp4"
    res_path = "./img/result3.mp4"

    # 先将视频转成我们需要的格式
    if not os.path.exists(out_path):
        utils.viodeo_resize(videopath, out_path)

    predict_video(out_path, res_path)

我训练了三天，训练的效果并不是很好。很多情况还是会有识别不到，识别错误的情况。可能是训练时间不够，也可能是数据集不够丰富。下面是网络的输出，挑了几张。