argparse模块
argparse 是 Python 内置的一个用于命令项选项与参数解析的模块,通过在程序中定义好我们需要的参数
1.解析器的创建argparse.ArgumentParser
parser=argparse.ArgumentParser(description='YOLO Detection')
2.添加参数.add_argument
parser.add_argument('-v', '--version', default='yolo',help='yolo.')
这句话意思就是parser的version是yolo
参数列表ArgumentParser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])
name or flags - 选项字符串的名字或者列表,例如 foo 或者 -f, --foo。
action - 命令行遇到参数时的动作,默认值是 store。
store_const,表示赋值为const;
append,将遇到的值存储成列表,也就是如果参数重复则会保存多个值;
append_const,将参数规范中定义的一个值保存到一个列表;
count,存储遇到的次数;此外,也可以继承 argparse.Action 自定义参数解析;
nargs - 应该读取的命令行参数个数,可以是具体的数字,或者是?号,当不指定值时对于 Positional argument 使用 default,对于 Optional argument 使用 const;或者是 * 号,表示 0 或多个参数;或者是 + 号表示 1 或多个参数。
const - action 和 nargs 所需要的常量值。
default - 不指定参数时的默认值。
type - 命令行参数应该被转换成的类型。
choices - 参数可允许的值的一个容器。
required - 可选参数是否可以省略 (仅针对可选参数)。
help - 参数的帮助信息,当指定为 argparse.SUPPRESS 时表示不显示该参数的帮助信息.
metavar - 在 usage 说明中的参数名称,对于必选参数默认就是参数名称,对于可选参数默认是全大写的参数名称.
dest - 解析后的参数名称,默认情况下,对于可选参数选取最长的名称,中划线转换为下划线.
3.实例化parser.parse_args()
def parse_args():
return parser.parse_args()
def train():
args = parse_args()
随机数种子
设置随机数种子,在gpu或cpu上固定每一次的训练结果,随机数种子seed确定时,模型的训练结果将始终保持一致。
这面的五个语句都是规定随机数种子,最后一个是cuda的随机数种子,将这个数值置为True的话,每次返回的卷积算法将是确定的
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
setup_seed(20)
调用cuda
if args.cuda:
print('use cuda')
cudnn.benchmark = True
device = torch.device("cuda")
else:
print('use cpu')
device = torch.device("cpu")
显存不够,跑不了gpu,是用cpu跑的
数据读取
dataset = VOCDetection(root=VOC_ROOT, img_size=input_size[0],
transform=SSDAugmentation(input_size),
mosaic=args.mosaic)
VOCDetection是从调用的是voc0712的
网络模型定义
yolo_net = myYOLO(device, input_size=input_size, num_classes=args.num_classes, trainable=True, hr=hr)
class myYOLO(nn.Module):
self.backbone = resnet18(pretrained=True)
# neck
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
SPP(),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
骨干网络是resnet18,从https://download.pytorch.org/models/resnet18-5c106cde.pth'下载的
neck部分是spp( Spatial Pyramid Pooling 空金字塔),Bottleneck瓶颈层
SPPNet的能够接受任意尺寸图片的输入
Bottleneck使用的是1*1的卷积神经网络,可以大幅减少计算量
然后用二维卷积预测.
用sgd做优化器,epoch = 10,最大学习率0.001
开始训练
动态学习率,在训练伊始学习率很低,然后慢慢提到最大学习率0.001
if args.cos and epoch > 20 and epoch <= max_epoch - 20:
# use cos lr
tmp_lr = 0.00001 + 0.5*(base_lr-0.00001)*(1+math.cos(math.pi*(epoch-20)*1./ (max_epoch-20)))
set_lr(optimizer, tmp_lr)
elif args.cos and epoch > max_epoch - 20:
tmp_lr = 0.00001
set_lr(optimizer, tmp_lr)
else:
if epoch in cfg['lr_epoch']:
tmp_lr = tmp_lr * 0.1
set_lr(optimizer, tmp_lr)
for iter_i, (images, targets) in enumerate(data_loader):
# WarmUp strategy for learning rate
if not args.no_warm_up:
if epoch < args.wp_epoch:
tmp_lr = base_lr * pow((iter_i+epoch*epoch_size)*1. / (args.wp_epoch*epoch_size), 4)
# tmp_lr = 1e-6 + (base_lr-1e-6) * (iter_i+epoch*epoch_size) / (epoch_size * (args.wp_epoch))
set_lr(optimizer, tmp_lr)
elif epoch == args.wp_epoch and iter_i == 0:
tmp_lr = base_lr
set_lr(optimizer, tmp_lr)
Multi-scale Training多尺度训练,预先定义几个固定的尺度,每个epoch随机选择一个尺度进行训练。
if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
# randomly choose a new size
size = random.randint(10, 19) * 32
input_size = [size, size]
model.set_grid(input_size)
if args.multi_scale:
# interpolate
images = torch.nn.functional.interpolate(images, size=input_size, mode='bilinear', align_corners=False)
在降低batch和epoch后终于可以成功运行了。跑了两天最后得到了模型,在测试是时候ap一直是-1,可能测试集设置的不对,测试的代码还没看懂,周末继续。
resent18代码阅读
残差块BasicBlock和Bottleneck
两个差不多,一个是两层卷积一个是三层
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x #弄了个temp来承接
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None: #当连接的维度不同时,使用1*1的卷积核将低维转成高维,然后才能进行相加
identity = self.downsample(x)
out += identity #实现H(x)=F(x)+x或H(x)=F(x)+Wx
out = self.relu(out)
return out
感觉原理很简单,就是在组后一层的relu之前,把输入数据再加入原始数据(可能有权重)
当有1x1卷积核的时候,我们叫bottleneck,当没有1x1卷积核时,我们称其为BasicBlock
第一阶段
class ResNet(nn.Module):
def __init__(self, block, layers, zero_init_residual=False):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
余下4个阶段,这里的make layer是resent内部编写的函数,下面会说
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
何凯明提出了针对于relu的初始化方法,这个部分就是对每一层进行初始化,详解看注释
for m in self.modules():#这就话就是把模块的每一层进行判别
if isinstance(m, nn.Conv2d):#如果这一层是二维卷积层,就用何凯明的正态分布初始化
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):#BatchNorm2d归一化也是针对与relu的,数据在进行Relu之前不会因为数据过大而导致网络性能的不稳定
nn.init.constant_(m.weight, 1)#对归一化的权重和偏执做初始化
nn.init.constant_(m.bias, 0)
在每个残差分支中初始化最后一个BN,Bottleneck和BasicBlock分别初始化
if zero_init_residual:#判断是否是残差块
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
实现一层卷积,block参数指定是两层残差块或三层残差块,planes参数为输入的channel数,blocks说明该卷积有几个残差块
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
#即如果该层的输入的通道数inplanes和其输出的通道数的数planes * block.expansion不同,那要使用1*1的卷积核将输入x低维转成高维,然后才能进行相加
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
从forward里最能直观的观察网络的结构了
def forward(self, x):
C_1 = self.conv1(x)
C_1 = self.bn1(C_1)
C_1 = self.relu(C_1)
C_1 = self.maxpool(C_1)
C_2 = self.layer1(C_1)
C_3 = self.layer2(C_2)
C_4 = self.layer3(C_3)
C_5 = self.layer4(C_4)
return C_3, C_4, C_5
resnet共有五个阶段,其中第一阶段为一个7x7的卷积处理,stride为2,然后经过池化处理,此时特征图的尺寸已成为输入的1/4,接下来是四个阶段,也就是代码中的layer1234
只有layer2,layer3,layer4只在第一个BasicBlock进行downsample下采样,layer1没有下采样。每个layer包含2个BasicBlock,1个BasicBlock中有2次卷积。
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model
ResNet的参数很直接 def init(self, block, layers, num_classes=1000, zero_init_residual=False):参数block指明残差块是两层或三层,参数layers指明每个卷积层需要的残差块数量,num_classes指明分类数,zero_init_residual是否初始化为0,其中resnet18中[2, 2, 2, 2]对应的就是layer1234的参数,resent18的block是basicBlock,resent50以上的网络使用了Bottleneck。
yolo检测器
先要提前定义几个模块
1.空间注意力
class SAM(nn.Module):
""" Parallel CBAM """
def __init__(self, in_ch):
super(SAM, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, in_ch, 1),
nn.Sigmoid()
)
def forward(self, x):
""" Spatial Attention Module """
x_attention = self.conv(x)
return x * x_attention
从x * x_attention很直观的就能看出,就是来算feature map权重x_attention
2.空间金子塔
class SPP(nn.Module):
"""
Spatial Pyramid Pooling
"""
def __init__(self):
super(SPP, self).__init__()
def forward(self, x):
x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2)
x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4)
x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6)
x = torch.cat([x, x_1, x_2, x_3], dim=1)
return x
先算三个规模下的池化,然后torch.cat(x,1,2,3),1)就表示按维数1拼接起来
3.BottleneckCSP CrossStagePartial跨阶段瓶颈
class BottleneckCSP(nn.Module):
# CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super(BottleneckCSP, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, k=1)
self.cv2 = nn.Conv2d(c1, c_, kernel_size=1, bias=False)
self.cv3 = nn.Conv2d(c_, c_, kernel_size=1, bias=False)
self.cv4 = Conv(2 * c_, c2, k=1)
self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
self.act = nn.LeakyReLU(0.1, inplace=True)
self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
这个模块的作者是yolov4和cspnet的作者,
本文中YOLO结构
class myYOLO(nn.Module):
def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.5, hr=False):
super(myYOLO, self).__init__()
self.device = device #输入层
#对各种参数的定义
self.num_classes = num_classes
self.trainable = trainable
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.stride = 32
#这里终于提到了yolo中画bbox的函数
self.grid_cell = self.create_grid(input_size)
self.input_size = input_size
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=device).float()
# we use resnet18 as backbone
self.backbone = resnet18(pretrained=True)
# neck
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
SPP(),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
def forward(self, x, target=None):
# backbone
_, _, C_5 = self.backbone(x)
# head
C_5 = self.SPP(C_5)
C_5 = self.SAM(C_5)
C_5 = self.conv_set(C_5)
# pred
prediction = self.pred(C_5)
prediction = prediction.view(C_5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
B, HW, C = prediction.size()
# Divide prediction to obj_pred, txtytwth_pred and cls_pred
# [B, H*W, 1]
conf_pred = prediction[:, :, :1]
# [B, H*W, num_cls]
cls_pred = prediction[:, :, 1 : 1 + self.num_classes]
# [B, H*W, 4]
txtytwth_pred = prediction[:, :, 1 + self.num_classes:]
# test
if not self.trainable:
with torch.no_grad():
# batch size = 1
all_conf = torch.sigmoid(conf_pred)[0] # 0 is because that these is only 1 batch.
all_bbox = torch.clamp((self.decode_boxes(txtytwth_pred) / self.scale_torch)[0], 0., 1.)
all_class = (torch.softmax(cls_pred[0, :, :], 1) * all_conf)
# separate box pred and class conf
all_conf = all_conf.to('cpu').numpy()
all_class = all_class.to('cpu').numpy()
all_bbox = all_bbox.to('cpu').numpy()
bboxes, scores, cls_inds = self.postprocess(all_bbox, all_class)
return bboxes, scores, cls_inds
else:
conf_loss, cls_loss, txtytwth_loss, total_loss = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred,
pred_txtytwth=txtytwth_pred,
label=target)
return conf_loss, cls_loss, txtytwth_loss, total_loss