我们首先先放上YOLOv3的结构图。
下面介绍的代码地址: eriklindernoren/PyTorch-YOLOv3
一、 数据处理模块
## utils/datasets.py
class ListDataset(Dataset):
def __init__(self, list_path, img_size=416, augment=True, multiscale=True, normalized_labels=True):
with open(list_path, "r") as file:
self.img_files = file.readlines()
self.label_files = [
path.replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt")
for path in self.img_files
]
self.img_size = img_size
self.max_objects = 100
self.augment = augment
self.multiscale = multiscale
self.normalized_labels = normalized_labels
self.min_size = self.img_size - 3 * 32
self.max_size = self.img_size + 3 * 32
self.batch_count = 0
def __getitem__(self, index):
# ---------
# Image
# ---------
img_path = self.img_files[index % len(self.img_files)].rstrip()
img_path = '/data/humaocheng/object_detection/yolov3/PyTorch-YOLOv3/data/coco' + img_path
#print (img_path)
# Extract image as PyTorch tensor
img = transforms.ToTensor()(Image.open(img_path).convert('RGB'))
# Handle images with less than three channels
if len(img.shape) != 3:
img = img.unsqueeze(0)
img = img.expand((3, img.shape[1:]))
_, h, w = img.shape
h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1)
# Pad to square resolution
img, pad = pad_to_square(img, 0)
_, padded_h, padded_w = img.shape
# ---------
# Label
# ---------
label_path = self.label_files[index % len(self.img_files)].rstrip()
label_path = '/data/humaocheng/object_detection/yolov3/PyTorch-YOLOv3/data/coco' + label_path
#print (label_path)
targets = None
if os.path.exists(label_path):
boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5))
# Extract coordinates for unpadded + unscaled image
x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2)
y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2)
x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2)
y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2)
# Adjust for added padding
x1 += pad[0]
y1 += pad[2]
x2 += pad[1]
y2 += pad[3]
# Returns (x, y, w, h)
boxes[:, 1] = ((x1 + x2) / 2) / padded_w
boxes[:, 2] = ((y1 + y2) / 2) / padded_h
boxes[:, 3] *= w_factor / padded_w
boxes[:, 4] *= h_factor / padded_h
targets = torch.zeros((len(boxes), 6))
targets[:, 1:] = boxes
# Apply augmentations
if self.augment:
if np.random.random() < 0.5:
img, targets = horisontal_flip(img, targets)
return img_path, img, targets
def collate_fn(self, batch):
paths, imgs, targets = list(zip(*batch))
# Remove empty placeholder targets
targets = [boxes for boxes in targets if boxes is not None]
# Add sample index to targets
for i, boxes in enumerate(targets):
boxes[:, 0] = i
targets = torch.cat(targets, 0)
# Selects new image size every tenth batch
if self.multiscale and self.batch_count % 10 == 0:
self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32))
# Resize images to input shape
imgs = torch.stack([resize(img, self.img_size) for img in imgs])
self.batch_count += 1
return paths, imgs, targets
def __len__(self):
return len(self.img_files)
通过上述代码我们可以得到经过padding
以及增广等方式我们可以得到变化后的图片(img
➡ shape: [3, 640, 640]
)以及对应的标签(target
➡ shape:[1, 6]
)。这里的6
, 4
个值是坐标,1
个值是类别,还有1
个是placeholder代表batch。
这里需要注意的是每10个batch
这里的输入大小就会改变。
# Selects new image size every tenth batch
if self.multiscale and self.batch_count % 10 == 0:
self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32))
二、 模型模块
from __future__ import division
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from utils.parse_config import *
from utils.utils import build_targets, to_cpu, non_max_suppression
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def create_modules(module_defs):
"""
Constructs module list of layer blocks from module configuration in module_defs
"""
hyperparams = module_defs.pop(0)
output_filters = [int(hyperparams["channels"])]
module_list = nn.ModuleList()
for module_i, module_def in enumerate(module_defs):
modules = nn.Sequential()
if module_def["type"] == "convolutional":
bn = int(module_def["batch_normalize"])
filters = int(module_def["filters"])
kernel_size = int(module_def["size"])
pad = (kernel_size - 1) // 2
modules.add_module(
f"conv_{module_i}",
nn.Conv2d(
in_channels=output_filters[-1],
out_channels=filters,
kernel_size=kernel_size,
stride=int(module_def["stride"]),
padding=pad,
bias=not bn,
),
)
if bn:
modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
if module_def["activation"] == "leaky":
modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))
elif module_def["type"] == "maxpool":
kernel_size = int(module_def["size"])
stride = int(module_def["stride"])
if kernel_size == 2 and stride == 1:
modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
modules.add_module(f"maxpool_{module_i}", maxpool)
elif module_def["type"] == "upsample":
upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
modules.add_module(f"upsample_{module_i}", upsample)
elif module_def["type"] == "route": # 输入1:26*26*256 输入2:26*26*128 输出:26*26*(256+128)
layers = [int(x) for x in module_def["layers"].split(",")]
filters = sum([output_filters[1:][i] for i in layers])
modules.add_module(f"route_{module_i}", EmptyLayer())
elif module_def["type"] == "shortcut":
filters = output_filters[1:][int(module_def["from"])]
modules.add_module(f"shortcut_{module_i}", EmptyLayer())
elif module_def["type"] == "yolo":
anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
# Extract anchors
anchors = [int(x) for x in module_def["anchors"].split(",")]
anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
anchors = [anchors[i] for i in anchor_idxs]
num_classes = int(module_def["classes"])
img_size = int(hyperparams["height"])
# Define detection layer
yolo_layer = YOLOLayer(anchors, num_classes, img_size)
modules.add_module(f"yolo_{module_i}", yolo_layer)
# Register module list and number of output filters
module_list.append(modules)
output_filters.append(filters)
return hyperparams, module_list
class Upsample(nn.Module):
""" nn.Upsample is deprecated """
def __init__(self, scale_factor, mode="nearest"):
super(Upsample, self).__init__()
self.scale_factor = scale_factor
self.mode = mode
def forward(self, x):
x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
return x
class EmptyLayer(nn.Module):
"""Placeholder for 'route' and 'shortcut' layers"""
def __init__(self):
super(EmptyLayer, self).__init__()
class YOLOLayer(nn.Module):
"""Detection layer"""
def __init__(self, anchors, num_classes, img_dim=416):
super(YOLOLayer, self).__init__()
self.anchors = anchors
self.num_anchors = len(anchors)
self.num_classes = num_classes
self.ignore_thres = 0.5
self.mse_loss = nn.MSELoss()
self.bce_loss = nn.BCELoss()
self.obj_scale = 1
self.noobj_scale = 100
self.metrics = {}
self.img_dim = img_dim
self.grid_size = 0 # grid size
def compute_grid_offsets(self, grid_size, cuda=True):
self.grid_size = grid_size
g = self.grid_size
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
self.stride = self.img_dim / self.grid_size
# Calculate offsets for each grid
self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
def forward(self, x, targets=None, img_dim=None):
# Tensors for cuda support
print (x.shape)
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
self.img_dim = img_dim
num_samples = x.size(0)
grid_size = x.size(2)
prediction = (
x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
.permute(0, 1, 3, 4, 2)
.contiguous()
)
print (prediction.shape)
# Get outputs
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
# If grid size does not match current we compute new offsets
if grid_size != self.grid_size:
self.compute_grid_offsets(grid_size, cuda=x.is_cuda) #相对位置得到对应的绝对位置比如之前的位置是0.5,0.5变为 11.5,11.5这样的
# Add offset and scale with anchors #特征图中的实际位置
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
if targets is None:
return output, 0
else:
iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
pred_boxes=pred_boxes,
pred_cls=pred_cls,
target=targets,
anchors=self.scaled_anchors,
ignore_thres=self.ignore_thres,
)
# iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引 obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
# Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #有物体越接近1越好 没物体的越接近0越好
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
# Metrics
cls_acc = 100 * class_mask[obj_mask].mean()
conf_obj = pred_conf[obj_mask].mean()
conf_noobj = pred_conf[noobj_mask].mean()
conf50 = (pred_conf > 0.5).float()
iou50 = (iou_scores > 0.5).float()
iou75 = (iou_scores > 0.75).float()
detected_mask = conf50 * class_mask * tconf
precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
self.metrics = {
"loss": to_cpu(total_loss).item(),
"x": to_cpu(loss_x).item(),
"y": to_cpu(loss_y).item(),
"w": to_cpu(loss_w).item(),
"h": to_cpu(loss_h).item(),
"conf": to_cpu(loss_conf).item(),
"cls": to_cpu(loss_cls).item(),
"cls_acc": to_cpu(cls_acc).item(),
"recall50": to_cpu(recall50).item(),
"recall75": to_cpu(recall75).item(),
"precision": to_cpu(precision).item(),
"conf_obj": to_cpu(conf_obj).item(),
"conf_noobj": to_cpu(conf_noobj).item(),
"grid_size": grid_size,
}
return output, total_loss
class Darknet(nn.Module):
"""YOLOv3 object detection model"""
def __init__(self, config_path, img_size=416):
super(Darknet, self).__init__()
self.module_defs = parse_model_config(config_path)
self.hyperparams, self.module_list = create_modules(self.module_defs)
self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]
self.img_size = img_size
self.seen = 0
self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)
def forward(self, x, targets=None):
img_dim = x.shape[2]
loss = 0
layer_outputs, yolo_outputs = [], []
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
x = module(x)
elif module_def["type"] == "route":
x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
elif module_def["type"] == "shortcut":
layer_i = int(module_def["from"])
x = layer_outputs[-1] + layer_outputs[layer_i]
elif module_def["type"] == "yolo":
x, layer_loss = module[0](x, targets, img_dim)
loss += layer_loss
yolo_outputs.append(x)
layer_outputs.append(x)
yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
return yolo_outputs if targets is None else (loss, yolo_outputs)
def load_darknet_weights(self, weights_path):
"""Parses and loads the weights stored in 'weights_path'"""
# Open the weights file
with open(weights_path, "rb") as f:
header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values
self.header_info = header # Needed to write header when saving weights
self.seen = header[3] # number of images seen during training
weights = np.fromfile(f, dtype=np.float32) # The rest are weights
# Establish cutoff for loading backbone weights
cutoff = None
if "darknet53.conv.74" in weights_path:
cutoff = 75
ptr = 0
for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
if i == cutoff:
break
if module_def["type"] == "convolutional":
conv_layer = module[0]
if module_def["batch_normalize"]:
# Load BN bias, weights, running mean and running variance
bn_layer = module[1]
num_b = bn_layer.bias.numel() # Number of biases
# Bias
bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
bn_layer.bias.data.copy_(bn_b)
ptr += num_b
# Weight
bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
bn_layer.weight.data.copy_(bn_w)
ptr += num_b
# Running Mean
bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
bn_layer.running_mean.data.copy_(bn_rm)
ptr += num_b
# Running Var
bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
bn_layer.running_var.data.copy_(bn_rv)
ptr += num_b
else:
# Load conv. bias
num_b = conv_layer.bias.numel()
conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
conv_layer.bias.data.copy_(conv_b)
ptr += num_b
# Load conv. weights
num_w = conv_layer.weight.numel()
conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
conv_layer.weight.data.copy_(conv_w)
ptr += num_w
def save_darknet_weights(self, path, cutoff=-1):
"""
@:param path - path of the new weights file
@:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
"""
fp = open(path, "wb")
self.header_info[3] = self.seen
self.header_info.tofile(fp)
# Iterate through layers
for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
if module_def["type"] == "convolutional":
conv_layer = module[0]
# If batch norm, load bn first
if module_def["batch_normalize"]:
bn_layer = module[1]
bn_layer.bias.data.cpu().numpy().tofile(fp)
bn_layer.weight.data.cpu().numpy().tofile(fp)
bn_layer.running_mean.data.cpu().numpy().tofile(fp)
bn_layer.running_var.data.cpu().numpy().tofile(fp)
# Load conv bias
else:
conv_layer.bias.data.cpu().numpy().tofile(fp)
# Load conv weights
conv_layer.weight.data.cpu().numpy().tofile(fp)
fp.close()
这里我们只需要关注Darknet
模块即可。因此我们只需要查看yolo_outputs
从图中可以看到有3个输出,这里是yolo头的三个输出。下面我们来看下yolo输出头做了什么操作,这里是整个模型的重点。
class YOLOLayer(nn.Module):
"""Detection layer"""
def __init__(self, anchors, num_classes, img_dim=416):
super(YOLOLayer, self).__init__()
self.anchors = anchors
self.num_anchors = len(anchors)
self.num_classes = num_classes
self.ignore_thres = 0.5
self.mse_loss = nn.MSELoss()
self.bce_loss = nn.BCELoss()
self.obj_scale = 1
self.noobj_scale = 100
self.metrics = {}
self.img_dim = img_dim
self.grid_size = 0 # grid size
def compute_grid_offsets(self, grid_size, cuda=True):
self.grid_size = grid_size
g = self.grid_size
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
self.stride = self.img_dim / self.grid_size
# Calculate offsets for each grid
self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
def forward(self, x, targets=None, img_dim=None):
# Tensors for cuda support
print (x.shape)
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
self.img_dim = img_dim
num_samples = x.size(0)
grid_size = x.size(2)
prediction = (
x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
.permute(0, 1, 3, 4, 2)
.contiguous()
)
print (prediction.shape)
# Get outputs
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
# If grid size does not match current we compute new offsets
if grid_size != self.grid_size:
self.compute_grid_offsets(grid_size, cuda=x.is_cuda) #相对位置得到对应的绝对位置比如之前的位置是0.5,0.5变为 11.5,11.5这样的
# Add offset and scale with anchors #特征图中的实际位置
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
if targets is None:
return output, 0
else:
iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
pred_boxes=pred_boxes,
pred_cls=pred_cls,
target=targets,
anchors=self.scaled_anchors,
ignore_thres=self.ignore_thres,
)
# iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引 obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
# Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #有物体越接近1越好 没物体的越接近0越好
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
# Metrics
cls_acc = 100 * class_mask[obj_mask].mean()
conf_obj = pred_conf[obj_mask].mean()
conf_noobj = pred_conf[noobj_mask].mean()
conf50 = (pred_conf > 0.5).float()
iou50 = (iou_scores > 0.5).float()
iou75 = (iou_scores > 0.75).float()
detected_mask = conf50 * class_mask * tconf
precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
self.metrics = {
"loss": to_cpu(total_loss).item(),
"x": to_cpu(loss_x).item(),
"y": to_cpu(loss_y).item(),
"w": to_cpu(loss_w).item(),
"h": to_cpu(loss_h).item(),
"conf": to_cpu(loss_conf).item(),
"cls": to_cpu(loss_cls).item(),
"cls_acc": to_cpu(cls_acc).item(),
"recall50": to_cpu(recall50).item(),
"recall75": to_cpu(recall75).item(),
"precision": to_cpu(precision).item(),
"conf_obj": to_cpu(conf_obj).item(),
"conf_noobj": to_cpu(conf_noobj).item(),
"grid_size": grid_size,
}
return output, total_loss
需要注意的是这里的输入的size是32的倍数,并且输入的size是随机的。
这里的YOLOLayer的forward
的输入有三个不同的输出大小。
print ("yolo input shape", x.shape)
FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
self.img_dim = img_dim
num_samples = x.size(0)
grid_size = x.size(2)
prediction = (
x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
.permute(0, 1, 3, 4, 2)
.contiguous()
)
比如这里输入图像(经过padding)是[4,3,320,320]
, 输入到yolo输出层shape是[4, 255, 10, 10]
, 之后经过view即resize我们得到prediction 的shape为[4, 3, 10, 10, 85]
。
4
为 batchsize
3
为grid上的每个像素点有3个achor
10x10
为感受野为10x10
85
为80个类别+4个坐标+置信度
x = torch.sigmoid(prediction[..., 0]) # Center x
y = torch.sigmoid(prediction[..., 1]) # Center y
w = prediction[..., 2] # Width
h = prediction[..., 3] # Height
pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
# If grid size does not match current we compute new offsets
if grid_size != self.grid_size:
self.compute_grid_offsets(grid_size, cuda=x.is_cuda) #相对位置得到对应的绝对位置比如之前的位置是0.5,0.5变为 11.5,11.5这样的
x
shape为 [4, 3, 10, 10]
范围0-1
y
shape为 [4, 3, 10, 10]
范围0-1
w
shape为 [4, 3, 10, 10]
有正负
h
shape为 [4, 3, 10, 10]
有正负
pred_conf
shape为[4, 3, 10, 10]
pred_cls
shape为[4, 3, 10, 10, 80]
第一个维度
为batch size
第二个维度
为anchor
的数量
第三个和第四个维度
为10x10
的grid,grid的上的像素总数量
第五个维度
为80
类别。
grid_size
为10, 经过compute_grid_offsets(self, grid_size, cuda=True)
函数
def compute_grid_offsets(self, grid_size, cuda=True):
self.grid_size = grid_size
g = self.grid_size
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
self.stride = self.img_dim / self.grid_size
# Calculate offsets for each grid
self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])
self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
我们可以得到
① self.grid_x
, self.grid_y
为, 其shape为[1, 1, 10, 10]
:
tensor([[[[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]]]], device='cuda:0')
② self.scaled_anchors
,其shape为[3, 2]
tensor([[ 3.6250, 2.8125],
[ 4.8750, 6.1875],
[11.6562, 10.1875]], device='cuda:0')
③ self.anchor_w
, 其shape为[1, 3, 1, 1]
tensor([[[[ 3.6250]],
[[ 4.8750]],
[[11.6562]]]], device='cuda:0')
④ self.anchor_h
, 其shape为[1, 3, 1, 1]
tensor([[[[ 2.8125]],
[[ 6.1875]],
[[10.1875]]]], device='cuda:0')
下面接着看代码
pred_boxes = FloatTensor(prediction[..., :4].shape)
pred_boxes[..., 0] = x.data + self.grid_x
pred_boxes[..., 1] = y.data + self.grid_y
pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
上述将其坐标转到特征图的实际位置。下面则是再将特征图上的坐标还原到原始图种。
output = torch.cat(
(
pred_boxes.view(num_samples, -1, 4) * self.stride, #还原到原始图中
pred_conf.view(num_samples, -1, 1),
pred_cls.view(num_samples, -1, self.num_classes),
),
-1,
)
这里的output如果是预测的阶段则输出。
下面看下如何求loss
iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
pred_boxes=pred_boxes,
pred_cls=pred_cls,
target=targets,
anchors=self.scaled_anchors,
ignore_thres=self.ignore_thres,
)
# iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引 obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
# Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #有物体越接近1越好 没物体的越接近0越好
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
这里看下build_targets
的函数。
def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor
nB = pred_boxes.size(0) # batchsieze 4
nA = pred_boxes.size(1) # 每个格子对应了多少个anchor
nC = pred_cls.size(-1) # 类别的数量
nG = pred_boxes.size(2) # gridsize
# Output tensors
obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0) # obj,anchor包含物体, 即为1,默认为0 考虑前景
noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) # noobj, anchor不包含物体, 则为1,默认为1 考虑背景
class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) # 类别掩膜,类别预测正确即为1,默认全为0
iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) # 预测框与真实框的iou得分
tx = FloatTensor(nB, nA, nG, nG).fill_(0) # 真实框相对于网格的位置
ty = FloatTensor(nB, nA, nG, nG).fill_(0)
tw = FloatTensor(nB, nA, nG, nG).fill_(0)
th = FloatTensor(nB, nA, nG, nG).fill_(0)
tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)
# Convert to position relative to box
target_boxes = target[:, 2:6] * nG #target中的xywh都是0-1的,可以得到其在当前gridsize上的xywh
gxy = target_boxes[:, :2]
gwh = target_boxes[:, 2:]
# Get anchors with best iou
ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) #每一种规格的anchor跟每个标签上的框的IOU得分
print (ious.shape)
best_ious, best_n = ious.max(0) # 得到其最高分以及哪种规格框和当前目标最相似
# Separate target values
b, target_labels = target[:, :2].long().t() # 真实框所对应的batch,以及每个框所代表的实际类别
gx, gy = gxy.t()
gw, gh = gwh.t()
gi, gj = gxy.long().t() #位置信息,向下取整了
# Set masks
obj_mask[b, best_n, gj, gi] = 1 # 实际包含物体的设置成1
noobj_mask[b, best_n, gj, gi] = 0 # 相反
# Set noobj mask to zero where iou exceeds ignore threshold
for i, anchor_ious in enumerate(ious.t()): # IOU超过了指定的阈值就相当于有物体了
noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
# Coordinates
tx[b, best_n, gj, gi] = gx - gx.floor() # 根据真实框所在位置,得到其相当于网络的位置
ty[b, best_n, gj, gi] = gy - gy.floor()
# Width and height
tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
# One-hot encoding of label
tcls[b, best_n, gj, gi, target_labels] = 1 #将真实框的标签转换为one-hot编码形式
# Compute label correctness and iou at best anchor 计算预测的和真实一样的索引
class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) #与真实框想匹配的预测框之间的iou值
tconf = obj_mask.float() # 真实框的置信度,也就是1
return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
下面说下具体细节
# Convert to position relative to box
target_boxes = target[:, 2:6] * nG #target中的xywh都是0-1的,可以得到其在当前gridsize上的xywh
gxy = target_boxes[:, :2]
gwh = target_boxes[:, 2:]
# Get anchors with best iou
ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) #每一种规格的anchor跟每个标签上的框的IOU得分
print (ious.shape)
best_ious, best_n = ious.max(0) # 得到其最高分以及哪种规格框和当前目标最相似
# Separate target values
b, target_labels = target[:, :2].long().t() # 真实框所对应的batch,以及每个框所代表的实际类别
gx, gy = gxy.t()
gw, gh = gwh.t()
gi, gj = gxy.long().t() #位置信息,向下取整了
# Set masks
obj_mask[b, best_n, gj, gi] = 1 # 实际包含物体的设置成1
noobj_mask[b, best_n, gj, gi] = 0 # 相反
target_boxes
shape为[26, 4]
在grid上的xywh
, 这里的nG
为10
。
示例如下:
tensor([[4.9437, 2.9110, 2.2886, 3.6280],
[4.8130, 4.7429, 1.2331, 1.6652],
[3.8773, 6.3149, 5.2898, 4.0577],
[7.6569, 4.0607, 3.6819, 4.5686],
[4.3046, 3.3931, 3.0820, 2.2591],
[1.6495, 5.7004, 2.0412, 0.9148],
[4.6284, 5.3995, 0.7797, 0.2894],
[6.4448, 6.2600, 1.1080, 0.2766],
[9.2528, 5.4973, 0.6988, 0.1742],
[5.1470, 4.4541, 1.2375, 1.8380],
[5.0341, 4.5761, 2.3220, 1.2794],
[6.3422, 5.0870, 0.3406, 0.3592],
[8.0896, 5.2478, 0.0686, 0.1250],
[6.3594, 5.9677, 0.2723, 0.4433],
[7.6264, 5.3473, 1.0400, 0.2488],
[7.3994, 5.2806, 1.0895, 0.2334],
[7.4966, 4.5640, 1.2917, 1.6633],
[5.4181, 5.4123, 1.2097, 0.2761],
[3.6413, 7.5164, 0.1598, 0.0600],
[0.5981, 7.5108, 0.1903, 0.0522],
[0.2362, 7.4884, 0.2108, 0.0689],
[2.3908, 7.5048, 0.2209, 0.1097],
[4.1255, 7.5466, 0.1756, 0.0578],
[3.8929, 7.4430, 0.1352, 0.2455],
[1.4373, 7.4771, 0.1269, 0.1761],
[1.3068, 7.6449, 0.0783, 0.1508]], device='cuda:0')
gxy
shape为[26, 2]
示例为
tensor([[4.9437, 2.9110],
[4.8130, 4.7429],
[3.8773, 6.3149],
[7.6569, 4.0607],
[4.3046, 3.3931],
[1.6495, 5.7004],
[4.6284, 5.3995],
[6.4448, 6.2600],
[9.2528, 5.4973],
[5.1470, 4.4541],
[5.0341, 4.5761],
[6.3422, 5.0870],
[8.0896, 5.2478],
[6.3594, 5.9677],
[7.6264, 5.3473],
[7.3994, 5.2806],
[7.4966, 4.5640],
[5.4181, 5.4123],
[3.6413, 7.5164],
[0.5981, 7.5108],
[0.2362, 7.4884],
[2.3908, 7.5048],
[4.1255, 7.5466],
[3.8929, 7.4430],
[1.4373, 7.4771],
[1.3068, 7.6449]], device='cuda:0')
gwh
shape为[26, 2]
示例为
tensor([[2.2886, 3.6280],
[1.2331, 1.6652],
[5.2898, 4.0577],
[3.6819, 4.5686],
[3.0820, 2.2591],
[2.0412, 0.9148],
[0.7797, 0.2894],
[1.1080, 0.2766],
[0.6988, 0.1742],
[1.2375, 1.8380],
[2.3220, 1.2794],
[0.3406, 0.3592],
[0.0686, 0.1250],
[0.2723, 0.4433],
[1.0400, 0.2488],
[1.0895, 0.2334],
[1.2917, 1.6633],
[1.2097, 0.2761],
[0.1598, 0.0600],
[0.1903, 0.0522],
[0.2108, 0.0689],
[0.2209, 0.1097],
[0.1756, 0.0578],
[0.1352, 0.2455],
[0.1269, 0.1761],
[0.0783, 0.1508]], device='cuda:0')
下面需要计算每一个目标与三个achor的iou,看看每一个目标适合哪一个anchor
, 这里的anchor
我们上面提过,shape为(3, 2)
,值为
tensor([[ 3.6250, 2.8125],
[ 4.8750, 6.1875],
[11.6562, 10.1875]], device='cuda:0')
ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) #每一种规格的anchor跟每个标签上的框的IOU得分
我们得到ious
shape为[3, 26]
, 得到ious
如下:
tensor([[5.3365e-01, 2.0140e-01, 4.7499e-01, 6.0611e-01, 6.8291e-01, 1.8317e-01,
2.2130e-02, 3.0055e-02, 1.1940e-02, 2.2309e-01, 2.9138e-01, 1.2002e-02,
8.4097e-04, 1.1841e-02, 2.5375e-02, 2.4946e-02, 2.1073e-01, 3.2759e-02,
9.4071e-04, 9.7411e-04, 1.4246e-03, 2.3771e-03, 9.9586e-04, 3.2542e-03,
2.1913e-03, 1.1577e-03],
[2.7526e-01, 6.8073e-02, 6.2112e-01, 5.5765e-01, 2.3082e-01, 6.1909e-02,
7.4798e-03, 1.0159e-02, 4.0358e-03, 7.5404e-02, 9.8486e-02, 4.0565e-03,
2.8424e-04, 4.0022e-03, 8.5765e-03, 8.4317e-03, 7.1227e-02, 1.1073e-02,
3.1796e-04, 3.2924e-04, 4.8150e-04, 8.0344e-04, 3.3660e-04, 1.0999e-03,
7.4064e-04, 3.9129e-04],
[6.9921e-02, 1.7292e-02, 1.8076e-01, 1.4165e-01, 5.8633e-02, 1.5726e-02,
1.9000e-03, 2.5805e-03, 1.0252e-03, 1.9154e-02, 2.5017e-02, 1.0304e-03,
7.2203e-05, 1.0166e-03, 2.1786e-03, 2.1418e-03, 1.8093e-02, 2.8126e-03,
8.0766e-05, 8.3634e-05, 1.2231e-04, 2.0409e-04, 8.5501e-05, 2.7940e-04,
1.8814e-04, 9.9395e-05]], device='cuda:0')
针对三个anchors, 针对每一个目标选取最大iou的anchor。
best_ious, best_n = ious.max(0) # 得到其最高分以及哪种规格框和当前目标最相似
得到best_n
即目标与哪一个anchor的iou最大的索引,这里的shape为[26]
, 这里为:
tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0], device='cuda:0')
这里的best_ious
如下:
tensor([0.5336, 0.2014, 0.6211, 0.6061, 0.6829, 0.1832, 0.0221, 0.0301, 0.0119,
0.2231, 0.2914, 0.0120, 0.0008, 0.0118, 0.0254, 0.0249, 0.2107, 0.0328,
0.0009, 0.0010, 0.0014, 0.0024, 0.0010, 0.0033, 0.0022, 0.0012],
device='cuda:0')
我们得到batch以及标签。
b, target_labels = target[:, :2].long().t() # 真实框所对应的batch,以及每个框所代表的实际类别
我们得到b
的shape[26]
,值为:
tensor([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
3, 3], device='cuda:0')
target_label
的shape[26]
, 值为:
tensor([ 0, 36, 48, 48, 45, 8, 8, 8, 8, 8, 8, 8, 0, 0, 8, 8, 8, 4,
8, 8, 8, 8, 8, 8, 8, 8], device='cuda:0')
下面是将x
, y
, w
, h
得到
gx, gy = gxy.t()
gw, gh = gwh.t()
gi, gj = gxy.long().t() #位置信息,向下取整了
我们得到gx
, gy
,gi
和gj
, gw
, gh
, shape
为26
,
得到gx
为:
tensor([4.9437, 4.8130, 3.8773, 7.6569, 4.3046, 1.6495, 4.6284, 6.4448, 9.2528,
5.1470, 5.0341, 6.3422, 8.0896, 6.3594, 7.6264, 7.3994, 7.4966, 5.4181,
3.6413, 0.5981, 0.2362, 2.3908, 4.1255, 3.8929, 1.4373, 1.3068],
device='cuda:0')
得到gy
为:
tensor([2.9110, 4.7429, 6.3149, 4.0607, 3.3931, 5.7004, 5.3995, 6.2600, 5.4973,
4.4541, 4.5761, 5.0870, 5.2478, 5.9677, 5.3473, 5.2806, 4.5640, 5.4123,
7.5164, 7.5108, 7.4884, 7.5048, 7.5466, 7.4430, 7.4771, 7.6449],
device='cuda:0')
得到gi
为:
tensor([4, 4, 3, 7, 4, 1, 4, 6, 9, 5, 5, 6, 8, 6, 7, 7, 7, 5, 3, 0, 0, 2, 4, 3,
1, 1], device='cuda:0')
得到gj
为:
tensor([2, 4, 6, 4, 3, 5, 5, 6, 5, 4, 4, 5, 5, 5, 5, 5, 4, 5, 7, 7, 7, 7, 7, 7,
7, 7], device='cuda:0')
得到gw
为:
tensor([2.2886, 1.2331, 5.2898, 3.6819, 3.0820, 2.0412, 0.7797, 1.1080, 0.6988,
1.2375, 2.3220, 0.3406, 0.0686, 0.2723, 1.0400, 1.0895, 1.2917, 1.2097,
0.1598, 0.1903, 0.2108, 0.2209, 0.1756, 0.1352, 0.1269, 0.0783],
device='cuda:0')
得到gh
为:
tensor([3.6280, 1.6652, 4.0577, 4.5686, 2.2591, 0.9148, 0.2894, 0.2766, 0.1742,
1.8380, 1.2794, 0.3592, 0.1250, 0.4433, 0.2488, 0.2334, 1.6633, 0.2761,
0.0600, 0.0522, 0.0689, 0.1097, 0.0578, 0.2455, 0.1761, 0.1508],
device='cuda:0')
再设置mask
# Set masks
obj_mask[b, best_n, gj, gi] = 1 # 实际包含物体的设置成1
noobj_mask[b, best_n, gj, gi] = 0 # 相反
# Set noobj mask to zero where iou exceeds ignore threshold
for i, anchor_ious in enumerate(ious.t()): # IOU超过了指定的阈值就相当于有物体了
noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
这里的obj_mask
以及noobj_mask
shape为[4, 3, 10, 10]
还有部分的背景需要把``1设置为
0`, 如下:
# Set noobj mask to zero where iou exceeds ignore threshold
for i, anchor_ious in enumerate(ious.t()): # IOU超过了指定的阈值就相当于有物体了
noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
接着设置标签的坐标以及宽高,类别。
# Coordinates
tx[b, best_n, gj, gi] = gx - gx.floor() # 根据真实框所在位置,得到其相当于网络的位置
ty[b, best_n, gj, gi] = gy - gy.floor()
# Width and height
tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
# One-hot encoding of label
tcls[b, best_n, gj, gi, target_labels] = 1 #将真实框的标签转换为one-hot编码形式
我们得到gx - gx.floor()
为:
tensor([0.9437, 0.8130, 0.8773, 0.6569, 0.3046, 0.6495, 0.6284, 0.4448, 0.2528,
0.1470, 0.0341, 0.3422, 0.0896, 0.3594, 0.6264, 0.3994, 0.4966, 0.4181,
0.6413, 0.5981, 0.2362, 0.3908, 0.1255, 0.8929, 0.4373, 0.3068],
device='cuda:0')
我们得到gy-gy.floor()
为:
tensor([0.9110, 0.7429, 0.3149, 0.0607, 0.3931, 0.7004, 0.3995, 0.2600, 0.4973,
0.4541, 0.5761, 0.0870, 0.2478, 0.9677, 0.3473, 0.2806, 0.5640, 0.4123,
0.5164, 0.5108, 0.4884, 0.5048, 0.5466, 0.4430, 0.4771, 0.6449],
device='cuda:0')
我们得到宽高
根据反推
torch.log(gw / anchors[best_n][:, 0] + 1e-16)
为
tensor([-0.4599, -1.0783, 0.0817, 0.0156, -0.1623, -0.5743, -1.5367, -1.1853,
-1.6463, -1.0748, -0.4454, -2.3648, -3.9675, -2.5886, -1.2486, -1.2021,
-1.0319, -1.0975, -3.1214, -2.9470, -2.8448, -2.7977, -3.0272, -3.2892,
-3.3524, -3.8353], device='cuda:0')
torch.log(gh / anchors[best_n][:, 1] + 1e-16)
为
tensor([ 0.2546, -0.5242, -0.4219, 0.4851, -0.2191, -1.1231, -2.2741, -2.3194,
-2.7815, -0.4254, -0.7877, -2.0579, -3.1135, -1.8476, -2.4254, -2.4889,
-0.5253, -2.3211, -3.8474, -3.9870, -3.7091, -3.2442, -3.8847, -2.4387,
-2.7708, -2.9260], device='cuda:0')
我们得到的one-hot
类别标签
tcls[b, best_n, gj, gi, target_labels] = 1 #将真实框的标签转换为one-hot编码形式
接着我们继续算class_mask
类别与真实标签的类别进行对比,判断类别判断准确度。
下面是计算iou_scores
iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) #与真实框想匹配的预测框之间的iou值
这里的target_boxes
的shape为[22, 4]
, 下面的值
tensor([[ 6.7821, 4.8227, 6.4358, 7.1284],
[ 2.4599, 5.6309, 3.9617, 2.4086],
[ 4.3287, 7.2922, 4.2977, 2.7134],
[ 3.4776, 3.6433, 3.1011, 1.7866],
[ 5.0000, 5.0008, 10.0000, 7.4984],
[ 6.3669, 5.5065, 3.7987, 2.8636],
[ 2.4023, 2.0899, 3.5156, 1.5820],
[ 1.1721, 3.3052, 2.3442, 1.8897],
[ 3.6666, 4.7720, 6.3503, 1.8578],
[ 5.0602, 5.5734, 2.8637, 2.5938],
[ 6.0722, 5.0375, 4.8578, 6.4169],
[ 7.5184, 2.9287, 2.3835, 4.7541],
[ 1.2719, 1.4799, 0.6294, 0.5797],
[ 4.9664, 6.5793, 6.4878, 4.8770],
[ 6.7324, 5.7293, 1.4570, 0.9570],
[ 4.2971, 6.8030, 1.0294, 0.8662],
[ 6.9277, 6.7813, 0.5328, 0.5721],
[ 9.0390, 2.8831, 1.9221, 3.0649],
[ 1.5111, 4.8221, 1.8173, 1.8930],
[ 3.7500, 2.3308, 7.2395, 4.4531],
[ 5.0000, 5.0015, 9.7807, 9.6516],
[ 0.4613, 4.3703, 0.6556, 1.3611]], device='cuda:0')
pred_boxes[b, best_n, gj, gi]
选取anchor的best的坐标。
iou_scores:真实值与最匹配的anchor的IOU得分值 class_mask:分类正确的索引 obj_mask:目标框所在位置的最好anchor置为1 noobj_mask obj_mask: 那里置0,还有计算的iou大于阈值的也置0,其他都为1 tx, ty, tw, th: 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值 tconf 目标置信度
下面我们来看loss
loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) # 只计算有目标的
loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])
loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])
loss_h = self.mse_loss(h[obj_mask], th[obj_mask])
loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])
loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])
loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj #有物体越接近1越好 没物体的越接近0越好
loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) #分类损失
total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls #总损失
三、预测过程
# Get detections
with torch.no_grad():
detections = model(input_imgs)
detections = non_max_suppression(detections, opt.conf_thres, opt.nms_thres)
def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):
"""
Removes detections with lower object confidence score than 'conf_thres' and performs
Non-Maximum Suppression to further filter detections.
Returns detections with shape:
(x1, y1, x2, y2, object_conf, class_score, class_pred)
"""
# From (center x, center y, width, height) to (x1, y1, x2, y2)
prediction[..., :4] = xywh2xyxy(prediction[..., :4])
output = [None for _ in range(len(prediction))]
for image_i, image_pred in enumerate(prediction):
# Filter out confidence scores below threshold
image_pred = image_pred[image_pred[:, 4] >= conf_thres]
# If none are remaining => process next image
if not image_pred.size(0):
continue
# Object confidence times class confidence
score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
# Sort by it
image_pred = image_pred[(-score).argsort()]
class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
# Perform non-maximum suppression
keep_boxes = []
while detections.size(0):
large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
label_match = detections[0, -1] == detections[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
weights = detections[invalid, 4:5]
# Merge overlapping bboxes by order of confidence
detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
keep_boxes += [detections[0]]
detections = detections[~invalid]
if keep_boxes:
output[image_i] = torch.stack(keep_boxes)
return output
image_pred = image_pred[image_pred[:, 4] >= conf_thres]
先过滤置信度小的值
score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
这里的score指的是前景置信度乘上类别置信度
detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
将目标坐标,前景置信度以及类别置信度合并,下面进行NMS
。
keep_boxes = []
while detections.size(0):
large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
label_match = detections[0, -1] == detections[:, -1]
# Indices of boxes with lower confidence scores, large IOUs and matching labels
invalid = large_overlap & label_match
weights = detections[invalid, 4:5]
# Merge overlapping bboxes by order of confidence
detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
keep_boxes += [detections[0]]
detections = detections[~invalid]
if keep_boxes:
output[image_i] = torch.stack(keep_boxes)
这边补充一下,在feature map
上一个grid
可以生成3个anchors
, 一般代码选择选取最大的anchor
, 其他的两个anchor设置为背景,这里有一个情况就是对于iou > 0.5(或其他阈值的)
, 我们作为正样本, 但是不参与loss计算,这里可以理解为就是忽略。