yolo v3 源码阅读(3):yolo train forward

yolo forward

//network.c
float train_network(network *net, data d)
{

    //rows 一次加载的图片数目,
    assert(d.X.rows % net->batch == 0);
    int batch = net->batch;
    //n为训练的次数
    int n = d.X.rows / batch;

    int i;
    float sum = 0;
    for(i = 0; i < n; ++i){
        //完成数据拷贝，从ｄ拷贝到net.input和net.truth中
        get_next_batch(d, batch, i*batch, net->input, net->truth);
        //单次 训练网络
        float err = train_network_datum(net);
        sum += err;
    }
    //返回单张图片的平均loss
    return (float)sum/(n*batch);
}

float train_network_datum(network *net)
{
    *net->seen += net->batch;
    //seen 表示已经看过的数据(已训练)
    net->train = 1;
    forward_network(net);
    backward_network(net);
    float error = *net->cost;
    //表示一轮结束,更新网络loss
    if(((*net->seen)/net->batch)%net->subdivisions == 0) update_network(net);
    return error;
}

network.c


void forward_network(network *netp)
{
#ifdef GPU
    if(netp->gpu_index >= 0){
        forward_network_gpu(netp);   
        return;
    }
#endif
    network net = *netp;
    int i;
    for(i = 0; i < net.n; ++i){
        net.index = i;
        layer l = net.layers[i];
        if(l.delta){
            fill_cpu(l.outputs * l.batch, 0, l.delta, 1);
        }
         //从这里开始我们可以一层一层分析了，重复的层就不再分析了，顺序如下：
    //我们只看yolo 层的forward吧

        l.forward(l, net);
        net.input = l.output;
        if(l.truth) {
            net.truth = l.output;
        }
    }
    //计算各层的cost loss
    calc_network_cost(netp);
}


//yolo_layer.c
//yolo层训练的时候的loss 计算
void forward_yolo_layer(const layer l, network net)
{
    int i,j,b,t,n;
    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));

#ifndef GPU
    for (b = 0; b < l.batch; ++b){
        for(n = 0; n < l.n; ++n){
            int index = entry_index(l, b, n*l.w*l.h, 0);
            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
            index = entry_index(l, b, n*l.w*l.h, 4);
            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
        }
    }
#endif

    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
    if(!net.train) return;
    float avg_iou = 0;
    float recall = 0;
    float recall75 = 0;
    float avg_cat = 0;
    float avg_obj = 0;
    float avg_anyobj = 0;
    int count = 0;
    int class_count = 0;
    *(l.cost) = 0;
    /*
     * 首先，网络的每个输出的bbox都对比groudtruth，如果IOU > ignore则不参与训练，
     * 进一步的，大于truth则计算loss，
     * 参与训练，但是cfg文件中这个值设置的是1,
     * 所以应该就是忽略后面这个进一步的了。
     */
    for (b = 0; b < l.batch; ++b) {
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
                    // n*l.w*l.h代表n张看过的图, 相当于找当前w,h所在的box
                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
                    float best_iou = 0;
                    int best_t = 0;
                    //找到最大iou的框
                    for(t = 0; t < l.max_boxes; ++t){
                        //获取到真实框的box,  每个yolo层有 90 * 5 个 l.truths = 90*(4 + 1);,查找真实框的地址
                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
                        //如果当前没有一个框,退出
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
                            best_iou = iou;
                            best_t = t;
                        }
                    }
                    /*
                     * 每个预测框都算一遍
                     */
                    //获取网络第I,J个grid cell 负责的真实物体的下标
                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                    //是否有物体(1 为有,)
                    avg_anyobj += l.output[obj_index];
                    l.delta[obj_index] = 0 - l.output[obj_index];
                    //yolo v3 ignore_thresh : 0.5   yolo_v3 tiny : 0.7,iou 大于0.5的时候,就不参与loss计算
                    if (best_iou > l.ignore_thresh) {
                        l.delta[obj_index] = 0;
                    }
                    //大于truth_thresh 则参与训练,但是cfg中为1
                    if (best_iou > l.truth_thresh) {
                        l.delta[obj_index] = 1 - l.output[obj_index];

                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
                        if (l.map) class = l.map[class];
                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                    }
                }
            }
        }
        /*
         *第二个循环，对每个目标，查找最合适的anchor，
         * 如果本层负责这个尺寸的anchor，就计算对应的各loss。否则忽略
         */
        for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);

            if(!truth.x) break;
            float best_iou = 0;
            int best_n = 0;
            i = (truth.x * l.w);
            j = (truth.y * l.h);
            box truth_shift = truth;
            truth_shift.x = truth_shift.y = 0;
            for(n = 0; n < l.total; ++n){
                //寻找最合适的anchor宽高
                box pred = {0};
                pred.w = l.biases[2*n]/net.w;
                pred.h = l.biases[2*n+1]/net.h;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
                    best_iou = iou;
                    best_n = n;
                }
            }
            //best_n 为最佳anchor 的wh

            int mask_n = int_index(l.mask, best_n, l.n);
            // 如果最合适的anchor由本层负责预测（由mask来决定）
            if(mask_n >= 0){
                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
                // 计算boundbox的loss
                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);

                //objectness 的loss
                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
                avg_obj += l.output[obj_index];
                //有object 的为 1,LOSS 就等于 1- output
                l.delta[obj_index] = 1 - l.output[obj_index];

                //class loss
                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
                if (l.map) class = l.map[class];
                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);

                ++count;
                ++class_count;
                if(iou > .5) recall += 1;
                if(iou > .75) recall75 += 1;
                avg_iou += iou;
            }
        }
    }
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
}

获取box函数:

box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
{
    box b;
    // lw/lh为网络输出大小， b.x, b.y 为全图相对尺寸
    b.x = (i + x[index + 0*stride]) / lw;
    b.y = (j + x[index + 1*stride]) / lh;
    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
    return b;
}

所以yolo Box的取值为:

$x = dx(预测的x) / 当前yolo层的宽$

$y = dy(预测的y值)/当前yolo层的高$

$w = exp^(dw)* anchor(w) / 网格输入图片w$
$h = exp^(dh) * anchor(h) / 网格输入图片h$

//box.c
box float_to_box(float *f, int stride)
{
    box b = {0};
    b.x = f[0];
    b.y = f[1*stride];
    b.w = f[2*stride];
    b.h = f[3*stride];
    return b;
}
//计算iou
float box_iou(box a, box b)
{
//相交部分乘以并集
    return box_intersection(a, b)/box_union(a, b);
}
float box_intersection(box a, box b)
{
    float w = overlap(a.x, a.w, b.x, b.w);
    float h = overlap(a.y, a.h, b.y, b.h);
    if(w < 0 || h < 0) return 0;
    float area = w*h;
    return area;
}

float box_union(box a, box b)
{
    float i = box_intersection(a, b);
    float u = a.w*a.h + b.w*b.h - i;
    return u;
}
float box_intersection(box a, box b)
{
    float w = overlap(a.x, a.w, b.x, b.w);
    float h = overlap(a.y, a.h, b.y, b.h);
    if(w < 0 || h < 0) return 0;
    float area = w*h;
    return area;
}
dbox dintersect(box a, box b)
{
    float w = overlap(a.x, a.w, b.x, b.w);
    float h = overlap(a.y, a.h, b.y, b.h);
    dbox dover = derivative(a, b);
    dbox di;

    di.dw = dover.dw*h;
    di.dx = dover.dx*h;
    di.dh = dover.dh*w;
    di.dy = dover.dy*w;

    return di;
}

box_loss的计算函数:

//yolo_layer.c
//                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);

float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
{
    //获取预测的Box
    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
    //计算box的iou
    float iou = box_iou(pred, truth);

    //分别计算真实物体的 xywh
    float tx = (truth.x*lw - i);
    float ty = (truth.y*lh - j);
    float tw = log(truth.w*w / biases[2*n]);
    float th = log(truth.h*h / biases[2*n + 1]);

    //返回loss  tx-truth
    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
    return iou;
}

我们可以看到真实框的计算:
$scale=(2-truth.w*truth.h)$
$tx = truth.x * yolo_layer.w$
$ty = truth.y * yolo_layer.h$
$tw = log(truth.w * net.w/ anchor.w)$
$th = log(truth.h * net.h/anchor.h)$

class_LOSS的计算:

//delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
{
    int n;
    if (delta[index]){
        //正确的class_index,loss = 0
        delta[index + stride*class] = 1 - output[index + stride*class];
        if(avg_cat) *avg_cat += output[index + stride*class];
        return;
    }
    for(n = 0; n < classes; ++n){
        //class 错了,loss 就等于预测的结果
        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
    }
}

yolo v3 源码阅读(3):yolo train forward

yolo forward

推荐阅读更多精彩内容