1.查看Yolo中train函数
这里面确定训练数据路径和中间备份路径、准备随机batch数据、确定网络结构,just as always do。这里主要查找其使怎样训练的(关于定位和识别)
查找loss函数
float loss = train_network(net, train);
if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
这里比较有意思avg_loss may be <0,还不明白为啥会小于0
然后做一个加权
发现其调用network.c中的train_network函数
float train_network_datum(network net, float *x, float *y)
//train_network()调用这个函数,把batch中所有x,y摊平
{
#ifdef GPU
if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
#endif
network_state state;
*net.seen += net.batch;
state.index = 0;
state.net = net;
state.input = x;
state.delta = 0;
state.truth = y;
state.train = 1;
forward_network(net, state);
backward_network(net, state);
float error = get_network_cost(net);//here cost
if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
return error;
}
查看 get_network_cost(net)
float get_network_cost(network net)
{
int i;
float sum = 0;
int count = 0;
for(i = 0; i < net.n; ++i){
if(net.layers[i].cost){//here important:maybe two cost layers
sum += net.layers[i].cost[0];
++count;
}
}
return sum/count;
}
typedef struct network{
float *workspace;
int n;
int batch;
int *seen;
float epoch;
int subdivisions;
float momentum;
float decay;
layer *layers;
int outputs;
float *output;
learning_rate_policy policy;
float learning_rate;
float gamma;
float scale;
float power;
int time_steps;
int step;
int max_batches;
float *scales;
int *steps;
int num_steps;
int burn_in;
int inputs;
int h, w, c;
int max_crop;
int min_crop;
float angle;
float aspect;
float exposure;
float saturation;
float hue;
int gpu_index;
#ifdef GPU
float **input_gpu;
float **truth_gpu;
#endif
} network;
然后看layer,amazing data structure
struct layer{
LAYER_TYPE type;
ACTIVATION activation;
COST_TYPE cost_type;
int batch_normalize;
int shortcut;
int batch;
int forced;
int flipped;
int inputs;
int outputs;
int truths;
int h,w,c;
int out_h, out_w, out_c;
int n;
int max_boxes;
int groups;
int size;
int side;
int stride;
int pad;
int sqrt;
int flip;
int index;
int binary;
int xnor;
int steps;
int hidden;
float dot;
float angle;
float jitter;
float saturation;
float exposure;
float shift;
float ratio;
int softmax;
int classes;
int coords;
int background;
int rescore;
int objectness;
int does_cost;
int joint;
int noadjust;
int reorg;
int log;
float alpha;
float beta;
float kappa;
float coord_scale;
float object_scale;
float noobject_scale;
float class_scale;
int random;
int dontload;
int dontloadscales;
float temperature;
float probability;
float scale;
int *indexes;
float *rand;
float *cost;
char *cweights;
float *state;
float *prev_state;
float *forgot_state;
float *forgot_delta;
float *state_delta;
float *concat;
float *concat_delta;
float *binary_weights;
float *biases;
float *bias_updates;
float *scales;
float *scale_updates;
float *weights;
float *weight_updates;
float *col_image;
int * input_layers;
int * input_sizes;
float * delta;
float * output;
float * squared;
float * norms;
float * spatial_mean;
float * mean;
float * variance;
float * mean_delta;
float * variance_delta;
float * rolling_mean;
float * rolling_variance;
float * x;
float * x_norm;
struct layer *input_layer;
struct layer *self_layer;
struct layer *output_layer;
struct layer *input_gate_layer;
struct layer *state_gate_layer;
struct layer *input_save_layer;
struct layer *state_save_layer;
struct layer *input_state_layer;
struct layer *state_state_layer;
struct layer *input_z_layer;
struct layer *state_z_layer;
struct layer *input_r_layer;
struct layer *state_r_layer;
struct layer *input_h_layer;
struct layer *state_h_layer;
float *z_cpu;
float *r_cpu;
float *h_cpu;
float *binary_input;
size_t workspace_size;
#ifdef GPU
float *z_gpu;
float *r_gpu;
float *h_gpu;
int *indexes_gpu;
float * prev_state_gpu;
float * forgot_state_gpu;
float * forgot_delta_gpu;
float * state_gpu;
float * state_delta_gpu;
float * gate_gpu;
float * gate_delta_gpu;
float * save_gpu;
float * save_delta_gpu;
float * concat_gpu;
float * concat_delta_gpu;
float *binary_input_gpu;
float *binary_weights_gpu;
float * mean_gpu;
float * variance_gpu;
float * rolling_mean_gpu;
float * rolling_variance_gpu;
float * variance_delta_gpu;
float * mean_delta_gpu;
float * col_image_gpu;
float * x_gpu;
float * x_norm_gpu;
float * weights_gpu;
float * weight_updates_gpu;
float * biases_gpu;
float * bias_updates_gpu;
float * scales_gpu;
float * scale_updates_gpu;
float * output_gpu;
float * delta_gpu;
float * rand_gpu;
float * squared_gpu;
float * norms_gpu;
#ifdef CUDNN
cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
cudnnFilterDescriptor_t weightDesc;
cudnnFilterDescriptor_t dweightDesc;
cudnnConvolutionDescriptor_t convDesc;
cudnnConvolutionFwdAlgo_t fw_algo;
cudnnConvolutionBwdDataAlgo_t bd_algo;
cudnnConvolutionBwdFilterAlgo_t bf_algo;
#endif
#endif
};
所以是在forward propogation时就计算好了cost
void forward_network(network net, network_state state)
{
state.workspace = net.workspace;
int i;
for(i = 0; i < net.n; ++i){
state.index = i;
layer l = net.layers[i];
if(l.delta){
scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
}
if(l.type == CONVOLUTIONAL){
forward_convolutional_layer(l, state);
} else if(l.type == DECONVOLUTIONAL){
forward_deconvolutional_layer(l, state);
} else if(l.type == ACTIVE){
forward_activation_layer(l, state);
} else if(l.type == LOCAL){
forward_local_layer(l, state);
} else if(l.type == NORMALIZATION){
forward_normalization_layer(l, state);
} else if(l.type == BATCHNORM){
forward_batchnorm_layer(l, state);
} else if(l.type == DETECTION){
forward_detection_layer(l, state);
} else if(l.type == REGION){
forward_region_layer(l, state);
} else if(l.type == CONNECTED){
forward_connected_layer(l, state);
} else if(l.type == RNN){
forward_rnn_layer(l, state);
} else if(l.type == GRU){
forward_gru_layer(l, state);
} else if(l.type == CRNN){
forward_crnn_layer(l, state);
} else if(l.type == CROP){
forward_crop_layer(l, state);
} else if(l.type == COST){
forward_cost_layer(l, state);
} else if(l.type == SOFTMAX){
forward_softmax_layer(l, state);
} else if(l.type == MAXPOOL){
forward_maxpool_layer(l, state);
} else if(l.type == REORG){
forward_reorg_layer(l, state);
} else if(l.type == AVGPOOL){
forward_avgpool_layer(l, state);
} else if(l.type == DROPOUT){
forward_dropout_layer(l, state);
} else if(l.type == ROUTE){
forward_route_layer(l, net);
} else if(l.type == SHORTCUT){
forward_shortcut_layer(l, state);
}
state.input = l.output;
}
}
下面我们看到他用smooth_l1 l2进行计算代价函数
if(l.cost_type == SMOOTH){
smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
} else {
l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
}
l.cost[0] = sum_array(l.output, l.batch*l.inputs);
void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float diff = truth[i] - pred[i];
float abs_val = fabs(diff);
if(abs_val < 1) {
error[i] = diff * diff;
delta[i] = diff;
}
else {
error[i] = 2*abs_val - 1;
delta[i] = (diff < 0) ? -1 : 1;
}
}
}
void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;
for(i = 0; i < n; ++i){
float diff = truth[i] - pred[i];
error[i] = diff * diff;
delta[i] = diff;
}
}
主要用到状态state里的truth和看n的大小l.inputs
typedef struct network_state {
float *truth;
float *input;
float *delta;
float *workspace;
int train;
int index;
network net;
} network_state;
float train_network_datum(network net, float *x, float *y)
{
#ifdef GPU
if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
#endif
network_state state;
*net.seen += net.batch;
state.index = 0;
state.net = net;
state.input = x;
state.delta = 0;
state.truth = y;//HERE
state.train = 1;
forward_network(net, state);
backward_network(net, state);
float error = get_network_cost(net);
if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
return error;
}
float train_network(network net, data d)
{
int batch = net.batch;
int n = d.X.rows / batch;
float *X = calloc(batch*d.X.cols, sizeof(float));
float *y = calloc(batch*d.y.cols, sizeof(float));//HERE
int i;
float sum = 0;
for(i = 0; i < n; ++i){
get_next_batch(d, batch, i*batch, X, y);
float err = train_network_datum(net, X, y);
sum += err;
}
free(X);
free(y);
return (float)sum/(n*batch);
}
typedef struct{
int w, h;
matrix X;
matrix y;//HERE
int *indexes;
int shallow;
int *num_boxes;
box **boxes;
} data;
可以看到只与y有关而与box无关