darknet-yolov3中的learning_rate是一個超參數,調參時可通過調節該參數使模型收斂到一個較好的狀態。
在cfg配置中的呈現如下圖:
我這里隨便設了一個值。
接下來說一下burn_in和policy.
這兩者在代碼中的呈現如下所示:
float get_current_rate(network *net) { size_t batch_num = get_current_batch(net); int i; float rate; if (batch_num < net->burn_in) //當batch_num小於burn_in時,返回如下learning_rate return net->learning_rate * pow((float)batch_num / net->burn_in, net->power); switch (net->policy) {//當大於burn_in時,按如下方式,原配值中給的是STEPS case CONSTANT: return net->learning_rate; case STEP: return net->learning_rate * pow(net->scale, batch_num/net->step); case STEPS: rate = net->learning_rate; for(i = 0; i < net->num_steps; ++i){ if(net->steps[i] > batch_num) return rate; rate *= net->scales[i]; } return rate; case EXP: return net->learning_rate * pow(net->gamma, batch_num); case POLY: return net->learning_rate * pow(1 - (float)batch_num / net->max_batches, net->power); case RANDOM: return net->learning_rate * pow(rand_uniform(0,1), net->power); case SIG: return net->learning_rate * (1./(1.+exp(net->gamma*(batch_num - net->step)))); default: fprintf(stderr, "Policy is weird!\n"); return net->learning_rate; } }
這里我做了一些調整。
調整依據是:發現自己設置的學習率和burn_in結束時的學習率總是有很大差異,造成loss變化出現停滯,或者劇烈抖動。
調整辦法:讓steps的起始學習率=burn_in結束時的學習率。
實現如下:
float last_rate; float get_current_rate(network *net) { size_t batch_num = get_current_batch(net); int i; float rate; if (batch_num < net->burn_in) { /******************************************************/ last_rate = net->learning_rate * pow((float)batch_num / net->burn_in, net->power); /*****************************************************/ return net->learning_rate * pow((float)batch_num / net->burn_in, net->power); } switch (net->policy) { case CONSTANT: return net->learning_rate; case STEP: return net->learning_rate * pow(net->scale, batch_num/net->step); case STEPS: //rate = net->learning_rate; rate = last_rate; for(i = 0; i < net->num_steps; ++i){ if(net->steps[i] > batch_num) return rate; rate *= net->scales[i]; } return rate; case EXP: return net->learning_rate * pow(net->gamma, batch_num); case POLY: return net->learning_rate * pow(1 - (float)batch_num / net->max_batches, net->power); case RANDOM: return net->learning_rate * pow(rand_uniform(0,1), net->power); case SIG: return net->learning_rate * (1./(1.+exp(net->gamma*(batch_num - net->step)))); default: fprintf(stderr, "Policy is weird!\n"); return net->learning_rate; } }