Caffe Loss分析

本文轉載自查看原文 2017-10-22 19:00 3151 Caffe

Caffe_Loss

損失函數為深度學習中重要的一個組成部分，各種優化算法均是基於Loss來的，損失函數的設計好壞很大程度下能夠影響最終網絡學習的好壞。派生於 \(LossLayer\),根據不同的Loss層有不同的參數;

1.基本函數

主要包含構造函數，前向、后向以及Reshape，部分有SetUp的函數，每層都有Loss參數

    explicit XXXLossLayer(const LayerParameter& param):
    LossLayer<Dtype>(param),diff_() {}
    virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
            const vector<Blob<Dtype>*>& top);
    virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
            const vector<Blob<Dtype>*>& top);
    virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
            const vector<Blob<Dtype>*>& top);
    virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
            const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
    virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
            const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

2.常用損失函數

由於訓練中，采用mini_batch的模式

(1) EuclideanLoss (歐式損失函數，L2損失)

\(EuclideanLoss\)的公式表達為 \(loss = \frac{1}{2n}\sum_{i=1}^n{(y_{i}-\hat{y}_{i})^2}\)

  //reshape函數，完成層次的reshape,diff_與輸入的N*C維度相同
  template <typename Dtype>
  void EuclideanLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top){
      LossLayer<Dtype>::Reshape(bottom,top);//先調用基類的Reshape函數
      CHECK_EQ(bottom[0]->count(1),bottom[1]->count(1));//label類別
      diff_.Reshape(*bottom[0]);//一般是N*C*1*1
  }

  // Forward_cpu 前向 主要計算loss
  template <typename Dtype>
  void EuclideanLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
     const int count = bottom[0]->count();
     caffe_sub(count,
               bottom[0]->cpu_data(),//網絡的輸出 N*C
               bottom[1]->cpu_data(),//對應label N*C
               diff_.mutable_cpu_data()//對應的loss差分
           );//完成 y_{predicy}-y_{label} //bottom[0]-bottom[1]
     Dtype dot = caffe_cpu_dot(count,diff_.cpu_data(),diff_.cpu_data());
     //bottom[0]->num()== bottom[0].shape(0);
     Dtype loss = dot/bottom[0]->num()/Dtype(2);//loss/(2*n)
     top[0]->mutable_cpu_data()[0] = loss;
  }

 //Backward_cpu f'(x) = 1/n*(y_{predict}-y_{label})
 template <typename Dtype>
 void EuclideanLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>&propagate_down,const vector<Blob<Dtype>*>& bottom){
    for (size_t i = 0; i < 2; i++) {
        if (propagate_down[i]) {//需要backward
            //對應predict-label 如果label為bottom[0]就需要乘以-1
            const Dtype sign = (i==0) ? 1 : -1;
            //top[0]->cpu_diff()返回float* length = 1;下式為loss/n;
            const Dtype alpha = sign*top[0]->cpu_diff()[0]/bottom[0]->num();
            //y = ax+by ;
            caffe_cpu_axpby(bottom[0]->count(),//count
                            alpha,// loss/n
                            diff_.cpu_data(),//y_{predict}-y_{label}
                            Dtype(0),
                            bottom[i]->mutable_cpu_diff()
                        );//1/n*loss*(y_{predict}-y_{label})
        }
    }
    //歐式損失函數形式簡單，常用於做回歸分析，做分類需要統一量綱。
 }

(2)SoftmaxWithLoss Softmax損失函數

\(\qquad softmax函數將輸出的各個類別的概率值進行歸一化，生成各個類別的prob\)
\(\qquad 常用的分類損失函數，Softmax輸出與Multinomial Logistic Loss的結合。公式如下:\)

\[y_i = softmax(x_i) = \frac{exp(x_i)}{\sum_{j=1}^{n}{exp(x_j)}} \]

\[loss = -log(y_k) ,k為實際的樣本label \]

\(\qquad 損失函數的推導:\frac{\partial Loss}{\partial x_i}=\sum_{j=1}^{n}{\frac{\partial loss}{\partial y_j}*\frac{\partial y_j}{\partial x_i}}=-\frac{1}{y_k}*\frac{\partial y_k}{\partial x_i} \quad k為實際的label,其他的\frac{\partial loss}{\partial y_j} =0 \\\)

\[\qquad \frac{\partial y_k}{\partial x_i} = \frac{\partial softmax(x_k)}{\partial x_i}= \begin{cases} \ y_k*(1-y_k) \qquad k == i \\\ \\ \ -y_k*y_i \qquad \qquad k \,\,!=\,i \end{cases} \]

\[整理后可以發現\frac{\partial loss}{\partial x_i}= \begin{cases} \ y_k-1 \qquad k \,== \,i ，即i為實際label\\\ \\ \ y_i \qquad \qquad k \,\,!=\,i,即i不是實際label \end{cases} \]

具體代碼的實現如下所示:

1.SoftmaxWithLossLayer的輸入:bottom

    // bottom[0]為前層的特征輸出，一般維度為N*C*1*1
    // bottom[1]為來自data層的樣本標簽，一般維度為N*1*1*1;
    // 申明
    const vector<Blob<Dtype>*>& bottom;
    //backward部分代碼
    Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
    const Dtype* prob_data = prob_.cpu_data();
    caffe_copy(prob_.count(), prob_data, bottom_diff);
    const Dtype* label = bottom[1]->cpu_data();//label

2.SoftmaxWithLossLayer層的輸出:top

    // SoftmaxWithLossLayer的輸出其實就是1*1*1*1的最終loss
    // 如果有多個的話實際就是也會保存softmax的輸出，但是需要注意的是內部包含了
    //Softmax的FORWAR過程，產生的概率值保存在prob_內
    const vector<Blob<Dtype>*>& top;
    //forward部分代碼 ,
    top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
    if (top.size() == 2) {
        top[1]->ShareData(prob_);//top[1]保存softmax的前向概率
    }

3.SoftmaxWithLossLayer的關鍵變量: \(softmax\_top\_vec\_,prob\_\) 記錄中間值

    shared_ptr<Layer<Dtype> > softmax_layer_;
    /// prob stores the output probability predictions from the SoftmaxLayer.
    Blob<Dtype> prob_;
    /// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
    vector<Blob<Dtype>*> softmax_bottom_vec_;
    /// top vector holder used in call to the underlying SoftmaxLayer::Forward
    vector<Blob<Dtype>*> softmax_top_vec_;
    /// Whether to ignore instances with a certain label.
    bool has_ignore_label_;
    /// The label indicating that an instance should be ignored.
    int ignore_label_;
    /// How to normalize the output loss.
    LossParameter_NormalizationMode normalization_;

    int softmax_axis_, outer_num_, inner_num_;//softmax的輸出與Loss的維度
    template <typename Dtype>
    void SoftmaxWithLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
        LossLayer<Dtype>::Reshape(bottom,top);//先調用基類的reshape
        softmax_layer_->Reshape(softmax_bottom_vec,softmax_top_vec_);
        int axis = this->layer_param_.softmax_param().axis();//softmaxproto參數(1)
        softmax_axis_ = bottom[0]->CanonicalAxisIndex(axis);//正不變負倒數
        outer_num_ = bottom[0]->count(0,softmax_axis_);// N mini_batch_size
        inner_num_ = bottom[0]->count(softmax_axis_+1);// H*W 一般為1*1
        //保證outer_num_*inner_num_ = bottom[1]->count();//bottom[1]為label N
        if (top.size() >= 2) {//多個top實際上是並列的，prob_值完全一致
            top[1]->Reshapelike(*bottom[0]);
        }
    }

    //forward是一個計算loss的過程，loss為-log(p_label)
    //由於softmaxWithLoss包含了Softmax所以需要經過Softmax的前向，並得到每個類別概率值
    template <typename Dtype>
    void SoftmaxWithLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
        //調用Softmax的前向
        softmax_layer_->Forward(softmax_bottom_vec_,softmax_top_vec_);
        //這里等同於softmax_top_vec_[0]->cpu_data();
        const Dtype* prob_data = prob_.cpu_data();
        const Dtype* label = bottom[1]->cpu_data();//label 一般來自Data層
        // 一般是N*C(n個樣本，每個C個預測概率)/ N == 類別數目
        int dim = prob_.count()/out_num_;
        int count = 0;//統計實際參與loss的樣本個數
        Dtype loss = 0;
        for (size_t i = 0; i < outer_num_; i++) {//每個樣本遍歷
            for (size_t j = 0; j < inner_num_; j++) { //可以認為j == 0 絕大多數成立
                const int label_value = static_cast<int>(label[i*inner_num_+j]);
                if(has_ignore_label_ && label_value == ignore_label_){
                    // softmaxLayer的參數，可以選擇不參與loss的類別
                    continue;
                }
                else{//實際需要判斷label_value > 0 ,< prob_.shape(1)
                    // -= 因為loss = -log(p_label),prob_data 是n*c的
                    loss -= log(std::max(prob_data[i*dim+label_value*inner_num_+j)],
                                    Dtype(FLT_MIN)));//防止溢出或prob出現NAN
                    ++count;
                }
            }
        }
        //全部樣本遍歷完成后，可以進行歸一，其實也挺簡單，
        // top[0]->mutable_cpu_data[0] = loss/歸一化
    }

    // Backward_cpu,這里的Backward實際需要更新的是softmax的輸入接口的數據，
    // 中間有個y的轉化，具體公式上面已經寫出
    // bottom_diff = top_diff * softmaxWithloss' = top_diff * {p -1 或者 p}
    template <typename Dtype>
    void SoftmaxWithLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom){
        //fc輸出與label的位置固定了，因此不需要如同歐式loss去判斷label和fc的輸入位置
        if (propagate_down[1]) {
            //label不需要backpropagate
        }
        if (propagate_down[0]) {//輸入，需要更新
            Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();//需要修改的
            const Dtype* prob_data = prob_.cpu_data();//N*C
            //這里把diff先確定為softmax輸出的y值，即bottom_diff[t] = y_t ;
            caffe_copy(prob_.count(),prob_data,bottom_diff);
            const Dtype* label = bottom[1]->cpu_data();
            // 也可以替換為bottom[1]->count(),實際就是類別C
            int dim = prob_.count()/ outer_num_;//NC/C == N
            int count = 0;
            for (size_t i = 0; i < outer_num_; i++) { //n個樣本
                for (size_t j = 0; j < inner_num_; j++) { // 實際j == 0
                    const int label_value = static_cast<int>(label[i*inner_num_+j]);
                    if (has_ignore_label_ && label_value == ignore_label_) {
                        //正好是忽略loss的類別
                        bottom_diff[i*dim+label_vale*inner_num_+j] = 0;
                    }
                    else{
                        //這里需要考慮為什么，實際上之前所有的diff初始為y_t，
                        //根據softmax的偏導知道真實label是y_t -1;
                        bottom_diff[i*dim+label_vale*inner_num_+j] -= 1;
                        ++count;
                    }
                }
            }
            //這里只完成了loss的一部分，還差top_diff即Loss
            //如果歸一化，就進行歸一，同cpu_forward
            //cpu_diff可以認為是Loss
            // Dtype loss_weight = top[0]->cpu_diff()[0]/歸一化
            caffe_scal(prob_count(),loss_weight,bottom_diff);
        }
    }

(3) SmoothL1Loss (RCNN后提出的Loss)

SmoothL1Loss為歐式均方誤差的修改版，為分段函數，對離散點不敏感,具體的公式如下:

\[SmoothL1Loss(x) = \begin{cases} \ 0.5*(sigma*x)^2 \qquad 其他 \\ \ \left|x\right|-0.5/sigma^2 \qquad \left|x\right| < 1./sigma^2 \end{cases} \]

整體的公式為:\(x_{new} = x_{input}*w_{in},output = w_{out}*SmoothL1loss(x_{new});\)
1.基本的數據類型和意義:

    Blob<Dtype> diff_;// y_
    Blob<Dtype> error_;//loss
    Blob<Dtype> ones_;
    bool has_weights_; // weight權值
    Dtype sigma2_ ;// sigma 默認為1，此處sigma2_ = sigma*simga;

2.基本的功能函數
基本包含了LayerSetup Reshape Forward 和 Backward四個函數,具體實現如下

    //構建layer層次,SmoothL1LossLayer的參數有sigma，默認為1
    template <typename Dtype>
    void SmoothL1LossLayer<Dtype>::LayerSetup(const vector<Blob<Dtype>*>&bottom,
    const vector<Blob<Dtype>*>& top){
        SmoothL1LossParameter loss_param = this->layer_param_.smooth_l1_loss_param();
        sigma2_ = loss_param.sigma()*loss_param.sigma();
        has_weights_ = (bottom.size() >= 3);//bottom[3]---為weights
        if (has_weights_) {
            //bottom[3] == out_weight;//w_out
            //bottom[2] == in_weight;// w_in
        }
    }

    // Reshape 根據輸入輸出調節結構，計算過程進行了拆分
    template <typename Dtype>
    void SmoothL1LossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>&
        bottom,const vector<Blob<Dtype>*>& top){
        LossLayer<Dtype>::Reshape(bottom,top);//基函數
        //這里判斷參數維度,
        if (has_weights_) {
            CHECK_EQ(bottom[0]->count(1) == bottom[2]->count(1) ==
            bottom[3].count(1))  ;//w_in和w_out的權值
        }
        diff_.Reshape(bottom[0].shape());// diff_ = w_in*(bottom[0]-bottom[1]);
        error_.Reshape(bottom[0].shape());// error_ = w_out*smoothL1(w_in*diff_);
        ones_.Reshape(bottom[0].shape());// one_ = error_*w_out;
        for (size_t i = 0; i < ones_->count(); i++) {
            one_s.mutable_cpu_data()[i] = Dtype(1);
        }
    }

    // Forward過程，一步一步操作
    template <typename Dtype>
    void SmoothL1LossLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
        int count = bottom[0]->count();
        //bottom[0]和bottom[1]不確定標簽和特征的順序
        caffe_gpu_sub( // 計算diff_ = bottom[0]-bottom[1];
            count,
            bottom[0]->gpu_data(),
            bottom[1]->gpu_data(),
            diff_.mutable_cpu_data()
        );
        if (has_weights_) { x_new = x_input*in_weight,xinput==diff_
            caffp_gpu_mul(
                count,
                bottom[2]->gpu_data(),
                diff_.gpu_data(),
                diff_.mutable_gpu_data();
            );
        }
        //此處為SmoothL1的函數前向過程GPU實現
        SmoothL1Forward<Dtype><<<CAFFE_GET_BLOCKS(count),
        CAFFE_CUDA_NUM_THREADS>>>(
            count, diff_.gpu_data(), errors_.mutable_gpu_data(), sigma2_);
        CUDA_POST_KERNEL_CHECK;

        if (has_weights_) { //x_out= SmoothL1(w_in*x_input) * w_out
            caffe_gpu_mul(
                count,
                bottom[3]->gpu_data(),
                error_.gpu_data(),
                error_.mutable_gpu_data();
            ); // error _ = w_out* error_
        }
        Dtype loss;
        caffe_gpu_dot(count,ones_.gpu_data().error_gpu_data(),&loss);//類似於asum
        top[0]->mutable_gpu_data()[0] = loss/bottom[0]->num();// mini_batch
    }

    // GPU的實現SmoothL1loss,根據公式實現即可
    template <typename Dtype>
    __global__ void SmoothL1Forward(const int n, const Dtype* in, Dtype* out,
    Dtype sigma2) {
    // f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
    //        |x| - 0.5 / sigma / sigma    otherwise
        CUDA_KERNEL_LOOP(index, n) { //for loop
            Dtype val = in[index];
            Dtype abs_val = abs(val);
            if (abs_val < 1.0 / sigma2) {
                out[index] = 0.5 * val * val * sigma2;
            }
            else {
                out[index] = abs_val - 0.5 / sigma2;
            }
        }
    }

反向過程中根據求導公式可以得到如下式子，Backward的過程也如下所示

\[\frac{\partial Loss}{\partial x} = w_{in}*w_{out}*\frac{\partial SmoothL1(x)}{\partial x} \]

cpu版本可以自己實現，只需要把GPU_data_diff換成cpu,以及gpu的smoothL1寫成CPU的即可。

    //backward過程，根據導函數
    // f'()
    template <typename Dtype>
    void SmoothL1LossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom){
        int count = diff_.count();

        // 反向即公式的smoothL1的偏導
        SmoothL1Backward<Dtype><<<CAFFE_GET_BLOCKS(count),
          CAFFE_CUDA_NUM_THREADS >>>(
            count, diff_.gpu_data(), diff_.mutable_gpu_data(), sigma2_);
        CUDA_POST_KERNEL_CHECK;

        //此處的循環loop如同歐式損失函數，因為無法確認bottom[0]和bottom[1]，fc和label
        //的順序，forward默認是0-1，因此如果0為label，則sign = -1;
        for (size_t i = 0; i < 2; i++) {
            if (propagate_down[i]) {
                const Dtype sign = (i == 0) ? 1:-1;//代碼默許了label為bottom[1]
                //sign* loss/n;
                const Dtype alpha = sign*top_diff->gpu_diff()[0]/bottom[i]->num();
                //smoothL1輸入的是diff_.gpu_data()
                caffe_cpu_axpby(
                    count,
                    alpha,
                    diff_.gpu_data(),//此處的data已經是SmoothL1返回的導數了
                    Dtype(0),
                    bottom[i]->mutable_gpu_diff()
                );
                if (has_weights_) {
                    caffe_gpu_mul(
                        count,
                        bottom[2]->gpu_data(),
                        bottom[i]->gpu_diff(),
                        bottom[i]->mutable_gpu_diff()
                    ); 乘以了內層的weight
                    caffe_gpu_mul(
                        count,
                        bottom[3]->gpu_data(),
                        bottom[i]->gpu_diff(),
                        bottom[i]->mutable_gpu_diff()
                    ); 乘以了外層的weight
                }
            }
        }
    }

    template <typename Dtype>
__global__ void SmoothL1Backward(const int n, const Dtype* in, Dtype* out,
    Dtype sigma2) {
  // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
  //       = sign(x)                   otherwise
        CUDA_KERNEL_LOOP(index, n) {
            Dtype val = in[index];
            Dtype abs_val = abs(val);
            if (abs_val < 1.0 / sigma2) {
              out[index] = sigma2 * val;
        }
        else {
              out[index] = (Dtype(0) < val) - (val < Dtype(0));//1或者-1
          }
        }
    }

cpu版本的SmoothL1前向和后向實現如下,cpu版本速度過慢，不建議使用

    //前向 替換前向GPU中一部分
    const Dtype* in = diff_.cpu_data();
    Dtype* out = errors_.mutable_cpu_data();
    for (size_t i = 0; i < diff_.count(); i++) {
       Dtype val = in[index];
       Dtype abs_val = abs(val);
       if(abs_val < 1.0 / sigma2_){
           out[index] = 0.5 * val * val * sigma2_;
       }
       else{
           out[index] = abs_val - 0.5 / sigma2_;
       }
   }

   //反向，替換反向GPU的一部分
   const Dtype* in = diff_.cpu_data();
   Dtype* out = diff_.mutable_cpu_data();
   for (size_t i = 0; i < diff_.count(); i++) {
      Dtype val = in[index];
      Dtype abs_val = abs(val);
      if(abs_val < 1.0 / sigma2_){
          out[index] = sigma2_ *  val;
      }
      else{
          out[index] = (Dtype(0) < val) - (val < Dtype(0));
      }
   }

// smoothL1在目標檢測的時候效果良好，由於多損失函數以及回歸點的變換，bottom[2]和
// bottom[3]基本都存在，由於其函數特性，對偏遠的點不敏感，因此可以替換L2loss

(4) SigmoidCrossEntropyLoss (交叉熵)

交叉熵應用廣泛，常作為二分類的損失函數，在\(logistic\)中使用，由於\(sigmoid\)的函數的輸出特性，能夠很好的以輸出值代表類別概率。具體的公式如下所示:

\[loss = -\frac{1}{n}\sum_{1}^{n}(\hat{p_i}*log(p_i)+(1-\hat{p_i})*log(1-p_i))) \]

\[p_i = \frac{1}{1.+exp(-x_i)} \]

\[\frac{\partial loss}{\partial x_i} = -\frac{1}{n}*\sum_{i=1}^{n}((\hat{p_i}*\frac{1}{p_i}*p_i*(1-p_i)-(1-\hat{p_i})*\frac{1}{1-p_i}*(1-p_i)*p_i)) \]

\[= -\frac{1}{n}\sum_{i=1}^{n}(\hat{p_i}-p_i) \]

1.基本的數據成員

    shared_ptr<SigmoidLayer<Dtype>>sigmoid_layer_;//layer參數
    shared_ptr<Blob<Dtype> > sigmoid_output_; // sigmoid輸出的值N*C C一般==1
    shared_ptr<Blob<Dtype>* > sigmoid_bottom_vec_;// sigmoid函數的輸入x
    shared_ptr<Blob<Dtype>* > sigmoid_top_vec_;// sigmoid函數的輸出

2.基本的成員函數
基本的成員函數為LayerSetup，Reshape ,Forward和Backward,實現如下:

    //構建layer 中間有sigmoid函數過度，所以如同softmaxLoss類似過程
    template <typename Dtype>
    void SigmoidCrossEntropyLossLayer<Dtype>::LayerSetup(
        const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
        LossLayer<Dtype>::LayerSetup(bottom,top);
        sigmoid_bottom_vec_.clear();
        sigmoid_bottom_vec_.push_back(bottom[0]);
        sigmoid_top_vec_.clear();
        sigmoid_top_vec_.push_back(sigmoid_output_.get());//sigmoid的輸出
        sigmoid_layer_->Setup(sigmoid_bottom_vec_,sigmoid_top_vec_);
    }

    //Reshape函數 比較簡單
    template <typename Dtype>
    void SigmoidCrossEntropyLossLayer<Dtype>::Reshape(
        const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
        LossLayer<Dtype>::Reshape(bottom,top);//步驟1
        sigmoid_layer_->Reshape(sigmoid_bottom_vec_,sigmoid_top_vec_);//步驟2
    }

這里Caffe實現的前向計算代碼與公式有差異，具體原因如下
\(\qquad \hat{p}*log(p)+(1-\hat{p})*log(1-p) \\ \qquad \,= \hat{p}*log(\frac{1}{1+e^{-x}})+(1-\hat{p})*log(\frac{e^{-x}}{1+e^{-x}}) \\ \qquad =\hat{p}*log(\frac{1}{1+e^{-x}})-\hat{p}*log(\frac{e^{-x}}{1+e^{-x}})+log(\frac{e^{-x}}{1+e^{-x}}) \\ \qquad =\hat{p}*x+log(\frac{e^{-x}}{1+e^{-x}})\)

當\(e^{-x}很大時, \frac{e^{-x}}{1+e^{-x}}\) 計算不准確，因此采用下種計算方式,當 \(x<0\)時,分子分母同時乘以\(e^{x}\),有:

\[\frac{e^{-x}}{1+e^{-x}}= \begin{cases} \ \frac{e^{-x}}{1+e^{-x}} \qquad x\ge0 \\ \ \frac{1}{1+e^{x}} \qquad \,\,\, x<0 \end{cases} \]

從而得到:

\[\hat{p}*x+log(\frac{e^{-x}}{1+e^{-x}})= \begin{cases} \ \hat{p}*x+log(\frac{e^{-x}}{1+e^{-x}}) = (\hat{p}-1) *x-log(1+e^{-x}) \quad x\ge0 \\ \ \hat{p}*x+log(\frac{e^{-x}}{1+e^{-x}})=\hat{p}*x-log(1+e^{x}) \quad\quad \qquad x<0 \end{cases} \]

    // Forward_cpu 前向函數，分布保存臨時值，得到loss
    template <typename Dtype>
    void SigmoidCrossEntropyLossLayer<Dtype>::Forward_cpu(
        const vector<Blob<Dtype>*> & bottom,const vector<Blob<Dtype>*>& top){
        sigmoid_bottom_vec_[0] = bottom[0];//這一步多余，setup時已經保持一致了
        sigmoid_layer_->Forward_cpu(sigmoid_bottom_vec_,sigmoid_top_vec_);//Sigmoid
        const int count = bottom[0]->count();//N*1*1*1，輸出一個概率值為預測1的
        const int num = bottom[0]->num();
        const Dtype* input_data = bottom[0]->cpu_data();
        const Dtype* target = bottom[1]->cpu_data();//真實label
        Dtype loss = 0;
        for (size_t i = 0; i < count; i++) {//遍歷mini_batch
            loss -= input_data[i]*(target[i]-(input_data[i]>=0))-
                    log(1.+exp(input_data[i]-2*(input_data[i]>=0)));
        }
        top[0]->mutable_cpu_data()[0] = loss/num;//mini_batch
    }


    //backward的反向更新比較簡單，-(target-predict)
    template <typename Dtype>
    void SigmoidCrossEntropyLossLayer<Dtype>::Backward_cpu(
        const vector<Blob<Dtype>*>& top,const vector<bool>& propagate_down,
        const vector<Blob<Dtype>*> & bottom){
        if (propagate_down[1]) {
            //label 不需要更新
        }
        if (propagate_down[0]) {
            const int count = bottom[0]->count();//N*1*1*1
            const int num = bottom[0]->num();// N
            const Dtype* sigmoid_output_data = sigmoid_output_.cpu_data();//預測值
            const Dtype* target = bottom[1]->cpu_data();
            Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
            // bottom_diff = predict - target_label
            caffe_sub(count,sigmoid_output_data,target,bottom_diff);
            const Dtype loss_weight = top[0]->cpu_diff()[0];
            //bottom_diff = bottom_diff*loss_weight/n
            caffe_scal(count,loss_weight/num,bottom_diff);
        }
    }

(5) CenterLoss (ECCV2016)

ECCV2016年提出的新loss，讓softmax能夠訓練出更好的內聚性的特征，思路比較簡單，在SoftmaxLoss的基礎上，添加了一個新的loss，Loss的表達式:

\[\zeta_C = \frac{1}{2}*\sum_{i=1}^{n}||x_i-c_{yi}||_2^2 \]

思路比較好理解，增加一個loss衡量樣本特征與該類類心的距離，更新的公式如下:

\[\frac{\partial \zeta_c}{\partial x_i} = x_i - c_{yi} \]

\[\triangle c_j = \frac{\sum_{i=1}^{n}\delta{(y_i=j)}*(c_j-x_i)}{1+\sum_{i=1}^{n}\delta{(y_i=j)}} \]

\[c_j^{t+1} = c_j^t-\alpha*\triangle{c_j^t} \]

第二步驟類心特征更新僅僅更新當前樣本所屬的類別，分母加1為了防止分母為0，因此和softmax整合后整體的Loss如下所示：

\[\zeta = \zeta_S+\lambda \zeta_C \]

1.基本數據成員

    //基本數據用以保存center_Loss的layer params
    int N_;// 對應params的num_output,分類類別
    int K_;// 對應fc層的輸出特征,
    int M_;// 對應於batch_size
    Blob<Dtype> distance_;//樣本與類心的距離，distance為x - x_center重點
    Blob<Dtype> variation_sum_;// distance的負數， x_center- x
    Blob<Dtype> count_; // 類心的個數
    string distance_type_; // 距離的衡量 默認L2

2.基本的成員函數
與一般的Loss層一樣，有LayerSetup,Reshape,Forward,Backward,具體實現如下

    // layersetup過程，center是N個中心，每個類心feature長度K
    template <typename Dtype>
    void CenterLossLayer<Dtype>::LayerSetup(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
        CenterLossParameter loss_param = this->layer_param_.center_loss_param();
        N_ = loss_pram.num_output();//分類的類別，類心的個數,prototxt內設置
        distance_type_ = loss_pram.distance_type();
        const int axis = bottom[0]->CanonicalAxisIndex(loss_pram.axis());
        K_ = bottom[0].count(axis);//axis 默認為1，K_= fc*1*1,特征的長度
        M_ = bottom[0]->num(); // batch_size的大小
        if (this->blobs_.size() > 0) {
            //層內無參數.
        }
        else{
            this->blobs_.resize(1);//這里放center，各個類別的fc中心
            vector<int> center_shape(2);
            center_shape[0] = N_;
            center_shape[1] = K_;
            // 代表中心是N個中心，每個中心的feature長度為K_
            this.blobs_[0].resize(new Blob<Dtype>(center_shape));
            // 初始中心的填充方式
            shared_ptr<Filler<Dtype>>center_filler(GetFiller<Dtype>(
                loss_param.center_filler()));
            )
            center_filler->Fill(this->blobs_[0].get());
        }
        this->param_propagate_down_.resize(this->blobs_.size(),true);//類心也更新
    }

    // Reshape函數
    template <typename Dtype>
    void CenterLossLayer<Dtype>::Reshape(const vector<Blob<Dtype>*> &bottom,
        const vector<Blob<Dtype>*>& top){
        LossLayer<Dtype>::Reshape(bottom,top);
        distance_.ReshapeLike(*bottom[0]);//bottom長度為N_*K_
        variation_sum_.ReshapeLike(*this->blobs_[0]);//一樣的N_*K_
        vector<int>count_reshape(1);
        count_reshape[0]= N_;
        count_.Reshape(count_reshape);//N_類心的個數
    }

    //Forward_cpu ，得到loss
    // N_類別數，K_特征長度,M_mini_batch的樣本個數
    template <typename Dtype>
    void CenterLossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
        const vector<Blob<Dtype>*>& top){
        const Dtype* bottom_data = bottom[0]->cpu_data();//N_*K_
        const Dtype* label = bottom[1]->cpu_data();//N_*1;
        const Dtype* center = this->blobs_[0]->cpu_data();//N_K_
        Dtype* distance_data = distance_.mutable_cpu_data();//
        // i-t樣本的距離
        for (size_t i = 0; i < M_; i++) {
            const int label_value = static_cast<int>(label[i]);//真是的樣本類別
            //對應特征相減，用fc特征減去該類的類心，保存在distance_data上
            caffe_sub(K_,bottom+i*K_,center+label_value*K_,distance_data+i*K_);
        }
        Dtype dot;
        Dtype loss;
        if (distance_type_ == "L1") { //L1 loss,distance_ sum即可
            // 也可以寫caffe_cpu_asum(M_*K_,distance_data);
            dot = caffe_cpu_asum(M_*K_,distance_.cpu_data());
            loss = dot/M_;
        }
        //L2,loss,distance_data*distance_data,然后M_樣本sum
        else if(distance_type_ == "L2"){
            dot = caffe_cpu_dot(M_*K_,distance_.cpu_data(),distance_.cpu_data());
            loss = dot/M_/Dtype(2);
        }
        else{
            //不支持其他的距離衡量
        }
        top[0]->mutable_cpu_data()[0] = loss;
    }

    // Backward_cpu,更新data和center，
    template <typename Dtype>
    void CenterLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
        const vector<bool>& propagate_down,const vector<Blob<Dtype>*>& bottom){
        if (this->param_propagate_down_[0]) {//表示更新類心
            const Dtype* label = bottom[1]->cpu_data();
            Dtype* center_diff = this->blobs_[0]->mutable_cpu_diff();
            Dtype* variation_sum_data = variation_sum_.mutable_cpu_data();
            int* count_data = count_.mutable_cpu_data();
            const Dtype* distance_data = distance_.cpu_data();//fc_center-fc_pre
            if (distance_type_ == "L1") {
                caffe_cpu_sign(M_*K_,distance_data,distance_.mutable_cpu_data());
            }
            caffe_set(N_*K_,Dtype(0),variation_sum_.mutable_cpu_data());
            caffe.set(N_,0,count_.mutable_cpu_data());//統計每個類別的個數

            for (size_t i = 0; i < M_; i++) {//樣本循環
                const int label_value = static_cast<int>(label[i]);
                //variation_sum_data 初始為0，distance保存的即使x_i-x_center
                caffe_sub(K_,variation_sum_data+label_value*K_,
                    distance_data+i*K,variation_sum_data+label_value*K);
                count_data[label_value]++:
            }
            for (size_t i = 0; i < M_; i++) {
                const int label_value = static_cast<int>(label[0]);
                //1/(count+1)*(x_center-x_i)
                caffe_cpu_axpby(K_,Dtype(1)/(count_data[label_value]+1),
                variation_sum_data+label_value*K,1.,center_diff+label_value*K_);
            }
        }

        //類心更新完成后,跟新x
        if (propagate_down[0]) {//更新輸入x
            //loss * 1/M * (x - x_center)
            caffe_copy(M_*K_,distance.cpu_data(),bottom[0]->mutable_cpu_diff());
            cafe_scal(M_*K_,top[0]->cpu_diff()[0]/M_,
            bottom[0]->mutable_cpu_diff());
        }
        if (propagate_down[1]) {
            // label不更新
        }
    }

\(CenterLoss\)在多分類上較\(Softmax\)有提高，\(loss \_weight\)的設置可以確定\(center \_loss\)和\(softmaxloss\)的比重，能夠很有效的使得網絡能夠最小化類內距離，加大區分度。

本文作者：張峰
本文鏈接：https://zhanglaplace.github.io/2017/10/20
版權聲明：本博客所有文章，均采用CC BY-NC-SA 3.0 許可協議。轉載請注明出處！

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 caffe之(五)loss層 train loss和test loss分析 train loss與test loss結果分析/loss不下降 caffe層解讀-softmax_loss Caffe---Pycaffe 繪制loss和accuracy曲線基於Caffe的Large Margin Softmax Loss的實現（上）【caffe】loss function、cost function和error 基於Caffe的Large Margin Softmax Loss的實現（中） Caffe訓練時Loss=87.3365問題 caffe層解讀系列-softmax_loss