作者在Caffe中引入了一個新層,一般情況在Caffe中引入一個新層需要修改caffe.proto,添加該層頭文件*.hpp,CPU實現*.cpp,GPU實現*.cu,代碼結果如下圖所示:
- caffe.proto
作者在caffe.proto中引入了largemargin_inner_product_laye層所需要的一些參數,例如num_output、type等,請注意一些參數有默認取值。
- largemargin_inner_product_laye.hpp
#ifndef CAFFE_LARGEMARGIN_INNER_PRODUCT_LAYER_HPP_ #define CAFFE_LARGEMARGIN_INNER_PRODUCT_LAYER_HPP_ #include <vector> #include "caffe/blob.hpp" #include "caffe/layer.hpp" #include "caffe/proto/caffe.pb.h" namespace caffe { /** * @brief Also known as a "LargeMargin fully-connected" layer, computes an LargeMargin inner product * with a set of learned weights, and (optionally) adds biases. * * TODO(dox): thorough documentation for Forward, Backward, and proto params. */ template <typename Dtype> class LargeMarginInnerProductLayer : public Layer<Dtype> { public: explicit LargeMarginInnerProductLayer(const LayerParameter& param) : Layer<Dtype>(param) {} virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual inline const char* type() const { return "LargeMarginInnerProduct"; } virtual inline int ExactNumBottomBlobs() const { return 2; } virtual inline int MaxTopBlobs() const { return 2; } protected: virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top); virtual void Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom); virtual void Backward_gpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom); int M_; int K_; int N_; LargeMarginInnerProductParameter_LargeMarginType type_; // common variables Blob<Dtype> x_norm_; Blob<Dtype> w_norm_; Blob<Dtype> cos_theta_; Blob<Dtype> sign_0_; // sign_0 = sign(cos_theta) // for DOUBLE type Blob<Dtype> cos_theta_quadratic_; // for TRIPLE type Blob<Dtype> sign_1_; // sign_1 = sign(abs(cos_theta) - 0.5) Blob<Dtype> sign_2_; // sign_2 = sign_0 * (1 + sign_1) - 2 Blob<Dtype> cos_theta_cubic_; // for QUADRA type Blob<Dtype> sign_3_; // sign_3 = sign_0 * sign(2 * cos_theta_quadratic_ - 1) Blob<Dtype> sign_4_; // sign_4 = 2 * sign_0 + sign_3 - 3 Blob<Dtype> cos_theta_quartic_; int iter_; Dtype lambda_; }; } // namespace caffe #endif // CAFFE_LARGEMARGIN_INNER_PRODUCT_LAYER_HPP_
作者在該頭文件中遵循了“在caffe引入層的一般規范”,此外引入了一些變量。一些變量更具其命名,我們是可以大致猜出去含義的,如x_norm_,w_norm_。
- largemargin_inner_product_laye.cpp
由於該部分代碼較多,因此筆者采取函數注釋的方式進行分析(只要分析type=DOUBLE的情形)
#include <vector> #include "caffe/blob.hpp" #include "caffe/common.hpp" #include "caffe/filler.hpp" #include "caffe/layer.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/layers/largemargin_inner_product_layer.hpp" namespace caffe { //該函數主要完成參數賦值、weight Blob初始化 template <typename Dtype> void LargeMarginInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { CHECK_EQ(bottom[0]->num(), bottom[1]->num()) << "Number of labels must match number of output; " << "DO NOT support multi-label this version." << "e.g., if prediction shape is (M X N), " << "label count (number of labels) must be M, " << "with integer values in {0, 1, ..., N-1}."; type_ = this->layer_param_.largemargin_inner_product_param().type(); iter_ = this->layer_param_.largemargin_inner_product_param().iteration(); lambda_ = (Dtype)0.; const int num_output = this->layer_param_.largemargin_inner_product_param().num_output(); N_ = num_output; const int axis = bottom[0]->CanonicalAxisIndex( this->layer_param_.largemargin_inner_product_param().axis()); K_ = bottom[0]->count(axis); // Check if we need to set up the weights if (this->blobs_.size() > 0) { LOG(INFO) << "Skipping parameter initialization"; } else { this->blobs_.resize(1); // Intialize the weight vector<int> weight_shape(2); weight_shape[0] = N_; weight_shape[1] = K_; this->blobs_[0].reset(new Blob<Dtype>(weight_shape));//weight維度為(N_,K_) // fill the weights shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>( this->layer_param_.largemargin_inner_product_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); } // parameter initialization this->param_propagate_down_.resize(this->blobs_.size(), true);//weight大小設置以及初始化 } template <typename Dtype> void LargeMarginInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // Figure out the dimensions const int axis = bottom[0]->CanonicalAxisIndex( this->layer_param_.largemargin_inner_product_param().axis()); const int new_K = bottom[0]->count(axis); CHECK_EQ(K_, new_K) << "Input size incompatible with inner product parameters."; M_ = bottom[0]->count(0, axis);//單個樣本輸出維度N_,樣本數M_,單個樣本維度K_,請記住這三個參數的含義 vector<int> top_shape = bottom[0]->shape(); top_shape.resize(axis + 1); top_shape[axis] = N_; top[0]->Reshape(top_shape);//top的維度(M_,N_) // if needed, reshape top[1] to output lambda vector<int> lambda_shape(1, 1); top[1]->Reshape(lambda_shape);//為了加速收斂引入了指數退化項lambda // common variables vector<int> shape_1_X_M(1, M_); x_norm_.Reshape(shape_1_X_M);//norm{xi},i屬於[0,M_-1] vector<int> shape_1_X_N(1, N_); w_norm_.Reshape(shape_1_X_N);//norm{wi},i屬於[0,N_-1] sign_0_.Reshape(top_shape); cos_theta_.Reshape(top_shape);//cos(theta)的維度(M_,N_) // optional temp variables switch (type_) { case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: break; case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: cos_theta_quadratic_.Reshape(top_shape);//cos(theta)^2的維度(M_,N_) break; case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: cos_theta_quadratic_.Reshape(top_shape); cos_theta_cubic_.Reshape(top_shape); sign_1_.Reshape(top_shape); sign_2_.Reshape(top_shape); break; case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: cos_theta_quadratic_.Reshape(top_shape); cos_theta_cubic_.Reshape(top_shape); cos_theta_quartic_.Reshape(top_shape); sign_3_.Reshape(top_shape); sign_4_.Reshape(top_shape); break; default: LOG(FATAL) << "Unknown L-Softmax type."; } } template <typename Dtype> void LargeMarginInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { iter_ += (Dtype)1.; Dtype base_ = this->layer_param_.largemargin_inner_product_param().base(); Dtype gamma_ = this->layer_param_.largemargin_inner_product_param().gamma(); Dtype power_ = this->layer_param_.largemargin_inner_product_param().power(); Dtype lambda_min_ = this->layer_param_.largemargin_inner_product_param().lambda_min(); lambda_ = base_ * pow(((Dtype)1. + gamma_ * iter_), -power_); lambda_ = std::max(lambda_, lambda_min_); top[1]->mutable_cpu_data()[0] = lambda_;//指數退化項,iter_很大時,lambda_趨於0 /************************* common variables *************************/ const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* mutable_x_norm_data = x_norm_.mutable_cpu_data(); for (int i = 0; i < M_; i++) { mutable_x_norm_data[i] = sqrt(caffe_cpu_dot(K_, bottom_data + i * K_, bottom_data + i * K_));//norm{xi}計算,i屬於M_ } const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* mutable_w_norm_data = w_norm_.mutable_cpu_data(); for (int i = 0; i < N_; i++) { mutable_w_norm_data[i] = sqrt(caffe_cpu_dot(K_, weight + i * K_, weight + i * K_));//norm{wi}計算,i屬於N_ } Blob<Dtype> xw_norm_product_; xw_norm_product_.Reshape(cos_theta_.shape()); caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., x_norm_.cpu_data(), w_norm_.cpu_data(), (Dtype)0., xw_norm_product_.mutable_cpu_data());//norm{wi}乘以norm{xj},輸出維度為(M_,N_) caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., cos_theta_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, (Dtype)0.000000001, xw_norm_product_.mutable_cpu_data());//防止分母為0 caffe_div(M_ * N_, cos_theta_.cpu_data(), xw_norm_product_.cpu_data(), cos_theta_.mutable_cpu_data());//cos(theta),輸出維度為(M_,N_) caffe_cpu_sign(M_ * N_, cos_theta_.cpu_data(), sign_0_.mutable_cpu_data()); switch (type_) { case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: break; case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data());//cos(theta)^2,輸出維度為(M_,N_) break; case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data()); caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)3., cos_theta_cubic_.mutable_cpu_data()); caffe_abs(M_ * N_, cos_theta_.cpu_data(), sign_1_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, -(Dtype)0.5, sign_1_.mutable_cpu_data()); caffe_cpu_sign(M_ * N_, sign_1_.cpu_data(), sign_1_.mutable_cpu_data()); caffe_copy(M_ * N_, sign_1_.cpu_data(), sign_2_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, (Dtype)1., sign_2_.mutable_cpu_data()); caffe_mul(M_ * N_, sign_0_.cpu_data(), sign_2_.cpu_data(), sign_2_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, - (Dtype)2., sign_2_.mutable_cpu_data()); break; case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)2., cos_theta_quadratic_.mutable_cpu_data()); caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)3., cos_theta_cubic_.mutable_cpu_data()); caffe_powx(M_ * N_, cos_theta_.cpu_data(), (Dtype)4., cos_theta_quartic_.mutable_cpu_data()); caffe_copy(M_ * N_, cos_theta_quadratic_.cpu_data(), sign_3_.mutable_cpu_data()); caffe_scal(M_ * N_, (Dtype)2., sign_3_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, (Dtype)-1., sign_3_.mutable_cpu_data()); caffe_cpu_sign(M_ * N_, sign_3_.cpu_data(), sign_3_.mutable_cpu_data()); caffe_mul(M_ * N_, sign_0_.cpu_data(), sign_3_.cpu_data(), sign_3_.mutable_cpu_data()); caffe_copy(M_ * N_, sign_0_.cpu_data(), sign_4_.mutable_cpu_data()); caffe_scal(M_ * N_, (Dtype)2., sign_4_.mutable_cpu_data()); caffe_add(M_ * N_, sign_4_.cpu_data(), sign_3_.cpu_data(), sign_4_.mutable_cpu_data()); caffe_add_scalar(M_ * N_, - (Dtype)3., sign_4_.mutable_cpu_data()); break; default: LOG(FATAL) << "Unknown L-Softmax type."; } /************************* Forward *************************/ Dtype* top_data = top[0]->mutable_cpu_data(); caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data);//top = XW’,X為bottom data維度為(M_,K_),W'為權重矩陣、維度為(K_,N_) const Dtype* label = bottom[1]->cpu_data(); const Dtype* xw_norm_product_data = xw_norm_product_.cpu_data(); switch (type_) { case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: { break; } case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: { const Dtype* sign_0_data = sign_0_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * ((Dtype)2. * sign_0_data[i * N_ + label_value] * cos_theta_quadratic_data[i * N_ + label_value] - (Dtype)1.); }//修改樣本i對應label的輸出(請參考論文) caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_, bottom_data, weight, (Dtype)1., top_data);//引入lambda_,加速收斂 caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data); break;//上述兩個操作參考原始論文很好理解 } case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: { const Dtype* sign_1_data = sign_1_.cpu_data(); const Dtype* sign_2_data = sign_2_.cpu_data(); const Dtype* cos_theta_data = cos_theta_.cpu_data(); const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * (sign_1_data[i * N_ + label_value] * ((Dtype)4. * cos_theta_cubic_data[i * N_ + label_value] - (Dtype)3. * cos_theta_data[i * N_ + label_value]) + sign_2_data[i * N_ + label_value]); } caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_, bottom_data, weight, (Dtype)1., top_data); caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data); break; } case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: { const Dtype* sign_3_data = sign_3_.cpu_data(); const Dtype* sign_4_data = sign_4_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); top_data[i * N_ + label_value] = xw_norm_product_data[i * N_ + label_value] * (sign_3_data[i * N_ + label_value] * ((Dtype)8. * cos_theta_quartic_data[i * N_ + label_value] - (Dtype)8. * cos_theta_quadratic_data[i * N_ + label_value] + (Dtype)1.) + sign_4_data[i * N_ + label_value]); } caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, N_, K_, lambda_, bottom_data, weight, (Dtype)1., top_data); caffe_scal(M_ * N_, (Dtype)1./((Dtype)1. + lambda_), top_data); break; } default: { LOG(FATAL) << "Unknown L-Softmax type."; } } } //在反向傳播中,我只簡單介紹一下核心的誤差傳遞,忽略導數的計算公式分析。計算公式參考論文是很好理解的 template <typename Dtype> void LargeMarginInnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { Blob<Dtype> inv_w_norm_; inv_w_norm_.Reshape(w_norm_.shape()); Blob<Dtype> xw_norm_ratio_; xw_norm_ratio_.Reshape(cos_theta_.shape()); caffe_add_scalar(N_, (Dtype)0.000000001, w_norm_.mutable_cpu_data()); caffe_set(N_, (Dtype)1., inv_w_norm_.mutable_cpu_data()); caffe_div(N_, inv_w_norm_.cpu_data(), w_norm_.cpu_data(), inv_w_norm_.mutable_cpu_data()); caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., x_norm_.cpu_data(), inv_w_norm_.cpu_data(), (Dtype)0., xw_norm_ratio_.mutable_cpu_data()); const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* label = bottom[1]->cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); if (this->param_propagate_down_[0]) { Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();//請注意:weight和weight_diff含義不同 const Dtype* xw_norm_ratio_data = xw_norm_ratio_.cpu_data(); switch (type_) { case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: { caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: { const Dtype* sign_0_data = sign_0_.cpu_data(); const Dtype* cos_theta_data = cos_theta_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); for (int i = 0; i < N_; i++) { for (int j = 0; j < M_; j++) {// dL/dwij = sum{dL/dfni*dfni/dwij},求和范圍n屬於[0,M_) const int label_value = static_cast<int>(label[j]); if (label_value != i) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * (Dtype)4. * sign_0_data[j * N_ + i] * cos_theta_data[j * N_ + i], bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * (-xw_norm_ratio_data[j * N_ + i]) * ((Dtype)2. * sign_0_data[j * N_ + i] * cos_theta_quadratic_data[j * N_ + i] + (Dtype)1.), weight + i * K_, (Dtype)1., weight_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_), top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: { const Dtype* sign_1_data = sign_1_.cpu_data(); const Dtype* sign_2_data = sign_2_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data(); for (int i = 0; i < N_; i++) { for (int j = 0; j < M_; j++) { const int label_value = static_cast<int>(label[j]); if (label_value != i) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * sign_1_data[j * N_ + i] * ((Dtype)12. * cos_theta_quadratic_data[j * N_ + i] - (Dtype)3.), bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * (-xw_norm_ratio_data[j * N_ + i]) * ((Dtype)8. * sign_1_data[j * N_ + i] * cos_theta_cubic_data[j * N_ + i] - sign_2_data[j * N_ + i]), weight + i * K_, (Dtype)1., weight_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_), top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: { const Dtype* sign_3_data = sign_3_.cpu_data(); const Dtype* sign_4_data = sign_4_.cpu_data(); const Dtype* cos_theta_data = cos_theta_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data(); const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data(); for (int i = 0; i < N_; i++) { for (int j = 0; j < M_; j++) { const int label_value = static_cast<int>(label[j]); if (label_value != i) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i], bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * sign_3_data[j * N_ + i] * ((Dtype)32. * cos_theta_cubic_data[j * N_ + i] - (Dtype)16. * cos_theta_data[j * N_ + i]), bottom_data + j * K_, (Dtype)1., weight_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[j * N_ + i] * (-xw_norm_ratio_data[j * N_ + i]) * (sign_3_data[j * N_ + i] * ((Dtype)24. * cos_theta_quartic_data[j * N_ + i] - (Dtype)8. * cos_theta_quadratic_data[j * N_ + i] - (Dtype)1.) - sign_4_data[j * N_ + i]), weight + i * K_, (Dtype)1., weight_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, lambda_/((Dtype)1. + lambda_), top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); break; } default: { LOG(FATAL) << "Unknown L-Softmax type."; } } } // Gradient with respect to bottom data if (propagate_down[0]) { Dtype* bottom_diff = bottom[0]->mutable_cpu_diff(); const Dtype* xw_norm_ratio_data = xw_norm_ratio_.cpu_data(); caffe_set(M_ * K_, (Dtype)0., bottom_diff); switch (type_) { case LargeMarginInnerProductParameter_LargeMarginType_SINGLE: { caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., bottom[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_DOUBLE: { const Dtype* sign_0_data = sign_0_.cpu_data(); const Dtype* cos_theta_data = cos_theta_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); for (int j = 0; j < N_; j++) {// dL/dxij = sum{dL/dfin*dfin/dxij},求和范圍n屬於[0,N_) if (label_value != j) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], weight + j * K_, (Dtype)1., bottom_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * (Dtype)4. * sign_0_data[i * N_ + j] * cos_theta_data[i * N_ + j], weight + j * K_, (Dtype)1., bottom_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] / (-xw_norm_ratio_data[i * N_ + j]) * ((Dtype)2. * sign_0_data[i * N_ + j] * cos_theta_quadratic_data[i * N_ + j] + (Dtype)1.), bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_), top_diff, this->blobs_[0]->cpu_data(), (Dtype)1., bottom[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_TRIPLE: { const Dtype* sign_1_data = sign_1_.cpu_data(); const Dtype* sign_2_data = sign_2_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); for (int j = 0; j < N_; j++) { if (label_value != j) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], weight + j * K_, (Dtype)1., bottom_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * sign_1_data[i * N_ + j] * ((Dtype)12. * cos_theta_quadratic_data[i * N_ + j] - (Dtype)3.), weight + j * K_, (Dtype)1., bottom_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] / (-xw_norm_ratio_data[i * N_ + j]) * ((Dtype)8. * sign_1_data[i * N_ + j] * cos_theta_cubic_data[i * N_ + j] - sign_2_data[i * N_ +j]), bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_), top_diff, this->blobs_[0]->cpu_data(), (Dtype)1., bottom[0]->mutable_cpu_diff()); break; } case LargeMarginInnerProductParameter_LargeMarginType_QUADRUPLE: { const Dtype* sign_3_data = sign_3_.cpu_data(); const Dtype* sign_4_data = sign_4_.cpu_data(); const Dtype* cos_theta_data = cos_theta_.cpu_data(); const Dtype* cos_theta_quadratic_data = cos_theta_quadratic_.cpu_data(); const Dtype* cos_theta_cubic_data = cos_theta_cubic_.cpu_data(); const Dtype* cos_theta_quartic_data = cos_theta_quartic_.cpu_data(); for (int i = 0; i < M_; i++) { const int label_value = static_cast<int>(label[i]); for (int j = 0; j < N_; j++) { if (label_value != j) { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j], weight + j * K_, (Dtype)1., bottom_diff + i * K_); } else { caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] * sign_3_data[i * N_ + j] * ((Dtype)32. * cos_theta_cubic_data[i * N_ + j] - (Dtype)16. * cos_theta_data[i * N_ + j]), weight + j * K_, (Dtype)1., bottom_diff + i * K_); caffe_cpu_axpby(K_, (Dtype)1. / ((Dtype)1. + lambda_) * top_diff[i * N_ + j] / (-xw_norm_ratio_data[i * N_ + j]) * (sign_3_data[i * N_ + j] * ((Dtype)24. * cos_theta_quartic_data[i * N_ + j] - (Dtype)8. * cos_theta_quadratic_data[i * N_ + j] - (Dtype)1.) - sign_4_data[i * N_ + j]), bottom_data + i * K_, (Dtype)1., bottom_diff + i * K_); } } } caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, K_, N_, lambda_/((Dtype)1. + lambda_), top_diff, this->blobs_[0]->cpu_data(), (Dtype)1., bottom[0]->mutable_cpu_diff()); break; } default: { LOG(FATAL) << "Unknown L-Softmax type."; } } } } #ifdef CPU_ONLY STUB_GPU(LargeMarginInnerProductLayer); #endif INSTANTIATE_CLASS(LargeMarginInnerProductLayer); REGISTER_LAYER_CLASS(LargeMarginInnerProduct); } // namespace caffe
反向傳播時我只示例性的展示了鏈式法則(有d代表偏導符號∂了)。如果大家覺得lambda礙事,可以直接認為其值為0,這樣能夠簡化我們的理解。
GPU實現部分原理應該差不多,只是實現方式有所差異,這里我就不進行分析了。大家可以參考這篇說明理解這篇論文的基本原理https://mp.weixin.qq.com/s?__biz=MzA3Mjk0OTgyMg==&mid=2651123524&idx=1&sn=0546ceca3d88e2ff1e66fbecc99bd6a7&chksm=84e6c615b3914f03bec98f22eefb00da5b30a82866c068cc4045e3ee9d0a31366f2f8bb5fec1&scene=0&ptlang=2052&source&ADUIN=1184611233&ADSESSION=1496371074&ADTAG=CLIENT.QQ.5527_.0&ADPUBNO=26632#rd
總結:通過閱讀源碼,我明白了鏈式法則在程序中是如何運用的,也學到了一個加速網絡收斂的技巧(即引入指數退化項)。