背景: 項目中需要在 caffe 中增加 binary convolution layer, 所以在單步調試了 minist 的訓練,大致看了一下流程,就詳細看 convolution layer 了。
1、數據結構
caffe 的基本數據結構是 Blob,也就是數據流的基本結構。
2、網絡結構
Net 是 Layer 構造出來的,Layer 包括了數據和運算(Blob input, Blob output, operation)。
3、卷積
3.1 卷積意義
卷積實際上是一種線性運算,只不過用卷積核移動的方式實現,會達到提取特征的效果。對於卷積層來說,輸入是一個特征圖(Feature Map), 輸出是通過卷積核的提取得到的新特征圖。之所以叫特征圖,是由於卷積核的作用下得到的結果是和卷積核代表的特征相關的結果。
如上圖的卷積核,當卷積核在輸入上移動時,提取到的特征是和對角線相關的特征,對於輸入來說,對角線的值越大,得到的特征值越大。
3.2 im2col
對於神經網絡來說,只考慮計算機視覺,也就是二維圖片的情況,原始原始輸入是 batch_size x channels x height x width, 那么在計算的時候,每個卷積核移動的次數就是 height x width, 在 caffe 使用的卷積計算是矩陣乘法獲得效率,用空間換時間,將圖片按卷積核擴展。
賈揚凊大神在知乎的回答鏈接:在 caffe 中如何計算卷積?
(其他的實現方法可以參考 github 上 ncnn 的實現)
Image 和 Filters 都是實際所需要的內容, 后面 Feature Matrix 和 Filter Matirx 是計算過程的數據。
3.3 caffe 實現
實現部分在 base_conv_layer.cpp, 這種設計結構是為了 deconvolution 和 convolution 共用。Convolution 和 Deconvolution 都通過 ConvolutionParameter 配置基本參數,也就是說,在 Conv 和 Deconv 里的參數是一樣的,但是實際上兩個模塊輸入輸出的 shape 和計算都是相反的,實現的原理是重寫base_conv_layer 的虛函數,實現不同的 shape 定義和計算。 除此之外,還有一個翻轉參數 reverse_dimensions() 通過虛函數的方法實現,但是只是在兩種層里重寫為返回 true 或者 false。貼一下 Setup 的注釋,建立完卷積層就是計算的東西了,代碼比較好讀,不寫了。
void BaseConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // Configure the kernel size, padding, stride, and inputs. ConvolutionParameter conv_param = this->layer_param_.convolution_param(); force_nd_im2col_ = conv_param.force_nd_im2col(); /** * 這里解釋一下,blob的CanonicalAxisIndex是為了標准化維度索引的輸入,將一些非法維度輸入轉化為合法輸入。 * blob的count(int)是統計從某個維度開始,到結尾的總個數。這里第一個維度表示的是樣本個數,其后面的是表示輸入特征的個數。 */ channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis()); const int first_spatial_axis = channel_axis_ + 1; const int num_axes = bottom[0]->num_axes(); num_spatial_axes_ = num_axes - first_spatial_axis; // 計算空間軸個數 // 空間軸大於等於 0, 即保證 blob 的維數至少到 channel(channel 的位置是在 conv_param 中定義的) CHECK_GE(num_spatial_axes_, 0); vector<int> bottom_dim_blob_shape(1, num_spatial_axes_ + 1); vector<int> spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1)); // Setup filter kernel dimensions (kernel_shape_). kernel_shape_.Reshape(spatial_dim_blob_shape); // kernel dimensions = input dim in one channel int* kernel_shape_data = kernel_shape_.mutable_cpu_data(); // 修改 kernel_shape if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) { // 給定卷積核 h 和 w 參數的時候確定是二維的 CHECK_EQ(num_spatial_axes_, 2) << "kernel_h & kernel_w can only be used for 2D convolution."; CHECK_EQ(0, conv_param.kernel_size_size()) << "Either kernel_size or kernel_h/w should be specified; not both."; kernel_shape_data[0] = conv_param.kernel_h(); kernel_shape_data[1] = conv_param.kernel_w(); } else { // 未給定 h w 參數的時候,是 n 維, 每一維的大小從 conv_param 獲得 const int num_kernel_dims = conv_param.kernel_size_size(); CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_) << "kernel_size must be specified once, or once per spatial dimension " << "(kernel_size specified " << num_kernel_dims << " times; " << num_spatial_axes_ << " spatial dims)."; for (int i = 0; i < num_spatial_axes_; ++i) { kernel_shape_data[i] = conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i); } } for (int i = 0; i < num_spatial_axes_; ++i) { // 保證卷積核在每一維的大小是大於 0 的 CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero."; } // Setup stride dimensions (stride_). stride_.Reshape(spatial_dim_blob_shape); // 設置 stride, 和 kernal 類似 int* stride_data = stride_.mutable_cpu_data(); // cy, if there are stride params, then set as the params. If there are no param set as default. if (conv_param.has_stride_h() || conv_param.has_stride_w()) { CHECK_EQ(num_spatial_axes_, 2) << "stride_h & stride_w can only be used for 2D convolution."; CHECK_EQ(0, conv_param.stride_size()) << "Either stride or stride_h/w should be specified; not both."; stride_data[0] = conv_param.stride_h(); stride_data[1] = conv_param.stride_w(); } else { const int num_stride_dims = conv_param.stride_size(); CHECK(num_stride_dims == 0 || num_stride_dims == 1 || num_stride_dims == num_spatial_axes_) << "stride must be specified once, or once per spatial dimension " << "(stride specified " << num_stride_dims << " times; " << num_spatial_axes_ << " spatial dims)."; const int kDefaultStride = 1; for (int i = 0; i < num_spatial_axes_; ++i) { stride_data[i] = (num_stride_dims == 0) ? kDefaultStride : conv_param.stride((num_stride_dims == 1) ? 0 : i); CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero."; } } // Setup pad dimensions (pad_). pad_.Reshape(spatial_dim_blob_shape); // 設置 pad, 和 kernal 類似 int* pad_data = pad_.mutable_cpu_data(); // cy, if there are pad params, then set as the params. If there are no param set as default. if (conv_param.has_pad_h() || conv_param.has_pad_w()) { CHECK_EQ(num_spatial_axes_, 2) << "pad_h & pad_w can only be used for 2D convolution."; CHECK_EQ(0, conv_param.pad_size()) << "Either pad or pad_h/w should be specified; not both."; pad_data[0] = conv_param.pad_h(); pad_data[1] = conv_param.pad_w(); } else { const int num_pad_dims = conv_param.pad_size(); CHECK(num_pad_dims == 0 || num_pad_dims == 1 || num_pad_dims == num_spatial_axes_) << "pad must be specified once, or once per spatial dimension " << "(pad specified " << num_pad_dims << " times; " << num_spatial_axes_ << " spatial dims)."; const int kDefaultPad = 0; for (int i = 0; i < num_spatial_axes_; ++i) { pad_data[i] = (num_pad_dims == 0) ? kDefaultPad : conv_param.pad((num_pad_dims == 1) ? 0 : i); } } // Setup dilation dimensions (dilation_). // 膨脹卷積(有翻譯為 空洞卷積, 更形象一些): 擴大感受野。 一種技術修正 pooling 丟失信息的技術。 dilation_.Reshape(spatial_dim_blob_shape); // 設置 dilation, 和 kernal 類似 int* dilation_data = dilation_.mutable_cpu_data(); const int num_dilation_dims = conv_param.dilation_size(); CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 || num_dilation_dims == num_spatial_axes_) << "dilation must be specified once, or once per spatial dimension " << "(dilation specified " << num_dilation_dims << " times; " << num_spatial_axes_ << " spatial dims)."; const int kDefaultDilation = 1; for (int i = 0; i < num_spatial_axes_; ++i) { dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation : conv_param.dilation((num_dilation_dims == 1) ? 0 : i); } // Special case: im2col is the identity for 1x1 convolution with stride 1 // and no padding, so flag for skipping the buffer and transformation. is_1x1_ = true; for (int i = 0; i < num_spatial_axes_; ++i) { is_1x1_ &= kernel_shape_data[i] == 1 && stride_data[i] == 1 && pad_data[i] == 0; if (!is_1x1_) { break; } } // Configure output channels and groups. channels_ = bottom[0]->shape(channel_axis_); // 通道數 num_output_ = this->layer_param_.convolution_param().num_output(); //param 中定義的 num_output,即 output “channel” CHECK_GT(num_output_, 0); // 至少一個輸出 group_ = this->layer_param_.convolution_param().group(); CHECK_EQ(channels_ % group_, 0); CHECK_EQ(num_output_ % group_, 0) << "Number of output should be multiples of group."; if (reverse_dimensions()) { conv_out_channels_ = channels_; conv_in_channels_ = num_output_; } else { conv_out_channels_ = num_output_; conv_in_channels_ = channels_; } // Handle the parameters: weights and biases. // - blobs_[0] holds the filter weights // - blobs_[1] holds the biases (optional) vector<int> weight_shape(2); weight_shape[0] = conv_out_channels_; weight_shape[1] = conv_in_channels_ / group_; for (int i = 0; i < num_spatial_axes_; ++i) { weight_shape.push_back(kernel_shape_data[i]); } bias_term_ = this->layer_param_.convolution_param().bias_term(); vector<int> bias_shape(bias_term_, num_output_); if (this->blobs_.size() > 0) { CHECK_EQ(1 + bias_term_, this->blobs_.size()) << "Incorrect number of weight blobs."; if (weight_shape != this->blobs_[0]->shape()) { Blob<Dtype> weight_shaped_blob(weight_shape); LOG(FATAL) << "Incorrect weight shape: expected shape " << weight_shaped_blob.shape_string() << "; instead, shape was " << this->blobs_[0]->shape_string(); } if (bias_term_ && bias_shape != this->blobs_[1]->shape()) { Blob<Dtype> bias_shaped_blob(bias_shape); LOG(FATAL) << "Incorrect bias shape: expected shape " << bias_shaped_blob.shape_string() << "; instead, shape was " << this->blobs_[1]->shape_string(); } LOG(INFO) << "Skipping parameter initialization"; } else { if (bias_term_) { this->blobs_.resize(2); } else { this->blobs_.resize(1); } // Initialize and fill the weights: // output channels x input channels per-group x kernel height x kernel width this->blobs_[0].reset(new Blob<Dtype>(weight_shape)); shared_ptr<Filler<Dtype> > weight_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().weight_filler())); weight_filler->Fill(this->blobs_[0].get()); // If necessary, initialize and fill the biases. if (bias_term_) { this->blobs_[1].reset(new Blob<Dtype>(bias_shape)); shared_ptr<Filler<Dtype> > bias_filler(GetFiller<Dtype>( this->layer_param_.convolution_param().bias_filler())); bias_filler->Fill(this->blobs_[1].get()); } } kernel_dim_ = this->blobs_[0]->count(1); //寫成(conv_out_channels_ / group_) * kernel_dim_更直觀。這個offset是相對group分組來講的。 weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; // Propagate gradients to the parameters (as directed by backward pass). this->param_propagate_down_.resize(this->blobs_.size(), true); }