Windows10 下RTX30系列Caffe安裝教程

本文轉載自查看原文 2021-09-14 14:42 165 C/C++

1. 前言
2. 環境准備
3. 具體步驟
4. 參考

1. 前言

最近復現馬普所的RGB攝像頭的動作捕捉論文 Vnect 和 Xnect ，需要配置Caffe的環境，由於是RTX30系列顯卡是Ampere架構，Caffe中默認的一些編譯配置沒有包含（因為Caffe很多年沒有更新了），再加上Caffe不支持cudnn8，因此也需要對Caffe源碼部分做一些修改。以下就是編譯測試Caffe的具體過程。

2. 環境准備

Windows 10
RTX3060 12G
Anaconda Python3.5
Visual Studio 2015
CUDA 11.0
CUDNN 8.0.3
CMake 3.21.1
Git

3. 具體步驟

3.1 安裝CUDA和CUDNN

cuda版本：cuda_11.0.3_451.82_win10.exe

cudnn版本：cudnn-11.0-windows-x64-v8.0.3.33.zip

3.2 安裝caffe

(1) 下載caffe源碼

輸入命令行 git clone https://github.com/BVLC/caffe.git
進入caffe文件 cd caffe
進入windows分支 git check windows

(2) 最關鍵的一步，修改path/caffe/scripts/build_win.cmd文件

開始修改build_win.cmd ，首先需要確定 Visual Studio 編譯的版本MSVC_VERSION = 13 表示 VS2013， MSVC_VERSION = 14 表示 VS2015, MSVC_VERSION = 15 表示 VS2017；然后確定GPU顯卡的架構，每種架構對應的cuda版本是不同，cudnn接口也有差異，下圖是目前的顯卡的[架構表](Matching CUDA arch and CUDA gencode for various NVIDIA architectures - Arnon Shimoni)

Kepler (GTX-7XX)	Maxwell (GTX-9xx)	Pascal (GTX-10xx)	Volta (Tesla Titan)	Turing (RTX-20xx)	Ampere (RTX-30xx）
sm_30, compute_30	sm_50, compute_50	sm_60, compute_60	sm_70 compute_70	sm_75 compute_75	sm_80 compute_80
sm_35, compute_35	sm_52, compute_52	sm_61, compute_61	sm_72 compute_72		sm_86 compute_86
sm_37, compute_37	sm_53, compute_53	sm_62, compute_62

主要修改以下文件：

修改caffe源碼中./scripts/build_win.cmd :

修改caffe源碼中./cmake/Cuda.cmake :

因為caffe之類的代碼很久不更新了，只支持到了使用cudnn7.x，在使用了cudnn8的環境下編譯caffe時，會在src/caffe/layers/cudnn_conv_layer.cpp等文件里出錯，

error: identifier "CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT" is undefined
error: identifier "cudnnGetConvolutionForwardAlgorithm" is undefined

這是因為cudnn8里沒有cudnnGetConvolutionForwardAlgorithm()這個函數了，改成了cudnnGetConvolutionForwardAlgorithm_v7()，也沒了CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT這個宏定義，這些都是API不兼容，但是NVIDIA聲明cudnn8不支持了，caffe的代碼也沒人去更新了，所以不能指望NVIDIA或者berkeley，只能自行修改。將cudnn_conv_layer.cpp文件替換成如下:

#ifdef USE_CUDNN
#include <algorithm>
#include <vector>

#include "caffe/layers/cudnn_conv_layer.hpp"

namespace caffe {

// Set to three for the benefit of the backward pass, which
// can use separate streams for calculating the gradient w.r.t.
// bias, filter weights, and bottom data for each group independently
#define CUDNN_STREAMS_PER_GROUP 3

/**
 * TODO(dox) explain cuDNN interface
 */
template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  // Initialize algorithm arrays
  fwd_algo_       = new cudnnConvolutionFwdAlgo_t[bottom.size()];
  bwd_filter_algo_= new cudnnConvolutionBwdFilterAlgo_t[bottom.size()];
  bwd_data_algo_  = new cudnnConvolutionBwdDataAlgo_t[bottom.size()];

  // initialize size arrays
  workspace_fwd_sizes_ = new size_t[bottom.size()];
  workspace_bwd_filter_sizes_ = new size_t[bottom.size()];
  workspace_bwd_data_sizes_ = new size_t[bottom.size()];

  // workspace data
  workspaceSizeInBytes = 0;
  workspaceData = NULL;
  workspace = new void*[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (size_t i = 0; i < bottom.size(); ++i) {
    // initialize all to default algorithms
    fwd_algo_[i] = (cudnnConvolutionFwdAlgo_t)0;
    bwd_filter_algo_[i] = (cudnnConvolutionBwdFilterAlgo_t)0;
    bwd_data_algo_[i] = (cudnnConvolutionBwdDataAlgo_t)0;
    // default algorithms don't require workspace
    workspace_fwd_sizes_[i] = 0;
    workspace_bwd_data_sizes_[i] = 0;
    workspace_bwd_filter_sizes_[i] = 0;
  }

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
    workspace[g] = NULL;
  }

  // Set the indexing parameters.
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  const int* kernel_shape_data = this->kernel_shape_.cpu_data();
  const int kernel_h = kernel_shape_data[0];
  const int kernel_w = kernel_shape_data[1];
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      kernel_h, kernel_w);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensorDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensorDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }

  handles_setup_ = true;
}

template <typename Dtype>
void CuDNNConvolutionLayer<Dtype>::Reshape(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::Reshape(bottom, top);
  CHECK_EQ(2, this->num_spatial_axes_)
      << "CuDNNConvolution input must have 2 spatial axes "
      << "(e.g., height and width). "
      << "Use 'engine: CAFFE' for general ND convolution.";
  bottom_offset_ = this->bottom_dim_ / this->group_;
  top_offset_ = this->top_dim_ / this->group_;
  const int height = bottom[0]->shape(this->channel_axis_ + 1);
  const int width = bottom[0]->shape(this->channel_axis_ + 2);
  const int height_out = top[0]->shape(this->channel_axis_ + 1);
  const int width_out = top[0]->shape(this->channel_axis_ + 2);
  const int* pad_data = this->pad_.cpu_data();
  const int pad_h = pad_data[0];
  const int pad_w = pad_data[1];
  const int* stride_data = this->stride_.cpu_data();
  const int stride_h = stride_data[0];
  const int stride_w = stride_data[1];
#if CUDNN_VERSION_MIN(8, 0, 0)
  int RetCnt;
  bool found_conv_algorithm;
  size_t free_memory, total_memory;
  cudnnConvolutionFwdAlgoPerf_t     fwd_algo_pref_[4];
  cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo_pref_[4];

  //get memory sizes
  cudaMemGetInfo(&free_memory, &total_memory);
#else
  // Specify workspace limit for kernels directly until we have a
  // planning strategy and a rewrite of Caffe's GPU memory mangagement
  size_t workspace_limit_bytes = 8*1024*1024;
#endif
  for (int i = 0; i < bottom.size(); i++) {
    cudnn::setTensor4dDesc<Dtype>(&bottom_descs_[i],
        this->num_,
        this->channels_ / this->group_, height, width,
        this->channels_ * height * width,
        height * width, width, 1);
    cudnn::setTensor4dDesc<Dtype>(&top_descs_[i],
        this->num_,
        this->num_output_ / this->group_, height_out, width_out,
        this->num_output_ * this->out_spatial_dim_,
        this->out_spatial_dim_, width_out, 1);
    cudnn::setConvolutionDesc<Dtype>(&conv_descs_[i], bottom_descs_[i],
        filter_desc_, pad_h, pad_w,
        stride_h, stride_w);

#if CUDNN_VERSION_MIN(8, 0, 0)
    // choose forward algorithm for filter
    // in forward filter the CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED is not implemented in cuDNN 8
    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(handle_[0],
      bottom_descs_[i],
      filter_desc_,
      conv_descs_[i],
      top_descs_[i],
      4,
      &RetCnt,
      fwd_algo_pref_));
		
    found_conv_algorithm = false;
    for(int n=0;n<RetCnt;n++){
      if (fwd_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
          fwd_algo_pref_[n].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
          fwd_algo_pref_[n].memory < free_memory){
        found_conv_algorithm = true;
	fwd_algo_[i]                   = fwd_algo_pref_[n].algo;
 	workspace_fwd_sizes_[i]        = fwd_algo_pref_[n].memory;
        break;
      }
    }
    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
    else{
	// choose backward algorithm for filter
        // for better or worse, just a fixed constant due to the missing 
        // cudnnGetConvolutionBackwardFilterAlgorithm in cuDNN version 8.0
	bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
	//twice the amount of the forward search to be save     
        workspace_bwd_filter_sizes_[i] = 2*workspace_fwd_sizes_[i];
    }

    // choose backward algo for data
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle_[0],
      filter_desc_, 
      top_descs_[i], 
      conv_descs_[i], 
      bottom_descs_[i],
      4,
      &RetCnt,
      bwd_data_algo_pref_));

    found_conv_algorithm = false;
    for(int n=0;n<RetCnt;n++){
      if (bwd_data_algo_pref_[n].status == CUDNN_STATUS_SUCCESS &&
          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD &&
          bwd_data_algo_pref_[n].algo != CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
          bwd_data_algo_pref_[n].memory < free_memory){
        found_conv_algorithm = true;
	bwd_data_algo_[i]              = bwd_data_algo_pref_[n].algo;
 	workspace_bwd_data_sizes_[i]   = bwd_data_algo_pref_[n].memory;
        break;
      }
    }
    if(!found_conv_algorithm) LOG(ERROR) << "cuDNN did not return a suitable algorithm for convolution.";
#else
    // choose forward and backward algorithms + workspace(s)
    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm(handle_[0],
      bottom_descs_[i],
      filter_desc_,
      conv_descs_[i],
      top_descs_[i],
      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
      workspace_limit_bytes,
      &fwd_algo_[i]));

    CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(handle_[0],
      bottom_descs_[i],
      filter_desc_,
      conv_descs_[i],
      top_descs_[i],
      fwd_algo_[i],
      &(workspace_fwd_sizes_[i])));

    // choose backward algorithm for filter
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm(handle_[0],
          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
          CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
          workspace_limit_bytes, &bwd_filter_algo_[i]) );

    // get workspace for backwards filter algorithm
    CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(handle_[0],
          bottom_descs_[i], top_descs_[i], conv_descs_[i], filter_desc_,
          bwd_filter_algo_[i], &workspace_bwd_filter_sizes_[i]));

    // choose backward algo for data
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm(handle_[0],
          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
          CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
        workspace_limit_bytes, &bwd_data_algo_[i]));

    // get workspace size
    CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(handle_[0],
          filter_desc_, top_descs_[i], conv_descs_[i], bottom_descs_[i],
          bwd_data_algo_[i], &workspace_bwd_data_sizes_[i]) );
#endif
  }
  // reduce over all workspace sizes to get a maximum to allocate / reallocate
  size_t total_workspace_fwd = 0;
  size_t total_workspace_bwd_data = 0;
  size_t total_workspace_bwd_filter = 0;

  for (size_t i = 0; i < bottom.size(); i++) {
    total_workspace_fwd        = std::max(total_workspace_fwd,
                                     workspace_fwd_sizes_[i]);
    total_workspace_bwd_data   = std::max(total_workspace_bwd_data,
                                     workspace_bwd_data_sizes_[i]);
    total_workspace_bwd_filter = std::max(total_workspace_bwd_filter,
                                     workspace_bwd_filter_sizes_[i]);
  }
  // get max over all operations
  size_t max_workspace = std::max(total_workspace_fwd,
                             total_workspace_bwd_data);
  max_workspace = std::max(max_workspace, total_workspace_bwd_filter);
  // ensure all groups have enough workspace
  size_t total_max_workspace = max_workspace *
                               (this->group_ * CUDNN_STREAMS_PER_GROUP);

  // this is the total amount of storage needed over all groups + streams
  if (total_max_workspace > workspaceSizeInBytes) {
    DLOG(INFO) << "Reallocating workspace storage: " << total_max_workspace;
    workspaceSizeInBytes = total_max_workspace;

    // free the existing workspace and allocate a new (larger) one
    cudaFree(this->workspaceData);

    cudaError_t err = cudaMalloc(&(this->workspaceData), workspaceSizeInBytes);
    if (err != cudaSuccess) {
      // force zero memory path
      for (int i = 0; i < bottom.size(); i++) {
        workspace_fwd_sizes_[i] = 0;
        workspace_bwd_filter_sizes_[i] = 0;
        workspace_bwd_data_sizes_[i] = 0;
        fwd_algo_[i] = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
        bwd_filter_algo_[i] = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
        bwd_data_algo_[i] = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
      }

      // NULL out all workspace pointers
      for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
        workspace[g] = NULL;
      }
      // NULL out underlying data
      workspaceData = NULL;
      workspaceSizeInBytes = 0;
    }

    // if we succeed in the allocation, set pointer aliases for workspaces
    for (int g = 0; g < (this->group_ * CUDNN_STREAMS_PER_GROUP); g++) {
      workspace[g] = reinterpret_cast<char *>(workspaceData) + g*max_workspace;
    }
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::setTensor4dDesc<Dtype>(&bias_desc_,
        1, this->num_output_ / this->group_, 1, 1);
  }
}

template <typename Dtype>
CuDNNConvolutionLayer<Dtype>::~CuDNNConvolutionLayer() {
  // Check that handles have been setup before destroying.
  if (!handles_setup_) { return; }

  for (int i = 0; i < bottom_descs_.size(); i++) {
    cudnnDestroyTensorDescriptor(bottom_descs_[i]);
    cudnnDestroyTensorDescriptor(top_descs_[i]);
    cudnnDestroyConvolutionDescriptor(conv_descs_[i]);
  }
  if (this->bias_term_) {
    cudnnDestroyTensorDescriptor(bias_desc_);
  }
  cudnnDestroyFilterDescriptor(filter_desc_);

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    cudaStreamDestroy(stream_[g]);
    cudnnDestroy(handle_[g]);
  }

  cudaFree(workspaceData);
  delete [] stream_;
  delete [] handle_;
  delete [] fwd_algo_;
  delete [] bwd_filter_algo_;
  delete [] bwd_data_algo_;
  delete [] workspace_fwd_sizes_;
  delete [] workspace_bwd_data_sizes_;
  delete [] workspace_bwd_filter_sizes_;
}

INSTANTIATE_CLASS(CuDNNConvolutionLayer);

}   // namespace caffe
#endif

在命令行窗口運行 scripts/build_win.cmd，等待運行，會下載一個文件libraries_v140_x64_py35_1.1.0.tar.bz2 建議最好掛上代理以免下載失敗，這個文件是caffe相關的依賴庫，此過程中編譯的時候會報一個boost相關的錯誤，對 C:\Users\Administrator\.caffe\dependencies\libraries_v140_x64_py35_1.1.0\libraries\include\boost-1_61\boost\config\compiler 路徑下的 nvcc.hpp 作如下修改，因為RTX3060的編譯器nvcc版本大於7.5：

之后刪除之前編譯的build文件夾，重新編譯一次，編譯過程中會出現較多警告可以不用理會，稍等一段時間后，最終會出現：

最后在build文件夾下找到Caffe.sln文件，用VS2015打開，然后右鍵ALL_BUILD進行生成，等幾分鍾后編譯完，

將caffe源碼下中python中的caffe文件夾粘貼到上面配置的python路徑中C:\python35\Lib\site-packages\，再將E:\caffe-windows-vs2015\build\install\bin路徑添加到環境變量中，然后pip安裝一些必要的庫 pip install numpy scipy protobuf six scikit-image pyyaml pydotplus graphviz , 最后打開python，測試一下（如果出現錯誤，更新一下scipy版本）：

3.3 Mnist GPU訓練測試

打開終端Windows PowerShell，加入caffe源碼目錄，先將mnist數據集轉化為LMDB格式，然后運行

.\build\install\bin\caffe.exe train -solver path\examples\mnist\lenet_solver.prototxt

4. 參考

超詳細win10+caffe+gpu (NVIDIA GeForce RTX 2060)+vs2015+CUDA10.0+cudnn7.4.1+anaconda+python3.5安裝

CUDA入門（一）：CUDA11.0+VS2019+WIN10環境配置

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 DeepFaceLab：快讓rtx30系列的提取速度翻倍吧！！！在windows10上安裝caffe和tensorflow Windows10下用Anaconda3安裝TensorFlow教程 windows10 下mysql 8.0.19安裝教程，自測成功 Windows10下安裝Git的詳細教程 windows10下git bash安裝教程 Windows10下JDK15的安裝教程 Windows10下用Anaconda3安裝TensorFlow教程在windows10上搭建caffe Pytorch windows10安裝教程