PaddlePaddle inference 源碼分析（四）

本文轉載自查看原文 2021-12-24 17:57 808 Paddle

本節介紹預測處理的流程。預測處理流程主要分為3部分，包括准備輸入數據、執行、獲取輸出數據。

一、放入輸入數據

簡單的使用方法如下所示：

vector<string> input_names = predictor->GetInputNames();
unique_ptr<Tensor> input_t = predictor->GetInputHandle(input_names[0]);
input_t->Reshape(input_shape);
input_t->CopyFromCpu(input.data());

我們按照這個流程一步一步來深入

1、GetInputNames

這個調用有點繞，因為對外提供的頭文件是paddle_infer作用域，因此這里的實際實現是先在paddle_infer下函數調用，然后調用了實際創建出來的AnalysisPredictor::GetInputNamse。

這一步是獲取輸入的節點名稱。這里idx2feeds_是std::map<size_t, std::string>，保存的是模型文件中op->Type==feed的名稱

// 接口類實現
namespace paddle_infer {
std::vector<std::string> Predictor::GetInputNames() {
  return predictor_->GetInputNames();
}
}

// 實際實現
std::vector<std::string> AnalysisPredictor::GetInputNames() {
  std::vector<std::string> input_names;
  for (auto &item : idx2feeds_) {
    input_names.push_back(item.second);
  }
  return input_names;
}

2、GetInputHandle

作用是根據節點名稱獲取到對應的內存區域。前文介紹過Scope中保存了所有節點的信息，這里就是拿到輸入節點Scope的內存區域.這里executor保存的scope是predictor的sub_scope。

namespace paddle_infer {
std::unique_ptr<Tensor> Predictor::GetInputHandle(const std::string &name) {
  return predictor_->GetInputTensor(name);
}
}

std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  PADDLE_ENFORCE_NOT_NULL(
      executor_->scope()->FindVar(name),
      platform::errors::PreconditionNotMet(
          "The variable named %s is not found in the scope of the exector.",
          name));
  // 拿到scope
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = true;
  res->SetName(name);
  // 根據設備獲取對應place
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
  } else if (platform::is_xpu_place(place_)) {
    if (config_.lite_engine_enabled()) {
      // Currently, Paddle-Lite's XPU user interface only supports the transfer
      // of host data pointers. If it is currently used as a subgraph, execution
      // efficiency will be sacrificed, so it is temporarily set to cpu place.
      // And, the current lite engine of xpu must execute all parts of the
      // model.
      res->SetPlace(PaddlePlace::kCPU);
    } else {
      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
    }
  } else if (platform::is_npu_place(place_)) {
    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
  return res;
}

3、ZeroCopyTensor::Reshape

這一步驟的作用就是操作輸入tensort，重新確定輸入數據的維度信息。這里我們會詳細介紹一下tensor的操作。

3.1 基類及接口是paddle_infer::Tensor(paddle_tensor.h/inference/api/details/zero_copy_tensor.cc).ZeroCopyTensor（paddle_api.h）是paddle_infer::Tensor的子類，主要重寫了copy相關的函數。會在下一小結具體講述。

3.2 實際的reshape操作作用在Tensor::Reshape中，實際邏輯為從sub_scope中取出對應名稱的Variable(framework/variable.h)並對其進行操作。

void Tensor::Reshape(const std::vector<int> &shape) {
  // 判斷是否設置了name
  PADDLE_ENFORCE_EQ(
      name_.empty(), false,
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
  // 判斷是否為input，只有input才能重新設置
  PADDLE_ENFORCE_EQ(input_or_output_, true,
                    paddle::platform::errors::PermissionDenied(
                        "Can't reshape the output tensor, it is readonly"));
  // 獲取scope，然后取出對應名稱節點的變量並進行設置。這里使用的是sub_scope，其中保存的都是非永久性的節點
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
      var, paddle::platform::errors::PreconditionNotMet(
               "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
  tensor->Resize(paddle::framework::make_ddim(shape));
}

3.3 var->GetMutable,這里實際在Variable中創建對應類型的存儲數據。存儲數據用LoDTensor（lod_tensor.h）,創建一個LoDTensor的對象賦值給

  template <typename T>
  T* GetMutable() {
    if (!holder_) {
      holder_.reset(new PlaceholderImpl<T>());
    } else {
      PADDLE_ENFORCE_EQ(
          holder_->Type(), VarTypeTrait<T>::kId,
          platform::errors::InvalidArgument(
              "The Variable type must be %s, but the type it holds is %s.",
              ToTypeName(VarTypeTrait<T>::kId), ToTypeName(holder_->Type())));
    }
    return static_cast<T*>(holder_->Ptr());
  }

PlaceholderImpl是一個模板類，用於包裝T，這樣Variable類在構造時不需要包含模板，只需要把Placeholder指針作為成員變量即可std::shared_ptr<Placeholder> holder_;PlaceholderImpl構造時會保存obj指針，同時保存obj的類型序號，序號實際在proto::VarType中定義。對應關系實現已注冊好。

  // Placeholder hides type T, so it doesn't appear as a template
  // parameter of Variable.
  template <typename T>
  struct PlaceholderImpl : public Placeholder {
    static_assert(
        IsRegisteredVarType<T>(),
        "Not registered type. Please register T inside var_type_traits.h");
    PlaceholderImpl() { this->Init(&obj_, VarTypeTrait<T>::kId); }

   private:
    T obj_;
  };

這里會檢查T類型是否已注冊，注冊列表詳見framework/var_type_traits.h

REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR);
REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS);
REG_PROTO_VAR_TYPE_TRAIT(std::vector<Scope *>, proto::VarType::STEP_SCOPES);
REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE);
REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY);
REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST);
REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER);
REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST);
REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST);
REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32);
REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB);
REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING);
REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS);

3.4 LoDTensor這里命名空間為paddle::framework，注意與之前paddle_infer::Tensor區分開。LoDTensor的父類為paddle::framework::Tensor(framework/tensor.h)，Resize操作也是直接使用父類函數

Tensor& Tensor::Resize(const DDim& dims) {
  dims_ = dims;
  return *this;
}

　　paddle::framework::make_ddim(shape)這里創建DDim（ddim.h），其中rank_保存了維度長度，dim_保存了具體的維度數據，最大為9維

4. ZeroCopyTensor::CopyFromCpu

這一步真正進行內存拷貝。我們分4步詳細介紹

template <typename T>
void Tensor::CopyFromCpu(const T *data) {
  // 1
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  PADDLE_ENFORCE_GE(tensor->numel(), 0,
                    paddle::platform::errors::PreconditionNotMet(
                        "You should call Tensor::Reshape(const "
                        "std::vector<int> &shape)"
                        "function before copying data from cpu."));
  // 2
  size_t ele_size = tensor->numel() * sizeof(T);

  // 3
  if (place_ == PlaceType::kCPU) {
    auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
    std::memcpy(static_cast<void *>(t_data), data, ele_size);
  } else if (place_ == PlaceType::kGPU) {
  // 4
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    paddle::platform::CUDAPlace gpu_place(device_);
    auto *t_data = tensor->mutable_data<T>(gpu_place);
    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
        pool.Get(gpu_place));

    paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
                         paddle::platform::CPUPlace(), data, ele_size,
                         dev_ctx->stream());
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
#endif
  } else if (place_ == PlaceType::kXPU) {
   ...// 昆侖xpu相關
  } else if (place_ == PlaceType::kNPU) {
   ...// 華為昇騰相關
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
  }
}

4.1 取出scope對應var中創建的LoDTensor指針，賦值給tensor_

// 1調用入口
EAGER_GET_TENSOR(paddle::framework::LoDTensor);
// 2 調用FindTensor獲取指針
#define EAGER_GET_TENSOR(tensor_type)    \
  if (!tensor_) {                        \
    tensor_ = FindTensor<tensor_type>(); \
  }                                      \
  auto *tensor = static_cast<tensor_type *>(tensor_);
// 3 實際邏輯，在scope對應var中使用GetMutable,由於Reshape時已經調用該接口進行了創建，而且本地調用類型與創建類型一致，會直接獲取之前創建的LoDTensor對象指針。
template <typename T>
void *Tensor::FindTensor() const {
  PADDLE_ENFORCE_EQ(
      name_.empty(), false,
      paddle::platform::errors::PreconditionNotMet(
          "Need to SetName first, so that the corresponding tensor can "
          "be retrieved."));
  auto *scope = static_cast<paddle::framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
  PADDLE_ENFORCE_NOT_NULL(
      var, paddle::platform::errors::PreconditionNotMet(
               "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<T>();
  return tensor;
}

4.2 計算需要的內存大小,實際與之前Reshap的大小以及T的類型有關。最終就是dim0*dim1*...*sizeof(T)

int64_t Tensor::numel() const { return product(dims_); }

最終通過這種遞歸調用模板的方式計算所有維度數據的乘積

template <size_t kStart, size_t kEnd, bool kStop>
struct UnrollProduct {
  template <typename T>
  HOSTDEVICE inline static T Run(const T *d) {
    return d[kStart] *
           UnrollProduct<kStart + 1, kEnd, kStart + 1 == kEnd>::Run(d);
  }
};

template <size_t kStart, size_t kEnd>
struct UnrollProduct<kStart, kEnd, true> {
  template <typename T>
  HOSTDEVICE inline constexpr static T Run(const T *d) {
    return 1;
  }
};

4.3 對於CPU place的數據拷貝。對於CPU比較簡單，就是從tensor中拿到內存，然后將數據進行拷貝。

4.3.1 獲取內存指針

// 注意，這里tensor仍將是paddle::framework::Tensor
// 1拿到內存
auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());

4.3.2 獲取T數據類型

// 2.1 tgensor_impl.h首先判斷T是否為pod，然后獲取T類型
template <typename T>
inline T* Tensor::mutable_data(const platform::Place& place,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  return reinterpret_cast<T*>(
      mutable_data(place, DataTypeTrait<T>::DataType(), requested_size));
}

// 2.2 具體獲取類型的方法 data_type.h
  DataTypeTrait<T>::DataType()

// 工具宏，用了遍歷類型
#define _ForEachDataType_(callback)                                      \
  _ForEachDataTypeHelper_(callback, float, FP32);                        \
  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
  _ForEachDataTypeHelper_(callback, double, FP64);                       \
  _ForEachDataTypeHelper_(callback, int, INT32);                         \
  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
  _ForEachDataTypeHelper_(callback, int8_t, INT8);                       \
  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
                          COMPLEX64);                                    \
  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
                          COMPLEX128);

// 首先在初始化時創建map，其中數據類型與framework.proto中VarType::Type的對應
//初始化函數，使用遍歷宏調用RegisterType函數,這個函數把數據類型與proto_type對應關系寫入map
static DataTypeMap* InitDataTypeMap() {
  auto retv = new DataTypeMap();

#define RegType(cc_type, proto_type) \
  RegisterType<cc_type>(retv, proto_type, #cc_type)

  _ForEachDataType_(RegType);

#undef RegType
  return retv;
}

// 然后使用宏定義創建對應的特化DataTypeTrait
// 創建特化類的宏
#define DefineDataTypeTrait(cpp_type, proto_type)                           \
  template <>                                                               \
  struct DataTypeTrait<cpp_type> {                                          \
    constexpr static proto::VarType::Type DataType() { return proto_type; } \
  }

// 使用遍歷宏去調用創建宏
_ForEachDataType_(DefineDataTypeTrait);

4.3.3 申請內存

確定了具體數據類型后，會實際調用framework::Tensor->mutable_data(place, type, requested_size=0)。然后仍舊使用numel()*SizeOfType(type)獲取數據大小，然后使用memory::AllocShared獲取對應place的內存，保存到Tensor的holder（memory::Allocation）,同時返回內存指針

4.4 GPU顯存的拷貝

相比與CPU內存的拷貝。GPU這里有兩點不同，第一個是使用全局的DeviceContextPool，DeviceContextPool保存了place與deviceContext的對應關系。這里獲取到對應的CUDADeviceContext。使用同樣的方法獲取內存指針后，將數據拷貝到顯存中。

二、執行預測

執行Predictor::Run。這里實際執行的是AnalysisPredictor::ZeroCopyRun函數

bool AnalysisPredictor::ZeroCopyRun() {
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
#ifdef PADDLE_WITH_MKLDNN
  ...#endif

  executor_->Run();

  if (config_.shape_range_info_collected()) {
    CollectShapeRangeInfo();
  }

  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
  tensor_array_batch_cleaner_.ResetTensorArray();

  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
  // conflict when integrating it into deployment service.
  paddle::platform::SetNumThreads(1);
#ifdef PADDLE_WITH_MKLDNN
  ...#endif
#if defined(PADDLE_WITH_MKLML)
  ...#endif
  return true;
}

1、NaiveExecutor::Run

基礎邏輯是逐步調用op->Run

void NaiveExecutor::Run() {
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
  platform::ScopedFlushDenormal flush;
  for (auto &op : ops_) {
    VLOG(4) << std::this_thread::get_id() << " run "
            << op->DebugStringEx(scope_) << " on scope " << scope_;
    op->SetIsCalledByExecutor(false);
    op->Run(*scope_, place_);
  }
}

2、OP RUN

OP RUN這里會直接調用OperatorBase::Run,先從place中獲取設備id，然后調用子類OP的RunImpl.所需要的資源為scope和place

void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  try {
    VLOG(4) << place << " " << DebugStringEx(&scope);
    if (platform::is_gpu_place(place)) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
      PADDLE_THROW(platform::errors::Unavailable(
          "Cannot run operator on place %s, please recompile paddle or "
          "reinstall Paddle with CUDA support.",
          place));
#else
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
      platform::SetDeviceId(dev_id);
#endif
    } else if (platform::is_xpu_place(place)) {
    ... //XPU相關，獲取設備id
    } else if (platform::is_npu_place(place)) {
    ... //NPU相關即昇騰，獲取id
    }

    {
      // TODO(wangchaochaohu) : refine code to use only one RecordEvent)
      // in order to record different op type cost time
      // and different op name cost time,we set two event.
      platform::RecordEvent op_type_record_event(Type());
      auto op_name = platform::OpName(outputs_, Type());
      platform::RecordEvent op_name_record_event(
          op_name, platform::EventRole::kUniqueOp);
      // 實際邏輯，調用子類OP的RunImpl
      RunImpl(scope, place);
    }

    VLOG(3) << GetExecutionPlace(place) << " " << DebugStringEx(&scope);
  } catch (platform::EnforceNotMet& exception) {
    ... // 錯誤信息
  }
}

OP這里分為兩種，可以參考《PaddlePaddle inference 源碼分析（三）》

一種OP直接繼承自OperatorBase，另一種OP繼承自OperatorWithKernel.

兩者區別為，繼承自OperatorBase的OP直接實現了RunImpl，這類OP直接運行於CPU上。繼承自OperatorWithKernel的OP運行OperatorWithKernel::RunImpl,然后運行對應Kernel，這里選擇Kernel的邏輯為根據place以及op_device參數。

2.1 OperatorBase類型的Runimpl

此類OP一般為功能性OP

我們以assert_op為例。它的基本邏輯為從scope中根據名稱拿到輸入的LoDTensor,然后從LoDTensor中獲取數據,如果有數據，則取出具體數據，也就是錯誤信息進行打印。

void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
    // 從scope中獲取tensor
    const framework::Variable *cond_var_ptr = scope.FindVar(Input(kCond));
    PADDLE_ENFORCE_NOT_NULL(cond_var_ptr,
                            platform::errors::NotFound(
                                "Input(Condition) of AssertOp is not found."));
    const LoDTensor &cond = cond_var_ptr->Get<LoDTensor>();
    PADDLE_ENFORCE_EQ(
        cond.dims(), paddle::framework::make_ddim({1}),
        platform::errors::InvalidArgument(
            "The numel of Input(Condition) of AssertOp must be 1. But now "
            "the Condition's shape is %s.",
            cond.dims().to_str()));

    // 判斷tensor中是否有數據
    bool cond_data = GetCondData(cond);
    if (cond_data) {
      return;
    }

    TensorFormatter formatter;
    formatter.SetSummarize(Attr<int64_t>(kSummarize));
    // 對數據進行處理，也就是打印錯誤信息
    const std::vector<std::string> &x_names = Inputs(kData);
    for (const std::string &name : x_names) {
      const framework::Variable *x_var_ptr = scope.FindVar(name);
      const framework::LoDTensor &x_tensor = x_var_ptr->Get<LoDTensor>();
      formatter.Print(x_tensor, name);
    }

    PADDLE_THROW(platform::errors::InvalidArgument(
        "The condition variable '%s' of AssertOp must be "
        "true, but received false",
        Input(kCond)));
  }

2.2 OperatorWithKernel RunImpl

絕大部分的計算型OP均為OperatorWithKernel類型。

它的調用步驟如下：OperatorWithKernel::RunImpl(scope,place)->OperatorWithKernel::RunImpl(scope,place,runtimecontext)->OperatorWithKernel::ChooseKernel->注冊的kernel_func_

2.2.1 RunImpl(scope,place)

獲取RuntimeContext。分為兩種情況，一種允許cache rumtimecontext，那就會沿用OP保存的runtime_ctx_，否則會創建新的。

2.2.2 RunImpl(scope, place,runtime_ctx)

從單例的DeviceContextPool中根據place獲取對應的DeviceContext

如果OP對象緩存的kernel_type_或kernel_func_為空，則調用ChooseKernel獲取kernel_func

然后調用PrepareData獲得transfer_scope

調用子類實現的InferShape設置輸入輸出的維度信息

調用kernel_func去實際進行計算

2.2.3 ChooseKernel(runtime_ctx,scope,place)

使用AllOpKernels獲取到一個全局的map，這個map組成為<key=op名稱,value=OpKernelMap>，每個OP的所有kernel實現都放在對應的OpKernelMap中。OpKernelMap的組成為<OpKernelType, OpKernelFunc>，其中OpKernelType包含了place、數據類型信息等。

首先根據Op_type拿到對應的OpKernelMap，然后根據DeviceContext獲取place,並且從OP以及RuntimeContext中獲取數據類型，生成對應的OpKernelType

之后還會查看一下是否含有op_device這個參數，如果有則將place值改為op_device的屬性值。然后取出對應Func

將OpKernelType以及取出的OpKernelFunc保存到op的kernen_type_以及kernel_func_

2.2.4 PrepareData

准備輸入的數據，這里會創建thread local的scope,並傳遞出來.

然后會比較所有輸入tensor.place與kernel_type_的place是否一致。如果不一致，會發送數據搬運，例如tensor為cpu數據，kernel為gpu，就會把數據從cpu搬運到gpu中。

PS：OperatorWithKernel::GetKernelTypeForVar函數用於根據tensor生成對應的OpKernelType變量kernel_type_for_var，用於與kernel_type_比較。有些OpWithKernel會自己重寫GetKernelTypeForVar函數，這樣返回的kernel_type_for_var與kernel_type_一致用來避免數據搬運。例如range_op（https://github.com/PaddlePaddle/Paddle/pull/25810），由於kernel運行在cpu上，在gpu模式下，保證輸入數據放在cpu中，重寫GetKernelTypeForVar避免搬運，來降低無謂的數據拷貝。

2.2.5 InferShape

這里會調用OP自行實現的InferShape函數，用於設置輸出的維度信息

2.2.6 kernel_func

會調用op實際注冊的對應類型與place的kernel函數.這里注冊形式也是使用模板類，在類中實現了Compute函數，實際執行時會運行Compute函數。

三、獲取輸出數據

簡單的使用方法如下所示：

std::vector<float> *out_data
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,std::multiplies<int>());

out_data->resize(out_num);
output_t->CopyToCpu(out_data->data());

1、GetOutputNames

與輸入類似，都是從map中獲取對應名稱

//接口類實現
std::vector<std::string> Predictor::GetOutputNames() {
  return predictor_->GetOutputNames();
}

//實際實現
std::vector<std::string> AnalysisPredictor::GetOutputNames() {
  std::vector<std::string> output_names;
  for (auto &item : idx2fetches_) {
    output_names.push_back(item.second);
  }
  return output_names;
}

2、GetOutputHandle

從sub_scope中獲取輸出對應的tensor

//接口
std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
  return predictor_->GetOutputTensor(name);
}

//實際實現
std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
    const std::string &name) {
  PADDLE_ENFORCE_NOT_NULL(
      executor_->scope()->FindVar(name),
      platform::errors::PreconditionNotMet(
          "he variable named %s is not found in the scope of the exector.",
          name));
  std::unique_ptr<ZeroCopyTensor> res(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = false;
  res->SetName(name);
  if (platform::is_cpu_place(place_)) {
    res->SetPlace(PaddlePlace::kCPU);
  } else if (platform::is_xpu_place(place_)) {
    if (config_.lite_engine_enabled()) {
      // Currently, Paddle-Lite's XPU user interface only supports the transfer
      // of host data pointers. If it is currently used as a subgraph, execution
      // efficiency will be sacrificed, so it is temporarily set to cpu place.
      // And, the current lite engine of xpu must execute all parts of the
      // model.
      res->SetPlace(PaddlePlace::kCPU);
    } else {
      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
    }
  } else if (platform::is_npu_place(place_)) {
    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
  } else {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
  return res;
}

3、CopyToCpu

實際邏輯如下：

首先拿到保存的LoDTensor，然后使用內部的memory::Allocation給data分配內存。根據place區分原數據是在cpu還是gpu上，進行拷貝

template <typename T>
void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
                           void *cb_params) const {
  EAGER_GET_TENSOR(paddle::framework::LoDTensor);
  auto ele_num = tensor->numel();
  auto *t_data = tensor->data<T>();
  auto t_place = tensor->place();

  paddle::framework::Tensor out;
  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
      static_cast<void *>(data), ele_num * sizeof(T),
      paddle::platform::CPUPlace());
  out.ResetHolder(mem_allocation);

  if (paddle::platform::is_cpu_place(t_place)) {
#ifdef PADDLE_WITH_MKLDNN
    ...#else
    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
#endif
  } else if (place_ == PlaceType::kGPU) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place);
    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
        pool.Get(gpu_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
                         static_cast<void *>(data), gpu_place, t_data,
                         ele_num * sizeof(T), dev_ctx->stream());
#ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
#else
    // async, return stream
    if (nullptr != exec_stream) {
      *(static_cast<cudaStream_t *>(exec_stream)) = dev_ctx->stream();
      // async with callback
    } else if (cb) {
      cudaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
      // sync
    } else {
      cudaStreamSynchronize(dev_ctx->stream());
    }
#endif
#else
    PADDLE_THROW(paddle::platform::errors::Unavailable(
        "Can not create tensor with CUDA place because paddle is not compiled "
        "with CUDA."));
#endif
  } else if (place_ == PlaceType::kXPU) {
#ifdef PADDLE_WITH_XPU
    ...
  } else if (place_ == PlaceType::kNPU) {
#ifdef PADDLE_WITH_ASCEND_CL
    ...
  } else {
    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
        "The analysis predictor supports CPU, GPU, NPU and XPU now."));
  }
}

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 PaddlePaddle inference 源碼分析（一） PaddlePaddle inference 源碼分析（二） PaddlePaddle Transformer encoder 源碼解析 fluid.io.load_inference_model 載入多個模型的時候會報錯 -- [paddlepaddle] tensorflow源碼解析之framework-shape_inference PaddlePaddle paddlepaddle中文詞法分析LAC inference和learning paddlepaddle使用(一) PaddlePaddle tutorial