原理
- 為什么要使用INT8推理:更高的吞吐量/處理的fps提高以及更低的內存占用(8-bit vs 32-bit)
- 將FP32模型轉換成INT8模型存在的挑戰:更低的動態范圍和精度
Consider that 32-bit floating-point can represent roughly 4 billion numbers in the interval [-3.4e38, 3.40e38]. This interval of representable numbers is also known as the dynamic-range. The distance between two neighboring representable numbers is the precision of the representation. ——《Achieving FP32 Accuracy for INT8 Inference Using Quantization Aware Training with NVIDIA TensorRT》
- 如何將FP32量化成INT8:最簡單的一種方式是Symmetric Linear Quantization,每個Tensor都可以用一個和它關聯的scalar factor乘以量化后的INT8值。那么如何確定這個scalar factor呢?

對於weights,TensorRT采用左圖的方式進行映射,這樣不會帶來accuracy drop;對於activations,TensorRT采用上圖右邊這種方式進行INT8量化,這面臨一個新的問題,如何為每個activation tensor選取最佳的|threshold|呢?(這個其實就是calibration的過程)
選取不同的threshold,相當於是不同的編碼方式。從信息論的角度看,我們希望選取一種編碼方式,使得編碼前后的信息損失最小,我們可以用KL散度來衡量這個信息損失。

- 對activations的calibration

實踐
為了使用TensorRT的INT8推理,我們需要編寫一個自己的calibrator類,然后通過builder->setInt8Calibrator(calibrator)告訴builder使用這個calibrator來做數據標定,從而減小量化誤差。
至於builder具體是怎么去做標定的,builder類實現了以下功能:
- builder首先調用calibrator類的
getBatchSize()來獲取input batch的大小 - 然后builder不斷調用
getBatch()來獲取用於標定的輸入數據,讀入的batch data的大小必須和getBatchSize()得到的大小一致,如果沒有input batch數據了,getBatch()返回false - builder會先建立一個32-bit的Engine,對calibration set進行前向推理,並記錄下每層activations的直方圖
- 根據獲得的直方圖建立一個calibration table
- 基於得到的calibration table和network definition來創建8-bit的Engine
而calibration的過程是比較耗時的,通過對calibration table進行緩存,可以高效地對同一網絡build多次。要實現對calibration table的緩存功能,需要實現calibrator類中的writeCalibrationCache()和readCalibrationCache()兩個函數。
綜上所述,要實現一個INT8的Engine,開發人員需要實現一個calibrator類,這個類需override下面幾個函數:
- getBatchSize
- getBatch
- writeCalibrationCache(optional)
- readCalibrationCache(optional)
這個calibrator類是一個IInt8Calibrator,TensorRT提供了4個IInt8Calibrator的派生類(IInt8EntropyCalibrator、IInt8EntropyCalibrator2、IInt8MinMaxCalibrator、IInt8LegacyCalibrator,我們例子中的calibrator繼承自IInt8EntropyCalibrator.
#include <algorithm>
#include <assert.h>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <time.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "logger.h"
#include "common.h"
#include "image.hpp"
#define DebugP(x) std::cout << "Line" << __LINE__ << " " << #x << "=" << x << std::endl
using namespace nvinfer1;
Logger gLogger;
// LogStreamConsumer gLogError;
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 1000;
const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";
const std::string gSampleName = "TensorRT.sample_onnx_image";
const std::string onnxFile = "resnet50.onnx";
const std::string engineFile = "../data/resnet50_int8.trt"
const std::string calibFile = "../data/calibration_img.txt"
samplesCommon::Args gArgs;
std::vector<float> prepareImage(cv::Mat &img) {
int c = 3;
int h = INPUT_H;
int w = INPUT_W;
// 1 Resize the source Image to a specific size(這里保持原圖長寬比進行resize)
float scale = std::min(float(w) / img.cols, float(h) / img.rows);
auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);
// Convert BGR to RGB
cv::Mat rgb;
cv::cvtColor(img, rgb, CV_BGR2RGB);
cv::Mat resized;
cv::resize(rgb, resized, scaleSize, 0, 0, cv::INTER_CUBIC);
// 2 Crop Image(將resize后的圖像放在(H, W, C)的中心, 周圍用127做padding)
cv::Mat cropped(h, w, CV_8UC3, 127)
// Rect(left_top_x, left_top_y, width, height)
cv::Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
resize.copyTo(cropped(rect));
// 3 Type conversion, convert unsigned int 8 to float 32
cv::Mat img_float;
cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);
// HWC to CHW, and convert Mat to std::vector<float>
std::vector<cv::Mat> input_channels(c);
cv::split(cropped, input_channels);
std::vector<float> result(h * w * c);
auto data = result.data();
int channelLength = h * w;
for (int i = 0; i < c; ++i) {
memcpy(data, input_channels[i].data, channelLength * sizeof(float));
data += channelLength;
}
return result;
}
// 實現自己的calibrator類
namespace nvinfer1 {
class int8EntropyCalibrator: public nvinfer1::IInt8EntropyCalibrator {
public:
int8EntropyCalibrator(const int &batchSize,
const std::string &imgPath,
const std::string &calibTablePath);
virtual ~int8EntropyCalibrator();
int getBatchSize() const override { return batchSize; }
bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
const void *readCalibationCache(std::size_t &length) override;
void writeCalibrationCache(const void *ptr, std::size_t length) override;
private:
int batchSize;
size_t inputCount;
size_t imageIndex;
std::string calibTablePath;
std::vector<std::string> imgPaths;
float *batchData { nullptr };
void *deviceInput { nullptr };
bool readCache;
std::vector<char> calibrationCache;
};
int8EntropyCalibrator::int8EntropyCalibrator(const int &batchSize, const std::string &imgPath,
const std::string &calibTablePath) : batchSize(batchSize), calibTablePath(calibTablePath), imageIndex(0) {
int inputChannel = 3;
int inputH = 256;
int inputW = 256;
inputCount = batchSize * inputChannel * inputH * inputW;
std::fstream f(imgPath);
if (f.is_open()) {
std::string temp;
while( std::getline(f, temp) ) imgPaths.push_back(temp);
}
int len = imgPaths.size();
for( int i = 0; i < len; i++) {
std::cout << imgPaths[i] << std::endl;
}
// allocate memory for a batch of data, batchData is for CPU, deviceInput is for GPU
batchData = new flowt[inputCount];
CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
}
IInt8EntropyCalibrator::~IInt8EntropyCalibrator() {
CHECK(cudaFree(deviceInput));
if (batchData) {
delete[] batchData;
}
}
bool int8EntropyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) {
std::cout << imageIndex << " " << batchSize << std::endl;
std::cout << imgPaths.size() << std::endl;
if (imageIndex + batchSize > ing(imgPaths.size()))
return false;
// load batch
float *ptr = batchData;
for (size_t j = imageIndex; j < imageIndex + batchSize; ++j) {
cv::Mat img = cv::imread(imgPaths[j]);
std::vector<float> inputData = prepareImage(img);
if (inputData.size() != inputCount) {
std::cout << "InputSize Error" << std::endl;
return false;
}
assert(inputData.size() == inputCount);
memcpy(ptr, inputData.data(), (int)(inputData.size()) * sizeof(float));
ptr += inputData.size();
std::cout << "load image " << imgPaths[j] << " " << (j + 1) * 100. / imgPaths.size() << "%" << std::endl;
}
imageIndex += batchSize;
// copy bytes from Host to Device
CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
bindings[0] = deviceInput;
return true;
}
const void* int8Entropycalibrator::readCalibrationCache(std::size_t &length) {
calibrationCache.clear();
std::ifstream input(calibTablePath, std::ios::binary);
input >> std::noskipws;
if (readCache && input.good()) {
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
std::back_inserter(calibrationCache));
}
length = calibrationCache.size();
return length ? &calibrationCache[0] : nullptr;
}
void int8EntropyCalibrator::writeCalibrationCache(const void *cache, std::size_t length) {
std::ofstream output(calibTablePath, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
}
bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model
unsigned int maxBatchSize, // batch size - NB must be at least as large as the batch we want to run with
IHostMemory*& trtModelStream, // output buffer for the TensorRT model
const std::string& engineFile)
// create the builder
IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
assert(builder != nullptr);
// create the config
auto config = builder->createBuilderConfig();
assert(config != nullptr);
if (! builder->platformHasFastInt8()) {
std::cout << "builder platform do not support Int8" << std::endl;
return false;
}
const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
std::cout << "explicitBatch is: " << explicitBatch << std::endl;
nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());
//Optional - uncomment below lines to view network layer information
//config->setPrintLayerInfo(true);
//parser->reportParsingInfo();
if ( !parser->parseFromFile( locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()) ) )
{
gLogError << "Failure while parsing ONNX file" << std::endl;
return false;
}
// config
config->setAvgTimingIterations(1);
config->setMinTimingIterations(1);
config->setMaxWorkspaceSize(1_GiB);
// Build the engine
builder->setMaxBatchSize(maxBatchSize);
//builder->setMaxWorkspaceSize(1 << 20);
builder->setMaxWorkspaceSize(10 << 20);
nvinfer1::int8EntropyCalibrator *calibrator = nullptr;
if (calibFile.size() > 0 ) calibrator = new nvinfer1::int8EntropyCalibrator(maxBatchSize, calibFile, "");
// builder->setFp16Mode(gArgs.runInFp16);
// builder->setInt8Mode(gArgs.runInInt8);
// 對builder進行設置, 告訴它使用Int8模式, 並利用編寫好的calibrator類進行calibration
builder->setInt8Mode(true);
builder->setInt8Calibrator(calibrator);
// if (gArgs.runInInt8)
// {
// samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
// }
config->setFlag(BuiderFlag::kINT8);
config->setInt8Calibrator(calibrator);
// 如果使用了calibrator, 應該參考https://github.com/enazoe/yolo-tensorrt/blob/dd4cb522625947bfe6bfbdfbb6890c3f7558864a/modules/yolo.cpp, 把下面這行注釋掉,使用數據集校准得到dynamic range;否則使用下面這行手動設置dynamic range。
// setAllTensorScales函數在官方TensorRT開源代碼里有
samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
// samplesCommon::enableDLA(builder, gArgs.useDLACore);
ICudaEngine* engine = builder->buildCudaEngine(*network);
assert(engine);
if (calibrator) {
delete calibrator;
calibrator = nullptr;
}
// we can destroy the parser
parser->destroy();
// serialize the engine, then close everything down
trtModelStream = engine->serialize();
std::ofstream file;
file.open(engineFile, std::ios::binary | std::ios::out);
file.write((const char*)data->data(), data->size());
file.close();
engine->destroy();
config->destroy();
network->destroy();
builder->destroy();
return true;
}
void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
const ICudaEngine& engine = context.getEngine();
// input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly one input and one output.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
DebugP(inputIndex); DebugP(outputIndex);
// create GPU buffers and a stream
CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA the input to the GPU, execute the batch asynchronously, and DMA it back:
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// release the stream and the buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo()
{
std::cout << "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]\n";
std::cout << "--help Display help information\n";
std::cout << "--datadir Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)" << std::endl;
std::cout << "--useDLACore=N Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform." << std::endl;
std::cout << "--int8 Run in Int8 mode.\n";
std::cout << "--fp16 Run in FP16 mode." << std::endl;
}
int main(int argc, char** argv)
{
bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);
if (gArgs.help)
{
printHelpInfo();
return EXIT_SUCCESS;
}
if (!argsOK)
{
std::cout << "Invalid arguments" << std::endl;
// gLogError << "Invalid arguments" << std::endl;
printHelpInfo();
return EXIT_FAILURE;
}
if (gArgs.dataDirs.empty())
{
gArgs.dataDirs = std::vector<std::string>{"data/"};
}
auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));
gLogger.reportTestStart(sampleTest);
// create a TensorRT model from the onnx model and serialize it to a stream
nvinfer1::IHostMemory* trtModelStream{nullptr};
if (!onnxToTRTModel(onnxFile, 1, trtModelStream))
gLogger.reportFail(sampleTest);
assert(trtModelStream != nullptr);
std::cout << "Successfully parsed ONNX file!!!!" << std::endl;
std::cout << "Start reading the input image!!!!" << std::endl;
cv::Mat image = cv::imread(locateFile("test.jpg", gArgs.dataDirs), cv::IMREAD_COLOR);
if (image.empty()) {
std::cout << "The input image is empty!!! Please check....."<<std::endl;
}
DebugP(image.size());
cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
cv::Mat dst = cv::Mat::zeros(INPUT_H, INPUT_W, CV_32FC3);
cv::resize(image, dst, dst.size());
DebugP(dst.size());
float* data = normal(dst);
// deserialize the engine
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
if (gArgs.useDLACore >= 0)
{
runtime->setDLACore(gArgs.useDLACore);
}
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
float prob[OUTPUT_SIZE];
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;
double total = 0.0;
// run inference and cout time
auto t0 = Time::now();
doInference(*context, data, prob, 1);
auto t1 = Time::now();
fsec fs = t1 - t0;
ms d = std::chrono::duration_cast<ms>(fs);
total += d.count();
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
std::cout << std::endl << "Running time of one image is:" << total << "ms" << std::endl;
std::cout << "Output:\n";
for (int i = 0; i < OUTPUT_SIZE; i++)
{
gLogInfo << prob[i] << " ";
}
std::cout << std::endl;
return gLogger.reportTest(sampleTest, true);
}
除了上面這個實現外,官方的sampleINT8.cpp也非常值得參考。
參考資料:
