原理

為什么要使用INT8推理：更高的吞吐量/處理的fps提高以及更低的內存占用(8-bit vs 32-bit)
將FP32模型轉換成INT8模型存在的挑戰：更低的動態范圍和精度

Consider that 32-bit floating-point can represent roughly 4 billion numbers in the interval [-3.4e38, 3.40e38]. This interval of representable numbers is also known as the dynamic-range. The distance between two neighboring representable numbers is the precision of the representation. ——《Achieving FP32 Accuracy for INT8 Inference Using Quantization Aware Training with NVIDIA TensorRT》

如何將FP32量化成INT8：最簡單的一種方式是Symmetric Linear Quantization，每個Tensor都可以用一個和它關聯的scalar factor乘以量化后的INT8值。那么如何確定這個scalar factor呢？

對於weights，TensorRT采用左圖的方式進行映射，這樣不會帶來accuracy drop；對於activations，TensorRT采用上圖右邊這種方式進行INT8量化，這面臨一個新的問題，如何為每個activation tensor選取最佳的|threshold|呢？（這個其實就是calibration的過程）
選取不同的threshold，相當於是不同的編碼方式。從信息論的角度看，我們希望選取一種編碼方式，使得編碼前后的信息損失最小，我們可以用KL散度來衡量這個信息損失。

對activations的calibration

實踐

為了使用TensorRT的INT8推理，我們需要編寫一個自己的calibrator類，然后通過builder->setInt8Calibrator(calibrator)告訴builder使用這個calibrator來做數據標定，從而減小量化誤差。
至於builder具體是怎么去做標定的，builder類實現了以下功能：

builder首先調用calibrator類的getBatchSize()來獲取input batch的大小
然后builder不斷調用getBatch()來獲取用於標定的輸入數據，讀入的batch data的大小必須和getBatchSize()得到的大小一致，如果沒有input batch數據了，getBatch()返回false
builder會先建立一個32-bit的Engine，對calibration set進行前向推理，並記錄下每層activations的直方圖
根據獲得的直方圖建立一個calibration table
基於得到的calibration table和network definition來創建8-bit的Engine

而calibration的過程是比較耗時的，通過對calibration table進行緩存，可以高效地對同一網絡build多次。要實現對calibration table的緩存功能，需要實現calibrator類中的writeCalibrationCache()和readCalibrationCache()兩個函數。
綜上所述，要實現一個INT8的Engine，開發人員需要實現一個calibrator類，這個類需override下面幾個函數：

getBatchSize
getBatch
writeCalibrationCache(optional)
readCalibrationCache(optional)

這個calibrator類是一個IInt8Calibrator，TensorRT提供了4個IInt8Calibrator的派生類(IInt8EntropyCalibrator、IInt8EntropyCalibrator2、IInt8MinMaxCalibrator、IInt8LegacyCalibrator，我們例子中的calibrator繼承自IInt8EntropyCalibrator.


#include <algorithm>
#include <assert.h>
#include <cmath>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <sys/stat.h>
#include <time.h>
#include <opencv2/opencv.hpp>
#include "NvInfer.h"
#include "NvOnnxParser.h"
#include "argsParser.h"
#include "logger.h"
#include "common.h"
#include "image.hpp"
#define DebugP(x) std::cout << "Line" << __LINE__ << "  " << #x << "=" << x << std::endl


using namespace nvinfer1;

Logger gLogger;
// LogStreamConsumer gLogError;

static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int INPUT_C = 3;
static const int OUTPUT_SIZE = 1000;

const char* INPUT_BLOB_NAME = "input";
const char* OUTPUT_BLOB_NAME = "output";

const std::string gSampleName = "TensorRT.sample_onnx_image";
const std::string onnxFile = "resnet50.onnx";
const std::string engineFile = "../data/resnet50_int8.trt"
const std::string calibFile = "../data/calibration_img.txt"


samplesCommon::Args gArgs;

std::vector<float> prepareImage(cv::Mat &img) {
    int c = 3;
    int h = INPUT_H;
    int w = INPUT_W;

    // 1 Resize the source Image to a specific size(這里保持原圖長寬比進行resize)
    float scale = std::min(float(w) / img.cols, float(h) / img.rows);
    auto scaleSize = cv::Size(img.cols * scale, img.rows * scale);

    // Convert BGR to RGB
    cv::Mat rgb;
    cv::cvtColor(img, rgb, CV_BGR2RGB);

    cv::Mat resized;
    cv::resize(rgb, resized, scaleSize, 0, 0, cv::INTER_CUBIC);

    // 2 Crop Image(將resize后的圖像放在(H, W, C)的中心, 周圍用127做padding)
    cv::Mat cropped(h, w, CV_8UC3, 127)
    // Rect(left_top_x, left_top_y, width, height)
    cv::Rect rect((w - scaleSize.width) / 2, (h - scaleSize.height) / 2, scaleSize.width, scaleSize.height);
    resize.copyTo(cropped(rect));

    // 3 Type conversion, convert unsigned int 8 to float 32
    cv::Mat img_float;
    cropped.convertTo(img_float, CV_32FC3, 1.f / 255.0);

    // HWC to CHW, and convert Mat to std::vector<float>
    std::vector<cv::Mat> input_channels(c);
    cv::split(cropped, input_channels);

    std::vector<float> result(h * w * c);
    auto data = result.data();
    int channelLength = h * w;
    for (int i = 0; i < c; ++i) {
        memcpy(data, input_channels[i].data, channelLength * sizeof(float));
        data += channelLength;
    }
    return result;
}

// 實現自己的calibrator類
namespace nvinfer1 {
    class int8EntropyCalibrator: public nvinfer1::IInt8EntropyCalibrator {
        public:
            int8EntropyCalibrator(const int &batchSize,
                const std::string &imgPath,
                const std::string &calibTablePath);

            virtual ~int8EntropyCalibrator();

            int getBatchSize() const override { return batchSize; }

            bool getBatch(void *bindings[], const char *names[], int nbBindings) override;

            const void *readCalibationCache(std::size_t &length) override;

            void writeCalibrationCache(const void *ptr, std::size_t length) override;
       
        private:
            int batchSize;
            size_t inputCount;
            size_t imageIndex;
            std::string calibTablePath;
            std::vector<std::string> imgPaths;

            float *batchData { nullptr };
            void *deviceInput { nullptr };

            bool readCache;
            std::vector<char> calibrationCache;
    };

    int8EntropyCalibrator::int8EntropyCalibrator(const int &batchSize, const std::string &imgPath,
        const std::string &calibTablePath) : batchSize(batchSize), calibTablePath(calibTablePath), imageIndex(0) {
            int inputChannel = 3;
            int inputH = 256;
            int inputW = 256;
            inputCount = batchSize * inputChannel  * inputH * inputW;

            std::fstream f(imgPath);
            if (f.is_open()) {
                std::string temp;
                while( std::getline(f, temp) ) imgPaths.push_back(temp);
            }
            int len = imgPaths.size();
            for( int i = 0; i < len; i++) {
                std::cout << imgPaths[i] << std::endl;
            }

            // allocate memory for a batch of data, batchData is for CPU, deviceInput is for GPU
            batchData = new flowt[inputCount];
            CHECK(cudaMalloc(&deviceInput, inputCount * sizeof(float)));
        }

        IInt8EntropyCalibrator::~IInt8EntropyCalibrator() {
            CHECK(cudaFree(deviceInput));
            if (batchData) {
                delete[] batchData;
            }
        }

        bool int8EntropyCalibrator::getBatch(void **bindings, const char **names, int nbBindings) {
            std::cout << imageIndex << " " << batchSize << std::endl;
            std::cout << imgPaths.size() << std::endl;
            if (imageIndex + batchSize > ing(imgPaths.size()))
                return false;
            // load batch
            float *ptr = batchData;
            for (size_t j = imageIndex; j < imageIndex + batchSize; ++j) {
                cv::Mat img = cv::imread(imgPaths[j]);
                std::vector<float> inputData = prepareImage(img);
                if (inputData.size() != inputCount) {
                    std::cout << "InputSize Error" << std::endl;
                    return false;
                }
                assert(inputData.size() == inputCount);
                memcpy(ptr, inputData.data(), (int)(inputData.size()) * sizeof(float));
                ptr += inputData.size();
                std::cout << "load image " << imgPaths[j] << " " << (j + 1) * 100. / imgPaths.size() << "%" << std::endl;
            }
            imageIndex += batchSize;
            // copy bytes from Host to Device
            CHECK(cudaMemcpy(deviceInput, batchData, inputCount * sizeof(float), cudaMemcpyHostToDevice));
            bindings[0] = deviceInput;
            return true;
        }

        const void* int8Entropycalibrator::readCalibrationCache(std::size_t &length) {
            calibrationCache.clear();
            std::ifstream input(calibTablePath, std::ios::binary);
            input >> std::noskipws;
            if (readCache && input.good()) {
                std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
                    std::back_inserter(calibrationCache));
            }
            length = calibrationCache.size();
            return length ? &calibrationCache[0] : nullptr;
        }

        void int8EntropyCalibrator::writeCalibrationCache(const void *cache, std::size_t length) {
            std::ofstream output(calibTablePath, std::ios::binary);
            output.write(reinterpret_cast<const char*>(cache), length);
        }
}

bool onnxToTRTModel(const std::string& modelFile, // name of the onnx model
                    unsigned int maxBatchSize,    // batch size - NB must be at least as large as the batch we want to run with
                    IHostMemory*& trtModelStream, // output buffer for the TensorRT model
                    const std::string& engineFile)
    // create the builder
    IBuilder* builder = createInferBuilder(gLogger.getTRTLogger());
    assert(builder != nullptr);

    // create the config
    auto config = builder->createBuilderConfig();
    assert(config != nullptr);

    if (! builder->platformHasFastInt8()) {
        std::cout << "builder platform do not support Int8" << std::endl;
        return false;
    }

    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    std::cout << "explicitBatch is: " << explicitBatch << std::endl;
    nvinfer1::INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    auto parser = nvonnxparser::createParser(*network, gLogger.getTRTLogger());

    //Optional - uncomment below lines to view network layer information
    //config->setPrintLayerInfo(true);
    //parser->reportParsingInfo();

    if ( !parser->parseFromFile( locateFile(modelFile, gArgs.dataDirs).c_str(), static_cast<int>(gLogger.getReportableSeverity()) ) )
    {
        gLogError << "Failure while parsing ONNX file" << std::endl;
        return false;
    }
 
    // config
    config->setAvgTimingIterations(1);
    config->setMinTimingIterations(1);
    config->setMaxWorkspaceSize(1_GiB);

    // Build the engine
    builder->setMaxBatchSize(maxBatchSize);
    //builder->setMaxWorkspaceSize(1 << 20);
    builder->setMaxWorkspaceSize(10 << 20);

    nvinfer1::int8EntropyCalibrator *calibrator = nullptr;
    if (calibFile.size() > 0 ) calibrator = new nvinfer1::int8EntropyCalibrator(maxBatchSize, calibFile, "");

    // builder->setFp16Mode(gArgs.runInFp16);
    // builder->setInt8Mode(gArgs.runInInt8);

    // 對builder進行設置, 告訴它使用Int8模式, 並利用編寫好的calibrator類進行calibration
    builder->setInt8Mode(true);
    builder->setInt8Calibrator(calibrator);

    // if (gArgs.runInInt8)
    // {
    //     samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
    // }
    config->setFlag(BuiderFlag::kINT8);
    config->setInt8Calibrator(calibrator);

    // 如果使用了calibrator, 應該參考https://github.com/enazoe/yolo-tensorrt/blob/dd4cb522625947bfe6bfbdfbb6890c3f7558864a/modules/yolo.cpp, 把下面這行注釋掉，使用數據集校准得到dynamic range；否則使用下面這行手動設置dynamic range。
    // setAllTensorScales函數在官方TensorRT開源代碼里有
    samplesCommon::setAllTensorScales(network, 127.0f, 127.0f);
    // samplesCommon::enableDLA(builder, gArgs.useDLACore);
   
    ICudaEngine* engine = builder->buildCudaEngine(*network);
    assert(engine);

    if (calibrator) {
        delete calibrator;
        calibrator = nullptr;
    }

    // we can destroy the parser
    parser->destroy();

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    std::ofstream file;
    file.open(engineFile, std::ios::binary | std::ios::out);
    file.write((const char*)data->data(), data->size());
    file.close();

    engine->destroy();
	config->destroy();
    network->destroy();
    builder->destroy();

    return true;
}

void doInference(IExecutionContext& context, float* input, float* output, int batchSize)
{
    const ICudaEngine& engine = context.getEngine();
    // input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
    // of these, but in this case we know that there is exactly one input and one output.
    assert(engine.getNbBindings() == 2);
    void* buffers[2];

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // note that indices are guaranteed to be less than IEngine::getNbBindings()
   
    const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
    const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
   
    DebugP(inputIndex); DebugP(outputIndex);
    // create GPU buffers and a stream
    CHECK(cudaMalloc(&buffers[inputIndex], batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));

    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    // DMA the input to the GPU,  execute the batch asynchronously, and DMA it back:
    CHECK(cudaMemcpyAsync(buffers[inputIndex], input, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);

    // release the stream and the buffers
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[inputIndex]));
    CHECK(cudaFree(buffers[outputIndex]));
}

//!
//! \brief This function prints the help information for running this sample
//!
void printHelpInfo()
{
    std::cout << "Usage: ./sample_onnx_mnist [-h or --help] [-d or --datadir=<path to data directory>] [--useDLACore=<int>]\n";
    std::cout << "--help          Display help information\n";
    std::cout << "--datadir       Specify path to a data directory, overriding the default. This option can be used multiple times to add multiple directories. If no data directories are given, the default is to use (data/samples/mnist/, data/mnist/)" << std::endl;
    std::cout << "--useDLACore=N  Specify a DLA engine for layers that support DLA. Value can range from 0 to n-1, where n is the number of DLA engines on the platform." << std::endl;
    std::cout << "--int8          Run in Int8 mode.\n";
    std::cout << "--fp16          Run in FP16 mode." << std::endl;
}

int main(int argc, char** argv)
{
    bool argsOK = samplesCommon::parseArgs(gArgs, argc, argv);
    if (gArgs.help)
    {
        printHelpInfo();
        return EXIT_SUCCESS;
    }
    if (!argsOK)
    {
        std::cout << "Invalid arguments" << std::endl;
        // gLogError << "Invalid arguments" << std::endl;
        printHelpInfo();
        return EXIT_FAILURE;
    }
    if (gArgs.dataDirs.empty())
    {
        gArgs.dataDirs = std::vector<std::string>{"data/"};
    }

    auto sampleTest = gLogger.defineTest(gSampleName, argc, const_cast<const char**>(argv));

    gLogger.reportTestStart(sampleTest);

    // create a TensorRT model from the onnx model and serialize it to a stream
    nvinfer1::IHostMemory* trtModelStream{nullptr};

    if (!onnxToTRTModel(onnxFile, 1, trtModelStream))
        gLogger.reportFail(sampleTest);

    assert(trtModelStream != nullptr);
    std::cout << "Successfully parsed ONNX file!!!!" << std::endl;
   
   
    std::cout << "Start reading the input image!!!!" << std::endl;
   
    cv::Mat image = cv::imread(locateFile("test.jpg", gArgs.dataDirs), cv::IMREAD_COLOR);
    if (image.empty()) {
        std::cout << "The input image is empty!!! Please check....."<<std::endl;
    }
    DebugP(image.size());
    cv::cvtColor(image, image, cv::COLOR_BGR2RGB);

    cv::Mat dst = cv::Mat::zeros(INPUT_H, INPUT_W, CV_32FC3);
    cv::resize(image, dst, dst.size());
    DebugP(dst.size());

    float* data = normal(dst);

    // deserialize the engine
    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    if (gArgs.useDLACore >= 0)
    {
        runtime->setDLACore(gArgs.useDLACore);
    }

    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
    assert(engine != nullptr);
    trtModelStream->destroy();
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
   
    float prob[OUTPUT_SIZE];
    typedef std::chrono::high_resolution_clock Time;
    typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
    typedef std::chrono::duration<float> fsec;
    double total = 0.0;

    // run inference and cout time
    auto t0 = Time::now();
    doInference(*context, data, prob, 1);
    auto t1 = Time::now();
    fsec fs = t1 - t0;
    ms d = std::chrono::duration_cast<ms>(fs);
    total += d.count();
    // destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
   
    std::cout << std::endl << "Running time of one image is:" << total << "ms" << std::endl;
  
    std::cout << "Output:\n";
    for (int i = 0; i < OUTPUT_SIZE; i++)
    {
        gLogInfo << prob[i] << " ";
    }
    std::cout << std::endl;

    return gLogger.reportTest(sampleTest, true);
}

除了上面這個實現外，官方的sampleINT8.cpp也非常值得參考。

參考資料：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 tensorrt int8量化原理幾點問題記錄 bert 推理加速使用tensorRt int4,int8,int16有什么區別？如何計算？ Int8，Int16，Int32，Int64 有啥不同呢？看了立馬就懂！有符號位和無符號位。——int8疑問有感 NVIDIA TensorRT：可編程推理加速器 TensorRT 開始利用NVIDIA NGC的TensorRT容器優化和加速人工智能推理 TensorFlow 筆記02-mnist 的 tensorRT 實現，從 .npz 文件中加載參數進行推理使用TensorRT對caffe和pytorch onnx版本的mnist模型進行fp32和fp16 推理 | tensorrt fp32 fp16 tutorial with caffe pytorch minist model