TensorFlow 筆記04 - 使用類封裝寫好的 TensorRT 模型，包括 int8 優化要用的 calibrator

本文轉載自查看原文 2019-10-29 19:20 532 TensorRT/ Python/ TensorFlow

▶ 使用類封裝寫好的 TensorRT 模型，每個函數、類成員各司其職，而不是以前程序那樣純過程式，變量全部攤開

● 代碼，程序入口 enter.py

  1 import os
  2 import sys
  3 import numpy as np
  4 import tensorrt as trt
  5 import pycuda.autoinit
  6 import pycuda.driver as cuda
  7 from datetime import datetime as dt
  8 
  9 import loadPara as ld
 10 import calibrator
 11 
 12 DEBUG           = True                     
 13 testDataPath    = "./"
 14 calibDataPath   = "./"
 15 tempPath        = "./"
 16 paraFile        = tempPath + "para.h5"
 17 cacheFile       = tempPath + "calib.cache"
 18 outputFile      = tempPath + "output.txt"
 19             
 20 iGpu            = 0
 21 calibCount      = 10                        # int8 校正次數
 22 inputSize       = (1,1,1)                   # 輸入數據尺寸，CHW
 23 
 24 class TrtPredictor:
 25     def __init__(self, batchSize, dataType):
 26         self.logger     = trt.Logger(trt.Logger.ERROR)                      # 創建 logger
 27         self.batchSize  = batchSize
 28         self.dataType   = dataType
 29         self.h5f, ...   = fld.loadPara(paraFile)                            # 讀取訓練好的參數
 30         
 31         trtFilePath = tempPath + "engine-" + self.dataType + ".trt"         # 嘗試讀取創建好的引擎，沒有則現場創建引擎
 32         if os.path.isfile(trtFilePath) and not DEBUG:
 33             f =  open(trtFilePath, 'rb')
 34             engineStr = f.read()                                            # enginStr 不作為成員變量
 35             self.runtime = trt.Runtime(self.logger)                         # 運行時讀取文件中的引擎
 36             self.engine = self.runtime.deserialize_cuda_engine(engineStr)
 37             f.close()
 38             print("succeeded loading engine!")
 39         else:                             
 40             self.create_engine()                                            # 創建 engine，並寫入文件，方便下次調用
 41             if self.engine == None:
 42                 print("failed building engine!")
 43                 return
 44             engineStr = self.engine.serialize()
 45             f = open(trtFilePath, 'wb')
 46             f.write(engineStr)
 47             f.close()
 48             print("succeeded building engine!")
 49                 
 50         self.context = self.engine.create_execution_context()               # 創建 CUDA 上下文和流
 51         self.stream = cuda.Stream()
 52                 
 53     def __del__(self):
 54         self.context = None
 55         self.engine  = None
 56         ld.close(self.h5f)
 57   
 58     def create_engine(self):                                                # 構造引擎
 59         self.builder = trt.Builder(self.logger)
 60         self.builder.max_batch_size     = 16
 61         self.builder.max_workspace_size = 1 << 30
 62         self.builder.fp16_mode          = self.dataType == 'float16'
 63         self.builder.int8_mode          = self.dataType == 'int8'
 64         self.network                    = self.builder.create_network()
 65         self.builder.strict_type_constraints = True        
 66                
 67         h0 = self.network.add_input("h0", trt.DataType.FLOAT, (1,) + inputSize) # 強制 N 為 1，多的數據堆在更高維度上
 68         
 69         #...                                                                # 中間層
 70         
 71         self.network.mark_output(h0.get_output(0))                          # 標記輸出層
 72 
 73         if self.dataType == 'int8':                                         # int8 需要額外的校正，放到 builder 中
 74             self.builder.int8_calibrator = calibrator.MyCalibrator(calibCount, (self.batchSize,) + inputSize, calibDataPath, cacheFile)
 75 
 76         self.engine = self.builder.build_cuda_engine(self.network)          # 創建引擎（最容易失敗的地方，返回構造函數后要檢查是否成功）
 77     
 78     def infer(self, hInPart, dIn, dOut, hOut):                              # 推理
 79         cuda.memcpy_htod_async(dIn, hInPart, self.stream)
 80         self.context.execute_async(len(hInPart), [int(dIn), int(dOut)], self.stream.handle)
 81         cuda.memcpy_dtoh_async(hOut, dOut, self.stream)            
 82         self.stream.synchronize()
 83                
 84 def predict(hIn, batchSize, dataType):    
 85     predictor = TrtPredictor(batchSize, dataType)                           # 構造一個預測器
 86     
 87     dIn  = cuda.mem_alloc(hIn[0].nbytes * batchSize)                        # 准備主機和設備內存
 88     hOut = np.empty((batchSize,) + tuple(predictor.engine.get_binding_shape(1)), dtype = np.float32)
 89     dOut = cuda.mem_alloc(hOut.nbytes)                                      # dOut 和 hOut 的大小一定是相同的
 90     res=[]
 91     for i in range(0, len(hIn), batchSize):                                 # 分 batch 喂入數據
 92         predictor.infer(hIn[i:i+batchSize], dIn, dOut, hOut)                    
 93         res.append( hOut )
 94     
 95     return res
 96 
 97 if __name__ == "__main__":                                                  # main 函數負責管理 cuda.Device 和 cuda.Context
 98     _ = os.system("clear")
 99     batchSize = int(sys.argv[1])    if len(sys.argv) > 1 and sys.argv[1].isdigit()                         else 1
100     dataType  = sys.argv[2]         if len(sys.argv) > 2 and sys.argv[2] in ['float32', 'float16', 'int8'] else 'float32'
101     DEBUG     = int(sys.argv[3])>0  if len(sys.argv) > 3 and sys.argv[3].isdigit()                         else False
102     if DEBUG:                                                               # 清除建好的 engine 和 校正緩存，重頭開始建立                                
103         oldEngineEAndCache = glob(tempPath+"*.trt") + glob(tempPath+"*.cache")
104         [ os.remove(oldEngineEAndCache[i]) for i in range(len(oldEngineEAndCache))]
105     print( "%s, start! GPU =  %s, batchSize = %2d, dataType  = %s" %( dt.now(), cuda.Device(iGpu).name(), batchSize, dataType ) )    
106     
107     inputData = loadData(testDataPath)                                      # 讀取數據
108     oF = open(outputFile, 'w')
109     cuda.Device(iGpu).make_context()
110 
111     res = predict(inputData, batchSize, dataType)
112     for i in range(len(res)):
113         print( "%d -> %s" % (i,res[i]) )
114         oF.write(res[i] + '\n')
115         
116     oF.close()
117     cuda.Context.pop()
118     print( "%s, finish!" %(dt.now()) )

● 代碼，矯正器 calibrator.py。核心思想是，手寫一個數據生成器供 TensorRT 調用，每次從校正數據集中抽取 batchSize 那么多的數據，計算工作全部由 TensorRT 完成

 1 import os
 2 import numpy as np
 3 import tensorrt as trt
 4 import pycuda.driver as cuda
 5 import pycuda.autoinit
 6 
 7 class MyCalibrator(trt.IInt8EntropyCalibrator2):
 8     def __init__(self, calibCount, inputShape, calibDataPath, cacheFile):
 9         trt.IInt8EntropyCalibrator2.__init__(self)                                              # 基類默認構造函數                                        
10         self.calibCount     = calibCount                
11         self.shape          = inputShape
12         self.calibDataSet   = self.laodData(calibDataPath)                                      # 需要自己實現一個讀數據的函數
13         self.cacheFile      = cacheFile
14         self.calibData      = np.zeros(self.shape, dtype=np.float32)        
15         self.dIn            = cuda.mem_alloc(trt.volume(self.shape) * trt.float32.itemsize)     # 准備好校正用的設備內存      
16         self.oneBatch       = self.batchGenerator()
17             
18     def batchGenerator(self):                                                                   # calibrator 的核心，一個提供數據的生成器
19         for i in range(self.calibCount):
20             print("> calibration ", i)
21             self.calibData = np.random.choice(self.calibDataSet, self.shape[0], replace=False)  # 隨機選取數據 
22             yield np.ascontiguousarray(self.calibData, dtype=np.float32)                        # 調整數據格式后拋出   
23     
24     def get_batch_size(self):                                                                   # TensorRT 會調用，不能改函數名
25         return self.shape[0]
26 
27     def get_batch(self, names):                                                                 # TensorRT 會調用，不能改函數名，老版本 TensorRT 的輸入參數個數可能不一樣
28         try:
29             data = next(self.oneBatch)                                                          # 生成下一組校正數據，拷貝到設備並返回設備地址，否則退出
30             cuda.memcpy_htod(self.dIn, data)
31             return [int(self.dIn)]
32         except StopIteration:
33             return None
34 
35     def read_calibration_cache(self):                                                           # TensorRT 會調用，不能改函數名
36         if os.path.exists(self.cacheFile):
37             print( "cahce file: %s" %(self.cacheFile) )
38             f = open(self.cacheFile, "rb")
39             cache = f.read()
40             f.close()
41             return cache              
42             
43     def write_calibration_cache(self, cache):                                                   # TensorRT 會調用，不能改函數名
44         print( "cahce file: %s" %(self.cacheFile) )
45         f = open(self.cacheFile, "wb")
46         f.write(cache)
47         f.close()

▶ 我的程序在 TensorRT 5 中 float32 和 float16 一切正常，int8 無法正確計算。具體表現為：正確加載 calibrator 調用，部分中間層計算結果與 float32 一模一樣（二進制位層次上的相同，顯然是采用了 float32 代替進行計算了），部分層所有計算結果與 float32 有分歧（10^-2 ~ 10^-3 量級上的），在之后多層計算中誤差會逐漸放大，最終計算結果與 float32 大相徑庭。更新 TensorRT 6 之后問題消失，int8 也能計算正確結果並獲得加速。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 TensorRT int8 量化部署 yolov5s 5.0 模型從TensorRT看INT8量化原理 Tensorrt環境安裝及yolov5模型轉換以及量化部署INT8 網絡模型int8量化中使用的一些量化方法模型加速[tensorflow&tensorrt] Xilinx器件INT8優化方法的HLS示例 Pytorch學習筆記04----LSTM模型理解及入門使用 YOLOv3使用筆記——TensorRT加速 tensorflow學習筆記-SavedModel文件解釋及TFServing的模型加載、使用 TensorFlow對象檢測-1.0和2.0：訓練，導出，優化（TensorRT），推斷（Jetson Nano）