TensorFlow 筆記02-mnist 的 tensorRT 實現,從 .npz 文件中加載參數進行推理


● 代碼,tf 卷積神經網絡,將訓練好的參數保存為 .npz 文件給 tensorRT 用

1 # tf 模型搭建和訓練部分同上一篇博客
2 tfArg = {}
3 for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):      # 遍歷全局元素集合,全部放進 tfArg 中來
4     tfArg[i.name] = sess.run(i)
5 tfArg['testX']=mnist[2].images                                  # 補上 mnist 的測試數據
6 tfArg['testY']=mnist[2].labels
7 
8 np.savez(pbFilePath + 'tfArg.npz',**tfArg)                      # 保存 tfArg 為 .npz 文件
9 sess.close()

● 代碼,將前面一模一樣的神經網絡用 trt 重寫一遍,加載訓練好的參數來推理

 1 import numpy as np
 2 import tensorflow as tf
 3 import tensorrt as trt
 4 import pycuda.autoinit
 5 import pycuda.driver as cuda
 6 import input_data
 7 from datetime import datetime as dt
 8 
 9 pbFilePath = "tempFile/"
10 
11 # 網絡基礎設施
12 iGpu = 0
13 print("GPU in use:", cuda.Device(iGpu).name())
14 cuda.Device(iGpu).make_context()
15 logger = trt.Logger(trt.Logger.WARNING)
16 builder = trt.Builder(logger)
17 network = builder.create_network()
18 builder.max_batch_size = 64
19 builder.max_workspace_size = 0 << 20
20 
21 # 讀取參數
22 para = np.load(pbFilePath + 'tfArg.npz')
23 w1= para['w1:0'].transpose((3, 2, 0, 1)).reshape(-1)                            # NHWC -> NCHW,所有權重都要 reshape(-1) 壓成 1 維
24 b1 = para['b1:0']
25 w2 = para['w2:0'].transpose((3, 2, 0, 1)).reshape(-1)
26 b2 = para['b2:0']
27 w3 = para['w3:0'].reshape(7,7,64,1024).transpose((3, 2, 0, 1)).reshape(-1)
28 b3 = para['b3:0']
29 w4 = para['w4:0'].reshape(1024,10).transpose((1,0)).reshape(-1)
30 b4 = para['b4:0']
31 testX = para['testX']                                                           # 測試數據
32 testY = para['testY']
33 
34 # 建立網絡
35 batchSize = 64
36 data = network.add_input("data", trt.DataType.FLOAT, (batchSize, 1, 28, 28))    # 輸入層,batchSize 張 1 通道 28 行 28 列
37 
38 h1 = network.add_convolution(data, 32, (5, 5), w1, b1)                          # 卷積 1,指定輸出特征數,窗口高寬,權重值(隱式轉換為 trt.Weigfhts)
39 h1.stride = (1, 1)                                                              # 外側補充指定跨步和光環
40 h1.padding = (2, 2)
41 h1Act = network.add_activation(h1.get_output(0), trt.ActivationType.RELU)       # 激活層,指定激活類型
42 
43 h1Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化層,指定池化類型,窗口高寬
44 h1Pool.stride = (2, 2)
45 h1Pool.padding = (0, 0)
46                
47 h2 = network.add_convolution(h1Pool.get_output(0), 64, (5, 5), w2, b2)          # 卷積 2
48 h2.stride = (1, 1)
49 h2.padding = (2, 2)
50 h2Act = network.add_activation(h2.get_output(0), trt.ActivationType.RELU)
51 
52 h2Pool = network.add_pooling(h1Act.get_output(0), trt.PoolingType.MAX, (2, 2))  # 池化 2
53 h2Pool.stride = (2, 2)
54 h2Pool.padding = (0, 0)
55 
56 
57 h3 = network.add_fully_connected(h2Pool.get_output(0), 1024, w3, b3)            # 全連接層,指定輸出特征數,權重值
58 h3Act = network.add_activation(h3.get_output(0), trt.ActivationType.RELU)
59 
60 h4 = network.add_fully_connected(h3Act.get_output(0), 10, w4, b4)               # 全連接層 2
61 y = network.add_softmax(h4.get_output(0))                                       # softmax 層
62 
63 network.mark_output(y.get_output(0))                                            # 指定輸出層
64 engine = builder.build_cuda_engine(network)                                     # 建立 engine
65 
66 # 申請內存
67 h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=np.float32)  
68 h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=np.float32)
69 d_input = cuda.mem_alloc(h_input.nbytes)
70 d_output = cuda.mem_alloc(h_output.nbytes)
71 
72 # 流和上下文
73 stream = cuda.Stream()
74 context = engine.create_execution_context()
75 
76 # 測試
77 print( "%s, start!" %( dt.now()) )
78 acc = 0
79 nTest = len(para['testX'])
80 for i in range(nTest // batchSize):                                             # 向下取整,尾巴可能沒測完       
81     h_input = para['testX'][i*batchSize:(i+1)*batchSize].reshape(-1,1,28,28)
82     
83     cuda.memcpy_htod_async(d_input, h_input, stream)                            # 數據拷貝
84     
85     context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)  # 執行內核
86     
87     cuda.memcpy_dtoh_async(h_output, d_output, stream)
88     
89     stream.synchronize()                                                        # 同步,否則 yy 是 全零矩陣
90     
91     yy = np.argmax(h_output.reshape(engine.get_binding_shape(1)),1).reshape(-1)
92     label = np.argmax(para['testY'][i*batchSize:(i+1)*batchSize],1)    
93     acc += np.sum( ( yy == label ).astype(np.int) )
94     
95 cuda.Context.pop()                                                              # 刪除上下文
96 print( "%s, acc = %f" %( dt.now(), acc/(len(para['testX'])) ) )

 

▶ 總結 tensorRT 的關鍵步驟(包含 engine 的讀寫,避免每次都新建 engine 浪費時間)

 1 import tensorflow as tf
 2 import tensorrt as trt
 3 import pycuda.autoinit
 4 import pycuda.driver as cuda
 5 
 6 iGpu = 0
 7 cuda.Device(iGpu).make_context()                # 設備上下文
 8 logger = trt.Logger(trt.Logger.WARNING)         # 創建 logger
 9 
10 trtFilePath = "./densenetEngine.trt"            # 讀取現成的 engine 序列文件,否則現場生成一個 engine 並序列化保存為文件
11 if os.path.isfile(trtFilePath) and not DEBUG:
12     with open(trtFilePath, 'rb') as f:
13         engineStr = f.read()
14 else:
15     builder = trt.Builder(logger)               # 創建 builder
16     builder.max_batch_size     = 64
17     builder.max_workspace_size = 200 << 20
18     builder.fp16_mode          = True           # 是否使用 float16
19     
20     network = builder.create_network()          # 創建 network
21 
22     h0 = network.add_input("h0", ...)           # 開始建網
23     
24     ...
25 
26     y = network.add_...
27     
28     network.mark_output(y.get_output(0))        # 標記輸出節點
29     
30     engine = builder.build_cuda_engine(network) # 建立 engine,最容易失敗的位置
31 
32     if engine == None:
33         print("build engine failed!")
34         return None        
35     
36     engineStr = engine.serialize()              # 創建序列化的 engine 並寫入文件中,方便下次直接取用
37     with open(trtFilePath, 'wb') as f:
38         f.write(engineStr)
39 
40 runtime = trt.Runtime(logger)                                               # 利用運行時環境讀取序列化的 engine(現場創建 engine 的可以跳過這步)
41 engine  = runtime.deserialize_cuda_engine(engineStr)
42 context = engine.create_execution_context()                                 # 創建內核上下文(區別於設備上下文)
43 stream  = cuda.Stream()                                                     # 創建流(可選)
44 
45 hIn = cuda.pagelocked_empty(engine.get_binding_shape(0), dtype=np.float32)  # 使用無初始化的頁鎖定內存,指定尺寸(隱式轉換為 trt.volume)和數據類型,也可用 np.empty 等來申請一般內存
46 hOut = cuda.pagelocked_empty(engine.get_binding_shape(1), dtype=np.float32) # engine.get_binding_shape 的 (0) 和 (1) 分別等於network 的輸入和輸出節點尺寸
47 dIn = cuda.mem_alloc(h_input.nbytes)                                        # 申請設備內存,使用主機內存的大小
48 dOut = cuda.mem_alloc(h_output.nbytes)
49 
50 cuda.memcpy_htod_async(d_input, h_input, stream)                            # 異步數據拷貝
51 #cuda.memcpy_htod(d_input, data)                                            # 非異步數據拷貝
52 context.execute_async(batchSize, bindings=[int(dIn), int(dOut)], stream_handle=stream.handle)  # 異步執行內核
53 context.execute(batchSize, bindings=[int(dIn), int(dOut)])                  # 非異步執行內核
54 cuda.memcpy_dtoh_async(hOut, dOut, stream)
55 
56 stream.synchronize()                                                        # 同步
57 
58 context = None                                                              # 清空內核上下文和 engine
59 engine  = None
60 cuda.Context.pop()                                                          # 關閉設備上下文

 

▶ 留坑,使用 convert_to_uff.py 將保存的 .pb 模型轉化為 .uff 模型,方便 tendorRT 直接加載和使用,不用再在 tenorRT 中重建。中間遇到一些問題,尚未成功。


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM