一、运行环境
系统:Ubuntu 18.04.4
CUDA:cuda_11.0.2_450.51.05_linux
cuDNN:cudnn-11.1-linux-x64-v8.0.5.39
显卡驱动版本:450.80.02
TensorRT:TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3
二、安装CUDA环境
三、安装TensorRT
1.下载地址: https://developer.nvidia.com/nvidia-tensorrt-8x-download,一定要下载TAR版本的
2.安装
tar zxvf TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz cd TensorRT-8.4.0.6/python # 根据自己的python版本选择 pip install tensorrt-8.4.0.6-cp37-none-linux_x86_64.whl cd ../graphsurgeon pip install graphsurgeon-0.4.5-py2.py3-none-any.whl
3.配置环境变量,将/data/setup/TensorRT-8.4.0.6/lib加入环境变量
vi ~/.bashrc
四、代码验证
我用一个简单的facenet做例子,将pytorch转ONNX再转TensorRT,在验证的时候顺便跑了一下速度,可以看到ONNX速度比pytorch快一倍,TensorRT比ONNX快一倍,好像TensorRT没有传的这么神,我想应该还可以优化。
import torch from torch.autograd import Variable import onnx import traceback import os import tensorrt as trt from torch import nn # import utils.tensortrt_util as trtUtil # import pycuda.autoinit import pycuda.driver as cuda import cv2 import numpy as np import onnxruntime import time from nets.facenet import Facenet print(torch.__version__) print(onnx.__version__) def torch2onnx(src_path, target_path): ''' pytorch转换onnx :param src_path: :param target_path: :return: ''' input_name = ['input'] output_name = ['output'] # input = Variable(torch.randn(1, 3, 32, 32)).cuda() # model = torchvision.models.resnet18(pretrained=True).cuda() input = Variable(torch.randn(1, 3, 160, 160)) model = Facenet(backbone="inception_resnetv1", mode="predict").eval() state_dict = torch.load(src_path, map_location=torch.device('cuda')) for s_dict in state_dict: print(s_dict) model.load_state_dict(state_dict, strict=False) # torch.onnx.export(model, input, target_path, input_names=input_name, output_names=output_name, verbose=True, # dynamic_axes={'input' : {0 : 'batch_size'}, # 'output' : {0 : 'batch_size'}}) torch.onnx.export(model, input, target_path, input_names=input_name, output_names=output_name, verbose=True) test = onnx.load(target_path) onnx.checker.check_model(test) print('run success:', target_path) def run_onnx(model_path): ''' 验证onnx :param model_path: :return: ''' onnx_model = onnxruntime.InferenceSession(model_path, providers=onnxruntime.get_available_providers()) # onnx_model.get_modelmeta() img = cv2.imread(r'img/002.jpg') img = cv2.resize(img, (160, 160)) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.transpose(img, (2, 1, 0)) img = img[np.newaxis, :, :, :] img = img / 255. img = img.astype(np.float32) img = torch.from_numpy(img) t = time.time() tt = 0 # for i in range(1): # img = np.random.rand(1, 3, 160, 160).astype(np.float32) # # img = torch.rand((1, 3, 224, 224)).cuda() # results = onnx_model.run(["output"], {"input": img, 'batch'})Z img = np.random.rand(1, 3, 160, 160).astype(np.float32) # img = torch.rand((1, 3, 224, 224)).cuda() results = onnx_model.run(["output"], {"input": img}) print('cost:', time.time() - t) for i in range(5000): img = np.random.rand(1, 3, 160, 160).astype(np.float32) t1 = time.time() results = onnx_model.run(["output"], {"input": img}) tt += time.time() - t1 # predict = torch.from_numpy(results[0]) print('onnx cost:', time.time() - t, tt) # print("predict:", results) def run_torch(src_path): model = Facenet(backbone="inception_resnetv1", mode="predict").eval() state_dict = torch.load(src_path, map_location=torch.device('cuda')) # for s_dict in state_dict: # print(s_dict) model.load_state_dict(state_dict, strict=False) model = model.eval() model = model.cuda() img = cv2.imread(r'img/002.jpg') img = cv2.resize(img, (160, 160)) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img = np.transpose(img, (2, 1, 0)) img = img[np.newaxis, :, :, :] img = img / 255. img = img.astype(np.float32) img = torch.from_numpy(img) t = time.time() tt = 0 for i in range(1): img = torch.rand((1, 3, 160, 160)).cuda() results = model(img) print('cost:', time.time() - t) for i in range(5000): img = torch.rand((1, 3, 160, 160)).cuda() t1 = time.time() results = model(img) tt += time.time() - t1 print('torch cost:', time.time() - t, tt) # print("predict:", results) def onnx2rt(onnx_file_path, engine_file_path): ''' ONNX转换TensorRT :param onnx_file_path: onnx文件路径 :param engine_file_path: TensorRT输出文件路径 :return: engine ''' # 打印日志 G_LOGGER = trt.Logger(trt.Logger.WARNING) explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) print('explicit_batch:', explicit_batch) with trt.Builder(G_LOGGER) as builder, builder.create_network(explicit_batch) as network, trt.OnnxParser(network, G_LOGGER) as parser: builder.max_batch_size = 100 builder.max_workspace_size = 1 << 20 print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model: print('Beginning ONNX file parsing') parser.parse(model.read()) print('Completed parsing of ONNX file') print('Building an engine from file {}; this may take a while...'.format(onnx_file_path)) engine = builder.build_engine(network) print("Completed creating Engine") # 保存计划文件 with open(engine_file_path, "wb") as f: f.write(engine.serialize()) return engine def check_trt(model_path, image_size): # 必须导入包,import pycuda.autoinit,否则cuda.Stream()报错 import pycuda.autoinit """ 验证TensorRT结果 """ print('[Info] model_path: {}'.format(model_path)) img_shape = (1, 3, image_size, image_size) print('[Info] img_shape: {}'.format(img_shape)) trt_logger = trt.Logger(trt.Logger.WARNING) trt_path = model_path # TRT模型路径 with open(trt_path, 'rb') as f, trt.Runtime(trt_logger) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) for binding in engine: binding_idx = engine.get_binding_index(binding) size = engine.get_binding_shape(binding_idx) dtype = trt.nptype(engine.get_binding_dtype(binding)) print("[Info] binding: {}, binding_idx: {}, size: {}, dtype: {}" .format(binding, binding_idx, size, dtype)) t = time.time() tt = 0 tt1 = 0 with engine.create_execution_context() as context: for i in range(5000): input_image = np.random.randn(*img_shape).astype(np.float32) # 图像尺寸 t1 = time.time() input_image = np.ascontiguousarray(input_image) tt1 += time.time() - t1 # print('[Info] input_image: {}'.format(input_image.shape)) stream = cuda.Stream() bindings = [0] * len(engine) for binding in engine: idx = engine.get_binding_index(binding) if engine.binding_is_input(idx): input_memory = cuda.mem_alloc(input_image.nbytes) bindings[idx] = int(input_memory) cuda.memcpy_htod_async(input_memory, input_image, stream) else: dtype = trt.nptype(engine.get_binding_dtype(binding)) shape = context.get_binding_shape(idx) output_buffer = np.empty(shape, dtype=dtype) output_buffer = np.ascontiguousarray(output_buffer) output_memory = cuda.mem_alloc(output_buffer.nbytes) bindings[idx] = int(output_memory) context.execute_async_v2(bindings, stream.handle) stream.synchronize() cuda.memcpy_dtoh(output_buffer, output_memory) tt += time.time() - t1 print('trt cost:', time.time() - t, tt, tt1) # print("[Info] output_buffer: {}".format(output_buffer)) if __name__ == '__main__': torch2onnx(r"model_data/facenet_inception_resnetv1.pth", r"model_data/facenet_inception_resnetv1.onnx") onnx2rt(r"model_data/facenet_inception_resnetv1.onnx", r"model_data/facenet_inception_resnetv1.trt") run_onnx(r"model_data/facenet_inception_resnetv1.onnx") run_torch(r"model_data/facenet_inception_resnetv1.pth") check_trt(r"model_data/facenet_inception_resnetv1.trt", 160)
运行结果:ONNX速度比pytorch快一倍,TensorRT比ONNX快一倍
torch cost: 95.99401450157166 onnx cost: 56.98542881011963 trt cost: 26.91579008102417