From b26ec998b3bd53e09fba910a2ae7fa9a0d477bd5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 18 Dec 2024 12:10:45 +0100 Subject: [PATCH] Using tensorrt runtime directly. --- extras/mmdeploy/README.md | 22 ++++++ scripts/utils_2d_pose_ort.py | 142 ++++++++++++++++++++++++++++++----- 2 files changed, 145 insertions(+), 19 deletions(-) diff --git a/extras/mmdeploy/README.md b/extras/mmdeploy/README.md index 84dbdd4..2abffa9 100644 --- a/extras/mmdeploy/README.md +++ b/extras/mmdeploy/README.md @@ -49,6 +49,28 @@ python3 /RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py
+## TensorRT + +Run this directly in the inference container (the TensorRT versions need to be the same) + +```bash +export withFP16="_fp16" + +trtexec --fp16 \ + --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320"$withFP16"_extra-steps.onnx \ + --saveEngine=end2end.engine + +mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.engine + +trtexec --fp16 \ + --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288"$withFP16"_extra-steps.onnx \ + --saveEngine=end2end.engine + +mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3"$withFP16"_extra-steps.engine +``` + +
+ ## Benchmark ```bash diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 8c336f0..3287412 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -1,19 +1,45 @@ import math +import os from abc import ABC, abstractmethod from typing import List import cv2 import numpy as np import onnxruntime as ort +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt from tqdm import tqdm # ================================================================================================== class BaseModel(ABC): - def __init__( - self, model_path: str, warmup: int, usetrt: bool = True, usegpu: bool = True - ): + def __init__(self, model_path: str, warmup: int): + self.model_path = model_path + self.runtime = "" + + if not os.path.exists(model_path): + raise FileNotFoundError("File not found:", model_path) + + if model_path.endswith(".engine"): + self.init_trt_engine(model_path) + self.runtime = "trt" + elif model_path.endswith(".onnx"): + self.init_onnxruntime(model_path) + self.runtime = "ort" + else: + raise ValueError("Unsupported model format:", model_path) + + if warmup > 0: + print("Running warmup for '{}' ...".format(self.__class__.__name__)) + self.warmup(warmup // 2) + self.warmup(warmup // 2) + + def init_onnxruntime(self, model_path): + usetrt = True + usegpu = True + self.opt = ort.SessionOptions() providers = ort.get_available_providers() # ort.set_default_logger_severity(1) @@ -49,8 +75,50 @@ class BaseModel(ABC): raise ValueError("Undefined input type:", input_type) self.input_types.append(itype) - if warmup > 0: - self.warmup(warmup) + def init_trt_engine(self, engine_path): + # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics + # https://stackoverflow.com/a/79076885 + + self.trt_logger = trt.Logger(trt.Logger.WARNING) + with open(engine_path, "rb") as f: + runtime = trt.Runtime(self.trt_logger) + self.engine = runtime.deserialize_cuda_engine(f.read()) + self.context = self.engine.create_execution_context() + self.stream = cuda.Stream() + + self.inputs, self.outputs, self.bindings = [], [], [] + self.input_names = [] + self.input_shapes = [] + self.input_types = [] + + for i in range(self.engine.num_io_tensors): + tensor_name = self.engine.get_tensor_name(i) + shape = self.engine.get_tensor_shape(tensor_name) + dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name)) + + if -1 in shape: + print("WARNING: Replacing dynamic shape with fixed for:", tensor_name) + shape[list(shape).index(-1)] = 10 + + # Allocate host and device buffers + size = trt.volume(shape) + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + self.bindings.append(int(device_mem)) + + # Append to the appropriate input/output list + if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: + self.inputs.append((host_mem, device_mem, shape)) + self.input_names.append(tensor_name) + self.input_shapes.append(shape) + self.input_types.append(dtype) + else: + self.outputs.append((host_mem, device_mem, shape)) + + # Set tensor address + self.context.set_tensor_address( + self.engine.get_tensor_name(i), self.bindings[i] + ) @abstractmethod def preprocess(self, **kwargs): @@ -63,14 +131,13 @@ class BaseModel(ABC): def warmup(self, epoch: int): np.random.seed(42) - print("Running warmup for '{}' ...".format(self.__class__.__name__)) for _ in tqdm(range(epoch)): inputs = {} for i in range(len(self.input_names)): iname = self.input_names[i] if "image" in iname: - ishape = self.input_shapes[i] + ishape = list(self.input_shapes[i]) if "batch_size" in ishape: if "TensorrtExecutionProvider" in self.providers: # Using different images sizes for TensorRT warmup takes too long @@ -101,15 +168,55 @@ class BaseModel(ABC): tensor = tensor.astype(self.input_types[i]) inputs[iname] = tensor - self.session.run(None, inputs) + self.call_model(list(inputs.values())) - def __call__(self, **kwargs): - tensor = self.preprocess(**kwargs) + def call_model_ort(self, tensor): inputs = {} for i in range(len(self.input_names)): iname = self.input_names[i] inputs[iname] = tensor[i] result = self.session.run(None, inputs) + return result + + def call_model_trt(self, tensor): + # Transfer input data to device + for i, input_data in enumerate(tensor): + np.copyto(self.inputs[i][0], input_data.ravel()) + cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream) + + # Empty the output buffers + for i in range(len(self.outputs)): + self.outputs[i][0].fill(0) + cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream) + + # Run inference + self.context.execute_async_v3(stream_handle=self.stream.handle) + + # Transfer predictions back + for i in range(len(self.outputs)): + cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream) + + # Synchronize the stream + self.stream.synchronize() + + # Un-flatten the outputs + outputs = [] + for i in range(len(self.outputs)): + output = self.outputs[i][0].reshape(self.outputs[i][2]) + outputs.append(output) + + return outputs + + def call_model(self, tensor): + if self.runtime == "trt": + result = self.call_model_trt(tensor) + elif self.runtime == "ort": + result = self.call_model_ort(tensor) + return result + + def __call__(self, **kwargs): + tensor = self.preprocess(**kwargs) + result = self.call_model(tensor) output = self.postprocess(result=result, **kwargs) return output @@ -416,11 +523,6 @@ class TopDown: box_min_area: float, warmup: int = 30, ): - if (not det_model_path.endswith(".onnx")) or ( - not pose_model_path.endswith(".onnx") - ): - raise ValueError("Only ONNX models are supported.") - self.det_model = RTMDet( det_model_path, box_conf_threshold, box_min_area, warmup ) @@ -439,17 +541,19 @@ class TopDown: def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1): - print("Loading onnx model ...") + print("Loading 2D model ...") model = TopDown( - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", + # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", + # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", + "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine", + "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine", box_conf_threshold=min_bbox_score, box_min_area=min_bbox_area, warmup=30, ) - print("Loaded onnx model") + print("Loaded 2D model") return model