From 7b8d2096017fea041e6059733b70be6e3f9231f5 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 18 Dec 2024 12:33:43 +0100 Subject: [PATCH] Cache onnx-trt models, they are faster than using two trt-engines. Removed trt-runtime again. --- data/.gitignore | 1 + scripts/utils_2d_pose_ort.py | 109 +++++------------------------------ 2 files changed, 15 insertions(+), 95 deletions(-) diff --git a/data/.gitignore b/data/.gitignore index 8975752..4ef61ca 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -5,3 +5,4 @@ *.json !*/*.json testoutput/ +trt_cache/ diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 3287412..1944983 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -6,9 +6,6 @@ from typing import List import cv2 import numpy as np import onnxruntime as ort -import pycuda.autoinit # noqa: F401 -import pycuda.driver as cuda -import tensorrt as trt from tqdm import tqdm # ================================================================================================== @@ -22,10 +19,7 @@ class BaseModel(ABC): if not os.path.exists(model_path): raise FileNotFoundError("File not found:", model_path) - if model_path.endswith(".engine"): - self.init_trt_engine(model_path) - self.runtime = "trt" - elif model_path.endswith(".onnx"): + if model_path.endswith(".onnx"): self.init_onnxruntime(model_path) self.runtime = "ort" else: @@ -46,7 +40,15 @@ class BaseModel(ABC): self.providers = [] if usetrt and "TensorrtExecutionProvider" in providers: - self.providers.append("TensorrtExecutionProvider") + self.providers.append( + ( + "TensorrtExecutionProvider", + { + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "/RapidPoseTriangulation/data/trt_cache/", + }, + ) + ) if usegpu and "CUDAExecutionProvider" in providers: self.providers.append("CUDAExecutionProvider") self.providers.append("CPUExecutionProvider") @@ -75,51 +77,6 @@ class BaseModel(ABC): raise ValueError("Undefined input type:", input_type) self.input_types.append(itype) - def init_trt_engine(self, engine_path): - # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics - # https://stackoverflow.com/a/79076885 - - self.trt_logger = trt.Logger(trt.Logger.WARNING) - with open(engine_path, "rb") as f: - runtime = trt.Runtime(self.trt_logger) - self.engine = runtime.deserialize_cuda_engine(f.read()) - self.context = self.engine.create_execution_context() - self.stream = cuda.Stream() - - self.inputs, self.outputs, self.bindings = [], [], [] - self.input_names = [] - self.input_shapes = [] - self.input_types = [] - - for i in range(self.engine.num_io_tensors): - tensor_name = self.engine.get_tensor_name(i) - shape = self.engine.get_tensor_shape(tensor_name) - dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name)) - - if -1 in shape: - print("WARNING: Replacing dynamic shape with fixed for:", tensor_name) - shape[list(shape).index(-1)] = 10 - - # Allocate host and device buffers - size = trt.volume(shape) - host_mem = cuda.pagelocked_empty(size, dtype) - device_mem = cuda.mem_alloc(host_mem.nbytes) - self.bindings.append(int(device_mem)) - - # Append to the appropriate input/output list - if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: - self.inputs.append((host_mem, device_mem, shape)) - self.input_names.append(tensor_name) - self.input_shapes.append(shape) - self.input_types.append(dtype) - else: - self.outputs.append((host_mem, device_mem, shape)) - - # Set tensor address - self.context.set_tensor_address( - self.engine.get_tensor_name(i), self.bindings[i] - ) - @abstractmethod def preprocess(self, **kwargs): pass @@ -168,7 +125,7 @@ class BaseModel(ABC): tensor = tensor.astype(self.input_types[i]) inputs[iname] = tensor - self.call_model(list(inputs.values())) + self.call_model_ort(list(inputs.values())) def call_model_ort(self, tensor): inputs = {} @@ -178,45 +135,9 @@ class BaseModel(ABC): result = self.session.run(None, inputs) return result - def call_model_trt(self, tensor): - # Transfer input data to device - for i, input_data in enumerate(tensor): - np.copyto(self.inputs[i][0], input_data.ravel()) - cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream) - - # Empty the output buffers - for i in range(len(self.outputs)): - self.outputs[i][0].fill(0) - cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream) - - # Run inference - self.context.execute_async_v3(stream_handle=self.stream.handle) - - # Transfer predictions back - for i in range(len(self.outputs)): - cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream) - - # Synchronize the stream - self.stream.synchronize() - - # Un-flatten the outputs - outputs = [] - for i in range(len(self.outputs)): - output = self.outputs[i][0].reshape(self.outputs[i][2]) - outputs.append(output) - - return outputs - - def call_model(self, tensor): - if self.runtime == "trt": - result = self.call_model_trt(tensor) - elif self.runtime == "ort": - result = self.call_model_ort(tensor) - return result - def __call__(self, **kwargs): tensor = self.preprocess(**kwargs) - result = self.call_model(tensor) + result = self.call_model_ort(tensor) output = self.postprocess(result=result, **kwargs) return output @@ -544,10 +465,8 @@ def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1): print("Loading 2D model ...") model = TopDown( - # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", - # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine", - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine", + "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", + "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", box_conf_threshold=min_bbox_score, box_min_area=min_bbox_area, warmup=30,