diff --git a/extras/mmdeploy/README.md b/extras/mmdeploy/README.md
index 84dbdd4..2abffa9 100644
--- a/extras/mmdeploy/README.md
+++ b/extras/mmdeploy/README.md
@@ -49,6 +49,28 @@ python3 /RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py
+## TensorRT
+
+Run this directly in the inference container (the TensorRT versions need to be the same)
+
+```bash
+export withFP16="_fp16"
+
+trtexec --fp16 \
+ --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320"$withFP16"_extra-steps.onnx \
+ --saveEngine=end2end.engine
+
+mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.engine
+
+trtexec --fp16 \
+ --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288"$withFP16"_extra-steps.onnx \
+ --saveEngine=end2end.engine
+
+mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3"$withFP16"_extra-steps.engine
+```
+
+
+
## Benchmark
```bash
diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py
index 8c336f0..3287412 100644
--- a/scripts/utils_2d_pose_ort.py
+++ b/scripts/utils_2d_pose_ort.py
@@ -1,19 +1,45 @@
import math
+import os
from abc import ABC, abstractmethod
from typing import List
import cv2
import numpy as np
import onnxruntime as ort
+import pycuda.autoinit # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
from tqdm import tqdm
# ==================================================================================================
class BaseModel(ABC):
- def __init__(
- self, model_path: str, warmup: int, usetrt: bool = True, usegpu: bool = True
- ):
+ def __init__(self, model_path: str, warmup: int):
+ self.model_path = model_path
+ self.runtime = ""
+
+ if not os.path.exists(model_path):
+ raise FileNotFoundError("File not found:", model_path)
+
+ if model_path.endswith(".engine"):
+ self.init_trt_engine(model_path)
+ self.runtime = "trt"
+ elif model_path.endswith(".onnx"):
+ self.init_onnxruntime(model_path)
+ self.runtime = "ort"
+ else:
+ raise ValueError("Unsupported model format:", model_path)
+
+ if warmup > 0:
+ print("Running warmup for '{}' ...".format(self.__class__.__name__))
+ self.warmup(warmup // 2)
+ self.warmup(warmup // 2)
+
+ def init_onnxruntime(self, model_path):
+ usetrt = True
+ usegpu = True
+
self.opt = ort.SessionOptions()
providers = ort.get_available_providers()
# ort.set_default_logger_severity(1)
@@ -49,8 +75,50 @@ class BaseModel(ABC):
raise ValueError("Undefined input type:", input_type)
self.input_types.append(itype)
- if warmup > 0:
- self.warmup(warmup)
+ def init_trt_engine(self, engine_path):
+ # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
+ # https://stackoverflow.com/a/79076885
+
+ self.trt_logger = trt.Logger(trt.Logger.WARNING)
+ with open(engine_path, "rb") as f:
+ runtime = trt.Runtime(self.trt_logger)
+ self.engine = runtime.deserialize_cuda_engine(f.read())
+ self.context = self.engine.create_execution_context()
+ self.stream = cuda.Stream()
+
+ self.inputs, self.outputs, self.bindings = [], [], []
+ self.input_names = []
+ self.input_shapes = []
+ self.input_types = []
+
+ for i in range(self.engine.num_io_tensors):
+ tensor_name = self.engine.get_tensor_name(i)
+ shape = self.engine.get_tensor_shape(tensor_name)
+ dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
+
+ if -1 in shape:
+ print("WARNING: Replacing dynamic shape with fixed for:", tensor_name)
+ shape[list(shape).index(-1)] = 10
+
+ # Allocate host and device buffers
+ size = trt.volume(shape)
+ host_mem = cuda.pagelocked_empty(size, dtype)
+ device_mem = cuda.mem_alloc(host_mem.nbytes)
+ self.bindings.append(int(device_mem))
+
+ # Append to the appropriate input/output list
+ if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
+ self.inputs.append((host_mem, device_mem, shape))
+ self.input_names.append(tensor_name)
+ self.input_shapes.append(shape)
+ self.input_types.append(dtype)
+ else:
+ self.outputs.append((host_mem, device_mem, shape))
+
+ # Set tensor address
+ self.context.set_tensor_address(
+ self.engine.get_tensor_name(i), self.bindings[i]
+ )
@abstractmethod
def preprocess(self, **kwargs):
@@ -63,14 +131,13 @@ class BaseModel(ABC):
def warmup(self, epoch: int):
np.random.seed(42)
- print("Running warmup for '{}' ...".format(self.__class__.__name__))
for _ in tqdm(range(epoch)):
inputs = {}
for i in range(len(self.input_names)):
iname = self.input_names[i]
if "image" in iname:
- ishape = self.input_shapes[i]
+ ishape = list(self.input_shapes[i])
if "batch_size" in ishape:
if "TensorrtExecutionProvider" in self.providers:
# Using different images sizes for TensorRT warmup takes too long
@@ -101,15 +168,55 @@ class BaseModel(ABC):
tensor = tensor.astype(self.input_types[i])
inputs[iname] = tensor
- self.session.run(None, inputs)
+ self.call_model(list(inputs.values()))
- def __call__(self, **kwargs):
- tensor = self.preprocess(**kwargs)
+ def call_model_ort(self, tensor):
inputs = {}
for i in range(len(self.input_names)):
iname = self.input_names[i]
inputs[iname] = tensor[i]
result = self.session.run(None, inputs)
+ return result
+
+ def call_model_trt(self, tensor):
+ # Transfer input data to device
+ for i, input_data in enumerate(tensor):
+ np.copyto(self.inputs[i][0], input_data.ravel())
+ cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream)
+
+ # Empty the output buffers
+ for i in range(len(self.outputs)):
+ self.outputs[i][0].fill(0)
+ cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream)
+
+ # Run inference
+ self.context.execute_async_v3(stream_handle=self.stream.handle)
+
+ # Transfer predictions back
+ for i in range(len(self.outputs)):
+ cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream)
+
+ # Synchronize the stream
+ self.stream.synchronize()
+
+ # Un-flatten the outputs
+ outputs = []
+ for i in range(len(self.outputs)):
+ output = self.outputs[i][0].reshape(self.outputs[i][2])
+ outputs.append(output)
+
+ return outputs
+
+ def call_model(self, tensor):
+ if self.runtime == "trt":
+ result = self.call_model_trt(tensor)
+ elif self.runtime == "ort":
+ result = self.call_model_ort(tensor)
+ return result
+
+ def __call__(self, **kwargs):
+ tensor = self.preprocess(**kwargs)
+ result = self.call_model(tensor)
output = self.postprocess(result=result, **kwargs)
return output
@@ -416,11 +523,6 @@ class TopDown:
box_min_area: float,
warmup: int = 30,
):
- if (not det_model_path.endswith(".onnx")) or (
- not pose_model_path.endswith(".onnx")
- ):
- raise ValueError("Only ONNX models are supported.")
-
self.det_model = RTMDet(
det_model_path, box_conf_threshold, box_min_area, warmup
)
@@ -439,17 +541,19 @@ class TopDown:
def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1):
- print("Loading onnx model ...")
+ print("Loading 2D model ...")
model = TopDown(
- "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
- "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
+ # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
+ # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
+ "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine",
+ "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine",
box_conf_threshold=min_bbox_score,
box_min_area=min_bbox_area,
warmup=30,
)
- print("Loaded onnx model")
+ print("Loaded 2D model")
return model