Using tensorrt runtime directly.

This commit is contained in:
Daniel
2024-12-18 12:10:45 +01:00
parent 6e8f6a22ba
commit b26ec998b3
2 changed files with 145 additions and 19 deletions

View File

@ -49,6 +49,28 @@ python3 /RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py
<br> <br>
## TensorRT
Run this directly in the inference container (the TensorRT versions need to be the same)
```bash
export withFP16="_fp16"
trtexec --fp16 \
--onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320"$withFP16"_extra-steps.onnx \
--saveEngine=end2end.engine
mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.engine
trtexec --fp16 \
--onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288"$withFP16"_extra-steps.onnx \
--saveEngine=end2end.engine
mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3"$withFP16"_extra-steps.engine
```
<br>
## Benchmark ## Benchmark
```bash ```bash

View File

@ -1,19 +1,45 @@
import math import math
import os
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
import cv2 import cv2
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
import pycuda.autoinit # noqa: F401
import pycuda.driver as cuda
import tensorrt as trt
from tqdm import tqdm from tqdm import tqdm
# ================================================================================================== # ==================================================================================================
class BaseModel(ABC): class BaseModel(ABC):
def __init__( def __init__(self, model_path: str, warmup: int):
self, model_path: str, warmup: int, usetrt: bool = True, usegpu: bool = True self.model_path = model_path
): self.runtime = ""
if not os.path.exists(model_path):
raise FileNotFoundError("File not found:", model_path)
if model_path.endswith(".engine"):
self.init_trt_engine(model_path)
self.runtime = "trt"
elif model_path.endswith(".onnx"):
self.init_onnxruntime(model_path)
self.runtime = "ort"
else:
raise ValueError("Unsupported model format:", model_path)
if warmup > 0:
print("Running warmup for '{}' ...".format(self.__class__.__name__))
self.warmup(warmup // 2)
self.warmup(warmup // 2)
def init_onnxruntime(self, model_path):
usetrt = True
usegpu = True
self.opt = ort.SessionOptions() self.opt = ort.SessionOptions()
providers = ort.get_available_providers() providers = ort.get_available_providers()
# ort.set_default_logger_severity(1) # ort.set_default_logger_severity(1)
@ -49,8 +75,50 @@ class BaseModel(ABC):
raise ValueError("Undefined input type:", input_type) raise ValueError("Undefined input type:", input_type)
self.input_types.append(itype) self.input_types.append(itype)
if warmup > 0: def init_trt_engine(self, engine_path):
self.warmup(warmup) # https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
# https://stackoverflow.com/a/79076885
self.trt_logger = trt.Logger(trt.Logger.WARNING)
with open(engine_path, "rb") as f:
runtime = trt.Runtime(self.trt_logger)
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
self.stream = cuda.Stream()
self.inputs, self.outputs, self.bindings = [], [], []
self.input_names = []
self.input_shapes = []
self.input_types = []
for i in range(self.engine.num_io_tensors):
tensor_name = self.engine.get_tensor_name(i)
shape = self.engine.get_tensor_shape(tensor_name)
dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
if -1 in shape:
print("WARNING: Replacing dynamic shape with fixed for:", tensor_name)
shape[list(shape).index(-1)] = 10
# Allocate host and device buffers
size = trt.volume(shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
self.bindings.append(int(device_mem))
# Append to the appropriate input/output list
if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
self.inputs.append((host_mem, device_mem, shape))
self.input_names.append(tensor_name)
self.input_shapes.append(shape)
self.input_types.append(dtype)
else:
self.outputs.append((host_mem, device_mem, shape))
# Set tensor address
self.context.set_tensor_address(
self.engine.get_tensor_name(i), self.bindings[i]
)
@abstractmethod @abstractmethod
def preprocess(self, **kwargs): def preprocess(self, **kwargs):
@ -63,14 +131,13 @@ class BaseModel(ABC):
def warmup(self, epoch: int): def warmup(self, epoch: int):
np.random.seed(42) np.random.seed(42)
print("Running warmup for '{}' ...".format(self.__class__.__name__))
for _ in tqdm(range(epoch)): for _ in tqdm(range(epoch)):
inputs = {} inputs = {}
for i in range(len(self.input_names)): for i in range(len(self.input_names)):
iname = self.input_names[i] iname = self.input_names[i]
if "image" in iname: if "image" in iname:
ishape = self.input_shapes[i] ishape = list(self.input_shapes[i])
if "batch_size" in ishape: if "batch_size" in ishape:
if "TensorrtExecutionProvider" in self.providers: if "TensorrtExecutionProvider" in self.providers:
# Using different images sizes for TensorRT warmup takes too long # Using different images sizes for TensorRT warmup takes too long
@ -101,15 +168,55 @@ class BaseModel(ABC):
tensor = tensor.astype(self.input_types[i]) tensor = tensor.astype(self.input_types[i])
inputs[iname] = tensor inputs[iname] = tensor
self.session.run(None, inputs) self.call_model(list(inputs.values()))
def __call__(self, **kwargs): def call_model_ort(self, tensor):
tensor = self.preprocess(**kwargs)
inputs = {} inputs = {}
for i in range(len(self.input_names)): for i in range(len(self.input_names)):
iname = self.input_names[i] iname = self.input_names[i]
inputs[iname] = tensor[i] inputs[iname] = tensor[i]
result = self.session.run(None, inputs) result = self.session.run(None, inputs)
return result
def call_model_trt(self, tensor):
# Transfer input data to device
for i, input_data in enumerate(tensor):
np.copyto(self.inputs[i][0], input_data.ravel())
cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream)
# Empty the output buffers
for i in range(len(self.outputs)):
self.outputs[i][0].fill(0)
cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream)
# Run inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# Transfer predictions back
for i in range(len(self.outputs)):
cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream)
# Synchronize the stream
self.stream.synchronize()
# Un-flatten the outputs
outputs = []
for i in range(len(self.outputs)):
output = self.outputs[i][0].reshape(self.outputs[i][2])
outputs.append(output)
return outputs
def call_model(self, tensor):
if self.runtime == "trt":
result = self.call_model_trt(tensor)
elif self.runtime == "ort":
result = self.call_model_ort(tensor)
return result
def __call__(self, **kwargs):
tensor = self.preprocess(**kwargs)
result = self.call_model(tensor)
output = self.postprocess(result=result, **kwargs) output = self.postprocess(result=result, **kwargs)
return output return output
@ -416,11 +523,6 @@ class TopDown:
box_min_area: float, box_min_area: float,
warmup: int = 30, warmup: int = 30,
): ):
if (not det_model_path.endswith(".onnx")) or (
not pose_model_path.endswith(".onnx")
):
raise ValueError("Only ONNX models are supported.")
self.det_model = RTMDet( self.det_model = RTMDet(
det_model_path, box_conf_threshold, box_min_area, warmup det_model_path, box_conf_threshold, box_min_area, warmup
) )
@ -439,17 +541,19 @@ class TopDown:
def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1): def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1):
print("Loading onnx model ...") print("Loading 2D model ...")
model = TopDown( model = TopDown(
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", # "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine",
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine",
box_conf_threshold=min_bbox_score, box_conf_threshold=min_bbox_score,
box_min_area=min_bbox_area, box_min_area=min_bbox_area,
warmup=30, warmup=30,
) )
print("Loaded onnx model") print("Loaded 2D model")
return model return model