Using tensorrt runtime directly.
This commit is contained in:
@ -49,6 +49,28 @@ python3 /RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py
|
|||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
## TensorRT
|
||||||
|
|
||||||
|
Run this directly in the inference container (the TensorRT versions need to be the same)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export withFP16="_fp16"
|
||||||
|
|
||||||
|
trtexec --fp16 \
|
||||||
|
--onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320"$withFP16"_extra-steps.onnx \
|
||||||
|
--saveEngine=end2end.engine
|
||||||
|
|
||||||
|
mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.engine
|
||||||
|
|
||||||
|
trtexec --fp16 \
|
||||||
|
--onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288"$withFP16"_extra-steps.onnx \
|
||||||
|
--saveEngine=end2end.engine
|
||||||
|
|
||||||
|
mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3"$withFP16"_extra-steps.engine
|
||||||
|
```
|
||||||
|
|
||||||
|
<br>
|
||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@ -1,19 +1,45 @@
|
|||||||
import math
|
import math
|
||||||
|
import os
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import onnxruntime as ort
|
import onnxruntime as ort
|
||||||
|
import pycuda.autoinit # noqa: F401
|
||||||
|
import pycuda.driver as cuda
|
||||||
|
import tensorrt as trt
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
# ==================================================================================================
|
# ==================================================================================================
|
||||||
|
|
||||||
|
|
||||||
class BaseModel(ABC):
|
class BaseModel(ABC):
|
||||||
def __init__(
|
def __init__(self, model_path: str, warmup: int):
|
||||||
self, model_path: str, warmup: int, usetrt: bool = True, usegpu: bool = True
|
self.model_path = model_path
|
||||||
):
|
self.runtime = ""
|
||||||
|
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
raise FileNotFoundError("File not found:", model_path)
|
||||||
|
|
||||||
|
if model_path.endswith(".engine"):
|
||||||
|
self.init_trt_engine(model_path)
|
||||||
|
self.runtime = "trt"
|
||||||
|
elif model_path.endswith(".onnx"):
|
||||||
|
self.init_onnxruntime(model_path)
|
||||||
|
self.runtime = "ort"
|
||||||
|
else:
|
||||||
|
raise ValueError("Unsupported model format:", model_path)
|
||||||
|
|
||||||
|
if warmup > 0:
|
||||||
|
print("Running warmup for '{}' ...".format(self.__class__.__name__))
|
||||||
|
self.warmup(warmup // 2)
|
||||||
|
self.warmup(warmup // 2)
|
||||||
|
|
||||||
|
def init_onnxruntime(self, model_path):
|
||||||
|
usetrt = True
|
||||||
|
usegpu = True
|
||||||
|
|
||||||
self.opt = ort.SessionOptions()
|
self.opt = ort.SessionOptions()
|
||||||
providers = ort.get_available_providers()
|
providers = ort.get_available_providers()
|
||||||
# ort.set_default_logger_severity(1)
|
# ort.set_default_logger_severity(1)
|
||||||
@ -49,8 +75,50 @@ class BaseModel(ABC):
|
|||||||
raise ValueError("Undefined input type:", input_type)
|
raise ValueError("Undefined input type:", input_type)
|
||||||
self.input_types.append(itype)
|
self.input_types.append(itype)
|
||||||
|
|
||||||
if warmup > 0:
|
def init_trt_engine(self, engine_path):
|
||||||
self.warmup(warmup)
|
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
|
||||||
|
# https://stackoverflow.com/a/79076885
|
||||||
|
|
||||||
|
self.trt_logger = trt.Logger(trt.Logger.WARNING)
|
||||||
|
with open(engine_path, "rb") as f:
|
||||||
|
runtime = trt.Runtime(self.trt_logger)
|
||||||
|
self.engine = runtime.deserialize_cuda_engine(f.read())
|
||||||
|
self.context = self.engine.create_execution_context()
|
||||||
|
self.stream = cuda.Stream()
|
||||||
|
|
||||||
|
self.inputs, self.outputs, self.bindings = [], [], []
|
||||||
|
self.input_names = []
|
||||||
|
self.input_shapes = []
|
||||||
|
self.input_types = []
|
||||||
|
|
||||||
|
for i in range(self.engine.num_io_tensors):
|
||||||
|
tensor_name = self.engine.get_tensor_name(i)
|
||||||
|
shape = self.engine.get_tensor_shape(tensor_name)
|
||||||
|
dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
|
||||||
|
|
||||||
|
if -1 in shape:
|
||||||
|
print("WARNING: Replacing dynamic shape with fixed for:", tensor_name)
|
||||||
|
shape[list(shape).index(-1)] = 10
|
||||||
|
|
||||||
|
# Allocate host and device buffers
|
||||||
|
size = trt.volume(shape)
|
||||||
|
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||||
|
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||||
|
self.bindings.append(int(device_mem))
|
||||||
|
|
||||||
|
# Append to the appropriate input/output list
|
||||||
|
if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
|
||||||
|
self.inputs.append((host_mem, device_mem, shape))
|
||||||
|
self.input_names.append(tensor_name)
|
||||||
|
self.input_shapes.append(shape)
|
||||||
|
self.input_types.append(dtype)
|
||||||
|
else:
|
||||||
|
self.outputs.append((host_mem, device_mem, shape))
|
||||||
|
|
||||||
|
# Set tensor address
|
||||||
|
self.context.set_tensor_address(
|
||||||
|
self.engine.get_tensor_name(i), self.bindings[i]
|
||||||
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def preprocess(self, **kwargs):
|
def preprocess(self, **kwargs):
|
||||||
@ -63,14 +131,13 @@ class BaseModel(ABC):
|
|||||||
def warmup(self, epoch: int):
|
def warmup(self, epoch: int):
|
||||||
np.random.seed(42)
|
np.random.seed(42)
|
||||||
|
|
||||||
print("Running warmup for '{}' ...".format(self.__class__.__name__))
|
|
||||||
for _ in tqdm(range(epoch)):
|
for _ in tqdm(range(epoch)):
|
||||||
inputs = {}
|
inputs = {}
|
||||||
for i in range(len(self.input_names)):
|
for i in range(len(self.input_names)):
|
||||||
iname = self.input_names[i]
|
iname = self.input_names[i]
|
||||||
|
|
||||||
if "image" in iname:
|
if "image" in iname:
|
||||||
ishape = self.input_shapes[i]
|
ishape = list(self.input_shapes[i])
|
||||||
if "batch_size" in ishape:
|
if "batch_size" in ishape:
|
||||||
if "TensorrtExecutionProvider" in self.providers:
|
if "TensorrtExecutionProvider" in self.providers:
|
||||||
# Using different images sizes for TensorRT warmup takes too long
|
# Using different images sizes for TensorRT warmup takes too long
|
||||||
@ -101,15 +168,55 @@ class BaseModel(ABC):
|
|||||||
tensor = tensor.astype(self.input_types[i])
|
tensor = tensor.astype(self.input_types[i])
|
||||||
inputs[iname] = tensor
|
inputs[iname] = tensor
|
||||||
|
|
||||||
self.session.run(None, inputs)
|
self.call_model(list(inputs.values()))
|
||||||
|
|
||||||
def __call__(self, **kwargs):
|
def call_model_ort(self, tensor):
|
||||||
tensor = self.preprocess(**kwargs)
|
|
||||||
inputs = {}
|
inputs = {}
|
||||||
for i in range(len(self.input_names)):
|
for i in range(len(self.input_names)):
|
||||||
iname = self.input_names[i]
|
iname = self.input_names[i]
|
||||||
inputs[iname] = tensor[i]
|
inputs[iname] = tensor[i]
|
||||||
result = self.session.run(None, inputs)
|
result = self.session.run(None, inputs)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def call_model_trt(self, tensor):
|
||||||
|
# Transfer input data to device
|
||||||
|
for i, input_data in enumerate(tensor):
|
||||||
|
np.copyto(self.inputs[i][0], input_data.ravel())
|
||||||
|
cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream)
|
||||||
|
|
||||||
|
# Empty the output buffers
|
||||||
|
for i in range(len(self.outputs)):
|
||||||
|
self.outputs[i][0].fill(0)
|
||||||
|
cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream)
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
self.context.execute_async_v3(stream_handle=self.stream.handle)
|
||||||
|
|
||||||
|
# Transfer predictions back
|
||||||
|
for i in range(len(self.outputs)):
|
||||||
|
cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream)
|
||||||
|
|
||||||
|
# Synchronize the stream
|
||||||
|
self.stream.synchronize()
|
||||||
|
|
||||||
|
# Un-flatten the outputs
|
||||||
|
outputs = []
|
||||||
|
for i in range(len(self.outputs)):
|
||||||
|
output = self.outputs[i][0].reshape(self.outputs[i][2])
|
||||||
|
outputs.append(output)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def call_model(self, tensor):
|
||||||
|
if self.runtime == "trt":
|
||||||
|
result = self.call_model_trt(tensor)
|
||||||
|
elif self.runtime == "ort":
|
||||||
|
result = self.call_model_ort(tensor)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def __call__(self, **kwargs):
|
||||||
|
tensor = self.preprocess(**kwargs)
|
||||||
|
result = self.call_model(tensor)
|
||||||
output = self.postprocess(result=result, **kwargs)
|
output = self.postprocess(result=result, **kwargs)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@ -416,11 +523,6 @@ class TopDown:
|
|||||||
box_min_area: float,
|
box_min_area: float,
|
||||||
warmup: int = 30,
|
warmup: int = 30,
|
||||||
):
|
):
|
||||||
if (not det_model_path.endswith(".onnx")) or (
|
|
||||||
not pose_model_path.endswith(".onnx")
|
|
||||||
):
|
|
||||||
raise ValueError("Only ONNX models are supported.")
|
|
||||||
|
|
||||||
self.det_model = RTMDet(
|
self.det_model = RTMDet(
|
||||||
det_model_path, box_conf_threshold, box_min_area, warmup
|
det_model_path, box_conf_threshold, box_min_area, warmup
|
||||||
)
|
)
|
||||||
@ -439,17 +541,19 @@ class TopDown:
|
|||||||
|
|
||||||
|
|
||||||
def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1):
|
def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1):
|
||||||
print("Loading onnx model ...")
|
print("Loading 2D model ...")
|
||||||
|
|
||||||
model = TopDown(
|
model = TopDown(
|
||||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
|
# "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
|
||||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
|
# "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
|
||||||
|
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine",
|
||||||
|
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine",
|
||||||
box_conf_threshold=min_bbox_score,
|
box_conf_threshold=min_bbox_score,
|
||||||
box_min_area=min_bbox_area,
|
box_min_area=min_bbox_area,
|
||||||
warmup=30,
|
warmup=30,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Loaded onnx model")
|
print("Loaded 2D model")
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user