Cache onnx-trt models, they are faster than using two trt-engines. Removed trt-runtime again.
This commit is contained in:
1
data/.gitignore
vendored
1
data/.gitignore
vendored
@ -5,3 +5,4 @@
|
||||
*.json
|
||||
!*/*.json
|
||||
testoutput/
|
||||
trt_cache/
|
||||
|
||||
@ -6,9 +6,6 @@ from typing import List
|
||||
import cv2
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
import pycuda.autoinit # noqa: F401
|
||||
import pycuda.driver as cuda
|
||||
import tensorrt as trt
|
||||
from tqdm import tqdm
|
||||
|
||||
# ==================================================================================================
|
||||
@ -22,10 +19,7 @@ class BaseModel(ABC):
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError("File not found:", model_path)
|
||||
|
||||
if model_path.endswith(".engine"):
|
||||
self.init_trt_engine(model_path)
|
||||
self.runtime = "trt"
|
||||
elif model_path.endswith(".onnx"):
|
||||
if model_path.endswith(".onnx"):
|
||||
self.init_onnxruntime(model_path)
|
||||
self.runtime = "ort"
|
||||
else:
|
||||
@ -46,7 +40,15 @@ class BaseModel(ABC):
|
||||
|
||||
self.providers = []
|
||||
if usetrt and "TensorrtExecutionProvider" in providers:
|
||||
self.providers.append("TensorrtExecutionProvider")
|
||||
self.providers.append(
|
||||
(
|
||||
"TensorrtExecutionProvider",
|
||||
{
|
||||
"trt_engine_cache_enable": True,
|
||||
"trt_engine_cache_path": "/RapidPoseTriangulation/data/trt_cache/",
|
||||
},
|
||||
)
|
||||
)
|
||||
if usegpu and "CUDAExecutionProvider" in providers:
|
||||
self.providers.append("CUDAExecutionProvider")
|
||||
self.providers.append("CPUExecutionProvider")
|
||||
@ -75,51 +77,6 @@ class BaseModel(ABC):
|
||||
raise ValueError("Undefined input type:", input_type)
|
||||
self.input_types.append(itype)
|
||||
|
||||
def init_trt_engine(self, engine_path):
|
||||
# https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#python_topics
|
||||
# https://stackoverflow.com/a/79076885
|
||||
|
||||
self.trt_logger = trt.Logger(trt.Logger.WARNING)
|
||||
with open(engine_path, "rb") as f:
|
||||
runtime = trt.Runtime(self.trt_logger)
|
||||
self.engine = runtime.deserialize_cuda_engine(f.read())
|
||||
self.context = self.engine.create_execution_context()
|
||||
self.stream = cuda.Stream()
|
||||
|
||||
self.inputs, self.outputs, self.bindings = [], [], []
|
||||
self.input_names = []
|
||||
self.input_shapes = []
|
||||
self.input_types = []
|
||||
|
||||
for i in range(self.engine.num_io_tensors):
|
||||
tensor_name = self.engine.get_tensor_name(i)
|
||||
shape = self.engine.get_tensor_shape(tensor_name)
|
||||
dtype = trt.nptype(self.engine.get_tensor_dtype(tensor_name))
|
||||
|
||||
if -1 in shape:
|
||||
print("WARNING: Replacing dynamic shape with fixed for:", tensor_name)
|
||||
shape[list(shape).index(-1)] = 10
|
||||
|
||||
# Allocate host and device buffers
|
||||
size = trt.volume(shape)
|
||||
host_mem = cuda.pagelocked_empty(size, dtype)
|
||||
device_mem = cuda.mem_alloc(host_mem.nbytes)
|
||||
self.bindings.append(int(device_mem))
|
||||
|
||||
# Append to the appropriate input/output list
|
||||
if self.engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT:
|
||||
self.inputs.append((host_mem, device_mem, shape))
|
||||
self.input_names.append(tensor_name)
|
||||
self.input_shapes.append(shape)
|
||||
self.input_types.append(dtype)
|
||||
else:
|
||||
self.outputs.append((host_mem, device_mem, shape))
|
||||
|
||||
# Set tensor address
|
||||
self.context.set_tensor_address(
|
||||
self.engine.get_tensor_name(i), self.bindings[i]
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def preprocess(self, **kwargs):
|
||||
pass
|
||||
@ -168,7 +125,7 @@ class BaseModel(ABC):
|
||||
tensor = tensor.astype(self.input_types[i])
|
||||
inputs[iname] = tensor
|
||||
|
||||
self.call_model(list(inputs.values()))
|
||||
self.call_model_ort(list(inputs.values()))
|
||||
|
||||
def call_model_ort(self, tensor):
|
||||
inputs = {}
|
||||
@ -178,45 +135,9 @@ class BaseModel(ABC):
|
||||
result = self.session.run(None, inputs)
|
||||
return result
|
||||
|
||||
def call_model_trt(self, tensor):
|
||||
# Transfer input data to device
|
||||
for i, input_data in enumerate(tensor):
|
||||
np.copyto(self.inputs[i][0], input_data.ravel())
|
||||
cuda.memcpy_htod_async(self.inputs[i][1], self.inputs[i][0], self.stream)
|
||||
|
||||
# Empty the output buffers
|
||||
for i in range(len(self.outputs)):
|
||||
self.outputs[i][0].fill(0)
|
||||
cuda.memcpy_htod_async(self.outputs[i][1], self.outputs[i][0], self.stream)
|
||||
|
||||
# Run inference
|
||||
self.context.execute_async_v3(stream_handle=self.stream.handle)
|
||||
|
||||
# Transfer predictions back
|
||||
for i in range(len(self.outputs)):
|
||||
cuda.memcpy_dtoh_async(self.outputs[i][0], self.outputs[i][1], self.stream)
|
||||
|
||||
# Synchronize the stream
|
||||
self.stream.synchronize()
|
||||
|
||||
# Un-flatten the outputs
|
||||
outputs = []
|
||||
for i in range(len(self.outputs)):
|
||||
output = self.outputs[i][0].reshape(self.outputs[i][2])
|
||||
outputs.append(output)
|
||||
|
||||
return outputs
|
||||
|
||||
def call_model(self, tensor):
|
||||
if self.runtime == "trt":
|
||||
result = self.call_model_trt(tensor)
|
||||
elif self.runtime == "ort":
|
||||
result = self.call_model_ort(tensor)
|
||||
return result
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
tensor = self.preprocess(**kwargs)
|
||||
result = self.call_model(tensor)
|
||||
result = self.call_model_ort(tensor)
|
||||
output = self.postprocess(result=result, **kwargs)
|
||||
return output
|
||||
|
||||
@ -544,10 +465,8 @@ def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1):
|
||||
print("Loading 2D model ...")
|
||||
|
||||
model = TopDown(
|
||||
# "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
|
||||
# "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
|
||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.engine",
|
||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3_fp16_extra-steps.engine",
|
||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx",
|
||||
"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx",
|
||||
box_conf_threshold=min_bbox_score,
|
||||
box_min_area=min_bbox_area,
|
||||
warmup=30,
|
||||
|
||||
Reference in New Issue
Block a user