diff --git a/extras/mmdeploy/README.md b/extras/mmdeploy/README.md index 2abffa9..fb30c9f 100644 --- a/extras/mmdeploy/README.md +++ b/extras/mmdeploy/README.md @@ -11,39 +11,53 @@ docker build --progress=plain -f extras/mmdeploy/dockerfile -t rpt_mmdeploy . ## ONNX ```bash -export withFP16="_fp16" -cp /RapidPoseTriangulation/extras/mmdeploy/configs/detection_onnxruntime_static-320x320$withFP16.py configs/mmdet/detection/ - cd /mmdeploy/ +export withFP16="_fp16" +cp /RapidPoseTriangulation/extras/mmdeploy/configs/detection_onnxruntime_static-320x320"$withFP16".py configs/mmdet/detection/ + python3 ./tools/deploy.py \ - configs/mmdet/detection/detection_onnxruntime_static-320x320$withFP16.py \ + configs/mmdet/detection/detection_onnxruntime_static-320x320"$withFP16".py \ /mmpose/projects/rtmpose/rtmdet/person/rtmdet_nano_320-8xb32_coco-person.py \ https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_nano_8xb32-100e_coco-obj365-person-05d8511e.pth \ /mmpose/projects/rtmpose/examples/onnxruntime/human-pose.jpeg \ --work-dir work_dir \ --show -mv /mmdeploy/work_dir/end2end.onnx /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320$withFP16.onnx +mv /mmdeploy/work_dir/end2end.onnx /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x3x320x320"$withFP16".onnx ``` ```bash -export withFP16="_fp16" -cp /RapidPoseTriangulation/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_static-384x288$withFP16.py configs/mmpose/ - cd /mmdeploy/ +export withFP16="_fp16" +cp /RapidPoseTriangulation/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_static-384x288"$withFP16".py configs/mmpose/ +cp /RapidPoseTriangulation/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288"$withFP16".py configs/mmpose/ + python3 ./tools/deploy.py \ - configs/mmpose/pose-detection_simcc_onnxruntime_static-384x288$withFP16.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_static-384x288"$withFP16".py \ /mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py \ https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth \ /mmpose/projects/rtmpose/examples/onnxruntime/human-pose.jpeg \ --work-dir work_dir \ --show -mv /mmdeploy/work_dir/end2end.onnx /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288$withFP16.onnx +mv /mmdeploy/work_dir/end2end.onnx /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x3x384x288"$withFP16".onnx + +python3 ./tools/deploy.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_dynamic-384x288"$withFP16".py \ + /mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py \ + https://download.openmmlab.com/mmpose/v1/projects/rtmposev1/rtmpose-m_simcc-body7_pt-body7_420e-384x288-65e718c4_20230504.pth \ + /mmpose/projects/rtmpose/examples/onnxruntime/human-pose.jpeg \ + --work-dir work_dir \ + --show + +mv /mmdeploy/work_dir/end2end.onnx /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_Bx3x384x288"$withFP16".onnx ``` ```bash python3 /RapidPoseTriangulation/extras/mmdeploy/make_extra_graphs.py +``` + +```bash python3 /RapidPoseTriangulation/extras/mmdeploy/add_extra_steps.py ``` @@ -57,14 +71,17 @@ Run this directly in the inference container (the TensorRT versions need to be t export withFP16="_fp16" trtexec --fp16 \ - --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320"$withFP16"_extra-steps.onnx \ + --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.onnx \ --saveEngine=end2end.engine mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3"$withFP16"_extra-steps.engine trtexec --fp16 \ - --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288"$withFP16"_extra-steps.onnx \ - --saveEngine=end2end.engine + --onnx=/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_Bx384x288x3"$withFP16"_extra-steps.onnx \ + --saveEngine=end2end.engine \ + --minShapes=image_input:1x384x288x3 \ + --optShapes=image_input:1x384x288x3 \ + --maxShapes=image_input:1x384x288x3 mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x384x288x3"$withFP16"_extra-steps.engine ``` @@ -74,14 +91,14 @@ mv ./end2end.engine /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x ## Benchmark ```bash +cd /mmdeploy/ export withFP16="_fp16" -cd /mmdeploy/ python3 ./tools/profiler.py \ - configs/mmpose/pose-detection_simcc_onnxruntime_static-384x288$withFP16.py \ + configs/mmpose/pose-detection_simcc_onnxruntime_static-384x288"$withFP16".py \ /mmpose/projects/rtmpose/rtmpose/body_2d_keypoint/rtmpose-m_8xb256-420e_coco-384x288.py \ /RapidPoseTriangulation/extras/mmdeploy/testimages/ \ - --model /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288$withFP16.onnx \ + --model /RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_1x3x384x288"$withFP16".onnx \ --shape 384x288 \ --device cuda \ --warmup 50 \ diff --git a/extras/mmdeploy/add_extra_steps.py b/extras/mmdeploy/add_extra_steps.py index 8bd7a08..6ab35fa 100644 --- a/extras/mmdeploy/add_extra_steps.py +++ b/extras/mmdeploy/add_extra_steps.py @@ -1,12 +1,15 @@ +import re + import numpy as np import onnx -from onnx import TensorProto, compose, helper, numpy_helper +from onnx import TensorProto, helper, numpy_helper # ================================================================================================== base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/" -pose_model_path = base_path + "rtmpose-m_384x288.onnx" -det_model_path = base_path + "rtmdet-nano_320x320.onnx" +det_model_path = base_path + "rtmdet-nano_1x3x320x320.onnx" +pose_model_path1 = base_path + "rtmpose-m_Bx3x384x288.onnx" +pose_model_path2 = base_path + "rtmpose-m_1x3x384x288.onnx" norm_mean = -1 * (np.array([0.485, 0.456, 0.406]) * 255) norm_std = 1.0 / (np.array([0.229, 0.224, 0.225]) * 255) @@ -97,6 +100,11 @@ def add_steps_to_onnx(model_path): for i, j in enumerate([0, 3, 1, 2]): input_shape[j].dim_value = dims[i] + # Set the batch size to a defined string + input_shape = graph.input[0].type.tensor_type.shape.dim + if input_shape[0].dim_value == 0: + input_shape[0].dim_param = "batch_size" + # Rename the input tensor main_input_image_name = model.graph.input[0].name for node in model.graph.node: @@ -108,7 +116,8 @@ def add_steps_to_onnx(model_path): # Set input image type to int8 model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8 - path = model_path.replace(".onnx", "_extra-steps.onnx") + path = re.sub(r"(x)(\d+)x(\d+)x(\d+)", r"\1\3x\4x\2", model_path) + path = path.replace(".onnx", "_extra-steps.onnx") onnx.save(model, path) @@ -116,10 +125,12 @@ def add_steps_to_onnx(model_path): def main(): - add_steps_to_onnx(pose_model_path) add_steps_to_onnx(det_model_path) + add_steps_to_onnx(pose_model_path1) + add_steps_to_onnx(pose_model_path2) add_steps_to_onnx(det_model_path.replace(".onnx", "_fp16.onnx")) - add_steps_to_onnx(pose_model_path.replace(".onnx", "_fp16.onnx")) + add_steps_to_onnx(pose_model_path1.replace(".onnx", "_fp16.onnx")) + add_steps_to_onnx(pose_model_path2.replace(".onnx", "_fp16.onnx")) # ================================================================================================== diff --git a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py index 3918d65..d7d5b57 100644 --- a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py +++ b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py @@ -5,7 +5,7 @@ onnx_config = dict( ) codebase_config = dict( - # For later TensorRT inference, the number of output boxes needs to be as stable as possible, + # For later TensorRT inference, the number of output boxes needs to be as stable as possible, # because a drop in the box count leads to a re-optimization which takes a lot of time, # therefore reduce the maximum number of output boxes to the smallest usable value and sort out # low confidence boxes outside the model. diff --git a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py index a724f53..1dd243b 100644 --- a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py +++ b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py @@ -5,7 +5,7 @@ onnx_config = dict( ) codebase_config = dict( - # For later TensorRT inference, the number of output boxes needs to be as stable as possible, + # For later TensorRT inference, the number of output boxes needs to be as stable as possible, # because a drop in the box count leads to a re-optimization which takes a lot of time, # therefore reduce the maximum number of output boxes to the smallest usable value and sort out # low confidence boxes outside the model. diff --git a/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288.py b/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288.py new file mode 100644 index 0000000..3d52547 --- /dev/null +++ b/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288.py @@ -0,0 +1,19 @@ +_base_ = ["./pose-detection_static.py", "../_base_/backends/onnxruntime.py"] + +onnx_config = dict( + input_shape=[288, 384], + output_names=["kpts", "scores"], + dynamic_axes={ + "input": { + 0: "batch", + }, + "kpts": { + 0: "batch", + }, + "scores": { + 0: "batch", + }, + }, +) + +codebase_config = dict(export_postprocess=True) # export get_simcc_maximum diff --git a/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288_fp16.py b/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288_fp16.py new file mode 100644 index 0000000..fe0ca45 --- /dev/null +++ b/extras/mmdeploy/configs/pose-detection_simcc_onnxruntime_dynamic-384x288_fp16.py @@ -0,0 +1,19 @@ +_base_ = ["./pose-detection_static.py", "../_base_/backends/onnxruntime-fp16.py"] + +onnx_config = dict( + input_shape=[288, 384], + output_names=["kpts", "scores"], + dynamic_axes={ + "input": { + 0: "batch", + }, + "kpts": { + 0: "batch", + }, + "scores": { + 0: "batch", + }, + }, +) + +codebase_config = dict(export_postprocess=True) # export get_simcc_maximum diff --git a/scripts/test_skelda_dataset.py b/scripts/test_skelda_dataset.py index cc0b61c..aa6a148 100644 --- a/scripts/test_skelda_dataset.py +++ b/scripts/test_skelda_dataset.py @@ -53,6 +53,9 @@ default_min_match_score = 0.94 # If the number of cameras is high, and the views are not occluded, use a higher value default_min_group_size = 1 +# Batch poses per image for faster processing +# If most of the time only one person is in a image, disable it, because it is slightly slower then +default_batch_poses = True datasets = { "human36m": { @@ -62,6 +65,7 @@ datasets = { "min_group_size": 1, "min_bbox_score": 0.4, "min_bbox_area": 0.1 * 0.1, + "batch_poses": False, }, "panoptic": { "path": "/datasets/panoptic/skelda/test.json", @@ -310,13 +314,14 @@ def main(): min_group_size = datasets[dataset_use].get("min_group_size", default_min_group_size) min_bbox_score = datasets[dataset_use].get("min_bbox_score", default_min_bbox_score) min_bbox_area = datasets[dataset_use].get("min_bbox_area", default_min_bbox_area) + batch_poses = datasets[dataset_use].get("batch_poses", default_batch_poses) # Load 2D pose model whole_body = test_triangulate.whole_body if any((whole_body[k] for k in whole_body)): kpt_model = utils_2d_pose.load_wb_model() else: - kpt_model = utils_2d_pose.load_model(min_bbox_score, min_bbox_area) + kpt_model = utils_2d_pose.load_model(min_bbox_score, min_bbox_area, batch_poses) # Manually set matplotlib backend try: diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 1944983..7d4a91b 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -96,29 +96,12 @@ class BaseModel(ABC): if "image" in iname: ishape = list(self.input_shapes[i]) if "batch_size" in ishape: - if "TensorrtExecutionProvider" in self.providers: - # Using different images sizes for TensorRT warmup takes too long - ishape = [1, 1000, 1000, 3] - else: - ishape = [ - 1, - np.random.randint(300, 1000), - np.random.randint(300, 1000), - 3, - ] + max_batch_size = 10 + ishape[0] = np.random.choice( + list(range(1, max_batch_size + 1)) + ) tensor = np.random.random(ishape) tensor = tensor * 255 - elif "bbox" in iname: - tensor = np.array( - [ - [ - np.random.randint(30, 100), - np.random.randint(30, 100), - np.random.randint(200, 300), - np.random.randint(200, 300), - ] - ] - ) else: raise ValueError("Undefined input type:", iname) @@ -401,35 +384,48 @@ class RTMPose(BaseModel): self.target_size = (384, 288) self.boxcrop = BoxCrop(self.target_size, padding_scale=1.25, fill_value=0) - def preprocess(self, image: np.ndarray, bbox: np.ndarray): - bbox = np.asarray(bbox)[0:4] - bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) - bbox = bbox.round().astype(np.int32) - region = self.boxcrop.crop_resize_box(image, bbox) - tensor = np.asarray(region).astype(self.input_types[0], copy=False) - tensor = np.expand_dims(tensor, axis=0) - tensor = [tensor] + def preprocess(self, image: np.ndarray, bboxes: np.ndarray): + cutouts = [] + for i in range(len(bboxes)): + bbox = np.asarray(bboxes[i])[0:4] + bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) + bbox = bbox.round().astype(np.int32) + region = self.boxcrop.crop_resize_box(image, bbox) + tensor = np.asarray(region).astype(self.input_types[0], copy=False) + cutouts.append(tensor) + + if len(bboxes) == 1: + cutouts = np.expand_dims(cutouts[0], axis=0) + else: + cutouts = np.stack(cutouts, axis=0) + + tensor = [cutouts] return tensor def postprocess( - self, result: List[np.ndarray], image: np.ndarray, bbox: np.ndarray + self, result: List[np.ndarray], image: np.ndarray, bboxes: np.ndarray ): - scores = np.clip(result[1][0], 0, 1) - kp = np.concatenate([result[0][0], np.expand_dims(scores, axis=-1)], axis=-1) + kpts = [] + for i in range(len(bboxes)): + scores = np.clip(result[1][i], 0, 1) + kp = np.concatenate( + [result[0][i], np.expand_dims(scores, axis=-1)], axis=-1 + ) - paddings, scale, bbox, _ = self.boxcrop.calc_params(image.shape, bbox) - kp[:, 0] -= paddings[0] - kp[:, 1] -= paddings[2] - kp[:, 0:2] /= scale - kp[:, 0] += bbox[0] - kp[:, 1] += bbox[1] - kp[:, 0:2] = np.maximum(kp[:, 0:2], 0) - max_w = image.shape[1] - 1 - max_h = image.shape[0] - 1 - kp[:, 0] = np.minimum(kp[:, 0], max_w) - kp[:, 1] = np.minimum(kp[:, 1], max_h) + paddings, scale, bbox, _ = self.boxcrop.calc_params(image.shape, bboxes[i]) + kp[:, 0] -= paddings[0] + kp[:, 1] -= paddings[2] + kp[:, 0:2] /= scale + kp[:, 0] += bbox[0] + kp[:, 1] += bbox[1] + kp[:, 0:2] = np.maximum(kp[:, 0:2], 0) + max_w = image.shape[1] - 1 + max_h = image.shape[0] - 1 + kp[:, 0] = np.minimum(kp[:, 0], max_w) + kp[:, 1] = np.minimum(kp[:, 1], max_h) + kpts.append(kp) - return kp + return kpts # ================================================================================================== @@ -444,6 +440,8 @@ class TopDown: box_min_area: float, warmup: int = 30, ): + self.batch_poses = bool("Bx" in pose_model_path) + self.det_model = RTMDet( det_model_path, box_conf_threshold, box_min_area, warmup ) @@ -451,22 +449,29 @@ class TopDown: def predict(self, image): boxes = self.det_model(image=image) + if len(boxes) == 0: + return [] + results = [] - for i in range(boxes.shape[0]): - kp = self.pose_model(image=image, bbox=boxes[i]) - results.append(kp) + if self.batch_poses: + results = self.pose_model(image=image, bboxes=boxes) + else: + for i in range(boxes.shape[0]): + kp = self.pose_model(image=image, bboxes=[boxes[i]]) + results.append(kp[0]) + return results # ================================================================================================== -def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1): +def load_model(min_bbox_score=0.3, min_bbox_area=0.1 * 0.1, batch_poses=False): print("Loading 2D model ...") model = TopDown( - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_320x320_fp16_extra-steps.onnx", - "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_384x288_fp16_extra-steps.onnx", + "/RapidPoseTriangulation/extras/mmdeploy/exports/rtmdet-nano_1x320x320x3_fp16_extra-steps.onnx", + f"/RapidPoseTriangulation/extras/mmdeploy/exports/rtmpose-m_{'B' if batch_poses else '1'}x384x288x3_fp16_extra-steps.onnx", box_conf_threshold=min_bbox_score, box_min_area=min_bbox_area, warmup=30,