From 97ff32b9ce81d0a9658acf4593928a8b336c7146 Mon Sep 17 00:00:00 2001 From: Daniel Date: Tue, 3 Dec 2024 17:59:09 +0100 Subject: [PATCH] Moved pose pre/post-processing into onnx graph. --- extras/mmdeploy/add_extra_steps.py | 47 ++++- extras/mmdeploy/make_extra_graphs.py | 161 ++++++++++++-- media/RESULTS.md | 304 +++++++++++++-------------- scripts/utils_2d_pose_ort.py | 183 ++++++---------- 4 files changed, 411 insertions(+), 284 deletions(-) diff --git a/extras/mmdeploy/add_extra_steps.py b/extras/mmdeploy/add_extra_steps.py index 6d7054e..42d9547 100644 --- a/extras/mmdeploy/add_extra_steps.py +++ b/extras/mmdeploy/add_extra_steps.py @@ -121,14 +121,55 @@ def add_steps_to_onnx(model_path): # Update nodes from postprocess model to use the input of the main network pp2_input_image_name = pp2_model.graph.input[0].name - main_input_name = model.graph.input[0].name + main_input_image_name = model.graph.input[0].name for node in model.graph.node: for idx, name in enumerate(node.input): if name == pp2_input_image_name: - node.input[idx] = main_input_name + node.input[idx] = main_input_image_name model.graph.input.pop(1) - # Set input type to int8 + if "pose" in model_path: + # Add preprocess model to main network + pp1_model = onnx.load(base_path + "pose_preprocess.onnx") + model = compose.add_prefix(model, prefix="main_") + pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_") + model = compose.merge_models( + pp1_model, + model, + io_map=[ + (pp1_model.graph.output[0].name, model.graph.input[0].name), + ], + ) + + # Add postprocess model + pp2_model = onnx.load(base_path + "pose_postprocess.onnx") + pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_") + model = compose.merge_models( + model, + pp2_model, + io_map=[ + (model.graph.output[0].name, pp2_model.graph.input[2].name), + ], + ) + + # Update nodes from postprocess model to use the input of the main network + pp2_input_image_name = pp2_model.graph.input[0].name + pp2_input_bbox_name = pp2_model.graph.input[1].name + main_input_image_name = model.graph.input[0].name + main_input_bbox_name = model.graph.input[1].name + for node in model.graph.node: + for idx, name in enumerate(node.input): + if name == pp2_input_image_name: + node.input[idx] = main_input_image_name + if name == pp2_input_bbox_name: + node.input[idx] = main_input_bbox_name + model.graph.input.pop(2) + model.graph.input.pop(2) + + # Set input box type to int32 + model.graph.input[1].type.tensor_type.elem_type = TensorProto.INT32 + + # Set input image type to int8 model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8 path = model_path.replace(".onnx", "_extra-steps.onnx") diff --git a/extras/mmdeploy/make_extra_graphs.py b/extras/mmdeploy/make_extra_graphs.py index a961032..1242531 100644 --- a/extras/mmdeploy/make_extra_graphs.py +++ b/extras/mmdeploy/make_extra_graphs.py @@ -7,6 +7,7 @@ import torch.nn.functional as F base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/" det_target_size = (320, 320) +pose_target_size = (384, 288) # ================================================================================================== @@ -19,10 +20,37 @@ class Letterbox(nn.Module): self.target_size = target_size self.fill_value = fill_value - def calc_params(self, img): - ih, iw = img.shape[1:3] + def calc_params_and_crop(self, img, bbox=None): + ih0, iw0 = img.shape[1:3] th, tw = self.target_size + if bbox is not None: + bbox = bbox[0].float() + x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] + + # Slightly increase bbox size + factor = 1.25 + w = x2 - x1 + h = y2 - y1 + x1 -= w * (factor - 1) / 2 + x2 += w * (factor - 1) / 2 + y1 -= h * (factor - 1) / 2 + y2 += h * (factor - 1) / 2 + + zero = torch.tensor(0) + x1 = torch.max(x1, zero).to(torch.int64) + y1 = torch.max(y1, zero).to(torch.int64) + x2 = torch.min(x2, iw0).to(torch.int64) + y2 = torch.min(y2, ih0).to(torch.int64) + bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0) + + img = img.to(torch.float32) + img = img[:, y1:y2, x1:x2, :] + ih = y2 - y1 + iw = x2 - x1 + else: + ih, iw = ih0, iw0 + scale = torch.min(tw / iw, th / ih) nw = torch.round(iw * scale) nh = torch.round(ih * scale) @@ -35,15 +63,18 @@ class Letterbox(nn.Module): pad_bottom = pad_h - pad_top paddings = (pad_left, pad_right, pad_top, pad_bottom) - return paddings, scale, (nw, nh) + return img, paddings, scale, (nw, nh), bbox - def forward(self, img): - paddings, _, (nw, nh) = self.calc_params(img) + def forward(self, img, bbox=None): + img, paddings, _, (nw, nh), _ = self.calc_params_and_crop(img, bbox) # Resize the image img = img.to(torch.float32) img = F.interpolate( - img.permute(0, 3, 1, 2), size=(nh, nw), mode="bilinear", align_corners=False + img.permute(0, 3, 1, 2), + size=(nh, nw), + mode="bilinear", + align_corners=False, ) img = img.permute(0, 2, 3, 1) img = img.round() @@ -71,7 +102,7 @@ class DetPreprocess(nn.Module): def forward(self, img): # img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8 - img = self.letterbox(img) + img = self.letterbox(img, None) return img @@ -81,36 +112,97 @@ class DetPreprocess(nn.Module): class DetPostprocess(nn.Module): def __init__(self, target_size): super(DetPostprocess, self).__init__() + + self.target_size = target_size self.letterbox = Letterbox(target_size) def forward(self, img, boxes): - paddings, scale, _ = self.letterbox.calc_params(img) + _, paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img, None) boxes = boxes.float() boxes[:, :, 0] -= paddings[0] boxes[:, :, 2] -= paddings[0] boxes[:, :, 1] -= paddings[2] boxes[:, :, 3] -= paddings[2] - boxes[:, :, 0:4] /= scale - ih, iw = img.shape[1:3] - boxes = torch.max(boxes, torch.tensor(0)) + zero = torch.tensor(0) + boxes = torch.max(boxes, zero) + + th, tw = self.target_size + pad_w = paddings[0] + paddings[1] + pad_h = paddings[2] + paddings[3] + max_w = tw - pad_w - 1 + max_h = th - pad_h - 1 b0 = boxes[:, :, 0] b1 = boxes[:, :, 1] b2 = boxes[:, :, 2] b3 = boxes[:, :, 3] - b0 = torch.min(b0, iw - 1) - b1 = torch.min(b1, ih - 1) - b2 = torch.min(b2, iw - 1) - b3 = torch.min(b3, ih - 1) + b0 = torch.min(b0, max_w) + b1 = torch.min(b1, max_h) + b2 = torch.min(b2, max_w) + b3 = torch.min(b3, max_h) boxes = torch.stack((b0, b1, b2, b3, boxes[:, :, 4]), dim=2) + boxes[:, :, 0:4] /= scale return boxes # ================================================================================================== +class PosePreprocess(nn.Module): + def __init__(self, target_size, fill_value=114): + super(PosePreprocess, self).__init__() + self.letterbox = Letterbox(target_size, fill_value) + + def forward(self, img, bbox): + # img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8 + # bbox: torch.Tensor of shape [1, 4], dtype=torch.float32 + img = self.letterbox(img, bbox) + return img + + +# ================================================================================================== + + +class PosePostprocess(nn.Module): + def __init__(self, target_size): + super(PosePostprocess, self).__init__() + + self.target_size = target_size + self.letterbox = Letterbox(target_size) + + def forward(self, img, bbox, keypoints): + _, paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img, bbox) + + kp = keypoints.float() + kp[:, :, 0] -= paddings[0] + kp[:, :, 1] -= paddings[2] + + zero = torch.tensor(0) + kp = torch.max(kp, zero) + + th, tw = self.target_size + pad_w = paddings[0] + paddings[1] + pad_h = paddings[2] + paddings[3] + max_w = tw - pad_w - 1 + max_h = th - pad_h - 1 + k0 = kp[:, :, 0] + k1 = kp[:, :, 1] + k0 = torch.min(k0, max_w) + k1 = torch.min(k1, max_h) + kp = torch.stack((k0, k1), dim=2) + + kp[:, :, 0:2] /= scale + + kp[:, :, 0] += bbox[0, 0] + kp[:, :, 1] += bbox[0, 1] + return kp + + +# ================================================================================================== + + def main(): img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg" @@ -154,6 +246,45 @@ def main(): }, ) + # Initialize the PosePreprocess module + preprocess_model = PosePreprocess(target_size=pose_target_size) + det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0) + det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]]) + + # Export to ONNX + torch.onnx.export( + preprocess_model, + (det_dummy_input_c0, det_dummy_input_c1), + base_path + "pose_preprocess.onnx", + opset_version=11, + input_names=["input_image", "bbox"], + output_names=["preprocessed_image"], + dynamic_axes={ + "input_image": {0: "batch_size", 1: "height", 2: "width"}, + "preprocessed_image": {0: "batch_size"}, + }, + ) + + # Initialize the PosePostprocess module + postprocess_model = PosePostprocess(target_size=pose_target_size) + det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0) + det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]]) + det_dummy_input_d2 = torch.rand(1, 17, 3) + + # Export to ONNX + torch.onnx.export( + postprocess_model, + (det_dummy_input_d0, det_dummy_input_d1, det_dummy_input_d2), + base_path + "pose_postprocess.onnx", + opset_version=11, + input_names=["input_image", "bbox", "keypoints"], + output_names=["output_keypoints"], + dynamic_axes={ + "input_image": {0: "batch_size", 1: "height", 2: "width"}, + "output_keypoints": {0: "batch_size"}, + }, + ) + # ================================================================================================== diff --git a/media/RESULTS.md b/media/RESULTS.md index 577d94a..5aa26bf 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets. ```json { - "avg_time_2d": 0.01254632634631658, - "avg_time_3d": 0.00036295955463991325, - "avg_fps": 77.4636186441503 + "avg_time_2d": 0.02244777841083074, + "avg_time_3d": 0.0003828315411583852, + "avg_fps": 43.800844659994496 } { "person_nums": { @@ -27,101 +27,101 @@ Results of the model in various experiments on different datasets. }, "mpjpe": { "count": 600, - "mean": 0.066275, - "median": 0.058426, - "std": 0.02768, - "sem": 0.001131, - "min": 0.040807, - "max": 0.188876, + "mean": 0.067837, + "median": 0.059973, + "std": 0.027729, + "sem": 0.001133, + "min": 0.044125, + "max": 0.191545, "recall-0.025": 0.0, - "recall-0.05": 0.083333, - "recall-0.1": 0.938333, + "recall-0.05": 0.035, + "recall-0.1": 0.931667, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.011533, - "ap-0.1": 0.899113, - "ap-0.15": 0.915362, + "ap-0.05": 0.003097, + "ap-0.1": 0.889734, + "ap-0.15": 0.915055, "ap-0.25": 1.0, "ap-0.5": 1.0 }, "nose": { "count": 600, - "mean": 0.115024, - "median": 0.099737, - "std": 0.041, - "sem": 0.001675, - "min": 0.02644, - "max": 0.261025, + "mean": 0.116272, + "median": 0.09953, + "std": 0.042967, + "sem": 0.001756, + "min": 0.033845, + "max": 0.263303, "recall-0.025": 0.0, "recall-0.05": 0.008333, - "recall-0.1": 0.501667, - "recall-0.15": 0.808333, - "recall-0.25": 0.998333, + "recall-0.1": 0.503333, + "recall-0.15": 0.815, + "recall-0.25": 0.993333, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_left": { "count": 600, - "mean": 0.034317, - "median": 0.026768, - "std": 0.031799, - "sem": 0.001299, - "min": 0.001234, - "max": 0.178357, - "recall-0.025": 0.456667, + "mean": 0.034881, + "median": 0.027327, + "std": 0.031594, + "sem": 0.001291, + "min": 0.002162, + "max": 0.178271, + "recall-0.025": 0.438333, "recall-0.05": 0.863333, "recall-0.1": 0.946667, - "recall-0.15": 0.966667, + "recall-0.15": 0.963333, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_right": { "count": 600, - "mean": 0.047981, - "median": 0.034263, - "std": 0.039767, - "sem": 0.001625, - "min": 0.005363, - "max": 0.24597, - "recall-0.025": 0.226667, - "recall-0.05": 0.743333, - "recall-0.1": 0.916667, - "recall-0.15": 0.948333, + "mean": 0.050288, + "median": 0.03555, + "std": 0.042274, + "sem": 0.001727, + "min": 0.003983, + "max": 0.238328, + "recall-0.025": 0.176667, + "recall-0.05": 0.748333, + "recall-0.1": 0.9, + "recall-0.15": 0.94, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.043526, - "median": 0.034276, - "std": 0.034786, - "sem": 0.001421, - "min": 0.003312, - "max": 0.198715, - "recall-0.025": 0.24, - "recall-0.05": 0.781667, - "recall-0.1": 0.943333, - "recall-0.15": 0.958333, + "mean": 0.044326, + "median": 0.035816, + "std": 0.034654, + "sem": 0.001416, + "min": 0.001741, + "max": 0.198882, + "recall-0.025": 0.226667, + "recall-0.05": 0.776667, + "recall-0.1": 0.946667, + "recall-0.15": 0.955, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_right": { "count": 600, - "mean": 0.04376, - "median": 0.033219, - "std": 0.037126, - "sem": 0.001517, - "min": 0.006159, - "max": 0.314756, - "recall-0.025": 0.245, - "recall-0.05": 0.803333, - "recall-0.1": 0.933333, + "mean": 0.044545, + "median": 0.033152, + "std": 0.037755, + "sem": 0.001543, + "min": 0.008169, + "max": 0.338555, + "recall-0.025": 0.218333, + "recall-0.05": 0.798333, + "recall-0.1": 0.928333, "recall-0.15": 0.943333, "recall-0.25": 0.996667, "recall-0.5": 1.0, @@ -129,146 +129,146 @@ Results of the model in various experiments on different datasets. }, "wrist_left": { "count": 600, - "mean": 0.044151, - "median": 0.026578, - "std": 0.047109, - "sem": 0.001925, - "min": 0.002328, - "max": 0.288425, - "recall-0.025": 0.478333, - "recall-0.05": 0.736667, - "recall-0.1": 0.883333, - "recall-0.15": 0.918333, - "recall-0.25": 0.998333, + "mean": 0.044896, + "median": 0.025929, + "std": 0.048601, + "sem": 0.001986, + "min": 0.002701, + "max": 0.326901, + "recall-0.025": 0.476667, + "recall-0.05": 0.735, + "recall-0.1": 0.885, + "recall-0.15": 0.913333, + "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { "count": 600, - "mean": 0.045218, - "median": 0.026994, - "std": 0.050213, - "sem": 0.002052, - "min": 0.002207, - "max": 0.291549, - "recall-0.025": 0.471667, - "recall-0.05": 0.766667, - "recall-0.1": 0.876667, - "recall-0.15": 0.908333, - "recall-0.25": 0.998333, + "mean": 0.045586, + "median": 0.027856, + "std": 0.048323, + "sem": 0.001974, + "min": 0.001841, + "max": 0.229728, + "recall-0.025": 0.436667, + "recall-0.05": 0.751667, + "recall-0.1": 0.881667, + "recall-0.15": 0.916667, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_left": { "count": 600, - "mean": 0.085085, - "median": 0.079726, - "std": 0.032918, - "sem": 0.001345, - "min": 0.020039, - "max": 0.232252, - "recall-0.025": 0.006667, - "recall-0.05": 0.055, - "recall-0.1": 0.853333, - "recall-0.15": 0.95, + "mean": 0.087757, + "median": 0.083491, + "std": 0.032627, + "sem": 0.001333, + "min": 0.004177, + "max": 0.235198, + "recall-0.025": 0.008333, + "recall-0.05": 0.031667, + "recall-0.1": 0.851667, + "recall-0.15": 0.953333, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_right": { "count": 600, - "mean": 0.108514, - "median": 0.106487, - "std": 0.025557, - "sem": 0.001044, - "min": 0.043182, - "max": 0.228959, + "mean": 0.112758, + "median": 0.11174, + "std": 0.025369, + "sem": 0.001037, + "min": 0.057593, + "max": 0.231402, "recall-0.025": 0.0, - "recall-0.05": 0.003333, - "recall-0.1": 0.358333, - "recall-0.15": 0.948333, + "recall-0.05": 0.0, + "recall-0.1": 0.251667, + "recall-0.15": 0.945, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "knee_left": { - "count": 600, - "mean": 0.063226, - "median": 0.046357, - "std": 0.059854, - "sem": 0.002446, - "min": 0.016702, - "max": 0.474089, - "recall-0.025": 0.035, - "recall-0.05": 0.563333, - "recall-0.1": 0.881667, + "count": 599, + "mean": 0.063316, + "median": 0.047979, + "std": 0.058253, + "sem": 0.002382, + "min": 0.019525, + "max": 0.476803, + "recall-0.025": 0.033333, + "recall-0.05": 0.546667, + "recall-0.1": 0.89, "recall-0.15": 0.925, - "recall-0.25": 0.98, - "recall-0.5": 1.0, + "recall-0.25": 0.978333, + "recall-0.5": 0.998333, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.050723, - "median": 0.041264, - "std": 0.036826, - "sem": 0.001505, - "min": 0.013747, - "max": 0.274, - "recall-0.025": 0.043333, - "recall-0.05": 0.745, - "recall-0.1": 0.945, + "mean": 0.050955, + "median": 0.041526, + "std": 0.037031, + "sem": 0.001513, + "min": 0.005291, + "max": 0.27011, + "recall-0.025": 0.035, + "recall-0.05": 0.746667, + "recall-0.1": 0.943333, "recall-0.15": 0.945, - "recall-0.25": 0.996667, + "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_left": { - "count": 600, - "mean": 0.097084, - "median": 0.085682, - "std": 0.046353, - "sem": 0.001894, - "min": 0.045955, - "max": 0.492226, + "count": 598, + "mean": 0.097897, + "median": 0.086817, + "std": 0.048343, + "sem": 0.001979, + "min": 0.048922, + "max": 0.493127, "recall-0.025": 0.0, - "recall-0.05": 0.001667, - "recall-0.1": 0.833333, + "recall-0.05": 0.003333, + "recall-0.1": 0.83, "recall-0.15": 0.933333, - "recall-0.25": 0.985, - "recall-0.5": 1.0, + "recall-0.25": 0.98, + "recall-0.5": 0.996667, "num_labels": 600 }, "ankle_right": { "count": 599, - "mean": 0.082224, - "median": 0.068812, - "std": 0.047465, - "sem": 0.001941, - "min": 0.029154, - "max": 0.404964, + "mean": 0.084814, + "median": 0.07029, + "std": 0.053839, + "sem": 0.002202, + "min": 0.025955, + "max": 0.384465, "recall-0.025": 0.0, - "recall-0.05": 0.026667, - "recall-0.1": 0.888333, - "recall-0.15": 0.91, - "recall-0.25": 0.985, + "recall-0.05": 0.02, + "recall-0.1": 0.886667, + "recall-0.15": 0.908333, + "recall-0.25": 0.973333, "recall-0.5": 0.998333, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.16923, - "recall-0.05": 0.46833, - "recall-0.1": 0.82692, - "recall-0.15": 0.92692, - "recall-0.25": 0.99487, - "recall-0.5": 0.99974 + "recall-0.025": 0.15718, + "recall-0.05": 0.46321, + "recall-0.1": 0.81846, + "recall-0.15": 0.92654, + "recall-0.25": 0.99308, + "recall-0.5": 0.99923 } } { "total_parts": 8400, - "correct_parts": 8097, - "pcp": 0.963929 + "correct_parts": 8077, + "pcp": 0.961548 } ``` diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 2213970..2bba486 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from typing import List -import cv2 import numpy as np import onnxruntime as ort from tqdm import tqdm @@ -16,12 +15,11 @@ class BaseModel(ABC): # ort.set_default_logger_severity(1) provider = "" - if "TensorrtExecutionProvider" in providers: - provider = "TensorrtExecutionProvider" - elif "CUDAExecutionProvider" in providers: + if "CUDAExecutionProvider" in providers: provider = "CUDAExecutionProvider" else: provider = "CPUExecutionProvider" + self.provider = provider print("Found providers:", providers) print("Using:", provider) @@ -29,18 +27,22 @@ class BaseModel(ABC): model_path, providers=[provider], sess_options=self.opt ) - self.input_name = self.session.get_inputs()[0].name - self.input_shape = self.session.get_inputs()[0].shape - if "batch_size" in self.input_shape: - self.input_shape = [1, 500, 500, 3] + self.input_names = [input.name for input in self.session.get_inputs()] + self.input_shapes = [input.shape for input in self.session.get_inputs()] - input_type = self.session.get_inputs()[0].type - if input_type == "tensor(float16)": - self.input_type = np.float16 - elif input_type == "tensor(uint8)": - self.input_type = np.uint8 - else: - self.input_type = np.float32 + input_types = [input.type for input in self.session.get_inputs()] + self.input_types = [] + for i in range(len(input_types)): + input_type = input_types[i] + if input_type == "tensor(float16)": + itype = np.float16 + elif input_type == "tensor(uint8)": + itype = np.uint8 + elif input_type == "tensor(int32)": + itype = np.int32 + else: + itype = np.float32 + self.input_types.append(itype) if warmup > 0: self.warmup(warmup) @@ -56,12 +58,51 @@ class BaseModel(ABC): def warmup(self, epoch: int): print("Running warmup for '{}' ...".format(self.__class__.__name__)) for _ in tqdm(range(epoch)): - tensor = np.random.random(self.input_shape).astype(self.input_type) - self.session.run(None, {self.input_name: tensor}) + inputs = {} + for i in range(len(self.input_names)): + iname = self.input_names[i] + + if "image" in iname: + ishape = self.input_shapes[i] + if "batch_size" in ishape: + if self.provider == "TensorrtExecutionProvider": + # Using different images sizes for TensorRT warmup takes too long + ishape = [1, 1000, 1000, 3] + else: + ishape = [ + 1, + np.random.randint(300, 1000), + np.random.randint(300, 1000), + 3, + ] + tensor = np.random.random(ishape) + tensor = tensor * 255 + elif "bbox" in iname: + tensor = np.array( + [ + [ + np.random.randint(30, 100), + np.random.randint(30, 100), + np.random.randint(200, 300), + np.random.randint(200, 300), + ] + ] + ) + else: + raise ValueError("Undefined input type") + + tensor = tensor.astype(self.input_types[i]) + inputs[iname] = tensor + + self.session.run(None, inputs) def __call__(self, image: np.ndarray, *args, **kwargs): tensor = self.preprocess(image, *args, **kwargs) - result = self.session.run(None, {self.input_name: tensor}) + inputs = {} + for i in range(len(self.input_names)): + iname = self.input_names[i] + inputs[iname] = tensor[i] + result = self.session.run(None, inputs) output = self.postprocess(result, *args, **kwargs) return output @@ -80,8 +121,9 @@ class RTMDet(BaseModel): self.conf_threshold = conf_threshold def preprocess(self, image: np.ndarray): - tensor = np.asarray(image).astype(self.input_type, copy=False) + tensor = np.asarray(image).astype(self.input_types[0], copy=False) tensor = np.expand_dims(tensor, axis=0) + tensor = [tensor] return tensor def postprocess(self, tensor: List[np.ndarray]): @@ -105,106 +147,19 @@ class RTMPose(BaseModel): super(RTMPose, self).__init__(model_path, warmup) self.bbox = None - def region_of_interest_warped( - self, - image: np.ndarray, - box: np.ndarray, - target_size: List[int], - padding_scale: float = 1.25, - ): - start_x, start_y, end_x, end_y = box[0:4] - target_w, target_h = target_size - - # Calculate original bounding box width and height - bbox_w = end_x - start_x - bbox_h = end_y - start_y - - if bbox_w <= 0 or bbox_h <= 0: - raise ValueError("Invalid bounding box!") - - # Calculate the aspect ratios - bbox_aspect = bbox_w / bbox_h - target_aspect = target_w / target_h - - # Adjust the scaled bounding box to match the target aspect ratio - if bbox_aspect > target_aspect: - adjusted_h = bbox_w / target_aspect - adjusted_w = bbox_w - else: - adjusted_w = bbox_h * target_aspect - adjusted_h = bbox_h - - # Scale the bounding box by the padding_scale - scaled_bbox_w = adjusted_w * padding_scale - scaled_bbox_h = adjusted_h * padding_scale - - # Calculate the center of the original box - center_x = (start_x + end_x) / 2.0 - center_y = (start_y + end_y) / 2.0 - - # Calculate scaled bounding box coordinates - new_start_x = center_x - scaled_bbox_w / 2.0 - new_start_y = center_y - scaled_bbox_h / 2.0 - new_end_x = center_x + scaled_bbox_w / 2.0 - new_end_y = center_y + scaled_bbox_h / 2.0 - - # Define the new box coordinates - new_box = np.array( - [new_start_x, new_start_y, new_end_x, new_end_y], dtype=np.float32 - ) - scale = target_w / scaled_bbox_w - - # Define source and destination points for affine transformation - # See: /mmpose/structures/bbox/transforms.py - src_pts = np.array( - [ - [center_x, center_y], - [new_start_x, center_y], - [new_start_x, center_y + (center_x - new_start_x)], - ], - dtype=np.float32, - ) - dst_pts = np.array( - [ - [target_w * 0.5, target_h * 0.5], - [0, target_h * 0.5], - [0, target_h * 0.5 + (target_w * 0.5 - 0)], - ], - dtype=np.float32, - ) - - # Compute the affine transformation matrix - M = cv2.getAffineTransform(src_pts, dst_pts) - - # Apply affine transformation with border filling - extracted_region = cv2.warpAffine( - image, - M, - target_size, - flags=cv2.INTER_LINEAR, - ) - - return extracted_region, new_box, scale - def preprocess(self, image: np.ndarray, bbox: np.ndarray): - th, tw = self.input_shape[1:3] - region, self.bbox, _ = self.region_of_interest_warped(image, bbox, (tw, th)) - tensor = np.asarray(region).astype(self.input_type, copy=False) + tensor = np.asarray(image).astype(self.input_types[0], copy=False) tensor = np.expand_dims(tensor, axis=0) + bbox = np.asarray(bbox)[0:4] + bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) + bbox = bbox.round().astype(np.int32) + bbox = np.expand_dims(bbox, axis=0) + tensor = [tensor, bbox] return tensor def postprocess(self, tensor: List[np.ndarray], **kwargs): - scores = np.clip(tensor[1][0], 0, 1) - kp = np.concatenate([tensor[0][0], np.expand_dims(scores, axis=-1)], axis=-1) - - # See: /mmpose/models/pose_estimators/topdown.py - add_pred_to_datasample() - th, tw = self.input_shape[1:3] - bw, bh = [self.bbox[2] - self.bbox[0], self.bbox[3] - self.bbox[1]] - kp[:, :2] /= np.array([tw, th]) - kp[:, :2] *= np.array([bw, bh]) - kp[:, :2] += np.array([self.bbox[0] + bw / 2, self.bbox[1] + bh / 2]) - kp[:, :2] -= 0.5 * np.array([bw, bh]) - + scores = np.clip(tensor[0][0], 0, 1) + kp = np.concatenate([tensor[1][0], np.expand_dims(scores, axis=-1)], axis=-1) return kp