diff --git a/extras/mmdeploy/add_extra_steps.py b/extras/mmdeploy/add_extra_steps.py index 42d9547..8bd7a08 100644 --- a/extras/mmdeploy/add_extra_steps.py +++ b/extras/mmdeploy/add_extra_steps.py @@ -97,77 +97,13 @@ def add_steps_to_onnx(model_path): for i, j in enumerate([0, 3, 1, 2]): input_shape[j].dim_value = dims[i] - if "det" in model_path: - # Add preprocess model to main network - pp1_model = onnx.load(base_path + "det_preprocess.onnx") - model = compose.add_prefix(model, prefix="main_") - pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_") - model = compose.merge_models( - pp1_model, - model, - io_map=[(pp1_model.graph.output[0].name, model.graph.input[0].name)], - ) - - # Add postprocess model - pp2_model = onnx.load(base_path + "det_postprocess.onnx") - pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_") - model = compose.merge_models( - model, - pp2_model, - io_map=[ - (model.graph.output[0].name, pp2_model.graph.input[1].name), - ], - ) - - # Update nodes from postprocess model to use the input of the main network - pp2_input_image_name = pp2_model.graph.input[0].name - main_input_image_name = model.graph.input[0].name - for node in model.graph.node: - for idx, name in enumerate(node.input): - if name == pp2_input_image_name: - node.input[idx] = main_input_image_name - model.graph.input.pop(1) - - if "pose" in model_path: - # Add preprocess model to main network - pp1_model = onnx.load(base_path + "pose_preprocess.onnx") - model = compose.add_prefix(model, prefix="main_") - pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_") - model = compose.merge_models( - pp1_model, - model, - io_map=[ - (pp1_model.graph.output[0].name, model.graph.input[0].name), - ], - ) - - # Add postprocess model - pp2_model = onnx.load(base_path + "pose_postprocess.onnx") - pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_") - model = compose.merge_models( - model, - pp2_model, - io_map=[ - (model.graph.output[0].name, pp2_model.graph.input[2].name), - ], - ) - - # Update nodes from postprocess model to use the input of the main network - pp2_input_image_name = pp2_model.graph.input[0].name - pp2_input_bbox_name = pp2_model.graph.input[1].name - main_input_image_name = model.graph.input[0].name - main_input_bbox_name = model.graph.input[1].name - for node in model.graph.node: - for idx, name in enumerate(node.input): - if name == pp2_input_image_name: - node.input[idx] = main_input_image_name - if name == pp2_input_bbox_name: - node.input[idx] = main_input_bbox_name - model.graph.input.pop(2) - model.graph.input.pop(2) - - # Set input box type to int32 - model.graph.input[1].type.tensor_type.elem_type = TensorProto.INT32 + # Rename the input tensor + main_input_image_name = model.graph.input[0].name + for node in model.graph.node: + for idx, name in enumerate(node.input): + if name == main_input_image_name: + node.input[idx] = "image_input" + model.graph.input[0].name = "image_input" # Set input image type to int8 model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8 diff --git a/media/RESULTS.md b/media/RESULTS.md index a5fce36..288b348 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -6,269 +6,269 @@ Results of the model in various experiments on different datasets. ```json { - "avg_time_2d": 0.01303539154893261, - "avg_time_3d": 0.00036579309883764233, - "avg_fps": 74.62026875112002 + "avg_time_2d": 0.01109659348504018, + "avg_time_3d": 0.00034234281313621394, + "avg_fps": 87.4207158719313 } { "person_nums": { "total_frames": 600, "total_labels": 600, - "total_preds": 600, + "total_preds": 601, "considered_empty": 0, "valid_preds": 600, - "invalid_preds": 0, + "invalid_preds": 1, "missing": 0, - "invalid_fraction": 0.0, - "precision": 1.0, + "invalid_fraction": 0.00166, + "precision": 0.99834, "recall": 1.0, - "f1": 1.0, - "non_empty": 600 + "f1": 0.99917, + "non_empty": 601 }, "mpjpe": { "count": 600, - "mean": 0.06664, - "median": 0.05883, - "std": 0.027642, - "sem": 0.001129, - "min": 0.037832, - "max": 0.189745, + "mean": 0.06621, + "median": 0.058297, + "std": 0.027913, + "sem": 0.00114, + "min": 0.04047, + "max": 0.189061, "recall-0.025": 0.0, - "recall-0.05": 0.1, + "recall-0.05": 0.098333, "recall-0.1": 0.941667, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.018725, - "ap-0.1": 0.902023, - "ap-0.15": 0.914628, + "ap-0.05": 0.018429, + "ap-0.1": 0.901756, + "ap-0.15": 0.913878, "ap-0.25": 1.0, "ap-0.5": 1.0 }, "nose": { "count": 600, - "mean": 0.114935, - "median": 0.099561, - "std": 0.042845, - "sem": 0.001751, - "min": 0.029831, - "max": 0.268342, + "mean": 0.113174, + "median": 0.098547, + "std": 0.041425, + "sem": 0.001693, + "min": 0.029421, + "max": 0.27266, "recall-0.025": 0.0, - "recall-0.05": 0.015, - "recall-0.1": 0.506667, - "recall-0.15": 0.803333, - "recall-0.25": 0.995, + "recall-0.05": 0.01, + "recall-0.1": 0.515, + "recall-0.15": 0.81, + "recall-0.25": 0.991667, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_left": { "count": 600, - "mean": 0.036888, - "median": 0.028719, - "std": 0.031747, - "sem": 0.001297, - "min": 0.004721, - "max": 0.182985, - "recall-0.025": 0.401667, - "recall-0.05": 0.833333, - "recall-0.1": 0.948333, - "recall-0.15": 0.963333, + "mean": 0.034727, + "median": 0.026049, + "std": 0.031822, + "sem": 0.0013, + "min": 0.002176, + "max": 0.183422, + "recall-0.025": 0.471667, + "recall-0.05": 0.855, + "recall-0.1": 0.95, + "recall-0.15": 0.965, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_right": { "count": 600, - "mean": 0.050032, - "median": 0.036552, - "std": 0.040712, - "sem": 0.001663, - "min": 0.006749, - "max": 0.239156, - "recall-0.025": 0.201667, - "recall-0.05": 0.708333, - "recall-0.1": 0.915, - "recall-0.15": 0.945, + "mean": 0.04794, + "median": 0.034508, + "std": 0.039316, + "sem": 0.001606, + "min": 0.004604, + "max": 0.218143, + "recall-0.025": 0.211667, + "recall-0.05": 0.76, + "recall-0.1": 0.918333, + "recall-0.15": 0.946667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.045586, - "median": 0.037313, - "std": 0.034633, - "sem": 0.001415, - "min": 0.003768, - "max": 0.200457, - "recall-0.025": 0.216667, - "recall-0.05": 0.746667, - "recall-0.1": 0.946667, - "recall-0.15": 0.955, + "mean": 0.044638, + "median": 0.036326, + "std": 0.034761, + "sem": 0.00142, + "min": 0.003696, + "max": 0.196813, + "recall-0.025": 0.226667, + "recall-0.05": 0.778333, + "recall-0.1": 0.941667, + "recall-0.15": 0.953333, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_right": { "count": 600, - "mean": 0.04539, - "median": 0.035591, - "std": 0.036356, - "sem": 0.001485, - "min": 0.007803, - "max": 0.281955, - "recall-0.025": 0.245, - "recall-0.05": 0.773333, - "recall-0.1": 0.923333, - "recall-0.15": 0.941667, + "mean": 0.044037, + "median": 0.033739, + "std": 0.036263, + "sem": 0.001482, + "min": 0.007995, + "max": 0.351118, + "recall-0.025": 0.251667, + "recall-0.05": 0.788333, + "recall-0.1": 0.931667, + "recall-0.15": 0.945, "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_left": { "count": 600, - "mean": 0.046389, - "median": 0.029742, - "std": 0.04752, - "sem": 0.001942, - "min": 0.00236, - "max": 0.287479, - "recall-0.025": 0.426667, - "recall-0.05": 0.728333, - "recall-0.1": 0.888333, - "recall-0.15": 0.91, - "recall-0.25": 0.996667, + "mean": 0.043333, + "median": 0.027284, + "std": 0.044655, + "sem": 0.001825, + "min": 0.002741, + "max": 0.185438, + "recall-0.025": 0.458333, + "recall-0.05": 0.745, + "recall-0.1": 0.891667, + "recall-0.15": 0.923333, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { "count": 600, - "mean": 0.046403, - "median": 0.028916, - "std": 0.046566, - "sem": 0.001903, - "min": 0.002735, - "max": 0.236808, - "recall-0.025": 0.428333, - "recall-0.05": 0.731667, - "recall-0.1": 0.87, - "recall-0.15": 0.926667, - "recall-0.25": 1.0, + "mean": 0.047488, + "median": 0.027367, + "std": 0.053442, + "sem": 0.002184, + "min": 0.001357, + "max": 0.465438, + "recall-0.025": 0.446667, + "recall-0.05": 0.738333, + "recall-0.1": 0.868333, + "recall-0.15": 0.898333, + "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "hip_left": { "count": 600, - "mean": 0.079732, - "median": 0.072175, - "std": 0.034532, - "sem": 0.001411, - "min": 0.013963, - "max": 0.24229, - "recall-0.025": 0.013333, - "recall-0.05": 0.081667, - "recall-0.1": 0.875, - "recall-0.15": 0.945, + "mean": 0.084262, + "median": 0.078071, + "std": 0.032944, + "sem": 0.001346, + "min": 0.022541, + "max": 0.239428, + "recall-0.025": 0.003333, + "recall-0.05": 0.055, + "recall-0.1": 0.851667, + "recall-0.15": 0.951667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_right": { "count": 600, - "mean": 0.101424, - "median": 0.099206, - "std": 0.02636, - "sem": 0.001077, - "min": 0.032964, - "max": 0.226018, + "mean": 0.106676, + "median": 0.103778, + "std": 0.025796, + "sem": 0.001054, + "min": 0.042573, + "max": 0.242475, "recall-0.025": 0.0, - "recall-0.05": 0.008333, - "recall-0.1": 0.52, - "recall-0.15": 0.946667, + "recall-0.05": 0.003333, + "recall-0.1": 0.421667, + "recall-0.15": 0.948333, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "knee_left": { - "count": 600, - "mean": 0.06299, - "median": 0.047078, - "std": 0.055676, - "sem": 0.002275, - "min": 0.013748, - "max": 0.412425, - "recall-0.025": 0.03, - "recall-0.05": 0.548333, - "recall-0.1": 0.89, - "recall-0.15": 0.926667, - "recall-0.25": 0.983333, - "recall-0.5": 1.0, + "count": 598, + "mean": 0.062386, + "median": 0.046647, + "std": 0.055624, + "sem": 0.002277, + "min": 0.012414, + "max": 0.399633, + "recall-0.025": 0.045, + "recall-0.05": 0.555, + "recall-0.1": 0.885, + "recall-0.15": 0.925, + "recall-0.25": 0.978333, + "recall-0.5": 0.996667, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.053303, - "median": 0.039785, - "std": 0.048089, - "sem": 0.001965, - "min": 0.009094, - "max": 0.470447, - "recall-0.025": 0.06, - "recall-0.05": 0.736667, - "recall-0.1": 0.923333, - "recall-0.15": 0.926667, - "recall-0.25": 0.988333, + "mean": 0.050939, + "median": 0.041387, + "std": 0.037661, + "sem": 0.001539, + "min": 0.006788, + "max": 0.268559, + "recall-0.025": 0.045, + "recall-0.05": 0.73, + "recall-0.1": 0.941667, + "recall-0.15": 0.943333, + "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_left": { "count": 600, - "mean": 0.097848, - "median": 0.087393, - "std": 0.039465, - "sem": 0.001613, - "min": 0.049149, - "max": 0.49791, + "mean": 0.096519, + "median": 0.085325, + "std": 0.043518, + "sem": 0.001778, + "min": 0.049769, + "max": 0.494823, "recall-0.025": 0.0, - "recall-0.05": 0.005, - "recall-0.1": 0.805, - "recall-0.15": 0.923333, - "recall-0.25": 0.99, + "recall-0.05": 0.001667, + "recall-0.1": 0.828333, + "recall-0.15": 0.935, + "recall-0.25": 0.988333, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_right": { "count": 600, - "mean": 0.085394, - "median": 0.070638, - "std": 0.050932, - "sem": 0.002081, - "min": 0.027674, - "max": 0.441898, + "mean": 0.082453, + "median": 0.068627, + "std": 0.050525, + "sem": 0.002064, + "min": 0.026098, + "max": 0.482397, "recall-0.025": 0.0, - "recall-0.05": 0.023333, - "recall-0.1": 0.876667, - "recall-0.15": 0.9, - "recall-0.25": 0.983333, + "recall-0.05": 0.035, + "recall-0.1": 0.896667, + "recall-0.15": 0.915, + "recall-0.25": 0.981667, "recall-0.5": 1.0, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.15538, - "recall-0.05": 0.45603, - "recall-0.1": 0.83705, - "recall-0.15": 0.92372, - "recall-0.25": 0.99449, - "recall-0.5": 1.0 + "recall-0.025": 0.1659, + "recall-0.05": 0.46526, + "recall-0.1": 0.83359, + "recall-0.15": 0.92705, + "recall-0.25": 0.99436, + "recall-0.5": 0.99974 } } { "total_parts": 8400, - "correct_parts": 8090, - "pcp": 0.963095 + "correct_parts": 8113, + "pcp": 0.965833 } ``` diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 72cb80d..33640d4 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -1,6 +1,8 @@ +import math from abc import ABC, abstractmethod from typing import List +import cv2 import numpy as np import onnxruntime as ort from tqdm import tqdm @@ -49,11 +51,11 @@ class BaseModel(ABC): self.warmup(warmup) @abstractmethod - def preprocess(self, image: np.ndarray, *args, **kwargs): + def preprocess(self, **kwargs): pass @abstractmethod - def postprocess(self, tensor: List[np.ndarray], *args, **kwargs): + def postprocess(self, **kwargs): pass def warmup(self, epoch: int): @@ -97,20 +99,178 @@ class BaseModel(ABC): self.session.run(None, inputs) - def __call__(self, image: np.ndarray, *args, **kwargs): - tensor = self.preprocess(image, *args, **kwargs) + def __call__(self, **kwargs): + tensor = self.preprocess(**kwargs) inputs = {} for i in range(len(self.input_names)): iname = self.input_names[i] inputs[iname] = tensor[i] result = self.session.run(None, inputs) - output = self.postprocess(result, *args, **kwargs) + output = self.postprocess(result=result, **kwargs) return output # ================================================================================================== +class LetterBox: + def __init__(self, target_size, fill_value=0): + self.target_size = target_size + self.fill_value = fill_value + + def calc_params(self, ishape): + img_h, img_w = ishape[:2] + target_h, target_w = self.target_size + + scale = min(target_w / img_w, target_h / img_h) + new_w = round(img_w * scale) + new_h = round(img_h * scale) + + pad_w = target_w - new_w + pad_h = target_h - new_h + pad_left = pad_w // 2 + pad_top = pad_h // 2 + pad_right = pad_w - pad_left + pad_bottom = pad_h - pad_top + paddings = (pad_left, pad_right, pad_top, pad_bottom) + + return paddings, scale, (new_w, new_h) + + def resize_image(self, image): + paddings, _, new_size = self.calc_params(image.shape) + + target_h, target_w = self.target_size + canvas = np.full( + (target_h, target_w, image.shape[2]), + self.fill_value, + dtype=image.dtype, + ) + + new_w, new_h = new_size + dx, dy = paddings[0], paddings[2] + canvas[dy : dy + new_h, dx : dx + new_w, :] = cv2.resize( + image, (new_w, new_h), interpolation=cv2.INTER_LINEAR + ) + + return canvas + + +# ================================================================================================== + + +class BoxCrop: + def __init__(self, target_size, padding_scale=1.0, fill_value=0): + self.target_size = target_size + self.padding_scale = padding_scale + self.fill_value = fill_value + + def calc_params(self, ishape, bbox): + start_x, start_y, end_x, end_y = bbox[0], bbox[1], bbox[2], bbox[3] + target_h, target_w = self.target_size + + # Calculate original bounding box center + center_x = (start_x + end_x) / 2.0 + center_y = (start_y + end_y) / 2.0 + + # Scale the bounding box by the padding_scale + bbox_w = end_x - start_x + bbox_h = end_y - start_y + scaled_w = bbox_w * self.padding_scale + scaled_h = bbox_h * self.padding_scale + + # Calculate the aspect ratios + bbox_aspect = scaled_w / scaled_h + target_aspect = target_w / target_h + + # Adjust the scaled bounding box to match the target aspect ratio + if bbox_aspect > target_aspect: + adjusted_h = scaled_w / target_aspect + adjusted_w = scaled_w + else: + adjusted_w = scaled_h * target_aspect + adjusted_h = scaled_h + + # Calculate scaled bounding box coordinates + bbox_w = adjusted_w + bbox_h = adjusted_h + new_start_x = center_x - bbox_w / 2.0 + new_start_y = center_y - bbox_h / 2.0 + new_end_x = center_x + bbox_w / 2.0 + new_end_y = center_y + bbox_h / 2.0 + + # Round the box coordinates + start_x = int(math.floor(new_start_x)) + start_y = int(math.floor(new_start_y)) + end_x = int(math.ceil(new_end_x)) + end_y = int(math.ceil(new_end_y)) + + # Define the new box coordinates + new_start_x = max(0, start_x) + new_start_y = max(0, start_y) + new_end_x = min(ishape[1] - 1, end_x) + new_end_y = min(ishape[0] - 1, end_y) + new_box = [new_start_x, new_start_y, new_end_x, new_end_y] + + bbox_w = new_box[2] - new_box[0] + bbox_h = new_box[3] - new_box[1] + scale = min(target_w / bbox_w, target_h / bbox_h) + new_w = round(bbox_w * scale) + new_h = round(bbox_h * scale) + + # Calculate paddings + pad_w = target_w - new_w + pad_h = target_h - new_h + pad_left, pad_right, pad_top, pad_bottom = 0, 0, 0, 0 + if pad_w > 0: + if start_x < 0: + pad_left = pad_w + pad_right = 0 + elif end_x > ishape[1]: + pad_left = 0 + pad_right = pad_w + else: + # Can be caused by bbox rounding + pad_left = pad_w // 2 + pad_right = pad_w - pad_left + if pad_h > 0: + if start_y < 0: + pad_top = pad_h + pad_bottom = 0 + elif end_y > ishape[0]: + pad_top = 0 + pad_bottom = pad_h + else: + # Can be caused by bbox rounding + pad_top = pad_h // 2 + pad_bottom = pad_h - pad_top + paddings = (pad_left, pad_right, pad_top, pad_bottom) + + return paddings, scale, new_box, (new_w, new_h) + + def crop_resize_box(self, image, bbox): + paddings, _, new_box, new_size = self.calc_params(image.shape, bbox) + + image = image[new_box[1] : new_box[3], new_box[0] : new_box[2]] + + th, tw = self.target_size + canvas = np.full( + (th, tw, image.shape[2]), + self.fill_value, + dtype=image.dtype, + ) + + nw, nh = new_size + dx, dy = paddings[0], paddings[2] + canvas[dy : dy + nh, dx : dx + nw, :] = cv2.resize( + image, (nw, nh), interpolation=cv2.INTER_LINEAR + ) + + return canvas + + +# ================================================================================================== + + class RTMDet(BaseModel): def __init__( self, @@ -119,17 +279,20 @@ class RTMDet(BaseModel): warmup: int = 30, ): super(RTMDet, self).__init__(model_path, warmup) + self.target_size = (320, 320) self.conf_threshold = conf_threshold + self.letterbox = LetterBox(self.target_size, fill_value=114) def preprocess(self, image: np.ndarray): + image = self.letterbox.resize_image(image) tensor = np.asarray(image).astype(self.input_types[0], copy=False) tensor = np.expand_dims(tensor, axis=0) tensor = [tensor] return tensor - def postprocess(self, tensor: List[np.ndarray]): - boxes = np.squeeze(tensor[1], axis=0) - classes = np.squeeze(tensor[0], axis=0) + def postprocess(self, result: List[np.ndarray], image: np.ndarray): + boxes = np.squeeze(result[0], axis=0) + classes = np.squeeze(result[1], axis=0) human_class = classes[:] == 0 boxes = boxes[human_class] @@ -137,6 +300,35 @@ class RTMDet(BaseModel): keep = boxes[:, 4] > self.conf_threshold boxes = boxes[keep] + paddings, scale, _ = self.letterbox.calc_params(image.shape) + + boxes[:, 0] -= paddings[0] + boxes[:, 2] -= paddings[0] + boxes[:, 1] -= paddings[2] + boxes[:, 3] -= paddings[2] + + boxes = np.maximum(boxes, 0) + + th, tw = self.target_size + pad_w = paddings[0] + paddings[1] + pad_h = paddings[2] + paddings[3] + max_w = tw - pad_w - 1 + max_h = th - pad_h - 1 + b0 = boxes[:, 0] + b1 = boxes[:, 1] + b2 = boxes[:, 2] + b3 = boxes[:, 3] + b0 = np.minimum(b0, max_w) + b1 = np.minimum(b1, max_h) + b2 = np.minimum(b2, max_w) + b3 = np.minimum(b3, max_h) + boxes[:, 0] = b0 + boxes[:, 1] = b1 + boxes[:, 2] = b2 + boxes[:, 3] = b3 + + boxes[:, 0:4] /= scale + return boxes @@ -146,7 +338,8 @@ class RTMDet(BaseModel): class RTMPose(BaseModel): def __init__(self, model_path: str, warmup: int = 30): super(RTMPose, self).__init__(model_path, warmup) - self.bbox = None + self.target_size = (384, 288) + self.boxcrop = BoxCrop(self.target_size, padding_scale=1.25, fill_value=0) def preprocess(self, image: np.ndarray, bbox: np.ndarray): tensor = np.asarray(image).astype(self.input_types[0], copy=False) @@ -154,13 +347,34 @@ class RTMPose(BaseModel): bbox = np.asarray(bbox)[0:4] bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) bbox = bbox.round().astype(np.int32) - bbox = np.expand_dims(bbox, axis=0) - tensor = [tensor, bbox] + region = self.boxcrop.crop_resize_box(image, bbox) + tensor = np.asarray(region).astype(self.input_types[0], copy=False) + tensor = np.expand_dims(tensor, axis=0) + tensor = [tensor] return tensor - def postprocess(self, tensor: List[np.ndarray], **kwargs): - scores = np.clip(tensor[0][0], 0, 1) - kp = np.concatenate([tensor[1][0], np.expand_dims(scores, axis=-1)], axis=-1) + def postprocess( + self, result: List[np.ndarray], image: np.ndarray, bbox: np.ndarray + ): + scores = np.clip(result[1][0], 0, 1) + kp = np.concatenate([result[0][0], np.expand_dims(scores, axis=-1)], axis=-1) + + paddings, scale, bbox, _ = self.boxcrop.calc_params(image.shape, bbox) + kp[:, 0] -= paddings[0] + kp[:, 1] -= paddings[2] + kp[:, 0:2] /= scale + kp[:, 0] += bbox[0] + kp[:, 1] += bbox[1] + kp[:, 0:2] = np.maximum(kp[:, 0:2], 0) + max_w = image.shape[1] - 1 + max_h = image.shape[0] - 1 + b0 = kp[:, 0] + b1 = kp[:, 1] + b0 = np.minimum(b0, max_w) + b1 = np.minimum(b1, max_h) + kp[:, 0] = b0 + kp[:, 1] = b1 + return kp @@ -184,10 +398,10 @@ class TopDown: self.pose_model = RTMPose(pose_model_path, warmup) def predict(self, image): - boxes = self.det_model(image) + boxes = self.det_model(image=image) results = [] for i in range(boxes.shape[0]): - kp = self.pose_model(image, bbox=boxes[i]) + kp = self.pose_model(image=image, bbox=boxes[i]) results.append(kp) return results