diff --git a/extras/mmdeploy/make_extra_graphs.py b/extras/mmdeploy/make_extra_graphs.py index 0afcaf8..0a920f7 100644 --- a/extras/mmdeploy/make_extra_graphs.py +++ b/extras/mmdeploy/make_extra_graphs.py @@ -2,6 +2,7 @@ import cv2 import torch import torch.nn as nn import torch.nn.functional as F +from torchvision.ops import roi_align # ================================================================================================== @@ -20,35 +21,10 @@ class Letterbox(nn.Module): self.target_size = target_size self.fill_value = fill_value - def calc_params_and_crop(self, ishape, bbox=None): - ih0, iw0 = ishape[1], ishape[2] + def calc_params(self, ishape): + ih, iw = ishape[1], ishape[2] th, tw = self.target_size - if bbox is not None: - bbox = bbox[0].float() - x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] - - # Slightly increase bbox size - factor = 1.25 - w = x2 - x1 - h = y2 - y1 - x1 -= w * (factor - 1) / 2 - x2 += w * (factor - 1) / 2 - y1 -= h * (factor - 1) / 2 - y2 += h * (factor - 1) / 2 - - zero = torch.tensor(0) - x1 = torch.max(x1, zero).to(torch.int64) - y1 = torch.max(y1, zero).to(torch.int64) - x2 = torch.min(x2, iw0).to(torch.int64) - y2 = torch.min(y2, ih0).to(torch.int64) - bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0) - - ih = y2 - y1 - iw = x2 - x1 - else: - ih, iw = ih0, iw0 - scale = torch.min(tw / iw, th / ih) nw = torch.round(iw * scale) nh = torch.round(ih * scale) @@ -61,21 +37,16 @@ class Letterbox(nn.Module): pad_bottom = pad_h - pad_top paddings = (pad_left, pad_right, pad_top, pad_bottom) - return paddings, scale, (nw, nh), bbox + return paddings, scale, (nw, nh) - def forward(self, img, bbox=None): - paddings, _, (nw, nh), bbox = self.calc_params_and_crop(img.shape, bbox) - - # Optional: Crop the image - if bbox is not None: - x1, y1, x2, y2 = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3] - img = img.to(torch.float32) - img = img[:, y1:y2, x1:x2, :] + def forward(self, img): + paddings, _, (nw, nh) = self.calc_params(img.shape) # Resize the image img = img.to(torch.float32) + img = img.permute(0, 3, 1, 2) img = F.interpolate( - img.permute(0, 3, 1, 2), + img, size=(nh, nw), mode="bilinear", align_corners=False, @@ -91,9 +62,82 @@ class Letterbox(nn.Module): value=self.fill_value, ) img = img.permute(0, 2, 3, 1) - canvas = img - return canvas + return img + + +# ================================================================================================== + + +class BoxCrop(nn.Module): + def __init__(self, target_size): + """Crop bounding box from image""" + super(BoxCrop, self).__init__() + + self.target_size = target_size + self.padding_scale = 1.25 + + def calc_params(self, bbox): + start_x, start_y, end_x, end_y = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3] + target_h, target_w = self.target_size + + # Calculate original bounding box width, height and center + bbox_w = end_x - start_x + bbox_h = end_y - start_y + center_x = (start_x + end_x) / 2.0 + center_y = (start_y + end_y) / 2.0 + + # Calculate the aspect ratios + bbox_aspect = bbox_w / bbox_h + target_aspect = target_w / target_h + + # Adjust the scaled bounding box to match the target aspect ratio + if bbox_aspect > target_aspect: + adjusted_h = bbox_w / target_aspect + adjusted_w = bbox_w + else: + adjusted_w = bbox_h * target_aspect + adjusted_h = bbox_h + + # Scale the bounding box by the padding_scale + scaled_bbox_w = adjusted_w * self.padding_scale + scaled_bbox_h = adjusted_h * self.padding_scale + + # Calculate scaled bounding box coordinates + new_start_x = center_x - scaled_bbox_w / 2.0 + new_start_y = center_y - scaled_bbox_h / 2.0 + new_end_x = center_x + scaled_bbox_w / 2.0 + new_end_y = center_y + scaled_bbox_h / 2.0 + + # Define the new box coordinates + new_box = torch.stack((new_start_x, new_start_y, new_end_x, new_end_y), dim=0) + new_box = new_box.unsqueeze(0) + scale = torch.stack( + ((target_w / scaled_bbox_w), (target_h / scaled_bbox_h)), dim=0 + ) + + return scale, new_box + + def forward(self, img, bbox): + _, bbox = self.calc_params(bbox) + + batch_indices = torch.zeros(bbox.shape[0], 1) + rois = torch.cat([batch_indices, bbox], dim=1) + + # Resize and crop + img = img.to(torch.float32) + img = img.permute(0, 3, 1, 2) + img = roi_align( + img, + rois, + output_size=self.target_size, + spatial_scale=1.0, + sampling_ratio=0, + ) + img = img.permute(0, 2, 3, 1) + img = img.round() + + return img # ================================================================================================== @@ -106,7 +150,7 @@ class DetPreprocess(nn.Module): def forward(self, img): # img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8 - img = self.letterbox(img, None) + img = self.letterbox(img) return img @@ -121,7 +165,7 @@ class DetPostprocess(nn.Module): self.letterbox = Letterbox(target_size) def forward(self, img, boxes): - paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img.shape, None) + paddings, scale, _ = self.letterbox.calc_params(img.shape) boxes = boxes.float() boxes[:, :, 0] -= paddings[0] @@ -160,12 +204,12 @@ class DetPostprocess(nn.Module): class PosePreprocess(nn.Module): def __init__(self, target_size, fill_value=114): super(PosePreprocess, self).__init__() - self.letterbox = Letterbox(target_size, fill_value) + self.boxcrop = BoxCrop(target_size) def forward(self, img, bbox): # img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8 # bbox: torch.Tensor of shape [1, 4], dtype=torch.float32 - img = self.letterbox(img, bbox) + img = self.boxcrop(img, bbox) return img @@ -175,25 +219,22 @@ class PosePreprocess(nn.Module): class PosePostprocess(nn.Module): def __init__(self, target_size): super(PosePostprocess, self).__init__() - + self.boxcrop = BoxCrop(target_size) self.target_size = target_size - self.letterbox = Letterbox(target_size) def forward(self, img, bbox, keypoints): - paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img.shape, bbox) + scale, bbox = self.boxcrop.calc_params(bbox) kp = keypoints.float() - kp[:, :, 0] -= paddings[0] - kp[:, :, 1] -= paddings[2] + kp[:, :, 0:2] /= scale + kp[:, :, 0] += bbox[0, 0] + kp[:, :, 1] += bbox[0, 1] zero = torch.tensor(0) kp = torch.max(kp, zero) - th, tw = self.target_size - pad_w = paddings[0] + paddings[1] - pad_h = paddings[2] + paddings[3] - max_w = tw - pad_w - 1 - max_h = th - pad_h - 1 + max_w = img.shape[2] - 1 + max_h = img.shape[1] - 1 k0 = kp[:, :, 0] k1 = kp[:, :, 1] k0 = torch.min(k0, max_w) @@ -201,10 +242,6 @@ class PosePostprocess(nn.Module): kp[:, :, 0] = k0 kp[:, :, 1] = k1 - kp[:, :, 0:2] /= scale - - kp[:, :, 0] += bbox[0, 0] - kp[:, :, 1] += bbox[0, 1] return kp @@ -215,6 +252,7 @@ def main(): img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg" image = cv2.imread(img_path, 3) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Initialize the DetPreprocess module preprocess_model = DetPreprocess(target_size=det_target_size) @@ -257,7 +295,7 @@ def main(): # Initialize the PosePreprocess module preprocess_model = PosePreprocess(target_size=pose_target_size) det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0) - det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32) + det_dummy_input_c1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32) # Export to ONNX torch.onnx.export( @@ -276,8 +314,8 @@ def main(): # Initialize the PosePostprocess module postprocess_model = PosePostprocess(target_size=pose_target_size) det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0) - det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32) - det_dummy_input_d2 = torch.rand(1, 17, 3) + det_dummy_input_d1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32) + det_dummy_input_d2 = torch.rand(1, 17, 2) # Export to ONNX torch.onnx.export( diff --git a/media/RESULTS.md b/media/RESULTS.md index 5aa26bf..a5fce36 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets. ```json { - "avg_time_2d": 0.02244777841083074, - "avg_time_3d": 0.0003828315411583852, - "avg_fps": 43.800844659994496 + "avg_time_2d": 0.01303539154893261, + "avg_time_3d": 0.00036579309883764233, + "avg_fps": 74.62026875112002 } { "person_nums": { @@ -27,53 +27,53 @@ Results of the model in various experiments on different datasets. }, "mpjpe": { "count": 600, - "mean": 0.067837, - "median": 0.059973, - "std": 0.027729, - "sem": 0.001133, - "min": 0.044125, - "max": 0.191545, + "mean": 0.06664, + "median": 0.05883, + "std": 0.027642, + "sem": 0.001129, + "min": 0.037832, + "max": 0.189745, "recall-0.025": 0.0, - "recall-0.05": 0.035, - "recall-0.1": 0.931667, + "recall-0.05": 0.1, + "recall-0.1": 0.941667, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.003097, - "ap-0.1": 0.889734, - "ap-0.15": 0.915055, + "ap-0.05": 0.018725, + "ap-0.1": 0.902023, + "ap-0.15": 0.914628, "ap-0.25": 1.0, "ap-0.5": 1.0 }, "nose": { "count": 600, - "mean": 0.116272, - "median": 0.09953, - "std": 0.042967, - "sem": 0.001756, - "min": 0.033845, - "max": 0.263303, + "mean": 0.114935, + "median": 0.099561, + "std": 0.042845, + "sem": 0.001751, + "min": 0.029831, + "max": 0.268342, "recall-0.025": 0.0, - "recall-0.05": 0.008333, - "recall-0.1": 0.503333, - "recall-0.15": 0.815, - "recall-0.25": 0.993333, + "recall-0.05": 0.015, + "recall-0.1": 0.506667, + "recall-0.15": 0.803333, + "recall-0.25": 0.995, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_left": { "count": 600, - "mean": 0.034881, - "median": 0.027327, - "std": 0.031594, - "sem": 0.001291, - "min": 0.002162, - "max": 0.178271, - "recall-0.025": 0.438333, - "recall-0.05": 0.863333, - "recall-0.1": 0.946667, + "mean": 0.036888, + "median": 0.028719, + "std": 0.031747, + "sem": 0.001297, + "min": 0.004721, + "max": 0.182985, + "recall-0.025": 0.401667, + "recall-0.05": 0.833333, + "recall-0.1": 0.948333, "recall-0.15": 0.963333, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -81,30 +81,30 @@ Results of the model in various experiments on different datasets. }, "shoulder_right": { "count": 600, - "mean": 0.050288, - "median": 0.03555, - "std": 0.042274, - "sem": 0.001727, - "min": 0.003983, - "max": 0.238328, - "recall-0.025": 0.176667, - "recall-0.05": 0.748333, - "recall-0.1": 0.9, - "recall-0.15": 0.94, + "mean": 0.050032, + "median": 0.036552, + "std": 0.040712, + "sem": 0.001663, + "min": 0.006749, + "max": 0.239156, + "recall-0.025": 0.201667, + "recall-0.05": 0.708333, + "recall-0.1": 0.915, + "recall-0.15": 0.945, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.044326, - "median": 0.035816, - "std": 0.034654, - "sem": 0.001416, - "min": 0.001741, - "max": 0.198882, - "recall-0.025": 0.226667, - "recall-0.05": 0.776667, + "mean": 0.045586, + "median": 0.037313, + "std": 0.034633, + "sem": 0.001415, + "min": 0.003768, + "max": 0.200457, + "recall-0.025": 0.216667, + "recall-0.05": 0.746667, "recall-0.1": 0.946667, "recall-0.15": 0.955, "recall-0.25": 1.0, @@ -113,162 +113,162 @@ Results of the model in various experiments on different datasets. }, "elbow_right": { "count": 600, - "mean": 0.044545, - "median": 0.033152, - "std": 0.037755, - "sem": 0.001543, - "min": 0.008169, - "max": 0.338555, - "recall-0.025": 0.218333, - "recall-0.05": 0.798333, - "recall-0.1": 0.928333, - "recall-0.15": 0.943333, - "recall-0.25": 0.996667, + "mean": 0.04539, + "median": 0.035591, + "std": 0.036356, + "sem": 0.001485, + "min": 0.007803, + "max": 0.281955, + "recall-0.025": 0.245, + "recall-0.05": 0.773333, + "recall-0.1": 0.923333, + "recall-0.15": 0.941667, + "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_left": { "count": 600, - "mean": 0.044896, - "median": 0.025929, - "std": 0.048601, - "sem": 0.001986, - "min": 0.002701, - "max": 0.326901, - "recall-0.025": 0.476667, - "recall-0.05": 0.735, - "recall-0.1": 0.885, - "recall-0.15": 0.913333, + "mean": 0.046389, + "median": 0.029742, + "std": 0.04752, + "sem": 0.001942, + "min": 0.00236, + "max": 0.287479, + "recall-0.025": 0.426667, + "recall-0.05": 0.728333, + "recall-0.1": 0.888333, + "recall-0.15": 0.91, "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { "count": 600, - "mean": 0.045586, - "median": 0.027856, - "std": 0.048323, - "sem": 0.001974, - "min": 0.001841, - "max": 0.229728, - "recall-0.025": 0.436667, - "recall-0.05": 0.751667, - "recall-0.1": 0.881667, - "recall-0.15": 0.916667, + "mean": 0.046403, + "median": 0.028916, + "std": 0.046566, + "sem": 0.001903, + "min": 0.002735, + "max": 0.236808, + "recall-0.025": 0.428333, + "recall-0.05": 0.731667, + "recall-0.1": 0.87, + "recall-0.15": 0.926667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_left": { "count": 600, - "mean": 0.087757, - "median": 0.083491, - "std": 0.032627, - "sem": 0.001333, - "min": 0.004177, - "max": 0.235198, - "recall-0.025": 0.008333, - "recall-0.05": 0.031667, - "recall-0.1": 0.851667, - "recall-0.15": 0.953333, + "mean": 0.079732, + "median": 0.072175, + "std": 0.034532, + "sem": 0.001411, + "min": 0.013963, + "max": 0.24229, + "recall-0.025": 0.013333, + "recall-0.05": 0.081667, + "recall-0.1": 0.875, + "recall-0.15": 0.945, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_right": { "count": 600, - "mean": 0.112758, - "median": 0.11174, - "std": 0.025369, - "sem": 0.001037, - "min": 0.057593, - "max": 0.231402, + "mean": 0.101424, + "median": 0.099206, + "std": 0.02636, + "sem": 0.001077, + "min": 0.032964, + "max": 0.226018, "recall-0.025": 0.0, - "recall-0.05": 0.0, - "recall-0.1": 0.251667, - "recall-0.15": 0.945, + "recall-0.05": 0.008333, + "recall-0.1": 0.52, + "recall-0.15": 0.946667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "knee_left": { - "count": 599, - "mean": 0.063316, - "median": 0.047979, - "std": 0.058253, - "sem": 0.002382, - "min": 0.019525, - "max": 0.476803, - "recall-0.025": 0.033333, - "recall-0.05": 0.546667, + "count": 600, + "mean": 0.06299, + "median": 0.047078, + "std": 0.055676, + "sem": 0.002275, + "min": 0.013748, + "max": 0.412425, + "recall-0.025": 0.03, + "recall-0.05": 0.548333, "recall-0.1": 0.89, - "recall-0.15": 0.925, - "recall-0.25": 0.978333, - "recall-0.5": 0.998333, + "recall-0.15": 0.926667, + "recall-0.25": 0.983333, + "recall-0.5": 1.0, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.050955, - "median": 0.041526, - "std": 0.037031, - "sem": 0.001513, - "min": 0.005291, - "max": 0.27011, - "recall-0.025": 0.035, - "recall-0.05": 0.746667, - "recall-0.1": 0.943333, - "recall-0.15": 0.945, - "recall-0.25": 0.998333, + "mean": 0.053303, + "median": 0.039785, + "std": 0.048089, + "sem": 0.001965, + "min": 0.009094, + "max": 0.470447, + "recall-0.025": 0.06, + "recall-0.05": 0.736667, + "recall-0.1": 0.923333, + "recall-0.15": 0.926667, + "recall-0.25": 0.988333, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_left": { - "count": 598, - "mean": 0.097897, - "median": 0.086817, - "std": 0.048343, - "sem": 0.001979, - "min": 0.048922, - "max": 0.493127, + "count": 600, + "mean": 0.097848, + "median": 0.087393, + "std": 0.039465, + "sem": 0.001613, + "min": 0.049149, + "max": 0.49791, "recall-0.025": 0.0, - "recall-0.05": 0.003333, - "recall-0.1": 0.83, - "recall-0.15": 0.933333, - "recall-0.25": 0.98, - "recall-0.5": 0.996667, + "recall-0.05": 0.005, + "recall-0.1": 0.805, + "recall-0.15": 0.923333, + "recall-0.25": 0.99, + "recall-0.5": 1.0, "num_labels": 600 }, "ankle_right": { - "count": 599, - "mean": 0.084814, - "median": 0.07029, - "std": 0.053839, - "sem": 0.002202, - "min": 0.025955, - "max": 0.384465, + "count": 600, + "mean": 0.085394, + "median": 0.070638, + "std": 0.050932, + "sem": 0.002081, + "min": 0.027674, + "max": 0.441898, "recall-0.025": 0.0, - "recall-0.05": 0.02, - "recall-0.1": 0.886667, - "recall-0.15": 0.908333, - "recall-0.25": 0.973333, - "recall-0.5": 0.998333, + "recall-0.05": 0.023333, + "recall-0.1": 0.876667, + "recall-0.15": 0.9, + "recall-0.25": 0.983333, + "recall-0.5": 1.0, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.15718, - "recall-0.05": 0.46321, - "recall-0.1": 0.81846, - "recall-0.15": 0.92654, - "recall-0.25": 0.99308, - "recall-0.5": 0.99923 + "recall-0.025": 0.15538, + "recall-0.05": 0.45603, + "recall-0.1": 0.83705, + "recall-0.15": 0.92372, + "recall-0.25": 0.99449, + "recall-0.5": 1.0 } } { "total_parts": 8400, - "correct_parts": 8077, - "pcp": 0.961548 + "correct_parts": 8090, + "pcp": 0.963095 } ```