Improved box cutting with always fixed tensor shapes.

2024-12-04 17:54:57 +01:00
parent 6452d20ec8
commit acf1d19b64
2 changed files with 254 additions and 216 deletions
--- a/extras/mmdeploy/make_extra_graphs.py
+++ b/extras/mmdeploy/make_extra_graphs.py
@ -2,6 +2,7 @@ import cv2
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torchvision.ops import roi_align

 # ==================================================================================================

@ -20,35 +21,10 @@ class Letterbox(nn.Module):
        self.target_size = target_size
        self.fill_value = fill_value

-    def calc_params_and_crop(self, ishape, bbox=None):
-        ih0, iw0 = ishape[1], ishape[2]
+    def calc_params(self, ishape):
+        ih, iw = ishape[1], ishape[2]
        th, tw = self.target_size

-        if bbox is not None:
-            bbox = bbox[0].float()
-            x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
-
-            # Slightly increase bbox size
-            factor = 1.25
-            w = x2 - x1
-            h = y2 - y1
-            x1 -= w * (factor - 1) / 2
-            x2 += w * (factor - 1) / 2
-            y1 -= h * (factor - 1) / 2
-            y2 += h * (factor - 1) / 2
-
-            zero = torch.tensor(0)
-            x1 = torch.max(x1, zero).to(torch.int64)
-            y1 = torch.max(y1, zero).to(torch.int64)
-            x2 = torch.min(x2, iw0).to(torch.int64)
-            y2 = torch.min(y2, ih0).to(torch.int64)
-            bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0)
-
-            ih = y2 - y1
-            iw = x2 - x1
-        else:
-            ih, iw = ih0, iw0
-
        scale = torch.min(tw / iw, th / ih)
        nw = torch.round(iw * scale)
        nh = torch.round(ih * scale)
@ -61,21 +37,16 @@ class Letterbox(nn.Module):
        pad_bottom = pad_h - pad_top
        paddings = (pad_left, pad_right, pad_top, pad_bottom)

-        return paddings, scale, (nw, nh), bbox
+        return paddings, scale, (nw, nh)

-    def forward(self, img, bbox=None):
-        paddings, _, (nw, nh), bbox = self.calc_params_and_crop(img.shape, bbox)
-
-        # Optional: Crop the image
-        if bbox is not None:
-            x1, y1, x2, y2 = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3]
-            img = img.to(torch.float32)
-            img = img[:, y1:y2, x1:x2, :]
+    def forward(self, img):
+        paddings, _, (nw, nh) = self.calc_params(img.shape)

        # Resize the image
        img = img.to(torch.float32)
+        img = img.permute(0, 3, 1, 2)
        img = F.interpolate(
-            img.permute(0, 3, 1, 2),
+            img,
            size=(nh, nw),
            mode="bilinear",
            align_corners=False,
@ -91,9 +62,82 @@ class Letterbox(nn.Module):
            value=self.fill_value,
        )
        img = img.permute(0, 2, 3, 1)
-        canvas = img

-        return canvas
+        return img
+
+
+# ==================================================================================================
+
+
+class BoxCrop(nn.Module):
+    def __init__(self, target_size):
+        """Crop bounding box from image"""
+        super(BoxCrop, self).__init__()
+
+        self.target_size = target_size
+        self.padding_scale = 1.25
+
+    def calc_params(self, bbox):
+        start_x, start_y, end_x, end_y = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3]
+        target_h, target_w = self.target_size
+
+        # Calculate original bounding box width, height and center
+        bbox_w = end_x - start_x
+        bbox_h = end_y - start_y
+        center_x = (start_x + end_x) / 2.0
+        center_y = (start_y + end_y) / 2.0
+
+        # Calculate the aspect ratios
+        bbox_aspect = bbox_w / bbox_h
+        target_aspect = target_w / target_h
+
+        # Adjust the scaled bounding box to match the target aspect ratio
+        if bbox_aspect > target_aspect:
+            adjusted_h = bbox_w / target_aspect
+            adjusted_w = bbox_w
+        else:
+            adjusted_w = bbox_h * target_aspect
+            adjusted_h = bbox_h
+
+        # Scale the bounding box by the padding_scale
+        scaled_bbox_w = adjusted_w * self.padding_scale
+        scaled_bbox_h = adjusted_h * self.padding_scale
+
+        # Calculate scaled bounding box coordinates
+        new_start_x = center_x - scaled_bbox_w / 2.0
+        new_start_y = center_y - scaled_bbox_h / 2.0
+        new_end_x = center_x + scaled_bbox_w / 2.0
+        new_end_y = center_y + scaled_bbox_h / 2.0
+
+        # Define the new box coordinates
+        new_box = torch.stack((new_start_x, new_start_y, new_end_x, new_end_y), dim=0)
+        new_box = new_box.unsqueeze(0)
+        scale = torch.stack(
+            ((target_w / scaled_bbox_w), (target_h / scaled_bbox_h)), dim=0
+        )
+
+        return scale, new_box
+
+    def forward(self, img, bbox):
+        _, bbox = self.calc_params(bbox)
+
+        batch_indices = torch.zeros(bbox.shape[0], 1)
+        rois = torch.cat([batch_indices, bbox], dim=1)
+
+        # Resize and crop
+        img = img.to(torch.float32)
+        img = img.permute(0, 3, 1, 2)
+        img = roi_align(
+            img,
+            rois,
+            output_size=self.target_size,
+            spatial_scale=1.0,
+            sampling_ratio=0,
+        )
+        img = img.permute(0, 2, 3, 1)
+        img = img.round()
+
+        return img


 # ==================================================================================================
@ -106,7 +150,7 @@ class DetPreprocess(nn.Module):

    def forward(self, img):
        # img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8
-        img = self.letterbox(img, None)
+        img = self.letterbox(img)
        return img


@ -121,7 +165,7 @@ class DetPostprocess(nn.Module):
        self.letterbox = Letterbox(target_size)

    def forward(self, img, boxes):
-        paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img.shape, None)
+        paddings, scale, _ = self.letterbox.calc_params(img.shape)

        boxes = boxes.float()
        boxes[:, :, 0] -= paddings[0]
@ -160,12 +204,12 @@ class DetPostprocess(nn.Module):
 class PosePreprocess(nn.Module):
    def __init__(self, target_size, fill_value=114):
        super(PosePreprocess, self).__init__()
-        self.letterbox = Letterbox(target_size, fill_value)
+        self.boxcrop = BoxCrop(target_size)

    def forward(self, img, bbox):
        # img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8
        # bbox: torch.Tensor of shape [1, 4], dtype=torch.float32
-        img = self.letterbox(img, bbox)
+        img = self.boxcrop(img, bbox)
        return img


@ -175,25 +219,22 @@ class PosePreprocess(nn.Module):
 class PosePostprocess(nn.Module):
    def __init__(self, target_size):
        super(PosePostprocess, self).__init__()
-
+        self.boxcrop = BoxCrop(target_size)
        self.target_size = target_size
-        self.letterbox = Letterbox(target_size)

    def forward(self, img, bbox, keypoints):
-        paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img.shape, bbox)
+        scale, bbox = self.boxcrop.calc_params(bbox)

        kp = keypoints.float()
-        kp[:, :, 0] -= paddings[0]
-        kp[:, :, 1] -= paddings[2]
+        kp[:, :, 0:2] /= scale
+        kp[:, :, 0] += bbox[0, 0]
+        kp[:, :, 1] += bbox[0, 1]

        zero = torch.tensor(0)
        kp = torch.max(kp, zero)

-        th, tw = self.target_size
-        pad_w = paddings[0] + paddings[1]
-        pad_h = paddings[2] + paddings[3]
-        max_w = tw - pad_w - 1
-        max_h = th - pad_h - 1
+        max_w = img.shape[2] - 1
+        max_h = img.shape[1] - 1
        k0 = kp[:, :, 0]
        k1 = kp[:, :, 1]
        k0 = torch.min(k0, max_w)
@ -201,10 +242,6 @@ class PosePostprocess(nn.Module):
        kp[:, :, 0] = k0
        kp[:, :, 1] = k1

-        kp[:, :, 0:2] /= scale
-
-        kp[:, :, 0] += bbox[0, 0]
-        kp[:, :, 1] += bbox[0, 1]
        return kp


@ -215,6 +252,7 @@ def main():

    img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg"
    image = cv2.imread(img_path, 3)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Initialize the DetPreprocess module
    preprocess_model = DetPreprocess(target_size=det_target_size)
@ -257,7 +295,7 @@ def main():
    # Initialize the PosePreprocess module
    preprocess_model = PosePreprocess(target_size=pose_target_size)
    det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0)
-    det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32)
+    det_dummy_input_c1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32)

    # Export to ONNX
    torch.onnx.export(
@ -276,8 +314,8 @@ def main():
    # Initialize the PosePostprocess module
    postprocess_model = PosePostprocess(target_size=pose_target_size)
    det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0)
-    det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32)
-    det_dummy_input_d2 = torch.rand(1, 17, 3)
+    det_dummy_input_d1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32)
+    det_dummy_input_d2 = torch.rand(1, 17, 2)

    # Export to ONNX
    torch.onnx.export(
--- a/media/RESULTS.md
+++ b/media/RESULTS.md
@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets.

 ```json
 {
-  "avg_time_2d": 0.02244777841083074,
-  "avg_time_3d": 0.0003828315411583852,
-  "avg_fps": 43.800844659994496
+  "avg_time_2d": 0.01303539154893261,
+  "avg_time_3d": 0.00036579309883764233,
+  "avg_fps": 74.62026875112002
 }
 {
  "person_nums": {
@ -27,53 +27,53 @@ Results of the model in various experiments on different datasets.
  },
  "mpjpe": {
    "count": 600,
-    "mean": 0.067837,
-    "median": 0.059973,
-    "std": 0.027729,
-    "sem": 0.001133,
-    "min": 0.044125,
-    "max": 0.191545,
+    "mean": 0.06664,
+    "median": 0.05883,
+    "std": 0.027642,
+    "sem": 0.001129,
+    "min": 0.037832,
+    "max": 0.189745,
    "recall-0.025": 0.0,
-    "recall-0.05": 0.035,
-    "recall-0.1": 0.931667,
+    "recall-0.05": 0.1,
+    "recall-0.1": 0.941667,
    "recall-0.15": 0.95,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
    "num_labels": 600,
    "ap-0.025": 0.0,
-    "ap-0.05": 0.003097,
-    "ap-0.1": 0.889734,
-    "ap-0.15": 0.915055,
+    "ap-0.05": 0.018725,
+    "ap-0.1": 0.902023,
+    "ap-0.15": 0.914628,
    "ap-0.25": 1.0,
    "ap-0.5": 1.0
  },
  "nose": {
    "count": 600,
-    "mean": 0.116272,
-    "median": 0.09953,
-    "std": 0.042967,
-    "sem": 0.001756,
-    "min": 0.033845,
-    "max": 0.263303,
+    "mean": 0.114935,
+    "median": 0.099561,
+    "std": 0.042845,
+    "sem": 0.001751,
+    "min": 0.029831,
+    "max": 0.268342,
    "recall-0.025": 0.0,
-    "recall-0.05": 0.008333,
-    "recall-0.1": 0.503333,
-    "recall-0.15": 0.815,
-    "recall-0.25": 0.993333,
+    "recall-0.05": 0.015,
+    "recall-0.1": 0.506667,
+    "recall-0.15": 0.803333,
+    "recall-0.25": 0.995,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "shoulder_left": {
    "count": 600,
-    "mean": 0.034881,
-    "median": 0.027327,
-    "std": 0.031594,
-    "sem": 0.001291,
-    "min": 0.002162,
-    "max": 0.178271,
-    "recall-0.025": 0.438333,
-    "recall-0.05": 0.863333,
-    "recall-0.1": 0.946667,
+    "mean": 0.036888,
+    "median": 0.028719,
+    "std": 0.031747,
+    "sem": 0.001297,
+    "min": 0.004721,
+    "max": 0.182985,
+    "recall-0.025": 0.401667,
+    "recall-0.05": 0.833333,
+    "recall-0.1": 0.948333,
    "recall-0.15": 0.963333,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
@ -81,30 +81,30 @@ Results of the model in various experiments on different datasets.
  },
  "shoulder_right": {
    "count": 600,
-    "mean": 0.050288,
-    "median": 0.03555,
-    "std": 0.042274,
-    "sem": 0.001727,
-    "min": 0.003983,
-    "max": 0.238328,
-    "recall-0.025": 0.176667,
-    "recall-0.05": 0.748333,
-    "recall-0.1": 0.9,
-    "recall-0.15": 0.94,
+    "mean": 0.050032,
+    "median": 0.036552,
+    "std": 0.040712,
+    "sem": 0.001663,
+    "min": 0.006749,
+    "max": 0.239156,
+    "recall-0.025": 0.201667,
+    "recall-0.05": 0.708333,
+    "recall-0.1": 0.915,
+    "recall-0.15": 0.945,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "elbow_left": {
    "count": 600,
-    "mean": 0.044326,
-    "median": 0.035816,
-    "std": 0.034654,
-    "sem": 0.001416,
-    "min": 0.001741,
-    "max": 0.198882,
-    "recall-0.025": 0.226667,
-    "recall-0.05": 0.776667,
+    "mean": 0.045586,
+    "median": 0.037313,
+    "std": 0.034633,
+    "sem": 0.001415,
+    "min": 0.003768,
+    "max": 0.200457,
+    "recall-0.025": 0.216667,
+    "recall-0.05": 0.746667,
    "recall-0.1": 0.946667,
    "recall-0.15": 0.955,
    "recall-0.25": 1.0,
@ -113,162 +113,162 @@ Results of the model in various experiments on different datasets.
  },
  "elbow_right": {
    "count": 600,
-    "mean": 0.044545,
-    "median": 0.033152,
-    "std": 0.037755,
-    "sem": 0.001543,
-    "min": 0.008169,
-    "max": 0.338555,
-    "recall-0.025": 0.218333,
-    "recall-0.05": 0.798333,
-    "recall-0.1": 0.928333,
-    "recall-0.15": 0.943333,
-    "recall-0.25": 0.996667,
+    "mean": 0.04539,
+    "median": 0.035591,
+    "std": 0.036356,
+    "sem": 0.001485,
+    "min": 0.007803,
+    "max": 0.281955,
+    "recall-0.025": 0.245,
+    "recall-0.05": 0.773333,
+    "recall-0.1": 0.923333,
+    "recall-0.15": 0.941667,
+    "recall-0.25": 0.998333,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "wrist_left": {
    "count": 600,
-    "mean": 0.044896,
-    "median": 0.025929,
-    "std": 0.048601,
-    "sem": 0.001986,
-    "min": 0.002701,
-    "max": 0.326901,
-    "recall-0.025": 0.476667,
-    "recall-0.05": 0.735,
-    "recall-0.1": 0.885,
-    "recall-0.15": 0.913333,
+    "mean": 0.046389,
+    "median": 0.029742,
+    "std": 0.04752,
+    "sem": 0.001942,
+    "min": 0.00236,
+    "max": 0.287479,
+    "recall-0.025": 0.426667,
+    "recall-0.05": 0.728333,
+    "recall-0.1": 0.888333,
+    "recall-0.15": 0.91,
    "recall-0.25": 0.996667,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "wrist_right": {
    "count": 600,
-    "mean": 0.045586,
-    "median": 0.027856,
-    "std": 0.048323,
-    "sem": 0.001974,
-    "min": 0.001841,
-    "max": 0.229728,
-    "recall-0.025": 0.436667,
-    "recall-0.05": 0.751667,
-    "recall-0.1": 0.881667,
-    "recall-0.15": 0.916667,
+    "mean": 0.046403,
+    "median": 0.028916,
+    "std": 0.046566,
+    "sem": 0.001903,
+    "min": 0.002735,
+    "max": 0.236808,
+    "recall-0.025": 0.428333,
+    "recall-0.05": 0.731667,
+    "recall-0.1": 0.87,
+    "recall-0.15": 0.926667,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "hip_left": {
    "count": 600,
-    "mean": 0.087757,
-    "median": 0.083491,
-    "std": 0.032627,
-    "sem": 0.001333,
-    "min": 0.004177,
-    "max": 0.235198,
-    "recall-0.025": 0.008333,
-    "recall-0.05": 0.031667,
-    "recall-0.1": 0.851667,
-    "recall-0.15": 0.953333,
+    "mean": 0.079732,
+    "median": 0.072175,
+    "std": 0.034532,
+    "sem": 0.001411,
+    "min": 0.013963,
+    "max": 0.24229,
+    "recall-0.025": 0.013333,
+    "recall-0.05": 0.081667,
+    "recall-0.1": 0.875,
+    "recall-0.15": 0.945,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "hip_right": {
    "count": 600,
-    "mean": 0.112758,
-    "median": 0.11174,
-    "std": 0.025369,
-    "sem": 0.001037,
-    "min": 0.057593,
-    "max": 0.231402,
+    "mean": 0.101424,
+    "median": 0.099206,
+    "std": 0.02636,
+    "sem": 0.001077,
+    "min": 0.032964,
+    "max": 0.226018,
    "recall-0.025": 0.0,
-    "recall-0.05": 0.0,
-    "recall-0.1": 0.251667,
-    "recall-0.15": 0.945,
+    "recall-0.05": 0.008333,
+    "recall-0.1": 0.52,
+    "recall-0.15": 0.946667,
    "recall-0.25": 1.0,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "knee_left": {
-    "count": 599,
-    "mean": 0.063316,
-    "median": 0.047979,
-    "std": 0.058253,
-    "sem": 0.002382,
-    "min": 0.019525,
-    "max": 0.476803,
-    "recall-0.025": 0.033333,
-    "recall-0.05": 0.546667,
+    "count": 600,
+    "mean": 0.06299,
+    "median": 0.047078,
+    "std": 0.055676,
+    "sem": 0.002275,
+    "min": 0.013748,
+    "max": 0.412425,
+    "recall-0.025": 0.03,
+    "recall-0.05": 0.548333,
    "recall-0.1": 0.89,
-    "recall-0.15": 0.925,
-    "recall-0.25": 0.978333,
-    "recall-0.5": 0.998333,
+    "recall-0.15": 0.926667,
+    "recall-0.25": 0.983333,
+    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "knee_right": {
    "count": 600,
-    "mean": 0.050955,
-    "median": 0.041526,
-    "std": 0.037031,
-    "sem": 0.001513,
-    "min": 0.005291,
-    "max": 0.27011,
-    "recall-0.025": 0.035,
-    "recall-0.05": 0.746667,
-    "recall-0.1": 0.943333,
-    "recall-0.15": 0.945,
-    "recall-0.25": 0.998333,
+    "mean": 0.053303,
+    "median": 0.039785,
+    "std": 0.048089,
+    "sem": 0.001965,
+    "min": 0.009094,
+    "max": 0.470447,
+    "recall-0.025": 0.06,
+    "recall-0.05": 0.736667,
+    "recall-0.1": 0.923333,
+    "recall-0.15": 0.926667,
+    "recall-0.25": 0.988333,
    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "ankle_left": {
-    "count": 598,
-    "mean": 0.097897,
-    "median": 0.086817,
-    "std": 0.048343,
-    "sem": 0.001979,
-    "min": 0.048922,
-    "max": 0.493127,
+    "count": 600,
+    "mean": 0.097848,
+    "median": 0.087393,
+    "std": 0.039465,
+    "sem": 0.001613,
+    "min": 0.049149,
+    "max": 0.49791,
    "recall-0.025": 0.0,
-    "recall-0.05": 0.003333,
-    "recall-0.1": 0.83,
-    "recall-0.15": 0.933333,
-    "recall-0.25": 0.98,
-    "recall-0.5": 0.996667,
+    "recall-0.05": 0.005,
+    "recall-0.1": 0.805,
+    "recall-0.15": 0.923333,
+    "recall-0.25": 0.99,
+    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "ankle_right": {
-    "count": 599,
-    "mean": 0.084814,
-    "median": 0.07029,
-    "std": 0.053839,
-    "sem": 0.002202,
-    "min": 0.025955,
-    "max": 0.384465,
+    "count": 600,
+    "mean": 0.085394,
+    "median": 0.070638,
+    "std": 0.050932,
+    "sem": 0.002081,
+    "min": 0.027674,
+    "max": 0.441898,
    "recall-0.025": 0.0,
-    "recall-0.05": 0.02,
-    "recall-0.1": 0.886667,
-    "recall-0.15": 0.908333,
-    "recall-0.25": 0.973333,
-    "recall-0.5": 0.998333,
+    "recall-0.05": 0.023333,
+    "recall-0.1": 0.876667,
+    "recall-0.15": 0.9,
+    "recall-0.25": 0.983333,
+    "recall-0.5": 1.0,
    "num_labels": 600
  },
  "joint_recalls": {
    "num_labels": 7800,
-    "recall-0.025": 0.15718,
-    "recall-0.05": 0.46321,
-    "recall-0.1": 0.81846,
-    "recall-0.15": 0.92654,
-    "recall-0.25": 0.99308,
-    "recall-0.5": 0.99923
+    "recall-0.025": 0.15538,
+    "recall-0.05": 0.45603,
+    "recall-0.1": 0.83705,
+    "recall-0.15": 0.92372,
+    "recall-0.25": 0.99449,
+    "recall-0.5": 1.0
  }
 }
 {
  "total_parts": 8400,
-  "correct_parts": 8077,
-  "pcp": 0.961548
+  "correct_parts": 8090,
+  "pcp": 0.963095
 }
 ```