Improved box cutting with always fixed tensor shapes.

This commit is contained in:
Daniel
2024-12-04 17:54:57 +01:00
parent 6452d20ec8
commit acf1d19b64
2 changed files with 254 additions and 216 deletions

View File

@ -2,6 +2,7 @@ import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.ops import roi_align
# ==================================================================================================
@ -20,35 +21,10 @@ class Letterbox(nn.Module):
self.target_size = target_size
self.fill_value = fill_value
def calc_params_and_crop(self, ishape, bbox=None):
ih0, iw0 = ishape[1], ishape[2]
def calc_params(self, ishape):
ih, iw = ishape[1], ishape[2]
th, tw = self.target_size
if bbox is not None:
bbox = bbox[0].float()
x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
# Slightly increase bbox size
factor = 1.25
w = x2 - x1
h = y2 - y1
x1 -= w * (factor - 1) / 2
x2 += w * (factor - 1) / 2
y1 -= h * (factor - 1) / 2
y2 += h * (factor - 1) / 2
zero = torch.tensor(0)
x1 = torch.max(x1, zero).to(torch.int64)
y1 = torch.max(y1, zero).to(torch.int64)
x2 = torch.min(x2, iw0).to(torch.int64)
y2 = torch.min(y2, ih0).to(torch.int64)
bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0)
ih = y2 - y1
iw = x2 - x1
else:
ih, iw = ih0, iw0
scale = torch.min(tw / iw, th / ih)
nw = torch.round(iw * scale)
nh = torch.round(ih * scale)
@ -61,21 +37,16 @@ class Letterbox(nn.Module):
pad_bottom = pad_h - pad_top
paddings = (pad_left, pad_right, pad_top, pad_bottom)
return paddings, scale, (nw, nh), bbox
return paddings, scale, (nw, nh)
def forward(self, img, bbox=None):
paddings, _, (nw, nh), bbox = self.calc_params_and_crop(img.shape, bbox)
# Optional: Crop the image
if bbox is not None:
x1, y1, x2, y2 = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3]
img = img.to(torch.float32)
img = img[:, y1:y2, x1:x2, :]
def forward(self, img):
paddings, _, (nw, nh) = self.calc_params(img.shape)
# Resize the image
img = img.to(torch.float32)
img = img.permute(0, 3, 1, 2)
img = F.interpolate(
img.permute(0, 3, 1, 2),
img,
size=(nh, nw),
mode="bilinear",
align_corners=False,
@ -91,9 +62,82 @@ class Letterbox(nn.Module):
value=self.fill_value,
)
img = img.permute(0, 2, 3, 1)
canvas = img
return canvas
return img
# ==================================================================================================
class BoxCrop(nn.Module):
def __init__(self, target_size):
"""Crop bounding box from image"""
super(BoxCrop, self).__init__()
self.target_size = target_size
self.padding_scale = 1.25
def calc_params(self, bbox):
start_x, start_y, end_x, end_y = bbox[0, 0], bbox[0, 1], bbox[0, 2], bbox[0, 3]
target_h, target_w = self.target_size
# Calculate original bounding box width, height and center
bbox_w = end_x - start_x
bbox_h = end_y - start_y
center_x = (start_x + end_x) / 2.0
center_y = (start_y + end_y) / 2.0
# Calculate the aspect ratios
bbox_aspect = bbox_w / bbox_h
target_aspect = target_w / target_h
# Adjust the scaled bounding box to match the target aspect ratio
if bbox_aspect > target_aspect:
adjusted_h = bbox_w / target_aspect
adjusted_w = bbox_w
else:
adjusted_w = bbox_h * target_aspect
adjusted_h = bbox_h
# Scale the bounding box by the padding_scale
scaled_bbox_w = adjusted_w * self.padding_scale
scaled_bbox_h = adjusted_h * self.padding_scale
# Calculate scaled bounding box coordinates
new_start_x = center_x - scaled_bbox_w / 2.0
new_start_y = center_y - scaled_bbox_h / 2.0
new_end_x = center_x + scaled_bbox_w / 2.0
new_end_y = center_y + scaled_bbox_h / 2.0
# Define the new box coordinates
new_box = torch.stack((new_start_x, new_start_y, new_end_x, new_end_y), dim=0)
new_box = new_box.unsqueeze(0)
scale = torch.stack(
((target_w / scaled_bbox_w), (target_h / scaled_bbox_h)), dim=0
)
return scale, new_box
def forward(self, img, bbox):
_, bbox = self.calc_params(bbox)
batch_indices = torch.zeros(bbox.shape[0], 1)
rois = torch.cat([batch_indices, bbox], dim=1)
# Resize and crop
img = img.to(torch.float32)
img = img.permute(0, 3, 1, 2)
img = roi_align(
img,
rois,
output_size=self.target_size,
spatial_scale=1.0,
sampling_ratio=0,
)
img = img.permute(0, 2, 3, 1)
img = img.round()
return img
# ==================================================================================================
@ -106,7 +150,7 @@ class DetPreprocess(nn.Module):
def forward(self, img):
# img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8
img = self.letterbox(img, None)
img = self.letterbox(img)
return img
@ -121,7 +165,7 @@ class DetPostprocess(nn.Module):
self.letterbox = Letterbox(target_size)
def forward(self, img, boxes):
paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img.shape, None)
paddings, scale, _ = self.letterbox.calc_params(img.shape)
boxes = boxes.float()
boxes[:, :, 0] -= paddings[0]
@ -160,12 +204,12 @@ class DetPostprocess(nn.Module):
class PosePreprocess(nn.Module):
def __init__(self, target_size, fill_value=114):
super(PosePreprocess, self).__init__()
self.letterbox = Letterbox(target_size, fill_value)
self.boxcrop = BoxCrop(target_size)
def forward(self, img, bbox):
# img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8
# bbox: torch.Tensor of shape [1, 4], dtype=torch.float32
img = self.letterbox(img, bbox)
img = self.boxcrop(img, bbox)
return img
@ -175,25 +219,22 @@ class PosePreprocess(nn.Module):
class PosePostprocess(nn.Module):
def __init__(self, target_size):
super(PosePostprocess, self).__init__()
self.boxcrop = BoxCrop(target_size)
self.target_size = target_size
self.letterbox = Letterbox(target_size)
def forward(self, img, bbox, keypoints):
paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img.shape, bbox)
scale, bbox = self.boxcrop.calc_params(bbox)
kp = keypoints.float()
kp[:, :, 0] -= paddings[0]
kp[:, :, 1] -= paddings[2]
kp[:, :, 0:2] /= scale
kp[:, :, 0] += bbox[0, 0]
kp[:, :, 1] += bbox[0, 1]
zero = torch.tensor(0)
kp = torch.max(kp, zero)
th, tw = self.target_size
pad_w = paddings[0] + paddings[1]
pad_h = paddings[2] + paddings[3]
max_w = tw - pad_w - 1
max_h = th - pad_h - 1
max_w = img.shape[2] - 1
max_h = img.shape[1] - 1
k0 = kp[:, :, 0]
k1 = kp[:, :, 1]
k0 = torch.min(k0, max_w)
@ -201,10 +242,6 @@ class PosePostprocess(nn.Module):
kp[:, :, 0] = k0
kp[:, :, 1] = k1
kp[:, :, 0:2] /= scale
kp[:, :, 0] += bbox[0, 0]
kp[:, :, 1] += bbox[0, 1]
return kp
@ -215,6 +252,7 @@ def main():
img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg"
image = cv2.imread(img_path, 3)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Initialize the DetPreprocess module
preprocess_model = DetPreprocess(target_size=det_target_size)
@ -257,7 +295,7 @@ def main():
# Initialize the PosePreprocess module
preprocess_model = PosePreprocess(target_size=pose_target_size)
det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0)
det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32)
det_dummy_input_c1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32)
# Export to ONNX
torch.onnx.export(
@ -276,8 +314,8 @@ def main():
# Initialize the PosePostprocess module
postprocess_model = PosePostprocess(target_size=pose_target_size)
det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0)
det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]]).to(torch.int32)
det_dummy_input_d2 = torch.rand(1, 17, 3)
det_dummy_input_d1 = torch.tensor([[352, 339, 518, 594]]).to(torch.int32)
det_dummy_input_d2 = torch.rand(1, 17, 2)
# Export to ONNX
torch.onnx.export(

View File

@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets.
```json
{
"avg_time_2d": 0.02244777841083074,
"avg_time_3d": 0.0003828315411583852,
"avg_fps": 43.800844659994496
"avg_time_2d": 0.01303539154893261,
"avg_time_3d": 0.00036579309883764233,
"avg_fps": 74.62026875112002
}
{
"person_nums": {
@ -27,53 +27,53 @@ Results of the model in various experiments on different datasets.
},
"mpjpe": {
"count": 600,
"mean": 0.067837,
"median": 0.059973,
"std": 0.027729,
"sem": 0.001133,
"min": 0.044125,
"max": 0.191545,
"mean": 0.06664,
"median": 0.05883,
"std": 0.027642,
"sem": 0.001129,
"min": 0.037832,
"max": 0.189745,
"recall-0.025": 0.0,
"recall-0.05": 0.035,
"recall-0.1": 0.931667,
"recall-0.05": 0.1,
"recall-0.1": 0.941667,
"recall-0.15": 0.95,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
"num_labels": 600,
"ap-0.025": 0.0,
"ap-0.05": 0.003097,
"ap-0.1": 0.889734,
"ap-0.15": 0.915055,
"ap-0.05": 0.018725,
"ap-0.1": 0.902023,
"ap-0.15": 0.914628,
"ap-0.25": 1.0,
"ap-0.5": 1.0
},
"nose": {
"count": 600,
"mean": 0.116272,
"median": 0.09953,
"std": 0.042967,
"sem": 0.001756,
"min": 0.033845,
"max": 0.263303,
"mean": 0.114935,
"median": 0.099561,
"std": 0.042845,
"sem": 0.001751,
"min": 0.029831,
"max": 0.268342,
"recall-0.025": 0.0,
"recall-0.05": 0.008333,
"recall-0.1": 0.503333,
"recall-0.15": 0.815,
"recall-0.25": 0.993333,
"recall-0.05": 0.015,
"recall-0.1": 0.506667,
"recall-0.15": 0.803333,
"recall-0.25": 0.995,
"recall-0.5": 1.0,
"num_labels": 600
},
"shoulder_left": {
"count": 600,
"mean": 0.034881,
"median": 0.027327,
"std": 0.031594,
"sem": 0.001291,
"min": 0.002162,
"max": 0.178271,
"recall-0.025": 0.438333,
"recall-0.05": 0.863333,
"recall-0.1": 0.946667,
"mean": 0.036888,
"median": 0.028719,
"std": 0.031747,
"sem": 0.001297,
"min": 0.004721,
"max": 0.182985,
"recall-0.025": 0.401667,
"recall-0.05": 0.833333,
"recall-0.1": 0.948333,
"recall-0.15": 0.963333,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
@ -81,30 +81,30 @@ Results of the model in various experiments on different datasets.
},
"shoulder_right": {
"count": 600,
"mean": 0.050288,
"median": 0.03555,
"std": 0.042274,
"sem": 0.001727,
"min": 0.003983,
"max": 0.238328,
"recall-0.025": 0.176667,
"recall-0.05": 0.748333,
"recall-0.1": 0.9,
"recall-0.15": 0.94,
"mean": 0.050032,
"median": 0.036552,
"std": 0.040712,
"sem": 0.001663,
"min": 0.006749,
"max": 0.239156,
"recall-0.025": 0.201667,
"recall-0.05": 0.708333,
"recall-0.1": 0.915,
"recall-0.15": 0.945,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
"num_labels": 600
},
"elbow_left": {
"count": 600,
"mean": 0.044326,
"median": 0.035816,
"std": 0.034654,
"sem": 0.001416,
"min": 0.001741,
"max": 0.198882,
"recall-0.025": 0.226667,
"recall-0.05": 0.776667,
"mean": 0.045586,
"median": 0.037313,
"std": 0.034633,
"sem": 0.001415,
"min": 0.003768,
"max": 0.200457,
"recall-0.025": 0.216667,
"recall-0.05": 0.746667,
"recall-0.1": 0.946667,
"recall-0.15": 0.955,
"recall-0.25": 1.0,
@ -113,162 +113,162 @@ Results of the model in various experiments on different datasets.
},
"elbow_right": {
"count": 600,
"mean": 0.044545,
"median": 0.033152,
"std": 0.037755,
"sem": 0.001543,
"min": 0.008169,
"max": 0.338555,
"recall-0.025": 0.218333,
"recall-0.05": 0.798333,
"recall-0.1": 0.928333,
"recall-0.15": 0.943333,
"recall-0.25": 0.996667,
"mean": 0.04539,
"median": 0.035591,
"std": 0.036356,
"sem": 0.001485,
"min": 0.007803,
"max": 0.281955,
"recall-0.025": 0.245,
"recall-0.05": 0.773333,
"recall-0.1": 0.923333,
"recall-0.15": 0.941667,
"recall-0.25": 0.998333,
"recall-0.5": 1.0,
"num_labels": 600
},
"wrist_left": {
"count": 600,
"mean": 0.044896,
"median": 0.025929,
"std": 0.048601,
"sem": 0.001986,
"min": 0.002701,
"max": 0.326901,
"recall-0.025": 0.476667,
"recall-0.05": 0.735,
"recall-0.1": 0.885,
"recall-0.15": 0.913333,
"mean": 0.046389,
"median": 0.029742,
"std": 0.04752,
"sem": 0.001942,
"min": 0.00236,
"max": 0.287479,
"recall-0.025": 0.426667,
"recall-0.05": 0.728333,
"recall-0.1": 0.888333,
"recall-0.15": 0.91,
"recall-0.25": 0.996667,
"recall-0.5": 1.0,
"num_labels": 600
},
"wrist_right": {
"count": 600,
"mean": 0.045586,
"median": 0.027856,
"std": 0.048323,
"sem": 0.001974,
"min": 0.001841,
"max": 0.229728,
"recall-0.025": 0.436667,
"recall-0.05": 0.751667,
"recall-0.1": 0.881667,
"recall-0.15": 0.916667,
"mean": 0.046403,
"median": 0.028916,
"std": 0.046566,
"sem": 0.001903,
"min": 0.002735,
"max": 0.236808,
"recall-0.025": 0.428333,
"recall-0.05": 0.731667,
"recall-0.1": 0.87,
"recall-0.15": 0.926667,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
"num_labels": 600
},
"hip_left": {
"count": 600,
"mean": 0.087757,
"median": 0.083491,
"std": 0.032627,
"sem": 0.001333,
"min": 0.004177,
"max": 0.235198,
"recall-0.025": 0.008333,
"recall-0.05": 0.031667,
"recall-0.1": 0.851667,
"recall-0.15": 0.953333,
"mean": 0.079732,
"median": 0.072175,
"std": 0.034532,
"sem": 0.001411,
"min": 0.013963,
"max": 0.24229,
"recall-0.025": 0.013333,
"recall-0.05": 0.081667,
"recall-0.1": 0.875,
"recall-0.15": 0.945,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
"num_labels": 600
},
"hip_right": {
"count": 600,
"mean": 0.112758,
"median": 0.11174,
"std": 0.025369,
"sem": 0.001037,
"min": 0.057593,
"max": 0.231402,
"mean": 0.101424,
"median": 0.099206,
"std": 0.02636,
"sem": 0.001077,
"min": 0.032964,
"max": 0.226018,
"recall-0.025": 0.0,
"recall-0.05": 0.0,
"recall-0.1": 0.251667,
"recall-0.15": 0.945,
"recall-0.05": 0.008333,
"recall-0.1": 0.52,
"recall-0.15": 0.946667,
"recall-0.25": 1.0,
"recall-0.5": 1.0,
"num_labels": 600
},
"knee_left": {
"count": 599,
"mean": 0.063316,
"median": 0.047979,
"std": 0.058253,
"sem": 0.002382,
"min": 0.019525,
"max": 0.476803,
"recall-0.025": 0.033333,
"recall-0.05": 0.546667,
"count": 600,
"mean": 0.06299,
"median": 0.047078,
"std": 0.055676,
"sem": 0.002275,
"min": 0.013748,
"max": 0.412425,
"recall-0.025": 0.03,
"recall-0.05": 0.548333,
"recall-0.1": 0.89,
"recall-0.15": 0.925,
"recall-0.25": 0.978333,
"recall-0.5": 0.998333,
"recall-0.15": 0.926667,
"recall-0.25": 0.983333,
"recall-0.5": 1.0,
"num_labels": 600
},
"knee_right": {
"count": 600,
"mean": 0.050955,
"median": 0.041526,
"std": 0.037031,
"sem": 0.001513,
"min": 0.005291,
"max": 0.27011,
"recall-0.025": 0.035,
"recall-0.05": 0.746667,
"recall-0.1": 0.943333,
"recall-0.15": 0.945,
"recall-0.25": 0.998333,
"mean": 0.053303,
"median": 0.039785,
"std": 0.048089,
"sem": 0.001965,
"min": 0.009094,
"max": 0.470447,
"recall-0.025": 0.06,
"recall-0.05": 0.736667,
"recall-0.1": 0.923333,
"recall-0.15": 0.926667,
"recall-0.25": 0.988333,
"recall-0.5": 1.0,
"num_labels": 600
},
"ankle_left": {
"count": 598,
"mean": 0.097897,
"median": 0.086817,
"std": 0.048343,
"sem": 0.001979,
"min": 0.048922,
"max": 0.493127,
"count": 600,
"mean": 0.097848,
"median": 0.087393,
"std": 0.039465,
"sem": 0.001613,
"min": 0.049149,
"max": 0.49791,
"recall-0.025": 0.0,
"recall-0.05": 0.003333,
"recall-0.1": 0.83,
"recall-0.15": 0.933333,
"recall-0.25": 0.98,
"recall-0.5": 0.996667,
"recall-0.05": 0.005,
"recall-0.1": 0.805,
"recall-0.15": 0.923333,
"recall-0.25": 0.99,
"recall-0.5": 1.0,
"num_labels": 600
},
"ankle_right": {
"count": 599,
"mean": 0.084814,
"median": 0.07029,
"std": 0.053839,
"sem": 0.002202,
"min": 0.025955,
"max": 0.384465,
"count": 600,
"mean": 0.085394,
"median": 0.070638,
"std": 0.050932,
"sem": 0.002081,
"min": 0.027674,
"max": 0.441898,
"recall-0.025": 0.0,
"recall-0.05": 0.02,
"recall-0.1": 0.886667,
"recall-0.15": 0.908333,
"recall-0.25": 0.973333,
"recall-0.5": 0.998333,
"recall-0.05": 0.023333,
"recall-0.1": 0.876667,
"recall-0.15": 0.9,
"recall-0.25": 0.983333,
"recall-0.5": 1.0,
"num_labels": 600
},
"joint_recalls": {
"num_labels": 7800,
"recall-0.025": 0.15718,
"recall-0.05": 0.46321,
"recall-0.1": 0.81846,
"recall-0.15": 0.92654,
"recall-0.25": 0.99308,
"recall-0.5": 0.99923
"recall-0.025": 0.15538,
"recall-0.05": 0.45603,
"recall-0.1": 0.83705,
"recall-0.15": 0.92372,
"recall-0.25": 0.99449,
"recall-0.5": 1.0
}
}
{
"total_parts": 8400,
"correct_parts": 8077,
"pcp": 0.961548
"correct_parts": 8090,
"pcp": 0.963095
}
```