Moved pose pre/post-processing into onnx graph.

This commit is contained in:
Daniel
2024-12-03 17:59:09 +01:00
parent 2e5c3399ed
commit 97ff32b9ce
4 changed files with 411 additions and 284 deletions

View File

@ -121,14 +121,55 @@ def add_steps_to_onnx(model_path):
# Update nodes from postprocess model to use the input of the main network # Update nodes from postprocess model to use the input of the main network
pp2_input_image_name = pp2_model.graph.input[0].name pp2_input_image_name = pp2_model.graph.input[0].name
main_input_name = model.graph.input[0].name main_input_image_name = model.graph.input[0].name
for node in model.graph.node: for node in model.graph.node:
for idx, name in enumerate(node.input): for idx, name in enumerate(node.input):
if name == pp2_input_image_name: if name == pp2_input_image_name:
node.input[idx] = main_input_name node.input[idx] = main_input_image_name
model.graph.input.pop(1) model.graph.input.pop(1)
# Set input type to int8 if "pose" in model_path:
# Add preprocess model to main network
pp1_model = onnx.load(base_path + "pose_preprocess.onnx")
model = compose.add_prefix(model, prefix="main_")
pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_")
model = compose.merge_models(
pp1_model,
model,
io_map=[
(pp1_model.graph.output[0].name, model.graph.input[0].name),
],
)
# Add postprocess model
pp2_model = onnx.load(base_path + "pose_postprocess.onnx")
pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_")
model = compose.merge_models(
model,
pp2_model,
io_map=[
(model.graph.output[0].name, pp2_model.graph.input[2].name),
],
)
# Update nodes from postprocess model to use the input of the main network
pp2_input_image_name = pp2_model.graph.input[0].name
pp2_input_bbox_name = pp2_model.graph.input[1].name
main_input_image_name = model.graph.input[0].name
main_input_bbox_name = model.graph.input[1].name
for node in model.graph.node:
for idx, name in enumerate(node.input):
if name == pp2_input_image_name:
node.input[idx] = main_input_image_name
if name == pp2_input_bbox_name:
node.input[idx] = main_input_bbox_name
model.graph.input.pop(2)
model.graph.input.pop(2)
# Set input box type to int32
model.graph.input[1].type.tensor_type.elem_type = TensorProto.INT32
# Set input image type to int8
model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8 model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8
path = model_path.replace(".onnx", "_extra-steps.onnx") path = model_path.replace(".onnx", "_extra-steps.onnx")

View File

@ -7,6 +7,7 @@ import torch.nn.functional as F
base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/" base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/"
det_target_size = (320, 320) det_target_size = (320, 320)
pose_target_size = (384, 288)
# ================================================================================================== # ==================================================================================================
@ -19,10 +20,37 @@ class Letterbox(nn.Module):
self.target_size = target_size self.target_size = target_size
self.fill_value = fill_value self.fill_value = fill_value
def calc_params(self, img): def calc_params_and_crop(self, img, bbox=None):
ih, iw = img.shape[1:3] ih0, iw0 = img.shape[1:3]
th, tw = self.target_size th, tw = self.target_size
if bbox is not None:
bbox = bbox[0].float()
x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
# Slightly increase bbox size
factor = 1.25
w = x2 - x1
h = y2 - y1
x1 -= w * (factor - 1) / 2
x2 += w * (factor - 1) / 2
y1 -= h * (factor - 1) / 2
y2 += h * (factor - 1) / 2
zero = torch.tensor(0)
x1 = torch.max(x1, zero).to(torch.int64)
y1 = torch.max(y1, zero).to(torch.int64)
x2 = torch.min(x2, iw0).to(torch.int64)
y2 = torch.min(y2, ih0).to(torch.int64)
bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0)
img = img.to(torch.float32)
img = img[:, y1:y2, x1:x2, :]
ih = y2 - y1
iw = x2 - x1
else:
ih, iw = ih0, iw0
scale = torch.min(tw / iw, th / ih) scale = torch.min(tw / iw, th / ih)
nw = torch.round(iw * scale) nw = torch.round(iw * scale)
nh = torch.round(ih * scale) nh = torch.round(ih * scale)
@ -35,15 +63,18 @@ class Letterbox(nn.Module):
pad_bottom = pad_h - pad_top pad_bottom = pad_h - pad_top
paddings = (pad_left, pad_right, pad_top, pad_bottom) paddings = (pad_left, pad_right, pad_top, pad_bottom)
return paddings, scale, (nw, nh) return img, paddings, scale, (nw, nh), bbox
def forward(self, img): def forward(self, img, bbox=None):
paddings, _, (nw, nh) = self.calc_params(img) img, paddings, _, (nw, nh), _ = self.calc_params_and_crop(img, bbox)
# Resize the image # Resize the image
img = img.to(torch.float32) img = img.to(torch.float32)
img = F.interpolate( img = F.interpolate(
img.permute(0, 3, 1, 2), size=(nh, nw), mode="bilinear", align_corners=False img.permute(0, 3, 1, 2),
size=(nh, nw),
mode="bilinear",
align_corners=False,
) )
img = img.permute(0, 2, 3, 1) img = img.permute(0, 2, 3, 1)
img = img.round() img = img.round()
@ -71,7 +102,7 @@ class DetPreprocess(nn.Module):
def forward(self, img): def forward(self, img):
# img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8 # img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8
img = self.letterbox(img) img = self.letterbox(img, None)
return img return img
@ -81,36 +112,97 @@ class DetPreprocess(nn.Module):
class DetPostprocess(nn.Module): class DetPostprocess(nn.Module):
def __init__(self, target_size): def __init__(self, target_size):
super(DetPostprocess, self).__init__() super(DetPostprocess, self).__init__()
self.target_size = target_size
self.letterbox = Letterbox(target_size) self.letterbox = Letterbox(target_size)
def forward(self, img, boxes): def forward(self, img, boxes):
paddings, scale, _ = self.letterbox.calc_params(img) _, paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img, None)
boxes = boxes.float() boxes = boxes.float()
boxes[:, :, 0] -= paddings[0] boxes[:, :, 0] -= paddings[0]
boxes[:, :, 2] -= paddings[0] boxes[:, :, 2] -= paddings[0]
boxes[:, :, 1] -= paddings[2] boxes[:, :, 1] -= paddings[2]
boxes[:, :, 3] -= paddings[2] boxes[:, :, 3] -= paddings[2]
boxes[:, :, 0:4] /= scale
ih, iw = img.shape[1:3] zero = torch.tensor(0)
boxes = torch.max(boxes, torch.tensor(0)) boxes = torch.max(boxes, zero)
th, tw = self.target_size
pad_w = paddings[0] + paddings[1]
pad_h = paddings[2] + paddings[3]
max_w = tw - pad_w - 1
max_h = th - pad_h - 1
b0 = boxes[:, :, 0] b0 = boxes[:, :, 0]
b1 = boxes[:, :, 1] b1 = boxes[:, :, 1]
b2 = boxes[:, :, 2] b2 = boxes[:, :, 2]
b3 = boxes[:, :, 3] b3 = boxes[:, :, 3]
b0 = torch.min(b0, iw - 1) b0 = torch.min(b0, max_w)
b1 = torch.min(b1, ih - 1) b1 = torch.min(b1, max_h)
b2 = torch.min(b2, iw - 1) b2 = torch.min(b2, max_w)
b3 = torch.min(b3, ih - 1) b3 = torch.min(b3, max_h)
boxes = torch.stack((b0, b1, b2, b3, boxes[:, :, 4]), dim=2) boxes = torch.stack((b0, b1, b2, b3, boxes[:, :, 4]), dim=2)
boxes[:, :, 0:4] /= scale
return boxes return boxes
# ================================================================================================== # ==================================================================================================
class PosePreprocess(nn.Module):
def __init__(self, target_size, fill_value=114):
super(PosePreprocess, self).__init__()
self.letterbox = Letterbox(target_size, fill_value)
def forward(self, img, bbox):
# img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8
# bbox: torch.Tensor of shape [1, 4], dtype=torch.float32
img = self.letterbox(img, bbox)
return img
# ==================================================================================================
class PosePostprocess(nn.Module):
def __init__(self, target_size):
super(PosePostprocess, self).__init__()
self.target_size = target_size
self.letterbox = Letterbox(target_size)
def forward(self, img, bbox, keypoints):
_, paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img, bbox)
kp = keypoints.float()
kp[:, :, 0] -= paddings[0]
kp[:, :, 1] -= paddings[2]
zero = torch.tensor(0)
kp = torch.max(kp, zero)
th, tw = self.target_size
pad_w = paddings[0] + paddings[1]
pad_h = paddings[2] + paddings[3]
max_w = tw - pad_w - 1
max_h = th - pad_h - 1
k0 = kp[:, :, 0]
k1 = kp[:, :, 1]
k0 = torch.min(k0, max_w)
k1 = torch.min(k1, max_h)
kp = torch.stack((k0, k1), dim=2)
kp[:, :, 0:2] /= scale
kp[:, :, 0] += bbox[0, 0]
kp[:, :, 1] += bbox[0, 1]
return kp
# ==================================================================================================
def main(): def main():
img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg" img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg"
@ -154,6 +246,45 @@ def main():
}, },
) )
# Initialize the PosePreprocess module
preprocess_model = PosePreprocess(target_size=pose_target_size)
det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0)
det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]])
# Export to ONNX
torch.onnx.export(
preprocess_model,
(det_dummy_input_c0, det_dummy_input_c1),
base_path + "pose_preprocess.onnx",
opset_version=11,
input_names=["input_image", "bbox"],
output_names=["preprocessed_image"],
dynamic_axes={
"input_image": {0: "batch_size", 1: "height", 2: "width"},
"preprocessed_image": {0: "batch_size"},
},
)
# Initialize the PosePostprocess module
postprocess_model = PosePostprocess(target_size=pose_target_size)
det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0)
det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]])
det_dummy_input_d2 = torch.rand(1, 17, 3)
# Export to ONNX
torch.onnx.export(
postprocess_model,
(det_dummy_input_d0, det_dummy_input_d1, det_dummy_input_d2),
base_path + "pose_postprocess.onnx",
opset_version=11,
input_names=["input_image", "bbox", "keypoints"],
output_names=["output_keypoints"],
dynamic_axes={
"input_image": {0: "batch_size", 1: "height", 2: "width"},
"output_keypoints": {0: "batch_size"},
},
)
# ================================================================================================== # ==================================================================================================

View File

@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets.
```json ```json
{ {
"avg_time_2d": 0.01254632634631658, "avg_time_2d": 0.02244777841083074,
"avg_time_3d": 0.00036295955463991325, "avg_time_3d": 0.0003828315411583852,
"avg_fps": 77.4636186441503 "avg_fps": 43.800844659994496
} }
{ {
"person_nums": { "person_nums": {
@ -27,101 +27,101 @@ Results of the model in various experiments on different datasets.
}, },
"mpjpe": { "mpjpe": {
"count": 600, "count": 600,
"mean": 0.066275, "mean": 0.067837,
"median": 0.058426, "median": 0.059973,
"std": 0.02768, "std": 0.027729,
"sem": 0.001131, "sem": 0.001133,
"min": 0.040807, "min": 0.044125,
"max": 0.188876, "max": 0.191545,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.083333, "recall-0.05": 0.035,
"recall-0.1": 0.938333, "recall-0.1": 0.931667,
"recall-0.15": 0.95, "recall-0.15": 0.95,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600, "num_labels": 600,
"ap-0.025": 0.0, "ap-0.025": 0.0,
"ap-0.05": 0.011533, "ap-0.05": 0.003097,
"ap-0.1": 0.899113, "ap-0.1": 0.889734,
"ap-0.15": 0.915362, "ap-0.15": 0.915055,
"ap-0.25": 1.0, "ap-0.25": 1.0,
"ap-0.5": 1.0 "ap-0.5": 1.0
}, },
"nose": { "nose": {
"count": 600, "count": 600,
"mean": 0.115024, "mean": 0.116272,
"median": 0.099737, "median": 0.09953,
"std": 0.041, "std": 0.042967,
"sem": 0.001675, "sem": 0.001756,
"min": 0.02644, "min": 0.033845,
"max": 0.261025, "max": 0.263303,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.008333, "recall-0.05": 0.008333,
"recall-0.1": 0.501667, "recall-0.1": 0.503333,
"recall-0.15": 0.808333, "recall-0.15": 0.815,
"recall-0.25": 0.998333, "recall-0.25": 0.993333,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"shoulder_left": { "shoulder_left": {
"count": 600, "count": 600,
"mean": 0.034317, "mean": 0.034881,
"median": 0.026768, "median": 0.027327,
"std": 0.031799, "std": 0.031594,
"sem": 0.001299, "sem": 0.001291,
"min": 0.001234, "min": 0.002162,
"max": 0.178357, "max": 0.178271,
"recall-0.025": 0.456667, "recall-0.025": 0.438333,
"recall-0.05": 0.863333, "recall-0.05": 0.863333,
"recall-0.1": 0.946667, "recall-0.1": 0.946667,
"recall-0.15": 0.966667, "recall-0.15": 0.963333,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"shoulder_right": { "shoulder_right": {
"count": 600, "count": 600,
"mean": 0.047981, "mean": 0.050288,
"median": 0.034263, "median": 0.03555,
"std": 0.039767, "std": 0.042274,
"sem": 0.001625, "sem": 0.001727,
"min": 0.005363, "min": 0.003983,
"max": 0.24597, "max": 0.238328,
"recall-0.025": 0.226667, "recall-0.025": 0.176667,
"recall-0.05": 0.743333, "recall-0.05": 0.748333,
"recall-0.1": 0.916667, "recall-0.1": 0.9,
"recall-0.15": 0.948333, "recall-0.15": 0.94,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"elbow_left": { "elbow_left": {
"count": 600, "count": 600,
"mean": 0.043526, "mean": 0.044326,
"median": 0.034276, "median": 0.035816,
"std": 0.034786, "std": 0.034654,
"sem": 0.001421, "sem": 0.001416,
"min": 0.003312, "min": 0.001741,
"max": 0.198715, "max": 0.198882,
"recall-0.025": 0.24, "recall-0.025": 0.226667,
"recall-0.05": 0.781667, "recall-0.05": 0.776667,
"recall-0.1": 0.943333, "recall-0.1": 0.946667,
"recall-0.15": 0.958333, "recall-0.15": 0.955,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"elbow_right": { "elbow_right": {
"count": 600, "count": 600,
"mean": 0.04376, "mean": 0.044545,
"median": 0.033219, "median": 0.033152,
"std": 0.037126, "std": 0.037755,
"sem": 0.001517, "sem": 0.001543,
"min": 0.006159, "min": 0.008169,
"max": 0.314756, "max": 0.338555,
"recall-0.025": 0.245, "recall-0.025": 0.218333,
"recall-0.05": 0.803333, "recall-0.05": 0.798333,
"recall-0.1": 0.933333, "recall-0.1": 0.928333,
"recall-0.15": 0.943333, "recall-0.15": 0.943333,
"recall-0.25": 0.996667, "recall-0.25": 0.996667,
"recall-0.5": 1.0, "recall-0.5": 1.0,
@ -129,146 +129,146 @@ Results of the model in various experiments on different datasets.
}, },
"wrist_left": { "wrist_left": {
"count": 600, "count": 600,
"mean": 0.044151, "mean": 0.044896,
"median": 0.026578, "median": 0.025929,
"std": 0.047109, "std": 0.048601,
"sem": 0.001925, "sem": 0.001986,
"min": 0.002328, "min": 0.002701,
"max": 0.288425, "max": 0.326901,
"recall-0.025": 0.478333, "recall-0.025": 0.476667,
"recall-0.05": 0.736667, "recall-0.05": 0.735,
"recall-0.1": 0.883333, "recall-0.1": 0.885,
"recall-0.15": 0.918333, "recall-0.15": 0.913333,
"recall-0.25": 0.998333, "recall-0.25": 0.996667,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"wrist_right": { "wrist_right": {
"count": 600, "count": 600,
"mean": 0.045218, "mean": 0.045586,
"median": 0.026994, "median": 0.027856,
"std": 0.050213, "std": 0.048323,
"sem": 0.002052, "sem": 0.001974,
"min": 0.002207, "min": 0.001841,
"max": 0.291549, "max": 0.229728,
"recall-0.025": 0.471667, "recall-0.025": 0.436667,
"recall-0.05": 0.766667, "recall-0.05": 0.751667,
"recall-0.1": 0.876667, "recall-0.1": 0.881667,
"recall-0.15": 0.908333, "recall-0.15": 0.916667,
"recall-0.25": 0.998333, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"hip_left": { "hip_left": {
"count": 600, "count": 600,
"mean": 0.085085, "mean": 0.087757,
"median": 0.079726, "median": 0.083491,
"std": 0.032918, "std": 0.032627,
"sem": 0.001345, "sem": 0.001333,
"min": 0.020039, "min": 0.004177,
"max": 0.232252, "max": 0.235198,
"recall-0.025": 0.006667, "recall-0.025": 0.008333,
"recall-0.05": 0.055, "recall-0.05": 0.031667,
"recall-0.1": 0.853333, "recall-0.1": 0.851667,
"recall-0.15": 0.95, "recall-0.15": 0.953333,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"hip_right": { "hip_right": {
"count": 600, "count": 600,
"mean": 0.108514, "mean": 0.112758,
"median": 0.106487, "median": 0.11174,
"std": 0.025557, "std": 0.025369,
"sem": 0.001044, "sem": 0.001037,
"min": 0.043182, "min": 0.057593,
"max": 0.228959, "max": 0.231402,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.003333, "recall-0.05": 0.0,
"recall-0.1": 0.358333, "recall-0.1": 0.251667,
"recall-0.15": 0.948333, "recall-0.15": 0.945,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"knee_left": { "knee_left": {
"count": 600, "count": 599,
"mean": 0.063226, "mean": 0.063316,
"median": 0.046357, "median": 0.047979,
"std": 0.059854, "std": 0.058253,
"sem": 0.002446, "sem": 0.002382,
"min": 0.016702, "min": 0.019525,
"max": 0.474089, "max": 0.476803,
"recall-0.025": 0.035, "recall-0.025": 0.033333,
"recall-0.05": 0.563333, "recall-0.05": 0.546667,
"recall-0.1": 0.881667, "recall-0.1": 0.89,
"recall-0.15": 0.925, "recall-0.15": 0.925,
"recall-0.25": 0.98, "recall-0.25": 0.978333,
"recall-0.5": 1.0, "recall-0.5": 0.998333,
"num_labels": 600 "num_labels": 600
}, },
"knee_right": { "knee_right": {
"count": 600, "count": 600,
"mean": 0.050723, "mean": 0.050955,
"median": 0.041264, "median": 0.041526,
"std": 0.036826, "std": 0.037031,
"sem": 0.001505, "sem": 0.001513,
"min": 0.013747, "min": 0.005291,
"max": 0.274, "max": 0.27011,
"recall-0.025": 0.043333, "recall-0.025": 0.035,
"recall-0.05": 0.745, "recall-0.05": 0.746667,
"recall-0.1": 0.945, "recall-0.1": 0.943333,
"recall-0.15": 0.945, "recall-0.15": 0.945,
"recall-0.25": 0.996667, "recall-0.25": 0.998333,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"ankle_left": { "ankle_left": {
"count": 600, "count": 598,
"mean": 0.097084, "mean": 0.097897,
"median": 0.085682, "median": 0.086817,
"std": 0.046353, "std": 0.048343,
"sem": 0.001894, "sem": 0.001979,
"min": 0.045955, "min": 0.048922,
"max": 0.492226, "max": 0.493127,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.001667, "recall-0.05": 0.003333,
"recall-0.1": 0.833333, "recall-0.1": 0.83,
"recall-0.15": 0.933333, "recall-0.15": 0.933333,
"recall-0.25": 0.985, "recall-0.25": 0.98,
"recall-0.5": 1.0, "recall-0.5": 0.996667,
"num_labels": 600 "num_labels": 600
}, },
"ankle_right": { "ankle_right": {
"count": 599, "count": 599,
"mean": 0.082224, "mean": 0.084814,
"median": 0.068812, "median": 0.07029,
"std": 0.047465, "std": 0.053839,
"sem": 0.001941, "sem": 0.002202,
"min": 0.029154, "min": 0.025955,
"max": 0.404964, "max": 0.384465,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.026667, "recall-0.05": 0.02,
"recall-0.1": 0.888333, "recall-0.1": 0.886667,
"recall-0.15": 0.91, "recall-0.15": 0.908333,
"recall-0.25": 0.985, "recall-0.25": 0.973333,
"recall-0.5": 0.998333, "recall-0.5": 0.998333,
"num_labels": 600 "num_labels": 600
}, },
"joint_recalls": { "joint_recalls": {
"num_labels": 7800, "num_labels": 7800,
"recall-0.025": 0.16923, "recall-0.025": 0.15718,
"recall-0.05": 0.46833, "recall-0.05": 0.46321,
"recall-0.1": 0.82692, "recall-0.1": 0.81846,
"recall-0.15": 0.92692, "recall-0.15": 0.92654,
"recall-0.25": 0.99487, "recall-0.25": 0.99308,
"recall-0.5": 0.99974 "recall-0.5": 0.99923
} }
} }
{ {
"total_parts": 8400, "total_parts": 8400,
"correct_parts": 8097, "correct_parts": 8077,
"pcp": 0.963929 "pcp": 0.961548
} }
``` ```

View File

@ -1,7 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
import cv2
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from tqdm import tqdm from tqdm import tqdm
@ -16,12 +15,11 @@ class BaseModel(ABC):
# ort.set_default_logger_severity(1) # ort.set_default_logger_severity(1)
provider = "" provider = ""
if "TensorrtExecutionProvider" in providers: if "CUDAExecutionProvider" in providers:
provider = "TensorrtExecutionProvider"
elif "CUDAExecutionProvider" in providers:
provider = "CUDAExecutionProvider" provider = "CUDAExecutionProvider"
else: else:
provider = "CPUExecutionProvider" provider = "CPUExecutionProvider"
self.provider = provider
print("Found providers:", providers) print("Found providers:", providers)
print("Using:", provider) print("Using:", provider)
@ -29,18 +27,22 @@ class BaseModel(ABC):
model_path, providers=[provider], sess_options=self.opt model_path, providers=[provider], sess_options=self.opt
) )
self.input_name = self.session.get_inputs()[0].name self.input_names = [input.name for input in self.session.get_inputs()]
self.input_shape = self.session.get_inputs()[0].shape self.input_shapes = [input.shape for input in self.session.get_inputs()]
if "batch_size" in self.input_shape:
self.input_shape = [1, 500, 500, 3]
input_type = self.session.get_inputs()[0].type input_types = [input.type for input in self.session.get_inputs()]
self.input_types = []
for i in range(len(input_types)):
input_type = input_types[i]
if input_type == "tensor(float16)": if input_type == "tensor(float16)":
self.input_type = np.float16 itype = np.float16
elif input_type == "tensor(uint8)": elif input_type == "tensor(uint8)":
self.input_type = np.uint8 itype = np.uint8
elif input_type == "tensor(int32)":
itype = np.int32
else: else:
self.input_type = np.float32 itype = np.float32
self.input_types.append(itype)
if warmup > 0: if warmup > 0:
self.warmup(warmup) self.warmup(warmup)
@ -56,12 +58,51 @@ class BaseModel(ABC):
def warmup(self, epoch: int): def warmup(self, epoch: int):
print("Running warmup for '{}' ...".format(self.__class__.__name__)) print("Running warmup for '{}' ...".format(self.__class__.__name__))
for _ in tqdm(range(epoch)): for _ in tqdm(range(epoch)):
tensor = np.random.random(self.input_shape).astype(self.input_type) inputs = {}
self.session.run(None, {self.input_name: tensor}) for i in range(len(self.input_names)):
iname = self.input_names[i]
if "image" in iname:
ishape = self.input_shapes[i]
if "batch_size" in ishape:
if self.provider == "TensorrtExecutionProvider":
# Using different images sizes for TensorRT warmup takes too long
ishape = [1, 1000, 1000, 3]
else:
ishape = [
1,
np.random.randint(300, 1000),
np.random.randint(300, 1000),
3,
]
tensor = np.random.random(ishape)
tensor = tensor * 255
elif "bbox" in iname:
tensor = np.array(
[
[
np.random.randint(30, 100),
np.random.randint(30, 100),
np.random.randint(200, 300),
np.random.randint(200, 300),
]
]
)
else:
raise ValueError("Undefined input type")
tensor = tensor.astype(self.input_types[i])
inputs[iname] = tensor
self.session.run(None, inputs)
def __call__(self, image: np.ndarray, *args, **kwargs): def __call__(self, image: np.ndarray, *args, **kwargs):
tensor = self.preprocess(image, *args, **kwargs) tensor = self.preprocess(image, *args, **kwargs)
result = self.session.run(None, {self.input_name: tensor}) inputs = {}
for i in range(len(self.input_names)):
iname = self.input_names[i]
inputs[iname] = tensor[i]
result = self.session.run(None, inputs)
output = self.postprocess(result, *args, **kwargs) output = self.postprocess(result, *args, **kwargs)
return output return output
@ -80,8 +121,9 @@ class RTMDet(BaseModel):
self.conf_threshold = conf_threshold self.conf_threshold = conf_threshold
def preprocess(self, image: np.ndarray): def preprocess(self, image: np.ndarray):
tensor = np.asarray(image).astype(self.input_type, copy=False) tensor = np.asarray(image).astype(self.input_types[0], copy=False)
tensor = np.expand_dims(tensor, axis=0) tensor = np.expand_dims(tensor, axis=0)
tensor = [tensor]
return tensor return tensor
def postprocess(self, tensor: List[np.ndarray]): def postprocess(self, tensor: List[np.ndarray]):
@ -105,106 +147,19 @@ class RTMPose(BaseModel):
super(RTMPose, self).__init__(model_path, warmup) super(RTMPose, self).__init__(model_path, warmup)
self.bbox = None self.bbox = None
def region_of_interest_warped(
self,
image: np.ndarray,
box: np.ndarray,
target_size: List[int],
padding_scale: float = 1.25,
):
start_x, start_y, end_x, end_y = box[0:4]
target_w, target_h = target_size
# Calculate original bounding box width and height
bbox_w = end_x - start_x
bbox_h = end_y - start_y
if bbox_w <= 0 or bbox_h <= 0:
raise ValueError("Invalid bounding box!")
# Calculate the aspect ratios
bbox_aspect = bbox_w / bbox_h
target_aspect = target_w / target_h
# Adjust the scaled bounding box to match the target aspect ratio
if bbox_aspect > target_aspect:
adjusted_h = bbox_w / target_aspect
adjusted_w = bbox_w
else:
adjusted_w = bbox_h * target_aspect
adjusted_h = bbox_h
# Scale the bounding box by the padding_scale
scaled_bbox_w = adjusted_w * padding_scale
scaled_bbox_h = adjusted_h * padding_scale
# Calculate the center of the original box
center_x = (start_x + end_x) / 2.0
center_y = (start_y + end_y) / 2.0
# Calculate scaled bounding box coordinates
new_start_x = center_x - scaled_bbox_w / 2.0
new_start_y = center_y - scaled_bbox_h / 2.0
new_end_x = center_x + scaled_bbox_w / 2.0
new_end_y = center_y + scaled_bbox_h / 2.0
# Define the new box coordinates
new_box = np.array(
[new_start_x, new_start_y, new_end_x, new_end_y], dtype=np.float32
)
scale = target_w / scaled_bbox_w
# Define source and destination points for affine transformation
# See: /mmpose/structures/bbox/transforms.py
src_pts = np.array(
[
[center_x, center_y],
[new_start_x, center_y],
[new_start_x, center_y + (center_x - new_start_x)],
],
dtype=np.float32,
)
dst_pts = np.array(
[
[target_w * 0.5, target_h * 0.5],
[0, target_h * 0.5],
[0, target_h * 0.5 + (target_w * 0.5 - 0)],
],
dtype=np.float32,
)
# Compute the affine transformation matrix
M = cv2.getAffineTransform(src_pts, dst_pts)
# Apply affine transformation with border filling
extracted_region = cv2.warpAffine(
image,
M,
target_size,
flags=cv2.INTER_LINEAR,
)
return extracted_region, new_box, scale
def preprocess(self, image: np.ndarray, bbox: np.ndarray): def preprocess(self, image: np.ndarray, bbox: np.ndarray):
th, tw = self.input_shape[1:3] tensor = np.asarray(image).astype(self.input_types[0], copy=False)
region, self.bbox, _ = self.region_of_interest_warped(image, bbox, (tw, th))
tensor = np.asarray(region).astype(self.input_type, copy=False)
tensor = np.expand_dims(tensor, axis=0) tensor = np.expand_dims(tensor, axis=0)
bbox = np.asarray(bbox)[0:4]
bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8])
bbox = bbox.round().astype(np.int32)
bbox = np.expand_dims(bbox, axis=0)
tensor = [tensor, bbox]
return tensor return tensor
def postprocess(self, tensor: List[np.ndarray], **kwargs): def postprocess(self, tensor: List[np.ndarray], **kwargs):
scores = np.clip(tensor[1][0], 0, 1) scores = np.clip(tensor[0][0], 0, 1)
kp = np.concatenate([tensor[0][0], np.expand_dims(scores, axis=-1)], axis=-1) kp = np.concatenate([tensor[1][0], np.expand_dims(scores, axis=-1)], axis=-1)
# See: /mmpose/models/pose_estimators/topdown.py - add_pred_to_datasample()
th, tw = self.input_shape[1:3]
bw, bh = [self.bbox[2] - self.bbox[0], self.bbox[3] - self.bbox[1]]
kp[:, :2] /= np.array([tw, th])
kp[:, :2] *= np.array([bw, bh])
kp[:, :2] += np.array([self.bbox[0] + bw / 2, self.bbox[1] + bh / 2])
kp[:, :2] -= 0.5 * np.array([bw, bh])
return kp return kp