Moved pose pre/post-processing into onnx graph.
This commit is contained in:
@ -121,14 +121,55 @@ def add_steps_to_onnx(model_path):
|
||||
|
||||
# Update nodes from postprocess model to use the input of the main network
|
||||
pp2_input_image_name = pp2_model.graph.input[0].name
|
||||
main_input_name = model.graph.input[0].name
|
||||
main_input_image_name = model.graph.input[0].name
|
||||
for node in model.graph.node:
|
||||
for idx, name in enumerate(node.input):
|
||||
if name == pp2_input_image_name:
|
||||
node.input[idx] = main_input_name
|
||||
node.input[idx] = main_input_image_name
|
||||
model.graph.input.pop(1)
|
||||
|
||||
# Set input type to int8
|
||||
if "pose" in model_path:
|
||||
# Add preprocess model to main network
|
||||
pp1_model = onnx.load(base_path + "pose_preprocess.onnx")
|
||||
model = compose.add_prefix(model, prefix="main_")
|
||||
pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_")
|
||||
model = compose.merge_models(
|
||||
pp1_model,
|
||||
model,
|
||||
io_map=[
|
||||
(pp1_model.graph.output[0].name, model.graph.input[0].name),
|
||||
],
|
||||
)
|
||||
|
||||
# Add postprocess model
|
||||
pp2_model = onnx.load(base_path + "pose_postprocess.onnx")
|
||||
pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_")
|
||||
model = compose.merge_models(
|
||||
model,
|
||||
pp2_model,
|
||||
io_map=[
|
||||
(model.graph.output[0].name, pp2_model.graph.input[2].name),
|
||||
],
|
||||
)
|
||||
|
||||
# Update nodes from postprocess model to use the input of the main network
|
||||
pp2_input_image_name = pp2_model.graph.input[0].name
|
||||
pp2_input_bbox_name = pp2_model.graph.input[1].name
|
||||
main_input_image_name = model.graph.input[0].name
|
||||
main_input_bbox_name = model.graph.input[1].name
|
||||
for node in model.graph.node:
|
||||
for idx, name in enumerate(node.input):
|
||||
if name == pp2_input_image_name:
|
||||
node.input[idx] = main_input_image_name
|
||||
if name == pp2_input_bbox_name:
|
||||
node.input[idx] = main_input_bbox_name
|
||||
model.graph.input.pop(2)
|
||||
model.graph.input.pop(2)
|
||||
|
||||
# Set input box type to int32
|
||||
model.graph.input[1].type.tensor_type.elem_type = TensorProto.INT32
|
||||
|
||||
# Set input image type to int8
|
||||
model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8
|
||||
|
||||
path = model_path.replace(".onnx", "_extra-steps.onnx")
|
||||
|
||||
@ -7,6 +7,7 @@ import torch.nn.functional as F
|
||||
|
||||
base_path = "/RapidPoseTriangulation/extras/mmdeploy/exports/"
|
||||
det_target_size = (320, 320)
|
||||
pose_target_size = (384, 288)
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
@ -19,10 +20,37 @@ class Letterbox(nn.Module):
|
||||
self.target_size = target_size
|
||||
self.fill_value = fill_value
|
||||
|
||||
def calc_params(self, img):
|
||||
ih, iw = img.shape[1:3]
|
||||
def calc_params_and_crop(self, img, bbox=None):
|
||||
ih0, iw0 = img.shape[1:3]
|
||||
th, tw = self.target_size
|
||||
|
||||
if bbox is not None:
|
||||
bbox = bbox[0].float()
|
||||
x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
|
||||
|
||||
# Slightly increase bbox size
|
||||
factor = 1.25
|
||||
w = x2 - x1
|
||||
h = y2 - y1
|
||||
x1 -= w * (factor - 1) / 2
|
||||
x2 += w * (factor - 1) / 2
|
||||
y1 -= h * (factor - 1) / 2
|
||||
y2 += h * (factor - 1) / 2
|
||||
|
||||
zero = torch.tensor(0)
|
||||
x1 = torch.max(x1, zero).to(torch.int64)
|
||||
y1 = torch.max(y1, zero).to(torch.int64)
|
||||
x2 = torch.min(x2, iw0).to(torch.int64)
|
||||
y2 = torch.min(y2, ih0).to(torch.int64)
|
||||
bbox = torch.stack((x1, y1, x2, y2), dim=0).unsqueeze(0)
|
||||
|
||||
img = img.to(torch.float32)
|
||||
img = img[:, y1:y2, x1:x2, :]
|
||||
ih = y2 - y1
|
||||
iw = x2 - x1
|
||||
else:
|
||||
ih, iw = ih0, iw0
|
||||
|
||||
scale = torch.min(tw / iw, th / ih)
|
||||
nw = torch.round(iw * scale)
|
||||
nh = torch.round(ih * scale)
|
||||
@ -35,15 +63,18 @@ class Letterbox(nn.Module):
|
||||
pad_bottom = pad_h - pad_top
|
||||
paddings = (pad_left, pad_right, pad_top, pad_bottom)
|
||||
|
||||
return paddings, scale, (nw, nh)
|
||||
return img, paddings, scale, (nw, nh), bbox
|
||||
|
||||
def forward(self, img):
|
||||
paddings, _, (nw, nh) = self.calc_params(img)
|
||||
def forward(self, img, bbox=None):
|
||||
img, paddings, _, (nw, nh), _ = self.calc_params_and_crop(img, bbox)
|
||||
|
||||
# Resize the image
|
||||
img = img.to(torch.float32)
|
||||
img = F.interpolate(
|
||||
img.permute(0, 3, 1, 2), size=(nh, nw), mode="bilinear", align_corners=False
|
||||
img.permute(0, 3, 1, 2),
|
||||
size=(nh, nw),
|
||||
mode="bilinear",
|
||||
align_corners=False,
|
||||
)
|
||||
img = img.permute(0, 2, 3, 1)
|
||||
img = img.round()
|
||||
@ -71,7 +102,7 @@ class DetPreprocess(nn.Module):
|
||||
|
||||
def forward(self, img):
|
||||
# img: torch.Tensor of shape [batch, H, W, C], dtype=torch.uint8
|
||||
img = self.letterbox(img)
|
||||
img = self.letterbox(img, None)
|
||||
return img
|
||||
|
||||
|
||||
@ -81,36 +112,97 @@ class DetPreprocess(nn.Module):
|
||||
class DetPostprocess(nn.Module):
|
||||
def __init__(self, target_size):
|
||||
super(DetPostprocess, self).__init__()
|
||||
|
||||
self.target_size = target_size
|
||||
self.letterbox = Letterbox(target_size)
|
||||
|
||||
def forward(self, img, boxes):
|
||||
paddings, scale, _ = self.letterbox.calc_params(img)
|
||||
_, paddings, scale, _, _ = self.letterbox.calc_params_and_crop(img, None)
|
||||
|
||||
boxes = boxes.float()
|
||||
boxes[:, :, 0] -= paddings[0]
|
||||
boxes[:, :, 2] -= paddings[0]
|
||||
boxes[:, :, 1] -= paddings[2]
|
||||
boxes[:, :, 3] -= paddings[2]
|
||||
boxes[:, :, 0:4] /= scale
|
||||
|
||||
ih, iw = img.shape[1:3]
|
||||
boxes = torch.max(boxes, torch.tensor(0))
|
||||
zero = torch.tensor(0)
|
||||
boxes = torch.max(boxes, zero)
|
||||
|
||||
th, tw = self.target_size
|
||||
pad_w = paddings[0] + paddings[1]
|
||||
pad_h = paddings[2] + paddings[3]
|
||||
max_w = tw - pad_w - 1
|
||||
max_h = th - pad_h - 1
|
||||
b0 = boxes[:, :, 0]
|
||||
b1 = boxes[:, :, 1]
|
||||
b2 = boxes[:, :, 2]
|
||||
b3 = boxes[:, :, 3]
|
||||
b0 = torch.min(b0, iw - 1)
|
||||
b1 = torch.min(b1, ih - 1)
|
||||
b2 = torch.min(b2, iw - 1)
|
||||
b3 = torch.min(b3, ih - 1)
|
||||
b0 = torch.min(b0, max_w)
|
||||
b1 = torch.min(b1, max_h)
|
||||
b2 = torch.min(b2, max_w)
|
||||
b3 = torch.min(b3, max_h)
|
||||
boxes = torch.stack((b0, b1, b2, b3, boxes[:, :, 4]), dim=2)
|
||||
|
||||
boxes[:, :, 0:4] /= scale
|
||||
return boxes
|
||||
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
||||
class PosePreprocess(nn.Module):
|
||||
def __init__(self, target_size, fill_value=114):
|
||||
super(PosePreprocess, self).__init__()
|
||||
self.letterbox = Letterbox(target_size, fill_value)
|
||||
|
||||
def forward(self, img, bbox):
|
||||
# img: torch.Tensor of shape [1, H, W, C], dtype=torch.uint8
|
||||
# bbox: torch.Tensor of shape [1, 4], dtype=torch.float32
|
||||
img = self.letterbox(img, bbox)
|
||||
return img
|
||||
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
||||
class PosePostprocess(nn.Module):
|
||||
def __init__(self, target_size):
|
||||
super(PosePostprocess, self).__init__()
|
||||
|
||||
self.target_size = target_size
|
||||
self.letterbox = Letterbox(target_size)
|
||||
|
||||
def forward(self, img, bbox, keypoints):
|
||||
_, paddings, scale, _, bbox = self.letterbox.calc_params_and_crop(img, bbox)
|
||||
|
||||
kp = keypoints.float()
|
||||
kp[:, :, 0] -= paddings[0]
|
||||
kp[:, :, 1] -= paddings[2]
|
||||
|
||||
zero = torch.tensor(0)
|
||||
kp = torch.max(kp, zero)
|
||||
|
||||
th, tw = self.target_size
|
||||
pad_w = paddings[0] + paddings[1]
|
||||
pad_h = paddings[2] + paddings[3]
|
||||
max_w = tw - pad_w - 1
|
||||
max_h = th - pad_h - 1
|
||||
k0 = kp[:, :, 0]
|
||||
k1 = kp[:, :, 1]
|
||||
k0 = torch.min(k0, max_w)
|
||||
k1 = torch.min(k1, max_h)
|
||||
kp = torch.stack((k0, k1), dim=2)
|
||||
|
||||
kp[:, :, 0:2] /= scale
|
||||
|
||||
kp[:, :, 0] += bbox[0, 0]
|
||||
kp[:, :, 1] += bbox[0, 1]
|
||||
return kp
|
||||
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
img_path = "/RapidPoseTriangulation/scripts/../data/h1/54138969-img_003201.jpg"
|
||||
@ -154,6 +246,45 @@ def main():
|
||||
},
|
||||
)
|
||||
|
||||
# Initialize the PosePreprocess module
|
||||
preprocess_model = PosePreprocess(target_size=pose_target_size)
|
||||
det_dummy_input_c0 = torch.from_numpy(image).unsqueeze(0)
|
||||
det_dummy_input_c1 = torch.tensor([[10, 10, 90, 40]])
|
||||
|
||||
# Export to ONNX
|
||||
torch.onnx.export(
|
||||
preprocess_model,
|
||||
(det_dummy_input_c0, det_dummy_input_c1),
|
||||
base_path + "pose_preprocess.onnx",
|
||||
opset_version=11,
|
||||
input_names=["input_image", "bbox"],
|
||||
output_names=["preprocessed_image"],
|
||||
dynamic_axes={
|
||||
"input_image": {0: "batch_size", 1: "height", 2: "width"},
|
||||
"preprocessed_image": {0: "batch_size"},
|
||||
},
|
||||
)
|
||||
|
||||
# Initialize the PosePostprocess module
|
||||
postprocess_model = PosePostprocess(target_size=pose_target_size)
|
||||
det_dummy_input_d0 = torch.from_numpy(image).unsqueeze(0)
|
||||
det_dummy_input_d1 = torch.tensor([[10, 10, 90, 40]])
|
||||
det_dummy_input_d2 = torch.rand(1, 17, 3)
|
||||
|
||||
# Export to ONNX
|
||||
torch.onnx.export(
|
||||
postprocess_model,
|
||||
(det_dummy_input_d0, det_dummy_input_d1, det_dummy_input_d2),
|
||||
base_path + "pose_postprocess.onnx",
|
||||
opset_version=11,
|
||||
input_names=["input_image", "bbox", "keypoints"],
|
||||
output_names=["output_keypoints"],
|
||||
dynamic_axes={
|
||||
"input_image": {0: "batch_size", 1: "height", 2: "width"},
|
||||
"output_keypoints": {0: "batch_size"},
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ==================================================================================================
|
||||
|
||||
|
||||
304
media/RESULTS.md
304
media/RESULTS.md
@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets.
|
||||
|
||||
```json
|
||||
{
|
||||
"avg_time_2d": 0.01254632634631658,
|
||||
"avg_time_3d": 0.00036295955463991325,
|
||||
"avg_fps": 77.4636186441503
|
||||
"avg_time_2d": 0.02244777841083074,
|
||||
"avg_time_3d": 0.0003828315411583852,
|
||||
"avg_fps": 43.800844659994496
|
||||
}
|
||||
{
|
||||
"person_nums": {
|
||||
@ -27,101 +27,101 @@ Results of the model in various experiments on different datasets.
|
||||
},
|
||||
"mpjpe": {
|
||||
"count": 600,
|
||||
"mean": 0.066275,
|
||||
"median": 0.058426,
|
||||
"std": 0.02768,
|
||||
"sem": 0.001131,
|
||||
"min": 0.040807,
|
||||
"max": 0.188876,
|
||||
"mean": 0.067837,
|
||||
"median": 0.059973,
|
||||
"std": 0.027729,
|
||||
"sem": 0.001133,
|
||||
"min": 0.044125,
|
||||
"max": 0.191545,
|
||||
"recall-0.025": 0.0,
|
||||
"recall-0.05": 0.083333,
|
||||
"recall-0.1": 0.938333,
|
||||
"recall-0.05": 0.035,
|
||||
"recall-0.1": 0.931667,
|
||||
"recall-0.15": 0.95,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600,
|
||||
"ap-0.025": 0.0,
|
||||
"ap-0.05": 0.011533,
|
||||
"ap-0.1": 0.899113,
|
||||
"ap-0.15": 0.915362,
|
||||
"ap-0.05": 0.003097,
|
||||
"ap-0.1": 0.889734,
|
||||
"ap-0.15": 0.915055,
|
||||
"ap-0.25": 1.0,
|
||||
"ap-0.5": 1.0
|
||||
},
|
||||
"nose": {
|
||||
"count": 600,
|
||||
"mean": 0.115024,
|
||||
"median": 0.099737,
|
||||
"std": 0.041,
|
||||
"sem": 0.001675,
|
||||
"min": 0.02644,
|
||||
"max": 0.261025,
|
||||
"mean": 0.116272,
|
||||
"median": 0.09953,
|
||||
"std": 0.042967,
|
||||
"sem": 0.001756,
|
||||
"min": 0.033845,
|
||||
"max": 0.263303,
|
||||
"recall-0.025": 0.0,
|
||||
"recall-0.05": 0.008333,
|
||||
"recall-0.1": 0.501667,
|
||||
"recall-0.15": 0.808333,
|
||||
"recall-0.25": 0.998333,
|
||||
"recall-0.1": 0.503333,
|
||||
"recall-0.15": 0.815,
|
||||
"recall-0.25": 0.993333,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"shoulder_left": {
|
||||
"count": 600,
|
||||
"mean": 0.034317,
|
||||
"median": 0.026768,
|
||||
"std": 0.031799,
|
||||
"sem": 0.001299,
|
||||
"min": 0.001234,
|
||||
"max": 0.178357,
|
||||
"recall-0.025": 0.456667,
|
||||
"mean": 0.034881,
|
||||
"median": 0.027327,
|
||||
"std": 0.031594,
|
||||
"sem": 0.001291,
|
||||
"min": 0.002162,
|
||||
"max": 0.178271,
|
||||
"recall-0.025": 0.438333,
|
||||
"recall-0.05": 0.863333,
|
||||
"recall-0.1": 0.946667,
|
||||
"recall-0.15": 0.966667,
|
||||
"recall-0.15": 0.963333,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"shoulder_right": {
|
||||
"count": 600,
|
||||
"mean": 0.047981,
|
||||
"median": 0.034263,
|
||||
"std": 0.039767,
|
||||
"sem": 0.001625,
|
||||
"min": 0.005363,
|
||||
"max": 0.24597,
|
||||
"recall-0.025": 0.226667,
|
||||
"recall-0.05": 0.743333,
|
||||
"recall-0.1": 0.916667,
|
||||
"recall-0.15": 0.948333,
|
||||
"mean": 0.050288,
|
||||
"median": 0.03555,
|
||||
"std": 0.042274,
|
||||
"sem": 0.001727,
|
||||
"min": 0.003983,
|
||||
"max": 0.238328,
|
||||
"recall-0.025": 0.176667,
|
||||
"recall-0.05": 0.748333,
|
||||
"recall-0.1": 0.9,
|
||||
"recall-0.15": 0.94,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"elbow_left": {
|
||||
"count": 600,
|
||||
"mean": 0.043526,
|
||||
"median": 0.034276,
|
||||
"std": 0.034786,
|
||||
"sem": 0.001421,
|
||||
"min": 0.003312,
|
||||
"max": 0.198715,
|
||||
"recall-0.025": 0.24,
|
||||
"recall-0.05": 0.781667,
|
||||
"recall-0.1": 0.943333,
|
||||
"recall-0.15": 0.958333,
|
||||
"mean": 0.044326,
|
||||
"median": 0.035816,
|
||||
"std": 0.034654,
|
||||
"sem": 0.001416,
|
||||
"min": 0.001741,
|
||||
"max": 0.198882,
|
||||
"recall-0.025": 0.226667,
|
||||
"recall-0.05": 0.776667,
|
||||
"recall-0.1": 0.946667,
|
||||
"recall-0.15": 0.955,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"elbow_right": {
|
||||
"count": 600,
|
||||
"mean": 0.04376,
|
||||
"median": 0.033219,
|
||||
"std": 0.037126,
|
||||
"sem": 0.001517,
|
||||
"min": 0.006159,
|
||||
"max": 0.314756,
|
||||
"recall-0.025": 0.245,
|
||||
"recall-0.05": 0.803333,
|
||||
"recall-0.1": 0.933333,
|
||||
"mean": 0.044545,
|
||||
"median": 0.033152,
|
||||
"std": 0.037755,
|
||||
"sem": 0.001543,
|
||||
"min": 0.008169,
|
||||
"max": 0.338555,
|
||||
"recall-0.025": 0.218333,
|
||||
"recall-0.05": 0.798333,
|
||||
"recall-0.1": 0.928333,
|
||||
"recall-0.15": 0.943333,
|
||||
"recall-0.25": 0.996667,
|
||||
"recall-0.5": 1.0,
|
||||
@ -129,146 +129,146 @@ Results of the model in various experiments on different datasets.
|
||||
},
|
||||
"wrist_left": {
|
||||
"count": 600,
|
||||
"mean": 0.044151,
|
||||
"median": 0.026578,
|
||||
"std": 0.047109,
|
||||
"sem": 0.001925,
|
||||
"min": 0.002328,
|
||||
"max": 0.288425,
|
||||
"recall-0.025": 0.478333,
|
||||
"recall-0.05": 0.736667,
|
||||
"recall-0.1": 0.883333,
|
||||
"recall-0.15": 0.918333,
|
||||
"recall-0.25": 0.998333,
|
||||
"mean": 0.044896,
|
||||
"median": 0.025929,
|
||||
"std": 0.048601,
|
||||
"sem": 0.001986,
|
||||
"min": 0.002701,
|
||||
"max": 0.326901,
|
||||
"recall-0.025": 0.476667,
|
||||
"recall-0.05": 0.735,
|
||||
"recall-0.1": 0.885,
|
||||
"recall-0.15": 0.913333,
|
||||
"recall-0.25": 0.996667,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"wrist_right": {
|
||||
"count": 600,
|
||||
"mean": 0.045218,
|
||||
"median": 0.026994,
|
||||
"std": 0.050213,
|
||||
"sem": 0.002052,
|
||||
"min": 0.002207,
|
||||
"max": 0.291549,
|
||||
"recall-0.025": 0.471667,
|
||||
"recall-0.05": 0.766667,
|
||||
"recall-0.1": 0.876667,
|
||||
"recall-0.15": 0.908333,
|
||||
"recall-0.25": 0.998333,
|
||||
"mean": 0.045586,
|
||||
"median": 0.027856,
|
||||
"std": 0.048323,
|
||||
"sem": 0.001974,
|
||||
"min": 0.001841,
|
||||
"max": 0.229728,
|
||||
"recall-0.025": 0.436667,
|
||||
"recall-0.05": 0.751667,
|
||||
"recall-0.1": 0.881667,
|
||||
"recall-0.15": 0.916667,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"hip_left": {
|
||||
"count": 600,
|
||||
"mean": 0.085085,
|
||||
"median": 0.079726,
|
||||
"std": 0.032918,
|
||||
"sem": 0.001345,
|
||||
"min": 0.020039,
|
||||
"max": 0.232252,
|
||||
"recall-0.025": 0.006667,
|
||||
"recall-0.05": 0.055,
|
||||
"recall-0.1": 0.853333,
|
||||
"recall-0.15": 0.95,
|
||||
"mean": 0.087757,
|
||||
"median": 0.083491,
|
||||
"std": 0.032627,
|
||||
"sem": 0.001333,
|
||||
"min": 0.004177,
|
||||
"max": 0.235198,
|
||||
"recall-0.025": 0.008333,
|
||||
"recall-0.05": 0.031667,
|
||||
"recall-0.1": 0.851667,
|
||||
"recall-0.15": 0.953333,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"hip_right": {
|
||||
"count": 600,
|
||||
"mean": 0.108514,
|
||||
"median": 0.106487,
|
||||
"std": 0.025557,
|
||||
"sem": 0.001044,
|
||||
"min": 0.043182,
|
||||
"max": 0.228959,
|
||||
"mean": 0.112758,
|
||||
"median": 0.11174,
|
||||
"std": 0.025369,
|
||||
"sem": 0.001037,
|
||||
"min": 0.057593,
|
||||
"max": 0.231402,
|
||||
"recall-0.025": 0.0,
|
||||
"recall-0.05": 0.003333,
|
||||
"recall-0.1": 0.358333,
|
||||
"recall-0.15": 0.948333,
|
||||
"recall-0.05": 0.0,
|
||||
"recall-0.1": 0.251667,
|
||||
"recall-0.15": 0.945,
|
||||
"recall-0.25": 1.0,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"knee_left": {
|
||||
"count": 600,
|
||||
"mean": 0.063226,
|
||||
"median": 0.046357,
|
||||
"std": 0.059854,
|
||||
"sem": 0.002446,
|
||||
"min": 0.016702,
|
||||
"max": 0.474089,
|
||||
"recall-0.025": 0.035,
|
||||
"recall-0.05": 0.563333,
|
||||
"recall-0.1": 0.881667,
|
||||
"count": 599,
|
||||
"mean": 0.063316,
|
||||
"median": 0.047979,
|
||||
"std": 0.058253,
|
||||
"sem": 0.002382,
|
||||
"min": 0.019525,
|
||||
"max": 0.476803,
|
||||
"recall-0.025": 0.033333,
|
||||
"recall-0.05": 0.546667,
|
||||
"recall-0.1": 0.89,
|
||||
"recall-0.15": 0.925,
|
||||
"recall-0.25": 0.98,
|
||||
"recall-0.5": 1.0,
|
||||
"recall-0.25": 0.978333,
|
||||
"recall-0.5": 0.998333,
|
||||
"num_labels": 600
|
||||
},
|
||||
"knee_right": {
|
||||
"count": 600,
|
||||
"mean": 0.050723,
|
||||
"median": 0.041264,
|
||||
"std": 0.036826,
|
||||
"sem": 0.001505,
|
||||
"min": 0.013747,
|
||||
"max": 0.274,
|
||||
"recall-0.025": 0.043333,
|
||||
"recall-0.05": 0.745,
|
||||
"recall-0.1": 0.945,
|
||||
"mean": 0.050955,
|
||||
"median": 0.041526,
|
||||
"std": 0.037031,
|
||||
"sem": 0.001513,
|
||||
"min": 0.005291,
|
||||
"max": 0.27011,
|
||||
"recall-0.025": 0.035,
|
||||
"recall-0.05": 0.746667,
|
||||
"recall-0.1": 0.943333,
|
||||
"recall-0.15": 0.945,
|
||||
"recall-0.25": 0.996667,
|
||||
"recall-0.25": 0.998333,
|
||||
"recall-0.5": 1.0,
|
||||
"num_labels": 600
|
||||
},
|
||||
"ankle_left": {
|
||||
"count": 600,
|
||||
"mean": 0.097084,
|
||||
"median": 0.085682,
|
||||
"std": 0.046353,
|
||||
"sem": 0.001894,
|
||||
"min": 0.045955,
|
||||
"max": 0.492226,
|
||||
"count": 598,
|
||||
"mean": 0.097897,
|
||||
"median": 0.086817,
|
||||
"std": 0.048343,
|
||||
"sem": 0.001979,
|
||||
"min": 0.048922,
|
||||
"max": 0.493127,
|
||||
"recall-0.025": 0.0,
|
||||
"recall-0.05": 0.001667,
|
||||
"recall-0.1": 0.833333,
|
||||
"recall-0.05": 0.003333,
|
||||
"recall-0.1": 0.83,
|
||||
"recall-0.15": 0.933333,
|
||||
"recall-0.25": 0.985,
|
||||
"recall-0.5": 1.0,
|
||||
"recall-0.25": 0.98,
|
||||
"recall-0.5": 0.996667,
|
||||
"num_labels": 600
|
||||
},
|
||||
"ankle_right": {
|
||||
"count": 599,
|
||||
"mean": 0.082224,
|
||||
"median": 0.068812,
|
||||
"std": 0.047465,
|
||||
"sem": 0.001941,
|
||||
"min": 0.029154,
|
||||
"max": 0.404964,
|
||||
"mean": 0.084814,
|
||||
"median": 0.07029,
|
||||
"std": 0.053839,
|
||||
"sem": 0.002202,
|
||||
"min": 0.025955,
|
||||
"max": 0.384465,
|
||||
"recall-0.025": 0.0,
|
||||
"recall-0.05": 0.026667,
|
||||
"recall-0.1": 0.888333,
|
||||
"recall-0.15": 0.91,
|
||||
"recall-0.25": 0.985,
|
||||
"recall-0.05": 0.02,
|
||||
"recall-0.1": 0.886667,
|
||||
"recall-0.15": 0.908333,
|
||||
"recall-0.25": 0.973333,
|
||||
"recall-0.5": 0.998333,
|
||||
"num_labels": 600
|
||||
},
|
||||
"joint_recalls": {
|
||||
"num_labels": 7800,
|
||||
"recall-0.025": 0.16923,
|
||||
"recall-0.05": 0.46833,
|
||||
"recall-0.1": 0.82692,
|
||||
"recall-0.15": 0.92692,
|
||||
"recall-0.25": 0.99487,
|
||||
"recall-0.5": 0.99974
|
||||
"recall-0.025": 0.15718,
|
||||
"recall-0.05": 0.46321,
|
||||
"recall-0.1": 0.81846,
|
||||
"recall-0.15": 0.92654,
|
||||
"recall-0.25": 0.99308,
|
||||
"recall-0.5": 0.99923
|
||||
}
|
||||
}
|
||||
{
|
||||
"total_parts": 8400,
|
||||
"correct_parts": 8097,
|
||||
"pcp": 0.963929
|
||||
"correct_parts": 8077,
|
||||
"pcp": 0.961548
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
from tqdm import tqdm
|
||||
@ -16,12 +15,11 @@ class BaseModel(ABC):
|
||||
# ort.set_default_logger_severity(1)
|
||||
|
||||
provider = ""
|
||||
if "TensorrtExecutionProvider" in providers:
|
||||
provider = "TensorrtExecutionProvider"
|
||||
elif "CUDAExecutionProvider" in providers:
|
||||
if "CUDAExecutionProvider" in providers:
|
||||
provider = "CUDAExecutionProvider"
|
||||
else:
|
||||
provider = "CPUExecutionProvider"
|
||||
self.provider = provider
|
||||
print("Found providers:", providers)
|
||||
print("Using:", provider)
|
||||
|
||||
@ -29,18 +27,22 @@ class BaseModel(ABC):
|
||||
model_path, providers=[provider], sess_options=self.opt
|
||||
)
|
||||
|
||||
self.input_name = self.session.get_inputs()[0].name
|
||||
self.input_shape = self.session.get_inputs()[0].shape
|
||||
if "batch_size" in self.input_shape:
|
||||
self.input_shape = [1, 500, 500, 3]
|
||||
self.input_names = [input.name for input in self.session.get_inputs()]
|
||||
self.input_shapes = [input.shape for input in self.session.get_inputs()]
|
||||
|
||||
input_type = self.session.get_inputs()[0].type
|
||||
if input_type == "tensor(float16)":
|
||||
self.input_type = np.float16
|
||||
elif input_type == "tensor(uint8)":
|
||||
self.input_type = np.uint8
|
||||
else:
|
||||
self.input_type = np.float32
|
||||
input_types = [input.type for input in self.session.get_inputs()]
|
||||
self.input_types = []
|
||||
for i in range(len(input_types)):
|
||||
input_type = input_types[i]
|
||||
if input_type == "tensor(float16)":
|
||||
itype = np.float16
|
||||
elif input_type == "tensor(uint8)":
|
||||
itype = np.uint8
|
||||
elif input_type == "tensor(int32)":
|
||||
itype = np.int32
|
||||
else:
|
||||
itype = np.float32
|
||||
self.input_types.append(itype)
|
||||
|
||||
if warmup > 0:
|
||||
self.warmup(warmup)
|
||||
@ -56,12 +58,51 @@ class BaseModel(ABC):
|
||||
def warmup(self, epoch: int):
|
||||
print("Running warmup for '{}' ...".format(self.__class__.__name__))
|
||||
for _ in tqdm(range(epoch)):
|
||||
tensor = np.random.random(self.input_shape).astype(self.input_type)
|
||||
self.session.run(None, {self.input_name: tensor})
|
||||
inputs = {}
|
||||
for i in range(len(self.input_names)):
|
||||
iname = self.input_names[i]
|
||||
|
||||
if "image" in iname:
|
||||
ishape = self.input_shapes[i]
|
||||
if "batch_size" in ishape:
|
||||
if self.provider == "TensorrtExecutionProvider":
|
||||
# Using different images sizes for TensorRT warmup takes too long
|
||||
ishape = [1, 1000, 1000, 3]
|
||||
else:
|
||||
ishape = [
|
||||
1,
|
||||
np.random.randint(300, 1000),
|
||||
np.random.randint(300, 1000),
|
||||
3,
|
||||
]
|
||||
tensor = np.random.random(ishape)
|
||||
tensor = tensor * 255
|
||||
elif "bbox" in iname:
|
||||
tensor = np.array(
|
||||
[
|
||||
[
|
||||
np.random.randint(30, 100),
|
||||
np.random.randint(30, 100),
|
||||
np.random.randint(200, 300),
|
||||
np.random.randint(200, 300),
|
||||
]
|
||||
]
|
||||
)
|
||||
else:
|
||||
raise ValueError("Undefined input type")
|
||||
|
||||
tensor = tensor.astype(self.input_types[i])
|
||||
inputs[iname] = tensor
|
||||
|
||||
self.session.run(None, inputs)
|
||||
|
||||
def __call__(self, image: np.ndarray, *args, **kwargs):
|
||||
tensor = self.preprocess(image, *args, **kwargs)
|
||||
result = self.session.run(None, {self.input_name: tensor})
|
||||
inputs = {}
|
||||
for i in range(len(self.input_names)):
|
||||
iname = self.input_names[i]
|
||||
inputs[iname] = tensor[i]
|
||||
result = self.session.run(None, inputs)
|
||||
output = self.postprocess(result, *args, **kwargs)
|
||||
return output
|
||||
|
||||
@ -80,8 +121,9 @@ class RTMDet(BaseModel):
|
||||
self.conf_threshold = conf_threshold
|
||||
|
||||
def preprocess(self, image: np.ndarray):
|
||||
tensor = np.asarray(image).astype(self.input_type, copy=False)
|
||||
tensor = np.asarray(image).astype(self.input_types[0], copy=False)
|
||||
tensor = np.expand_dims(tensor, axis=0)
|
||||
tensor = [tensor]
|
||||
return tensor
|
||||
|
||||
def postprocess(self, tensor: List[np.ndarray]):
|
||||
@ -105,106 +147,19 @@ class RTMPose(BaseModel):
|
||||
super(RTMPose, self).__init__(model_path, warmup)
|
||||
self.bbox = None
|
||||
|
||||
def region_of_interest_warped(
|
||||
self,
|
||||
image: np.ndarray,
|
||||
box: np.ndarray,
|
||||
target_size: List[int],
|
||||
padding_scale: float = 1.25,
|
||||
):
|
||||
start_x, start_y, end_x, end_y = box[0:4]
|
||||
target_w, target_h = target_size
|
||||
|
||||
# Calculate original bounding box width and height
|
||||
bbox_w = end_x - start_x
|
||||
bbox_h = end_y - start_y
|
||||
|
||||
if bbox_w <= 0 or bbox_h <= 0:
|
||||
raise ValueError("Invalid bounding box!")
|
||||
|
||||
# Calculate the aspect ratios
|
||||
bbox_aspect = bbox_w / bbox_h
|
||||
target_aspect = target_w / target_h
|
||||
|
||||
# Adjust the scaled bounding box to match the target aspect ratio
|
||||
if bbox_aspect > target_aspect:
|
||||
adjusted_h = bbox_w / target_aspect
|
||||
adjusted_w = bbox_w
|
||||
else:
|
||||
adjusted_w = bbox_h * target_aspect
|
||||
adjusted_h = bbox_h
|
||||
|
||||
# Scale the bounding box by the padding_scale
|
||||
scaled_bbox_w = adjusted_w * padding_scale
|
||||
scaled_bbox_h = adjusted_h * padding_scale
|
||||
|
||||
# Calculate the center of the original box
|
||||
center_x = (start_x + end_x) / 2.0
|
||||
center_y = (start_y + end_y) / 2.0
|
||||
|
||||
# Calculate scaled bounding box coordinates
|
||||
new_start_x = center_x - scaled_bbox_w / 2.0
|
||||
new_start_y = center_y - scaled_bbox_h / 2.0
|
||||
new_end_x = center_x + scaled_bbox_w / 2.0
|
||||
new_end_y = center_y + scaled_bbox_h / 2.0
|
||||
|
||||
# Define the new box coordinates
|
||||
new_box = np.array(
|
||||
[new_start_x, new_start_y, new_end_x, new_end_y], dtype=np.float32
|
||||
)
|
||||
scale = target_w / scaled_bbox_w
|
||||
|
||||
# Define source and destination points for affine transformation
|
||||
# See: /mmpose/structures/bbox/transforms.py
|
||||
src_pts = np.array(
|
||||
[
|
||||
[center_x, center_y],
|
||||
[new_start_x, center_y],
|
||||
[new_start_x, center_y + (center_x - new_start_x)],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
dst_pts = np.array(
|
||||
[
|
||||
[target_w * 0.5, target_h * 0.5],
|
||||
[0, target_h * 0.5],
|
||||
[0, target_h * 0.5 + (target_w * 0.5 - 0)],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
# Compute the affine transformation matrix
|
||||
M = cv2.getAffineTransform(src_pts, dst_pts)
|
||||
|
||||
# Apply affine transformation with border filling
|
||||
extracted_region = cv2.warpAffine(
|
||||
image,
|
||||
M,
|
||||
target_size,
|
||||
flags=cv2.INTER_LINEAR,
|
||||
)
|
||||
|
||||
return extracted_region, new_box, scale
|
||||
|
||||
def preprocess(self, image: np.ndarray, bbox: np.ndarray):
|
||||
th, tw = self.input_shape[1:3]
|
||||
region, self.bbox, _ = self.region_of_interest_warped(image, bbox, (tw, th))
|
||||
tensor = np.asarray(region).astype(self.input_type, copy=False)
|
||||
tensor = np.asarray(image).astype(self.input_types[0], copy=False)
|
||||
tensor = np.expand_dims(tensor, axis=0)
|
||||
bbox = np.asarray(bbox)[0:4]
|
||||
bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8])
|
||||
bbox = bbox.round().astype(np.int32)
|
||||
bbox = np.expand_dims(bbox, axis=0)
|
||||
tensor = [tensor, bbox]
|
||||
return tensor
|
||||
|
||||
def postprocess(self, tensor: List[np.ndarray], **kwargs):
|
||||
scores = np.clip(tensor[1][0], 0, 1)
|
||||
kp = np.concatenate([tensor[0][0], np.expand_dims(scores, axis=-1)], axis=-1)
|
||||
|
||||
# See: /mmpose/models/pose_estimators/topdown.py - add_pred_to_datasample()
|
||||
th, tw = self.input_shape[1:3]
|
||||
bw, bh = [self.bbox[2] - self.bbox[0], self.bbox[3] - self.bbox[1]]
|
||||
kp[:, :2] /= np.array([tw, th])
|
||||
kp[:, :2] *= np.array([bw, bh])
|
||||
kp[:, :2] += np.array([self.bbox[0] + bw / 2, self.bbox[1] + bh / 2])
|
||||
kp[:, :2] -= 0.5 * np.array([bw, bh])
|
||||
|
||||
scores = np.clip(tensor[0][0], 0, 1)
|
||||
kp = np.concatenate([tensor[1][0], np.expand_dims(scores, axis=-1)], axis=-1)
|
||||
return kp
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user