Moved image cropping out of the graph again.

This commit is contained in:
Daniel
2024-12-06 12:32:51 +01:00
parent acf1d19b64
commit 7a253cd615
3 changed files with 399 additions and 249 deletions

View File

@ -97,77 +97,13 @@ def add_steps_to_onnx(model_path):
for i, j in enumerate([0, 3, 1, 2]): for i, j in enumerate([0, 3, 1, 2]):
input_shape[j].dim_value = dims[i] input_shape[j].dim_value = dims[i]
if "det" in model_path: # Rename the input tensor
# Add preprocess model to main network main_input_image_name = model.graph.input[0].name
pp1_model = onnx.load(base_path + "det_preprocess.onnx") for node in model.graph.node:
model = compose.add_prefix(model, prefix="main_") for idx, name in enumerate(node.input):
pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_") if name == main_input_image_name:
model = compose.merge_models( node.input[idx] = "image_input"
pp1_model, model.graph.input[0].name = "image_input"
model,
io_map=[(pp1_model.graph.output[0].name, model.graph.input[0].name)],
)
# Add postprocess model
pp2_model = onnx.load(base_path + "det_postprocess.onnx")
pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_")
model = compose.merge_models(
model,
pp2_model,
io_map=[
(model.graph.output[0].name, pp2_model.graph.input[1].name),
],
)
# Update nodes from postprocess model to use the input of the main network
pp2_input_image_name = pp2_model.graph.input[0].name
main_input_image_name = model.graph.input[0].name
for node in model.graph.node:
for idx, name in enumerate(node.input):
if name == pp2_input_image_name:
node.input[idx] = main_input_image_name
model.graph.input.pop(1)
if "pose" in model_path:
# Add preprocess model to main network
pp1_model = onnx.load(base_path + "pose_preprocess.onnx")
model = compose.add_prefix(model, prefix="main_")
pp1_model = compose.add_prefix(pp1_model, prefix="preprocess_")
model = compose.merge_models(
pp1_model,
model,
io_map=[
(pp1_model.graph.output[0].name, model.graph.input[0].name),
],
)
# Add postprocess model
pp2_model = onnx.load(base_path + "pose_postprocess.onnx")
pp2_model = compose.add_prefix(pp2_model, prefix="postprocess_")
model = compose.merge_models(
model,
pp2_model,
io_map=[
(model.graph.output[0].name, pp2_model.graph.input[2].name),
],
)
# Update nodes from postprocess model to use the input of the main network
pp2_input_image_name = pp2_model.graph.input[0].name
pp2_input_bbox_name = pp2_model.graph.input[1].name
main_input_image_name = model.graph.input[0].name
main_input_bbox_name = model.graph.input[1].name
for node in model.graph.node:
for idx, name in enumerate(node.input):
if name == pp2_input_image_name:
node.input[idx] = main_input_image_name
if name == pp2_input_bbox_name:
node.input[idx] = main_input_bbox_name
model.graph.input.pop(2)
model.graph.input.pop(2)
# Set input box type to int32
model.graph.input[1].type.tensor_type.elem_type = TensorProto.INT32
# Set input image type to int8 # Set input image type to int8
model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8 model.graph.input[0].type.tensor_type.elem_type = TensorProto.UINT8

View File

@ -6,269 +6,269 @@ Results of the model in various experiments on different datasets.
```json ```json
{ {
"avg_time_2d": 0.01303539154893261, "avg_time_2d": 0.01109659348504018,
"avg_time_3d": 0.00036579309883764233, "avg_time_3d": 0.00034234281313621394,
"avg_fps": 74.62026875112002 "avg_fps": 87.4207158719313
} }
{ {
"person_nums": { "person_nums": {
"total_frames": 600, "total_frames": 600,
"total_labels": 600, "total_labels": 600,
"total_preds": 600, "total_preds": 601,
"considered_empty": 0, "considered_empty": 0,
"valid_preds": 600, "valid_preds": 600,
"invalid_preds": 0, "invalid_preds": 1,
"missing": 0, "missing": 0,
"invalid_fraction": 0.0, "invalid_fraction": 0.00166,
"precision": 1.0, "precision": 0.99834,
"recall": 1.0, "recall": 1.0,
"f1": 1.0, "f1": 0.99917,
"non_empty": 600 "non_empty": 601
}, },
"mpjpe": { "mpjpe": {
"count": 600, "count": 600,
"mean": 0.06664, "mean": 0.06621,
"median": 0.05883, "median": 0.058297,
"std": 0.027642, "std": 0.027913,
"sem": 0.001129, "sem": 0.00114,
"min": 0.037832, "min": 0.04047,
"max": 0.189745, "max": 0.189061,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.1, "recall-0.05": 0.098333,
"recall-0.1": 0.941667, "recall-0.1": 0.941667,
"recall-0.15": 0.95, "recall-0.15": 0.95,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600, "num_labels": 600,
"ap-0.025": 0.0, "ap-0.025": 0.0,
"ap-0.05": 0.018725, "ap-0.05": 0.018429,
"ap-0.1": 0.902023, "ap-0.1": 0.901756,
"ap-0.15": 0.914628, "ap-0.15": 0.913878,
"ap-0.25": 1.0, "ap-0.25": 1.0,
"ap-0.5": 1.0 "ap-0.5": 1.0
}, },
"nose": { "nose": {
"count": 600, "count": 600,
"mean": 0.114935, "mean": 0.113174,
"median": 0.099561, "median": 0.098547,
"std": 0.042845, "std": 0.041425,
"sem": 0.001751, "sem": 0.001693,
"min": 0.029831, "min": 0.029421,
"max": 0.268342, "max": 0.27266,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.015, "recall-0.05": 0.01,
"recall-0.1": 0.506667, "recall-0.1": 0.515,
"recall-0.15": 0.803333, "recall-0.15": 0.81,
"recall-0.25": 0.995, "recall-0.25": 0.991667,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"shoulder_left": { "shoulder_left": {
"count": 600, "count": 600,
"mean": 0.036888, "mean": 0.034727,
"median": 0.028719, "median": 0.026049,
"std": 0.031747, "std": 0.031822,
"sem": 0.001297, "sem": 0.0013,
"min": 0.004721, "min": 0.002176,
"max": 0.182985, "max": 0.183422,
"recall-0.025": 0.401667, "recall-0.025": 0.471667,
"recall-0.05": 0.833333, "recall-0.05": 0.855,
"recall-0.1": 0.948333, "recall-0.1": 0.95,
"recall-0.15": 0.963333, "recall-0.15": 0.965,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"shoulder_right": { "shoulder_right": {
"count": 600, "count": 600,
"mean": 0.050032, "mean": 0.04794,
"median": 0.036552, "median": 0.034508,
"std": 0.040712, "std": 0.039316,
"sem": 0.001663, "sem": 0.001606,
"min": 0.006749, "min": 0.004604,
"max": 0.239156, "max": 0.218143,
"recall-0.025": 0.201667, "recall-0.025": 0.211667,
"recall-0.05": 0.708333, "recall-0.05": 0.76,
"recall-0.1": 0.915, "recall-0.1": 0.918333,
"recall-0.15": 0.945, "recall-0.15": 0.946667,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"elbow_left": { "elbow_left": {
"count": 600, "count": 600,
"mean": 0.045586, "mean": 0.044638,
"median": 0.037313, "median": 0.036326,
"std": 0.034633, "std": 0.034761,
"sem": 0.001415, "sem": 0.00142,
"min": 0.003768, "min": 0.003696,
"max": 0.200457, "max": 0.196813,
"recall-0.025": 0.216667, "recall-0.025": 0.226667,
"recall-0.05": 0.746667, "recall-0.05": 0.778333,
"recall-0.1": 0.946667, "recall-0.1": 0.941667,
"recall-0.15": 0.955, "recall-0.15": 0.953333,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"elbow_right": { "elbow_right": {
"count": 600, "count": 600,
"mean": 0.04539, "mean": 0.044037,
"median": 0.035591, "median": 0.033739,
"std": 0.036356, "std": 0.036263,
"sem": 0.001485, "sem": 0.001482,
"min": 0.007803, "min": 0.007995,
"max": 0.281955, "max": 0.351118,
"recall-0.025": 0.245, "recall-0.025": 0.251667,
"recall-0.05": 0.773333, "recall-0.05": 0.788333,
"recall-0.1": 0.923333, "recall-0.1": 0.931667,
"recall-0.15": 0.941667, "recall-0.15": 0.945,
"recall-0.25": 0.998333, "recall-0.25": 0.998333,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"wrist_left": { "wrist_left": {
"count": 600, "count": 600,
"mean": 0.046389, "mean": 0.043333,
"median": 0.029742, "median": 0.027284,
"std": 0.04752, "std": 0.044655,
"sem": 0.001942, "sem": 0.001825,
"min": 0.00236, "min": 0.002741,
"max": 0.287479, "max": 0.185438,
"recall-0.025": 0.426667, "recall-0.025": 0.458333,
"recall-0.05": 0.728333, "recall-0.05": 0.745,
"recall-0.1": 0.888333, "recall-0.1": 0.891667,
"recall-0.15": 0.91, "recall-0.15": 0.923333,
"recall-0.25": 0.996667, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"wrist_right": { "wrist_right": {
"count": 600, "count": 600,
"mean": 0.046403, "mean": 0.047488,
"median": 0.028916, "median": 0.027367,
"std": 0.046566, "std": 0.053442,
"sem": 0.001903, "sem": 0.002184,
"min": 0.002735, "min": 0.001357,
"max": 0.236808, "max": 0.465438,
"recall-0.025": 0.428333, "recall-0.025": 0.446667,
"recall-0.05": 0.731667, "recall-0.05": 0.738333,
"recall-0.1": 0.87, "recall-0.1": 0.868333,
"recall-0.15": 0.926667, "recall-0.15": 0.898333,
"recall-0.25": 1.0, "recall-0.25": 0.998333,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"hip_left": { "hip_left": {
"count": 600, "count": 600,
"mean": 0.079732, "mean": 0.084262,
"median": 0.072175, "median": 0.078071,
"std": 0.034532, "std": 0.032944,
"sem": 0.001411, "sem": 0.001346,
"min": 0.013963, "min": 0.022541,
"max": 0.24229, "max": 0.239428,
"recall-0.025": 0.013333, "recall-0.025": 0.003333,
"recall-0.05": 0.081667, "recall-0.05": 0.055,
"recall-0.1": 0.875, "recall-0.1": 0.851667,
"recall-0.15": 0.945, "recall-0.15": 0.951667,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"hip_right": { "hip_right": {
"count": 600, "count": 600,
"mean": 0.101424, "mean": 0.106676,
"median": 0.099206, "median": 0.103778,
"std": 0.02636, "std": 0.025796,
"sem": 0.001077, "sem": 0.001054,
"min": 0.032964, "min": 0.042573,
"max": 0.226018, "max": 0.242475,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.008333, "recall-0.05": 0.003333,
"recall-0.1": 0.52, "recall-0.1": 0.421667,
"recall-0.15": 0.946667, "recall-0.15": 0.948333,
"recall-0.25": 1.0, "recall-0.25": 1.0,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"knee_left": { "knee_left": {
"count": 600, "count": 598,
"mean": 0.06299, "mean": 0.062386,
"median": 0.047078, "median": 0.046647,
"std": 0.055676, "std": 0.055624,
"sem": 0.002275, "sem": 0.002277,
"min": 0.013748, "min": 0.012414,
"max": 0.412425, "max": 0.399633,
"recall-0.025": 0.03, "recall-0.025": 0.045,
"recall-0.05": 0.548333, "recall-0.05": 0.555,
"recall-0.1": 0.89, "recall-0.1": 0.885,
"recall-0.15": 0.926667, "recall-0.15": 0.925,
"recall-0.25": 0.983333, "recall-0.25": 0.978333,
"recall-0.5": 1.0, "recall-0.5": 0.996667,
"num_labels": 600 "num_labels": 600
}, },
"knee_right": { "knee_right": {
"count": 600, "count": 600,
"mean": 0.053303, "mean": 0.050939,
"median": 0.039785, "median": 0.041387,
"std": 0.048089, "std": 0.037661,
"sem": 0.001965, "sem": 0.001539,
"min": 0.009094, "min": 0.006788,
"max": 0.470447, "max": 0.268559,
"recall-0.025": 0.06, "recall-0.025": 0.045,
"recall-0.05": 0.736667, "recall-0.05": 0.73,
"recall-0.1": 0.923333, "recall-0.1": 0.941667,
"recall-0.15": 0.926667, "recall-0.15": 0.943333,
"recall-0.25": 0.988333, "recall-0.25": 0.996667,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"ankle_left": { "ankle_left": {
"count": 600, "count": 600,
"mean": 0.097848, "mean": 0.096519,
"median": 0.087393, "median": 0.085325,
"std": 0.039465, "std": 0.043518,
"sem": 0.001613, "sem": 0.001778,
"min": 0.049149, "min": 0.049769,
"max": 0.49791, "max": 0.494823,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.005, "recall-0.05": 0.001667,
"recall-0.1": 0.805, "recall-0.1": 0.828333,
"recall-0.15": 0.923333, "recall-0.15": 0.935,
"recall-0.25": 0.99, "recall-0.25": 0.988333,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"ankle_right": { "ankle_right": {
"count": 600, "count": 600,
"mean": 0.085394, "mean": 0.082453,
"median": 0.070638, "median": 0.068627,
"std": 0.050932, "std": 0.050525,
"sem": 0.002081, "sem": 0.002064,
"min": 0.027674, "min": 0.026098,
"max": 0.441898, "max": 0.482397,
"recall-0.025": 0.0, "recall-0.025": 0.0,
"recall-0.05": 0.023333, "recall-0.05": 0.035,
"recall-0.1": 0.876667, "recall-0.1": 0.896667,
"recall-0.15": 0.9, "recall-0.15": 0.915,
"recall-0.25": 0.983333, "recall-0.25": 0.981667,
"recall-0.5": 1.0, "recall-0.5": 1.0,
"num_labels": 600 "num_labels": 600
}, },
"joint_recalls": { "joint_recalls": {
"num_labels": 7800, "num_labels": 7800,
"recall-0.025": 0.15538, "recall-0.025": 0.1659,
"recall-0.05": 0.45603, "recall-0.05": 0.46526,
"recall-0.1": 0.83705, "recall-0.1": 0.83359,
"recall-0.15": 0.92372, "recall-0.15": 0.92705,
"recall-0.25": 0.99449, "recall-0.25": 0.99436,
"recall-0.5": 1.0 "recall-0.5": 0.99974
} }
} }
{ {
"total_parts": 8400, "total_parts": 8400,
"correct_parts": 8090, "correct_parts": 8113,
"pcp": 0.963095 "pcp": 0.965833
} }
``` ```

View File

@ -1,6 +1,8 @@
import math
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List from typing import List
import cv2
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from tqdm import tqdm from tqdm import tqdm
@ -49,11 +51,11 @@ class BaseModel(ABC):
self.warmup(warmup) self.warmup(warmup)
@abstractmethod @abstractmethod
def preprocess(self, image: np.ndarray, *args, **kwargs): def preprocess(self, **kwargs):
pass pass
@abstractmethod @abstractmethod
def postprocess(self, tensor: List[np.ndarray], *args, **kwargs): def postprocess(self, **kwargs):
pass pass
def warmup(self, epoch: int): def warmup(self, epoch: int):
@ -97,20 +99,178 @@ class BaseModel(ABC):
self.session.run(None, inputs) self.session.run(None, inputs)
def __call__(self, image: np.ndarray, *args, **kwargs): def __call__(self, **kwargs):
tensor = self.preprocess(image, *args, **kwargs) tensor = self.preprocess(**kwargs)
inputs = {} inputs = {}
for i in range(len(self.input_names)): for i in range(len(self.input_names)):
iname = self.input_names[i] iname = self.input_names[i]
inputs[iname] = tensor[i] inputs[iname] = tensor[i]
result = self.session.run(None, inputs) result = self.session.run(None, inputs)
output = self.postprocess(result, *args, **kwargs) output = self.postprocess(result=result, **kwargs)
return output return output
# ================================================================================================== # ==================================================================================================
class LetterBox:
def __init__(self, target_size, fill_value=0):
self.target_size = target_size
self.fill_value = fill_value
def calc_params(self, ishape):
img_h, img_w = ishape[:2]
target_h, target_w = self.target_size
scale = min(target_w / img_w, target_h / img_h)
new_w = round(img_w * scale)
new_h = round(img_h * scale)
pad_w = target_w - new_w
pad_h = target_h - new_h
pad_left = pad_w // 2
pad_top = pad_h // 2
pad_right = pad_w - pad_left
pad_bottom = pad_h - pad_top
paddings = (pad_left, pad_right, pad_top, pad_bottom)
return paddings, scale, (new_w, new_h)
def resize_image(self, image):
paddings, _, new_size = self.calc_params(image.shape)
target_h, target_w = self.target_size
canvas = np.full(
(target_h, target_w, image.shape[2]),
self.fill_value,
dtype=image.dtype,
)
new_w, new_h = new_size
dx, dy = paddings[0], paddings[2]
canvas[dy : dy + new_h, dx : dx + new_w, :] = cv2.resize(
image, (new_w, new_h), interpolation=cv2.INTER_LINEAR
)
return canvas
# ==================================================================================================
class BoxCrop:
def __init__(self, target_size, padding_scale=1.0, fill_value=0):
self.target_size = target_size
self.padding_scale = padding_scale
self.fill_value = fill_value
def calc_params(self, ishape, bbox):
start_x, start_y, end_x, end_y = bbox[0], bbox[1], bbox[2], bbox[3]
target_h, target_w = self.target_size
# Calculate original bounding box center
center_x = (start_x + end_x) / 2.0
center_y = (start_y + end_y) / 2.0
# Scale the bounding box by the padding_scale
bbox_w = end_x - start_x
bbox_h = end_y - start_y
scaled_w = bbox_w * self.padding_scale
scaled_h = bbox_h * self.padding_scale
# Calculate the aspect ratios
bbox_aspect = scaled_w / scaled_h
target_aspect = target_w / target_h
# Adjust the scaled bounding box to match the target aspect ratio
if bbox_aspect > target_aspect:
adjusted_h = scaled_w / target_aspect
adjusted_w = scaled_w
else:
adjusted_w = scaled_h * target_aspect
adjusted_h = scaled_h
# Calculate scaled bounding box coordinates
bbox_w = adjusted_w
bbox_h = adjusted_h
new_start_x = center_x - bbox_w / 2.0
new_start_y = center_y - bbox_h / 2.0
new_end_x = center_x + bbox_w / 2.0
new_end_y = center_y + bbox_h / 2.0
# Round the box coordinates
start_x = int(math.floor(new_start_x))
start_y = int(math.floor(new_start_y))
end_x = int(math.ceil(new_end_x))
end_y = int(math.ceil(new_end_y))
# Define the new box coordinates
new_start_x = max(0, start_x)
new_start_y = max(0, start_y)
new_end_x = min(ishape[1] - 1, end_x)
new_end_y = min(ishape[0] - 1, end_y)
new_box = [new_start_x, new_start_y, new_end_x, new_end_y]
bbox_w = new_box[2] - new_box[0]
bbox_h = new_box[3] - new_box[1]
scale = min(target_w / bbox_w, target_h / bbox_h)
new_w = round(bbox_w * scale)
new_h = round(bbox_h * scale)
# Calculate paddings
pad_w = target_w - new_w
pad_h = target_h - new_h
pad_left, pad_right, pad_top, pad_bottom = 0, 0, 0, 0
if pad_w > 0:
if start_x < 0:
pad_left = pad_w
pad_right = 0
elif end_x > ishape[1]:
pad_left = 0
pad_right = pad_w
else:
# Can be caused by bbox rounding
pad_left = pad_w // 2
pad_right = pad_w - pad_left
if pad_h > 0:
if start_y < 0:
pad_top = pad_h
pad_bottom = 0
elif end_y > ishape[0]:
pad_top = 0
pad_bottom = pad_h
else:
# Can be caused by bbox rounding
pad_top = pad_h // 2
pad_bottom = pad_h - pad_top
paddings = (pad_left, pad_right, pad_top, pad_bottom)
return paddings, scale, new_box, (new_w, new_h)
def crop_resize_box(self, image, bbox):
paddings, _, new_box, new_size = self.calc_params(image.shape, bbox)
image = image[new_box[1] : new_box[3], new_box[0] : new_box[2]]
th, tw = self.target_size
canvas = np.full(
(th, tw, image.shape[2]),
self.fill_value,
dtype=image.dtype,
)
nw, nh = new_size
dx, dy = paddings[0], paddings[2]
canvas[dy : dy + nh, dx : dx + nw, :] = cv2.resize(
image, (nw, nh), interpolation=cv2.INTER_LINEAR
)
return canvas
# ==================================================================================================
class RTMDet(BaseModel): class RTMDet(BaseModel):
def __init__( def __init__(
self, self,
@ -119,17 +279,20 @@ class RTMDet(BaseModel):
warmup: int = 30, warmup: int = 30,
): ):
super(RTMDet, self).__init__(model_path, warmup) super(RTMDet, self).__init__(model_path, warmup)
self.target_size = (320, 320)
self.conf_threshold = conf_threshold self.conf_threshold = conf_threshold
self.letterbox = LetterBox(self.target_size, fill_value=114)
def preprocess(self, image: np.ndarray): def preprocess(self, image: np.ndarray):
image = self.letterbox.resize_image(image)
tensor = np.asarray(image).astype(self.input_types[0], copy=False) tensor = np.asarray(image).astype(self.input_types[0], copy=False)
tensor = np.expand_dims(tensor, axis=0) tensor = np.expand_dims(tensor, axis=0)
tensor = [tensor] tensor = [tensor]
return tensor return tensor
def postprocess(self, tensor: List[np.ndarray]): def postprocess(self, result: List[np.ndarray], image: np.ndarray):
boxes = np.squeeze(tensor[1], axis=0) boxes = np.squeeze(result[0], axis=0)
classes = np.squeeze(tensor[0], axis=0) classes = np.squeeze(result[1], axis=0)
human_class = classes[:] == 0 human_class = classes[:] == 0
boxes = boxes[human_class] boxes = boxes[human_class]
@ -137,6 +300,35 @@ class RTMDet(BaseModel):
keep = boxes[:, 4] > self.conf_threshold keep = boxes[:, 4] > self.conf_threshold
boxes = boxes[keep] boxes = boxes[keep]
paddings, scale, _ = self.letterbox.calc_params(image.shape)
boxes[:, 0] -= paddings[0]
boxes[:, 2] -= paddings[0]
boxes[:, 1] -= paddings[2]
boxes[:, 3] -= paddings[2]
boxes = np.maximum(boxes, 0)
th, tw = self.target_size
pad_w = paddings[0] + paddings[1]
pad_h = paddings[2] + paddings[3]
max_w = tw - pad_w - 1
max_h = th - pad_h - 1
b0 = boxes[:, 0]
b1 = boxes[:, 1]
b2 = boxes[:, 2]
b3 = boxes[:, 3]
b0 = np.minimum(b0, max_w)
b1 = np.minimum(b1, max_h)
b2 = np.minimum(b2, max_w)
b3 = np.minimum(b3, max_h)
boxes[:, 0] = b0
boxes[:, 1] = b1
boxes[:, 2] = b2
boxes[:, 3] = b3
boxes[:, 0:4] /= scale
return boxes return boxes
@ -146,7 +338,8 @@ class RTMDet(BaseModel):
class RTMPose(BaseModel): class RTMPose(BaseModel):
def __init__(self, model_path: str, warmup: int = 30): def __init__(self, model_path: str, warmup: int = 30):
super(RTMPose, self).__init__(model_path, warmup) super(RTMPose, self).__init__(model_path, warmup)
self.bbox = None self.target_size = (384, 288)
self.boxcrop = BoxCrop(self.target_size, padding_scale=1.25, fill_value=0)
def preprocess(self, image: np.ndarray, bbox: np.ndarray): def preprocess(self, image: np.ndarray, bbox: np.ndarray):
tensor = np.asarray(image).astype(self.input_types[0], copy=False) tensor = np.asarray(image).astype(self.input_types[0], copy=False)
@ -154,13 +347,34 @@ class RTMPose(BaseModel):
bbox = np.asarray(bbox)[0:4] bbox = np.asarray(bbox)[0:4]
bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8])
bbox = bbox.round().astype(np.int32) bbox = bbox.round().astype(np.int32)
bbox = np.expand_dims(bbox, axis=0) region = self.boxcrop.crop_resize_box(image, bbox)
tensor = [tensor, bbox] tensor = np.asarray(region).astype(self.input_types[0], copy=False)
tensor = np.expand_dims(tensor, axis=0)
tensor = [tensor]
return tensor return tensor
def postprocess(self, tensor: List[np.ndarray], **kwargs): def postprocess(
scores = np.clip(tensor[0][0], 0, 1) self, result: List[np.ndarray], image: np.ndarray, bbox: np.ndarray
kp = np.concatenate([tensor[1][0], np.expand_dims(scores, axis=-1)], axis=-1) ):
scores = np.clip(result[1][0], 0, 1)
kp = np.concatenate([result[0][0], np.expand_dims(scores, axis=-1)], axis=-1)
paddings, scale, bbox, _ = self.boxcrop.calc_params(image.shape, bbox)
kp[:, 0] -= paddings[0]
kp[:, 1] -= paddings[2]
kp[:, 0:2] /= scale
kp[:, 0] += bbox[0]
kp[:, 1] += bbox[1]
kp[:, 0:2] = np.maximum(kp[:, 0:2], 0)
max_w = image.shape[1] - 1
max_h = image.shape[0] - 1
b0 = kp[:, 0]
b1 = kp[:, 1]
b0 = np.minimum(b0, max_w)
b1 = np.minimum(b1, max_h)
kp[:, 0] = b0
kp[:, 1] = b1
return kp return kp
@ -184,10 +398,10 @@ class TopDown:
self.pose_model = RTMPose(pose_model_path, warmup) self.pose_model = RTMPose(pose_model_path, warmup)
def predict(self, image): def predict(self, image):
boxes = self.det_model(image) boxes = self.det_model(image=image)
results = [] results = []
for i in range(boxes.shape[0]): for i in range(boxes.shape[0]):
kp = self.pose_model(image, bbox=boxes[i]) kp = self.pose_model(image=image, bbox=boxes[i])
results.append(kp) results.append(kp)
return results return results