From 108937d96c69ccff7012d80d0771a470a29909c7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 6 Dec 2024 13:58:34 +0100 Subject: [PATCH] Some mixed improvements. --- media/RESULTS.md | 284 +++++++++++++++++------------------ scripts/utils_2d_pose_ort.py | 108 +++++++------ 2 files changed, 201 insertions(+), 191 deletions(-) diff --git a/media/RESULTS.md b/media/RESULTS.md index 288b348..9c13741 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -6,9 +6,9 @@ Results of the model in various experiments on different datasets. ```json { - "avg_time_2d": 0.01109659348504018, - "avg_time_3d": 0.00034234281313621394, - "avg_fps": 87.4207158719313 + "avg_time_2d": 0.010846347323918747, + "avg_time_3d": 0.0003320467674126059, + "avg_fps": 89.45828817893282 } { "person_nums": { @@ -27,149 +27,149 @@ Results of the model in various experiments on different datasets. }, "mpjpe": { "count": 600, - "mean": 0.06621, - "median": 0.058297, - "std": 0.027913, - "sem": 0.00114, - "min": 0.04047, - "max": 0.189061, + "mean": 0.066093, + "median": 0.058635, + "std": 0.027815, + "sem": 0.001136, + "min": 0.040333, + "max": 0.189198, "recall-0.025": 0.0, - "recall-0.05": 0.098333, - "recall-0.1": 0.941667, + "recall-0.05": 0.101667, + "recall-0.1": 0.938333, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.018429, - "ap-0.1": 0.901756, - "ap-0.15": 0.913878, + "ap-0.05": 0.023002, + "ap-0.1": 0.897991, + "ap-0.15": 0.914985, "ap-0.25": 1.0, "ap-0.5": 1.0 }, "nose": { "count": 600, - "mean": 0.113174, - "median": 0.098547, - "std": 0.041425, - "sem": 0.001693, - "min": 0.029421, - "max": 0.27266, + "mean": 0.114181, + "median": 0.099121, + "std": 0.042396, + "sem": 0.001732, + "min": 0.029365, + "max": 0.287428, "recall-0.025": 0.0, - "recall-0.05": 0.01, - "recall-0.1": 0.515, - "recall-0.15": 0.81, + "recall-0.05": 0.011667, + "recall-0.1": 0.508333, + "recall-0.15": 0.801667, "recall-0.25": 0.991667, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_left": { "count": 600, - "mean": 0.034727, - "median": 0.026049, - "std": 0.031822, - "sem": 0.0013, - "min": 0.002176, - "max": 0.183422, - "recall-0.025": 0.471667, - "recall-0.05": 0.855, + "mean": 0.03478, + "median": 0.026496, + "std": 0.031647, + "sem": 0.001293, + "min": 0.003155, + "max": 0.183779, + "recall-0.025": 0.455, + "recall-0.05": 0.853333, "recall-0.1": 0.95, - "recall-0.15": 0.965, + "recall-0.15": 0.966667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_right": { "count": 600, - "mean": 0.04794, - "median": 0.034508, - "std": 0.039316, - "sem": 0.001606, - "min": 0.004604, - "max": 0.218143, - "recall-0.025": 0.211667, - "recall-0.05": 0.76, - "recall-0.1": 0.918333, - "recall-0.15": 0.946667, - "recall-0.25": 1.0, + "mean": 0.047867, + "median": 0.034293, + "std": 0.039619, + "sem": 0.001619, + "min": 0.005688, + "max": 0.254393, + "recall-0.025": 0.218333, + "recall-0.05": 0.751667, + "recall-0.1": 0.913333, + "recall-0.15": 0.95, + "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.044638, - "median": 0.036326, - "std": 0.034761, - "sem": 0.00142, - "min": 0.003696, - "max": 0.196813, - "recall-0.025": 0.226667, - "recall-0.05": 0.778333, - "recall-0.1": 0.941667, - "recall-0.15": 0.953333, + "mean": 0.044022, + "median": 0.035159, + "std": 0.034701, + "sem": 0.001418, + "min": 0.002814, + "max": 0.194526, + "recall-0.025": 0.233333, + "recall-0.05": 0.771667, + "recall-0.1": 0.943333, + "recall-0.15": 0.958333, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_right": { "count": 600, - "mean": 0.044037, - "median": 0.033739, - "std": 0.036263, - "sem": 0.001482, - "min": 0.007995, - "max": 0.351118, - "recall-0.025": 0.251667, - "recall-0.05": 0.788333, - "recall-0.1": 0.931667, - "recall-0.15": 0.945, + "mean": 0.04408, + "median": 0.033951, + "std": 0.036319, + "sem": 0.001484, + "min": 0.008171, + "max": 0.360134, + "recall-0.025": 0.265, + "recall-0.05": 0.78, + "recall-0.1": 0.933333, + "recall-0.15": 0.946667, "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_left": { "count": 600, - "mean": 0.043333, - "median": 0.027284, - "std": 0.044655, + "mean": 0.043753, + "median": 0.027211, + "std": 0.044668, "sem": 0.001825, - "min": 0.002741, - "max": 0.185438, - "recall-0.025": 0.458333, - "recall-0.05": 0.745, + "min": 0.002715, + "max": 0.190751, + "recall-0.025": 0.46, + "recall-0.05": 0.74, "recall-0.1": 0.891667, - "recall-0.15": 0.923333, + "recall-0.15": 0.925, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { "count": 600, - "mean": 0.047488, - "median": 0.027367, - "std": 0.053442, - "sem": 0.002184, - "min": 0.001357, - "max": 0.465438, - "recall-0.025": 0.446667, - "recall-0.05": 0.738333, - "recall-0.1": 0.868333, - "recall-0.15": 0.898333, - "recall-0.25": 0.998333, + "mean": 0.046553, + "median": 0.026979, + "std": 0.050263, + "sem": 0.002054, + "min": 0.003364, + "max": 0.244861, + "recall-0.025": 0.46, + "recall-0.05": 0.733333, + "recall-0.1": 0.87, + "recall-0.15": 0.906667, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "hip_left": { "count": 600, - "mean": 0.084262, - "median": 0.078071, - "std": 0.032944, - "sem": 0.001346, - "min": 0.022541, - "max": 0.239428, - "recall-0.025": 0.003333, + "mean": 0.08362, + "median": 0.077619, + "std": 0.032967, + "sem": 0.001347, + "min": 0.018157, + "max": 0.240771, + "recall-0.025": 0.005, "recall-0.05": 0.055, - "recall-0.1": 0.851667, + "recall-0.1": 0.848333, "recall-0.15": 0.951667, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -177,63 +177,63 @@ Results of the model in various experiments on different datasets. }, "hip_right": { "count": 600, - "mean": 0.106676, - "median": 0.103778, - "std": 0.025796, - "sem": 0.001054, - "min": 0.042573, - "max": 0.242475, + "mean": 0.106567, + "median": 0.104243, + "std": 0.026243, + "sem": 0.001072, + "min": 0.035565, + "max": 0.245341, "recall-0.025": 0.0, "recall-0.05": 0.003333, - "recall-0.1": 0.421667, - "recall-0.15": 0.948333, + "recall-0.1": 0.415, + "recall-0.15": 0.946667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "knee_left": { - "count": 598, - "mean": 0.062386, - "median": 0.046647, - "std": 0.055624, - "sem": 0.002277, - "min": 0.012414, - "max": 0.399633, - "recall-0.025": 0.045, - "recall-0.05": 0.555, - "recall-0.1": 0.885, + "count": 599, + "mean": 0.063278, + "median": 0.047513, + "std": 0.056978, + "sem": 0.00233, + "min": 0.017587, + "max": 0.4004, + "recall-0.025": 0.038333, + "recall-0.05": 0.546667, + "recall-0.1": 0.883333, "recall-0.15": 0.925, "recall-0.25": 0.978333, - "recall-0.5": 0.996667, + "recall-0.5": 0.998333, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.050939, - "median": 0.041387, - "std": 0.037661, - "sem": 0.001539, - "min": 0.006788, - "max": 0.268559, - "recall-0.025": 0.045, - "recall-0.05": 0.73, + "mean": 0.050742, + "median": 0.041408, + "std": 0.037974, + "sem": 0.001552, + "min": 0.01394, + "max": 0.279839, + "recall-0.025": 0.053333, + "recall-0.05": 0.75, "recall-0.1": 0.941667, - "recall-0.15": 0.943333, + "recall-0.15": 0.941667, "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_left": { "count": 600, - "mean": 0.096519, - "median": 0.085325, - "std": 0.043518, - "sem": 0.001778, - "min": 0.049769, - "max": 0.494823, + "mean": 0.096717, + "median": 0.085484, + "std": 0.043279, + "sem": 0.001768, + "min": 0.050765, + "max": 0.49651, "recall-0.025": 0.0, - "recall-0.05": 0.001667, - "recall-0.1": 0.828333, + "recall-0.05": 0.0, + "recall-0.1": 0.825, "recall-0.15": 0.935, "recall-0.25": 0.988333, "recall-0.5": 1.0, @@ -241,34 +241,34 @@ Results of the model in various experiments on different datasets. }, "ankle_right": { "count": 600, - "mean": 0.082453, - "median": 0.068627, - "std": 0.050525, - "sem": 0.002064, - "min": 0.026098, - "max": 0.482397, + "mean": 0.08227, + "median": 0.068786, + "std": 0.049929, + "sem": 0.00204, + "min": 0.028705, + "max": 0.486848, "recall-0.025": 0.0, - "recall-0.05": 0.035, + "recall-0.05": 0.033333, "recall-0.1": 0.896667, - "recall-0.15": 0.915, - "recall-0.25": 0.981667, + "recall-0.15": 0.916667, + "recall-0.25": 0.985, "recall-0.5": 1.0, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.1659, - "recall-0.05": 0.46526, - "recall-0.1": 0.83359, - "recall-0.15": 0.92705, - "recall-0.25": 0.99436, + "recall-0.025": 0.16782, + "recall-0.05": 0.46333, + "recall-0.1": 0.83154, + "recall-0.15": 0.92846, + "recall-0.25": 0.99462, "recall-0.5": 0.99974 } } { "total_parts": 8400, - "correct_parts": 8113, - "pcp": 0.965833 + "correct_parts": 8111, + "pcp": 0.965595 } ``` diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 33640d4..88787d9 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -37,14 +37,16 @@ class BaseModel(ABC): self.input_types = [] for i in range(len(input_types)): input_type = input_types[i] - if input_type == "tensor(float16)": + if input_type == "tensor(float32)": + itype = np.float32 + elif input_type == "tensor(float16)": itype = np.float16 - elif input_type == "tensor(uint8)": - itype = np.uint8 elif input_type == "tensor(int32)": itype = np.int32 + elif input_type == "tensor(uint8)": + itype = np.uint8 else: - itype = np.float32 + raise ValueError("Undefined input type:", input_type) self.input_types.append(itype) if warmup > 0: @@ -59,6 +61,8 @@ class BaseModel(ABC): pass def warmup(self, epoch: int): + np.random.seed(42) + print("Running warmup for '{}' ...".format(self.__class__.__name__)) for _ in tqdm(range(epoch)): inputs = {} @@ -139,20 +143,30 @@ class LetterBox: def resize_image(self, image): paddings, _, new_size = self.calc_params(image.shape) - target_h, target_w = self.target_size - canvas = np.full( - (target_h, target_w, image.shape[2]), - self.fill_value, - dtype=image.dtype, - ) - + # Resize the image new_w, new_h = new_size - dx, dy = paddings[0], paddings[2] - canvas[dy : dy + new_h, dx : dx + new_w, :] = cv2.resize( - image, (new_w, new_h), interpolation=cv2.INTER_LINEAR + resized_img = cv2.resize( + image, + (new_w, new_h), + interpolation=cv2.INTER_LINEAR, ) - return canvas + # Optionally pad the image + pad_left, pad_right, pad_top, pad_bottom = paddings + if pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0: + final_img = resized_img + else: + final_img = cv2.copyMakeBorder( + resized_img, + pad_top, + pad_bottom, + pad_left, + pad_right, + borderType=cv2.BORDER_CONSTANT, + value=[self.fill_value, self.fill_value, self.fill_value], + ) + + return final_img # ================================================================================================== @@ -211,6 +225,7 @@ class BoxCrop: new_end_y = min(ishape[0] - 1, end_y) new_box = [new_start_x, new_start_y, new_end_x, new_end_y] + # Calculate resized crop size bbox_w = new_box[2] - new_box[0] bbox_h = new_box[3] - new_box[1] scale = min(target_w / bbox_w, target_h / bbox_h) @@ -250,22 +265,33 @@ class BoxCrop: def crop_resize_box(self, image, bbox): paddings, _, new_box, new_size = self.calc_params(image.shape, bbox) - image = image[new_box[1] : new_box[3], new_box[0] : new_box[2]] + # Extract the bounding box + cropped_img = image[new_box[1] : new_box[3], new_box[0] : new_box[2]] - th, tw = self.target_size - canvas = np.full( - (th, tw, image.shape[2]), - self.fill_value, - dtype=image.dtype, + # Resize the image + new_w, new_h = new_size + resized_img = cv2.resize( + cropped_img, + (new_w, new_h), + interpolation=cv2.INTER_LINEAR, ) - nw, nh = new_size - dx, dy = paddings[0], paddings[2] - canvas[dy : dy + nh, dx : dx + nw, :] = cv2.resize( - image, (nw, nh), interpolation=cv2.INTER_LINEAR - ) + # Optionally pad the image + pad_left, pad_right, pad_top, pad_bottom = paddings + if pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0: + final_img = resized_img + else: + final_img = cv2.copyMakeBorder( + resized_img, + pad_top, + pad_bottom, + pad_left, + pad_right, + borderType=cv2.BORDER_CONSTANT, + value=[self.fill_value, self.fill_value, self.fill_value], + ) - return canvas + return final_img # ================================================================================================== @@ -308,27 +334,17 @@ class RTMDet(BaseModel): boxes[:, 3] -= paddings[2] boxes = np.maximum(boxes, 0) - th, tw = self.target_size pad_w = paddings[0] + paddings[1] pad_h = paddings[2] + paddings[3] max_w = tw - pad_w - 1 max_h = th - pad_h - 1 - b0 = boxes[:, 0] - b1 = boxes[:, 1] - b2 = boxes[:, 2] - b3 = boxes[:, 3] - b0 = np.minimum(b0, max_w) - b1 = np.minimum(b1, max_h) - b2 = np.minimum(b2, max_w) - b3 = np.minimum(b3, max_h) - boxes[:, 0] = b0 - boxes[:, 1] = b1 - boxes[:, 2] = b2 - boxes[:, 3] = b3 + boxes[:, 0] = np.minimum(boxes[:, 0], max_w) + boxes[:, 1] = np.minimum(boxes[:, 1], max_h) + boxes[:, 2] = np.minimum(boxes[:, 2], max_w) + boxes[:, 3] = np.minimum(boxes[:, 3], max_h) boxes[:, 0:4] /= scale - return boxes @@ -342,8 +358,6 @@ class RTMPose(BaseModel): self.boxcrop = BoxCrop(self.target_size, padding_scale=1.25, fill_value=0) def preprocess(self, image: np.ndarray, bbox: np.ndarray): - tensor = np.asarray(image).astype(self.input_types[0], copy=False) - tensor = np.expand_dims(tensor, axis=0) bbox = np.asarray(bbox)[0:4] bbox += np.array([-0.5, -0.5, 0.5 - 1e-8, 0.5 - 1e-8]) bbox = bbox.round().astype(np.int32) @@ -368,12 +382,8 @@ class RTMPose(BaseModel): kp[:, 0:2] = np.maximum(kp[:, 0:2], 0) max_w = image.shape[1] - 1 max_h = image.shape[0] - 1 - b0 = kp[:, 0] - b1 = kp[:, 1] - b0 = np.minimum(b0, max_w) - b1 = np.minimum(b1, max_h) - kp[:, 0] = b0 - kp[:, 1] = b1 + kp[:, 0] = np.minimum(kp[:, 0], max_w) + kp[:, 1] = np.minimum(kp[:, 1], max_h) return kp