diff --git a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py index b9a9c4c..3918d65 100644 --- a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py +++ b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320.py @@ -7,12 +7,12 @@ onnx_config = dict( codebase_config = dict( # For later TensorRT inference, the number of output boxes needs to be as stable as possible, # because a drop in the box count leads to a re-optimization which takes a lot of time, - # therefore sort out low confidence boxes outside the model and reduce the maximum number - # of output boxes to the smallest usable value. + # therefore reduce the maximum number of output boxes to the smallest usable value and sort out + # low confidence boxes outside the model. post_processing=dict( score_threshold=0.0, confidence_threshold=0.0, - iou_threshold=0.3, + iou_threshold=0.5, max_output_boxes_per_class=10, ), ) diff --git a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py index a5478be..a724f53 100644 --- a/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py +++ b/extras/mmdeploy/configs/detection_onnxruntime_static-320x320_fp16.py @@ -12,7 +12,7 @@ codebase_config = dict( post_processing=dict( score_threshold=0.0, confidence_threshold=0.0, - iou_threshold=0.3, + iou_threshold=0.5, max_output_boxes_per_class=10, ), ) diff --git a/media/RESULTS.md b/media/RESULTS.md index 9c13741..7b1b616 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -6,74 +6,74 @@ Results of the model in various experiments on different datasets. ```json { - "avg_time_2d": 0.010846347323918747, - "avg_time_3d": 0.0003320467674126059, - "avg_fps": 89.45828817893282 + "avg_time_2d": 0.010003441875263796, + "avg_time_3d": 0.0003245426436602059, + "avg_fps": 96.824312446218 } { "person_nums": { "total_frames": 600, "total_labels": 600, - "total_preds": 601, + "total_preds": 600, "considered_empty": 0, "valid_preds": 600, - "invalid_preds": 1, + "invalid_preds": 0, "missing": 0, - "invalid_fraction": 0.00166, - "precision": 0.99834, + "invalid_fraction": 0.0, + "precision": 1.0, "recall": 1.0, - "f1": 0.99917, - "non_empty": 601 + "f1": 1.0, + "non_empty": 600 }, "mpjpe": { "count": 600, - "mean": 0.066093, - "median": 0.058635, - "std": 0.027815, - "sem": 0.001136, - "min": 0.040333, - "max": 0.189198, + "mean": 0.067074, + "median": 0.058987, + "std": 0.027958, + "sem": 0.001142, + "min": 0.042414, + "max": 0.189648, "recall-0.025": 0.0, - "recall-0.05": 0.101667, - "recall-0.1": 0.938333, + "recall-0.05": 0.061667, + "recall-0.1": 0.93, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.023002, - "ap-0.1": 0.897991, - "ap-0.15": 0.914985, + "ap-0.05": 0.00503, + "ap-0.1": 0.887557, + "ap-0.15": 0.913732, "ap-0.25": 1.0, "ap-0.5": 1.0 }, "nose": { "count": 600, - "mean": 0.114181, - "median": 0.099121, - "std": 0.042396, - "sem": 0.001732, - "min": 0.029365, - "max": 0.287428, + "mean": 0.114519, + "median": 0.097973, + "std": 0.044206, + "sem": 0.001806, + "min": 0.025858, + "max": 0.292026, "recall-0.025": 0.0, - "recall-0.05": 0.011667, - "recall-0.1": 0.508333, - "recall-0.15": 0.801667, - "recall-0.25": 0.991667, + "recall-0.05": 0.015, + "recall-0.1": 0.52, + "recall-0.15": 0.816667, + "recall-0.25": 0.988333, "recall-0.5": 1.0, "num_labels": 600 }, "shoulder_left": { "count": 600, - "mean": 0.03478, - "median": 0.026496, - "std": 0.031647, - "sem": 0.001293, - "min": 0.003155, - "max": 0.183779, - "recall-0.025": 0.455, - "recall-0.05": 0.853333, - "recall-0.1": 0.95, + "mean": 0.034466, + "median": 0.025369, + "std": 0.032528, + "sem": 0.001329, + "min": 0.002782, + "max": 0.182086, + "recall-0.025": 0.483333, + "recall-0.05": 0.863333, + "recall-0.1": 0.941667, "recall-0.15": 0.966667, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -81,95 +81,95 @@ Results of the model in various experiments on different datasets. }, "shoulder_right": { "count": 600, - "mean": 0.047867, - "median": 0.034293, - "std": 0.039619, - "sem": 0.001619, - "min": 0.005688, - "max": 0.254393, - "recall-0.025": 0.218333, + "mean": 0.048171, + "median": 0.03483, + "std": 0.040889, + "sem": 0.001671, + "min": 0.003841, + "max": 0.258489, + "recall-0.025": 0.221667, "recall-0.05": 0.751667, "recall-0.1": 0.913333, - "recall-0.15": 0.95, + "recall-0.15": 0.945, "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.044022, - "median": 0.035159, - "std": 0.034701, - "sem": 0.001418, - "min": 0.002814, - "max": 0.194526, - "recall-0.025": 0.233333, - "recall-0.05": 0.771667, - "recall-0.1": 0.943333, - "recall-0.15": 0.958333, + "mean": 0.043039, + "median": 0.03493, + "std": 0.034865, + "sem": 0.001425, + "min": 0.002006, + "max": 0.197281, + "recall-0.025": 0.248333, + "recall-0.05": 0.805, + "recall-0.1": 0.941667, + "recall-0.15": 0.955, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_right": { "count": 600, - "mean": 0.04408, - "median": 0.033951, - "std": 0.036319, - "sem": 0.001484, - "min": 0.008171, - "max": 0.360134, - "recall-0.025": 0.265, - "recall-0.05": 0.78, - "recall-0.1": 0.933333, - "recall-0.15": 0.946667, - "recall-0.25": 0.998333, + "mean": 0.044694, + "median": 0.032396, + "std": 0.03821, + "sem": 0.001561, + "min": 0.005657, + "max": 0.367138, + "recall-0.025": 0.24, + "recall-0.05": 0.791667, + "recall-0.1": 0.928333, + "recall-0.15": 0.943333, + "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_left": { "count": 600, - "mean": 0.043753, - "median": 0.027211, - "std": 0.044668, - "sem": 0.001825, - "min": 0.002715, - "max": 0.190751, - "recall-0.025": 0.46, - "recall-0.05": 0.74, - "recall-0.1": 0.891667, - "recall-0.15": 0.925, - "recall-0.25": 1.0, + "mean": 0.043228, + "median": 0.024022, + "std": 0.047501, + "sem": 0.001941, + "min": 0.002332, + "max": 0.283113, + "recall-0.025": 0.52, + "recall-0.05": 0.746667, + "recall-0.1": 0.885, + "recall-0.15": 0.92, + "recall-0.25": 0.996667, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { - "count": 600, - "mean": 0.046553, - "median": 0.026979, - "std": 0.050263, - "sem": 0.002054, - "min": 0.003364, - "max": 0.244861, - "recall-0.025": 0.46, - "recall-0.05": 0.733333, - "recall-0.1": 0.87, + "count": 599, + "mean": 0.047526, + "median": 0.027369, + "std": 0.055131, + "sem": 0.002254, + "min": 0.001, + "max": 0.492857, + "recall-0.025": 0.451667, + "recall-0.05": 0.74, + "recall-0.1": 0.873333, "recall-0.15": 0.906667, - "recall-0.25": 1.0, - "recall-0.5": 1.0, + "recall-0.25": 0.991667, + "recall-0.5": 0.998333, "num_labels": 600 }, "hip_left": { "count": 600, - "mean": 0.08362, - "median": 0.077619, - "std": 0.032967, - "sem": 0.001347, - "min": 0.018157, - "max": 0.240771, - "recall-0.025": 0.005, - "recall-0.05": 0.055, - "recall-0.1": 0.848333, + "mean": 0.089504, + "median": 0.085316, + "std": 0.032919, + "sem": 0.001345, + "min": 0.011484, + "max": 0.236463, + "recall-0.025": 0.006667, + "recall-0.05": 0.031667, + "recall-0.1": 0.815, "recall-0.15": 0.951667, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -177,98 +177,98 @@ Results of the model in various experiments on different datasets. }, "hip_right": { "count": 600, - "mean": 0.106567, - "median": 0.104243, - "std": 0.026243, - "sem": 0.001072, - "min": 0.035565, - "max": 0.245341, + "mean": 0.112947, + "median": 0.112279, + "std": 0.026967, + "sem": 0.001102, + "min": 0.041373, + "max": 0.235641, "recall-0.025": 0.0, - "recall-0.05": 0.003333, - "recall-0.1": 0.415, + "recall-0.05": 0.01, + "recall-0.1": 0.245, "recall-0.15": 0.946667, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "knee_left": { - "count": 599, - "mean": 0.063278, - "median": 0.047513, - "std": 0.056978, - "sem": 0.00233, - "min": 0.017587, - "max": 0.4004, - "recall-0.025": 0.038333, - "recall-0.05": 0.546667, - "recall-0.1": 0.883333, - "recall-0.15": 0.925, - "recall-0.25": 0.978333, - "recall-0.5": 0.998333, + "count": 600, + "mean": 0.061189, + "median": 0.045843, + "std": 0.0566, + "sem": 0.002313, + "min": 0.012587, + "max": 0.400213, + "recall-0.025": 0.05, + "recall-0.05": 0.58, + "recall-0.1": 0.91, + "recall-0.15": 0.926667, + "recall-0.25": 0.981667, + "recall-0.5": 1.0, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.050742, - "median": 0.041408, - "std": 0.037974, - "sem": 0.001552, - "min": 0.01394, - "max": 0.279839, - "recall-0.025": 0.053333, - "recall-0.05": 0.75, - "recall-0.1": 0.941667, - "recall-0.15": 0.941667, - "recall-0.25": 0.996667, + "mean": 0.052612, + "median": 0.04423, + "std": 0.037278, + "sem": 0.001523, + "min": 0.01118, + "max": 0.249994, + "recall-0.025": 0.038333, + "recall-0.05": 0.736667, + "recall-0.1": 0.936667, + "recall-0.15": 0.94, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "ankle_left": { - "count": 600, - "mean": 0.096717, - "median": 0.085484, - "std": 0.043279, - "sem": 0.001768, - "min": 0.050765, - "max": 0.49651, + "count": 598, + "mean": 0.095824, + "median": 0.084767, + "std": 0.048441, + "sem": 0.001983, + "min": 0.045599, + "max": 0.496625, "recall-0.025": 0.0, - "recall-0.05": 0.0, - "recall-0.1": 0.825, - "recall-0.15": 0.935, - "recall-0.25": 0.988333, - "recall-0.5": 1.0, + "recall-0.05": 0.003333, + "recall-0.1": 0.843333, + "recall-0.15": 0.94, + "recall-0.25": 0.981667, + "recall-0.5": 0.996667, "num_labels": 600 }, "ankle_right": { - "count": 600, - "mean": 0.08227, - "median": 0.068786, - "std": 0.049929, - "sem": 0.00204, - "min": 0.028705, - "max": 0.486848, + "count": 598, + "mean": 0.080368, + "median": 0.067762, + "std": 0.045136, + "sem": 0.001847, + "min": 0.031319, + "max": 0.490733, "recall-0.025": 0.0, - "recall-0.05": 0.033333, - "recall-0.1": 0.896667, - "recall-0.15": 0.916667, - "recall-0.25": 0.985, - "recall-0.5": 1.0, + "recall-0.05": 0.028333, + "recall-0.1": 0.89, + "recall-0.15": 0.913333, + "recall-0.25": 0.983333, + "recall-0.5": 0.996667, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.16782, - "recall-0.05": 0.46333, - "recall-0.1": 0.83154, - "recall-0.15": 0.92846, - "recall-0.25": 0.99462, - "recall-0.5": 0.99974 + "recall-0.025": 0.17346, + "recall-0.05": 0.4691, + "recall-0.1": 0.81808, + "recall-0.15": 0.92833, + "recall-0.25": 0.99333, + "recall-0.5": 0.99923 } } { "total_parts": 8400, - "correct_parts": 8111, - "pcp": 0.965595 + "correct_parts": 8084, + "pcp": 0.962381 } ``` diff --git a/scripts/utils_2d_pose_ort.py b/scripts/utils_2d_pose_ort.py index 88787d9..0cbc875 100644 --- a/scripts/utils_2d_pose_ort.py +++ b/scripts/utils_2d_pose_ort.py @@ -148,7 +148,7 @@ class LetterBox: resized_img = cv2.resize( image, (new_w, new_h), - interpolation=cv2.INTER_LINEAR, + interpolation=cv2.INTER_NEAREST, ) # Optionally pad the image @@ -273,7 +273,7 @@ class BoxCrop: resized_img = cv2.resize( cropped_img, (new_w, new_h), - interpolation=cv2.INTER_LINEAR, + interpolation=cv2.INTER_NEAREST, ) # Optionally pad the image @@ -309,6 +309,10 @@ class RTMDet(BaseModel): self.conf_threshold = conf_threshold self.letterbox = LetterBox(self.target_size, fill_value=114) + min_area_scale = 0.025 * 0.025 + img_area = self.target_size[0] * self.target_size[1] + self.min_area = img_area * min_area_scale + def preprocess(self, image: np.ndarray): image = self.letterbox.resize_image(image) tensor = np.asarray(image).astype(self.input_types[0], copy=False) @@ -326,6 +330,11 @@ class RTMDet(BaseModel): keep = boxes[:, 4] > self.conf_threshold boxes = boxes[keep] + # Drop boxes with too small area + areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + keep = areas >= self.min_area + boxes = boxes[keep] + paddings, scale, _ = self.letterbox.calc_params(image.shape) boxes[:, 0] -= paddings[0]