From ff735759f74bd0d2a8c4347f200405f8f9d9efd7 Mon Sep 17 00:00:00 2001 From: Daniel Date: Thu, 3 Apr 2025 18:51:33 +0200 Subject: [PATCH] First tracker implementation. --- media/RESULTS.md | 591 ++++++++++++++++---------------- rpt/tracker.hpp | 289 ++++++++++++++++ scripts/test_skelda_dataset.cpp | 27 +- scripts/test_skelda_dataset.py | 4 + 4 files changed, 616 insertions(+), 295 deletions(-) create mode 100644 rpt/tracker.hpp diff --git a/media/RESULTS.md b/media/RESULTS.md index 1a23c56..bbd819b 100644 --- a/media/RESULTS.md +++ b/media/RESULTS.md @@ -7,26 +7,27 @@ Results of the model in various experiments on different datasets. \ ```json { - "img_loading": 0.00946685, - "demosaicing": 0.000433632, - "avg_time_2d": 0.00564251, - "avg_time_3d": 3.16907e-05, - "fps": 163.724 + "img_loading": 0.00954196, + "demosaicing": 0.000437263, + "avg_time_2d": 0.00563337, + "avg_time_3d": 3.22195e-05, + "time_tracks": 2.56698e-06, + "fps": 163.789 } { "triangulator_calls": 600, - "init_time": 2.2774e-06, - "undistort_time": 3.66277e-06, - "project_time": 6.4956e-07, - "match_time": 1.27751e-06, - "pairs_time": 1.83347e-07, - "pair_scoring_time": 5.04129e-06, - "grouping_time": 9.12377e-07, - "full_time": 6.56363e-06, - "merge_time": 2.93065e-06, - "post_time": 3.79765e-06, - "convert_time": 5.13667e-08, - "total_time": 2.75941e-05 + "init_time": 2.34263e-06, + "undistort_time": 3.69506e-06, + "project_time": 5.953e-07, + "match_time": 1.23644e-06, + "pairs_time": 1.78917e-07, + "pair_scoring_time": 5.10816e-06, + "grouping_time": 8.91258e-07, + "full_time": 6.67602e-06, + "merge_time": 2.95392e-06, + "post_time": 3.7758e-06, + "convert_time": 5.69e-08, + "total_time": 2.7759e-05 } { "person_nums": { @@ -45,23 +46,23 @@ Results of the model in various experiments on different datasets. \ }, "mpjpe": { "count": 600, - "mean": 0.06036, - "median": 0.052956, - "std": 0.028032, - "sem": 0.001145, + "mean": 0.060543, + "median": 0.053007, + "std": 0.028089, + "sem": 0.001148, "min": 0.037548, "max": 0.192385, "recall-0.025": 0.0, - "recall-0.05": 0.333333, - "recall-0.1": 0.948333, + "recall-0.05": 0.325, + "recall-0.1": 0.946667, "recall-0.15": 0.95, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600, "ap-0.025": 0.0, - "ap-0.05": 0.187797, - "ap-0.1": 0.910662, - "ap-0.15": 0.913398, + "ap-0.05": 0.179324, + "ap-0.1": 0.907003, + "ap-0.15": 0.91147, "ap-0.25": 1.0, "ap-0.5": 1.0 }, @@ -99,82 +100,82 @@ Results of the model in various experiments on different datasets. \ }, "shoulder_right": { "count": 600, - "mean": 0.045983, + "mean": 0.046158, "median": 0.034992, - "std": 0.039542, - "sem": 0.001616, + "std": 0.039877, + "sem": 0.001629, "min": 0.00206, "max": 0.240597, "recall-0.025": 0.298333, "recall-0.05": 0.753333, - "recall-0.1": 0.91, - "recall-0.15": 0.946667, + "recall-0.1": 0.908333, + "recall-0.15": 0.945, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_left": { "count": 600, - "mean": 0.045328, - "median": 0.034177, - "std": 0.03755, - "sem": 0.001534, + "mean": 0.045362, + "median": 0.034249, + "std": 0.037307, + "sem": 0.001524, "min": 0.002988, "max": 0.194229, - "recall-0.025": 0.26, - "recall-0.05": 0.763333, - "recall-0.1": 0.926667, - "recall-0.15": 0.953333, + "recall-0.025": 0.258333, + "recall-0.05": 0.761667, + "recall-0.1": 0.925, + "recall-0.15": 0.955, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 600 }, "elbow_right": { "count": 600, - "mean": 0.044168, + "mean": 0.04432, "median": 0.03193, - "std": 0.03869, - "sem": 0.001581, + "std": 0.038646, + "sem": 0.001579, "min": 0.005218, - "max": 0.427232, + "max": 0.383954, "recall-0.025": 0.268333, "recall-0.05": 0.783333, - "recall-0.1": 0.928333, - "recall-0.15": 0.941667, + "recall-0.1": 0.926667, + "recall-0.15": 0.94, "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_left": { "count": 600, - "mean": 0.039992, - "median": 0.022739, - "std": 0.044805, - "sem": 0.001831, + "mean": 0.04031, + "median": 0.022792, + "std": 0.044965, + "sem": 0.001837, "min": 0.002345, "max": 0.289452, - "recall-0.025": 0.553333, - "recall-0.05": 0.766667, - "recall-0.1": 0.908333, - "recall-0.15": 0.928333, + "recall-0.025": 0.551667, + "recall-0.05": 0.763333, + "recall-0.1": 0.906667, + "recall-0.15": 0.93, "recall-0.25": 0.998333, "recall-0.5": 1.0, "num_labels": 600 }, "wrist_right": { - "count": 599, - "mean": 0.040157, - "median": 0.026117, - "std": 0.044323, - "sem": 0.001812, + "count": 600, + "mean": 0.041059, + "median": 0.026221, + "std": 0.04525, + "sem": 0.001849, "min": 0.003181, "max": 0.296936, - "recall-0.025": 0.481667, - "recall-0.05": 0.806667, - "recall-0.1": 0.915, - "recall-0.15": 0.925, - "recall-0.25": 0.995, - "recall-0.5": 0.998333, + "recall-0.025": 0.476667, + "recall-0.05": 0.801667, + "recall-0.1": 0.91, + "recall-0.15": 0.921667, + "recall-0.25": 0.996667, + "recall-0.5": 1.0, "num_labels": 600 }, "hip_left": { @@ -211,31 +212,31 @@ Results of the model in various experiments on different datasets. \ }, "knee_left": { "count": 599, - "mean": 0.052251, + "mean": 0.05212, "median": 0.042367, - "std": 0.042631, - "sem": 0.001743, + "std": 0.039681, + "sem": 0.001623, "min": 0.014466, - "max": 0.482769, + "max": 0.303977, "recall-0.025": 0.06, - "recall-0.05": 0.698333, - "recall-0.1": 0.925, - "recall-0.15": 0.945, - "recall-0.25": 0.995, + "recall-0.05": 0.696667, + "recall-0.1": 0.921667, + "recall-0.15": 0.943333, + "recall-0.25": 0.996667, "recall-0.5": 0.998333, "num_labels": 600 }, "knee_right": { "count": 600, - "mean": 0.045065, - "median": 0.034037, - "std": 0.037143, - "sem": 0.001518, + "mean": 0.045269, + "median": 0.034108, + "std": 0.03723, + "sem": 0.001521, "min": 0.010961, "max": 0.293895, - "recall-0.025": 0.106667, - "recall-0.05": 0.825, - "recall-0.1": 0.931667, + "recall-0.025": 0.105, + "recall-0.05": 0.821667, + "recall-0.1": 0.93, "recall-0.15": 0.948333, "recall-0.25": 0.998333, "recall-0.5": 1.0, @@ -243,50 +244,50 @@ Results of the model in various experiments on different datasets. \ }, "ankle_left": { "count": 599, - "mean": 0.088513, - "median": 0.082493, - "std": 0.029511, - "sem": 0.001207, + "mean": 0.089795, + "median": 0.082637, + "std": 0.03326, + "sem": 0.00136, "min": 0.048202, - "max": 0.271976, + "max": 0.351142, "recall-0.025": 0.0, "recall-0.05": 0.001667, - "recall-0.1": 0.88, - "recall-0.15": 0.945, - "recall-0.25": 0.995, + "recall-0.1": 0.871667, + "recall-0.15": 0.94, + "recall-0.25": 0.991667, "recall-0.5": 0.998333, "num_labels": 600 }, "ankle_right": { "count": 600, - "mean": 0.077527, - "median": 0.06667, - "std": 0.038806, - "sem": 0.001586, - "min": 0.033001, + "mean": 0.077779, + "median": 0.066732, + "std": 0.039218, + "sem": 0.001602, + "min": 0.018484, "max": 0.281311, - "recall-0.025": 0.0, - "recall-0.05": 0.036667, - "recall-0.1": 0.9, - "recall-0.15": 0.926667, + "recall-0.025": 0.003333, + "recall-0.05": 0.041667, + "recall-0.1": 0.893333, + "recall-0.15": 0.923333, "recall-0.25": 0.991667, "recall-0.5": 1.0, "num_labels": 600 }, "joint_recalls": { "num_labels": 7800, - "recall-0.025": 0.18962, - "recall-0.05": 0.50128, - "recall-0.1": 0.87962, - "recall-0.15": 0.93526, + "recall-0.025": 0.18897, + "recall-0.05": 0.50064, + "recall-0.1": 0.87718, + "recall-0.15": 0.93423, "recall-0.25": 0.99603, - "recall-0.5": 0.99923 + "recall-0.5": 0.99949 } } { "total_parts": 8400, - "correct_parts": 8134, - "pcp": 0.968333 + "correct_parts": 8130, + "pcp": 0.967857 } ``` @@ -294,70 +295,71 @@ Results of the model in various experiments on different datasets. \ ```json { - "img_loading": 0.0431246, - "demosaicing": 0.000535209, - "avg_time_2d": 0.0104033, - "avg_time_3d": 0.000107023, - "fps": 90.5343 + "img_loading": 0.0415743, + "demosaicing": 0.000548337, + "avg_time_2d": 0.0104442, + "avg_time_3d": 0.000102827, + "time_tracks": 7.53299e-07, + "fps": 90.1218 } { "triangulator_calls": 301, - "init_time": 2.60675e-06, - "undistort_time": 1.42386e-05, - "project_time": 1.74955e-06, - "match_time": 7.80174e-06, - "pairs_time": 4.016e-06, - "pair_scoring_time": 2.31076e-05, - "grouping_time": 4.08468e-06, - "full_time": 2.34529e-05, - "merge_time": 1.1163e-05, - "post_time": 6.22243e-06, - "convert_time": 1.01126e-07, - "total_time": 9.87993e-05 + "init_time": 2.5757e-06, + "undistort_time": 1.38599e-05, + "project_time": 1.70299e-06, + "match_time": 7.42211e-06, + "pairs_time": 3.51909e-06, + "pair_scoring_time": 2.20828e-05, + "grouping_time": 4.00774e-06, + "full_time": 2.23789e-05, + "merge_time": 1.09032e-05, + "post_time": 6.10427e-06, + "convert_time": 1.11395e-07, + "total_time": 9.49145e-05 } { "person_nums": { "total_frames": 301, "total_labels": 477, - "total_preds": 828, + "total_preds": 847, "considered_empty": 0, "valid_preds": 477, - "invalid_preds": 351, + "invalid_preds": 370, "missing": 0, - "invalid_fraction": 0.42391, - "precision": 0.57609, + "invalid_fraction": 0.43684, + "precision": 0.56316, "recall": 1.0, - "f1": 0.73103, - "non_empty": 828 + "f1": 0.72054, + "non_empty": 847 }, "mpjpe": { "count": 477, - "mean": 0.047653, + "mean": 0.047597, "median": 0.042661, - "std": 0.015008, - "sem": 0.000688, + "std": 0.014947, + "sem": 0.000685, "min": 0.028932, - "max": 0.113187, + "max": 0.107384, "recall-0.025": 0.0, - "recall-0.05": 0.719078, - "recall-0.1": 0.985325, + "recall-0.05": 0.721174, + "recall-0.1": 0.987421, "recall-0.15": 1.0, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 477, "ap-0.025": 0.0, - "ap-0.05": 0.449379, - "ap-0.1": 0.750506, - "ap-0.15": 0.76101, - "ap-0.25": 0.76101, - "ap-0.5": 0.76101 + "ap-0.05": 0.456736, + "ap-0.1": 0.755053, + "ap-0.15": 0.764546, + "ap-0.25": 0.764546, + "ap-0.5": 0.764546 }, "head": { "count": 477, - "mean": 0.053389, + "mean": 0.053421, "median": 0.049509, - "std": 0.025237, - "sem": 0.001157, + "std": 0.025337, + "sem": 0.001161, "min": 0.005816, "max": 0.182842, "recall-0.025": 0.098532, @@ -402,17 +404,17 @@ Results of the model in various experiments on different datasets. \ }, "elbow_left": { "count": 477, - "mean": 0.040654, + "mean": 0.040136, "median": 0.032194, - "std": 0.02881, - "sem": 0.001321, + "std": 0.026056, + "sem": 0.001194, "min": 0.003888, - "max": 0.309836, + "max": 0.146492, "recall-0.025": 0.301887, "recall-0.05": 0.752621, - "recall-0.1": 0.951782, - "recall-0.15": 0.997904, - "recall-0.25": 0.997904, + "recall-0.1": 0.953878, + "recall-0.15": 1.0, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 477 }, @@ -434,33 +436,33 @@ Results of the model in various experiments on different datasets. \ }, "wrist_left": { "count": 477, - "mean": 0.059565, + "mean": 0.059143, "median": 0.05442, - "std": 0.039643, - "sem": 0.001817, + "std": 0.038602, + "sem": 0.001769, "min": 0.001595, "max": 0.314599, "recall-0.025": 0.155136, "recall-0.05": 0.410901, - "recall-0.1": 0.909853, - "recall-0.15": 0.966457, - "recall-0.25": 0.989518, + "recall-0.1": 0.91195, + "recall-0.15": 0.968553, + "recall-0.25": 0.991614, "recall-0.5": 1.0, "num_labels": 477 }, "wrist_right": { "count": 477, - "mean": 0.058378, + "mean": 0.05783, "median": 0.053422, - "std": 0.034091, - "sem": 0.001563, + "std": 0.030837, + "sem": 0.001413, "min": 0.008434, - "max": 0.381589, + "max": 0.207547, "recall-0.025": 0.109015, "recall-0.05": 0.442348, "recall-0.1": 0.907757, - "recall-0.15": 0.979036, - "recall-0.25": 0.997904, + "recall-0.15": 0.981132, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 477 }, @@ -514,15 +516,15 @@ Results of the model in various experiments on different datasets. \ }, "knee_right": { "count": 477, - "mean": 0.039775, + "mean": 0.039912, "median": 0.035222, - "std": 0.023097, - "sem": 0.001059, + "std": 0.023393, + "sem": 0.001072, "min": 0.006395, "max": 0.184949, "recall-0.025": 0.303983, "recall-0.05": 0.727463, - "recall-0.1": 0.974843, + "recall-0.1": 0.972746, "recall-0.15": 0.997904, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -530,10 +532,10 @@ Results of the model in various experiments on different datasets. \ }, "ankle_left": { "count": 477, - "mean": 0.036458, + "mean": 0.036485, "median": 0.028094, - "std": 0.03061, - "sem": 0.001403, + "std": 0.030728, + "sem": 0.001408, "min": 0.004185, "max": 0.222831, "recall-0.025": 0.429769, @@ -546,17 +548,17 @@ Results of the model in various experiments on different datasets. \ }, "ankle_right": { "count": 477, - "mean": 0.040799, + "mean": 0.041358, "median": 0.030877, - "std": 0.03753, - "sem": 0.00172, + "std": 0.040428, + "sem": 0.001853, "min": 0.002443, - "max": 0.287278, + "max": 0.355429, "recall-0.025": 0.30608, "recall-0.05": 0.813417, "recall-0.1": 0.928721, - "recall-0.15": 0.968553, - "recall-0.25": 0.997904, + "recall-0.15": 0.966457, + "recall-0.25": 0.993711, "recall-0.5": 1.0, "num_labels": 477 }, @@ -565,15 +567,15 @@ Results of the model in various experiments on different datasets. \ "recall-0.025": 0.21351, "recall-0.05": 0.62119, "recall-0.1": 0.94323, - "recall-0.15": 0.98645, + "recall-0.15": 0.98662, "recall-0.25": 0.99871, "recall-0.5": 1.0 } } { "total_parts": 6678, - "correct_parts": 6622, - "pcp": 0.991614 + "correct_parts": 6623, + "pcp": 0.991764 } ``` @@ -581,75 +583,76 @@ Results of the model in various experiments on different datasets. \ ```json { - "img_loading": 0.00517366, - "demosaicing": 0.000109466, - "avg_time_2d": 0.00478839, - "avg_time_3d": 3.38694e-05, - "fps": 202.769 + "img_loading": 0.00477884, + "demosaicing": 0.000105891, + "avg_time_2d": 0.00475547, + "avg_time_3d": 3.45411e-05, + "time_tracks": 8.63962e-07, + "fps": 204.216 } { "triangulator_calls": 222, - "init_time": 2.02099e-06, - "undistort_time": 5.6859e-06, - "project_time": 8.53198e-07, - "match_time": 2.58553e-06, - "pairs_time": 4.89811e-07, - "pair_scoring_time": 4.32314e-06, - "grouping_time": 8.76297e-07, - "full_time": 5.37456e-06, - "merge_time": 2.54044e-06, - "post_time": 4.92114e-06, - "convert_time": 7.91441e-08, - "total_time": 2.99961e-05 + "init_time": 1.99938e-06, + "undistort_time": 5.71187e-06, + "project_time": 8.61266e-07, + "match_time": 2.61942e-06, + "pairs_time": 4.20405e-07, + "pair_scoring_time": 4.35068e-06, + "grouping_time": 9.27387e-07, + "full_time": 5.49219e-06, + "merge_time": 2.62484e-06, + "post_time": 5.03611e-06, + "convert_time": 9.22072e-08, + "total_time": 3.03843e-05 } { "person_nums": { "total_frames": 222, "total_labels": 376, - "total_preds": 443, + "total_preds": 457, "considered_empty": 0, "valid_preds": 376, - "invalid_preds": 67, + "invalid_preds": 81, "missing": 0, - "invalid_fraction": 0.15124, - "precision": 0.84876, + "invalid_fraction": 0.17724, + "precision": 0.82276, "recall": 1.0, - "f1": 0.91819, - "non_empty": 443 + "f1": 0.90276, + "non_empty": 457 }, "mpjpe": { "count": 376, - "mean": 0.075573, - "median": 0.072487, - "std": 0.017073, - "sem": 0.000882, + "mean": 0.075498, + "median": 0.073144, + "std": 0.014424, + "sem": 0.000745, "min": 0.040047, - "max": 0.237124, + "max": 0.12107, "recall-0.025": 0.0, "recall-0.05": 0.007979, - "recall-0.1": 0.922872, - "recall-0.15": 0.99734, + "recall-0.1": 0.933511, + "recall-0.15": 1.0, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 376, "ap-0.025": 0.0, - "ap-0.05": 7.9e-05, - "ap-0.1": 0.836487, - "ap-0.15": 0.966449, - "ap-0.25": 0.968931, - "ap-0.5": 0.968931 + "ap-0.05": 9.4e-05, + "ap-0.1": 0.843551, + "ap-0.15": 0.959589, + "ap-0.25": 0.959589, + "ap-0.5": 0.959589 }, "head": { "count": 376, - "mean": 0.063145, + "mean": 0.063289, "median": 0.062777, - "std": 0.025859, - "sem": 0.001335, + "std": 0.02607, + "sem": 0.001346, "min": 0.007715, "max": 0.197424, "recall-0.025": 0.055851, "recall-0.05": 0.31117, - "recall-0.1": 0.930851, + "recall-0.1": 0.928191, "recall-0.15": 0.992021, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -657,97 +660,97 @@ Results of the model in various experiments on different datasets. \ }, "shoulder_left": { "count": 376, - "mean": 0.067151, + "mean": 0.066843, "median": 0.064619, - "std": 0.029971, - "sem": 0.001548, + "std": 0.029378, + "sem": 0.001517, "min": 0.016733, "max": 0.205046, "recall-0.025": 0.039894, "recall-0.05": 0.321809, - "recall-0.1": 0.890957, - "recall-0.15": 0.981383, + "recall-0.1": 0.893617, + "recall-0.15": 0.984043, "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 376 }, "shoulder_right": { "count": 376, - "mean": 0.070444, + "mean": 0.069399, "median": 0.062402, - "std": 0.038664, - "sem": 0.001997, + "std": 0.031883, + "sem": 0.001646, "min": 0.017401, - "max": 0.495275, + "max": 0.198707, "recall-0.025": 0.018617, "recall-0.05": 0.316489, "recall-0.1": 0.837766, - "recall-0.15": 0.968085, - "recall-0.25": 0.99734, + "recall-0.15": 0.970745, + "recall-0.25": 1.0, "recall-0.5": 1.0, "num_labels": 376 }, "elbow_left": { "count": 376, - "mean": 0.090384, + "mean": 0.089682, "median": 0.072996, - "std": 0.054936, - "sem": 0.002837, + "std": 0.053299, + "sem": 0.002752, "min": 0.009937, - "max": 0.344095, + "max": 0.283516, "recall-0.025": 0.029255, "recall-0.05": 0.242021, - "recall-0.1": 0.648936, - "recall-0.15": 0.864362, - "recall-0.25": 0.989362, + "recall-0.1": 0.651596, + "recall-0.15": 0.867021, + "recall-0.25": 0.992021, "recall-0.5": 1.0, "num_labels": 376 }, "elbow_right": { - "count": 375, - "mean": 0.077748, - "median": 0.070448, - "std": 0.037755, - "sem": 0.001952, + "count": 376, + "mean": 0.077757, + "median": 0.070519, + "std": 0.037705, + "sem": 0.001947, "min": 0.004749, "max": 0.194529, "recall-0.025": 0.042553, "recall-0.05": 0.276596, - "recall-0.1": 0.723404, - "recall-0.15": 0.941489, - "recall-0.25": 0.99734, - "recall-0.5": 0.99734, + "recall-0.1": 0.726064, + "recall-0.15": 0.944149, + "recall-0.25": 1.0, + "recall-0.5": 1.0, "num_labels": 376 }, "wrist_left": { "count": 376, - "mean": 0.128727, - "median": 0.113244, - "std": 0.06993, - "sem": 0.003611, + "mean": 0.128965, + "median": 0.113502, + "std": 0.068869, + "sem": 0.003556, "min": 0.015136, - "max": 0.458731, + "max": 0.352772, "recall-0.025": 0.013298, "recall-0.05": 0.079787, - "recall-0.1": 0.422872, - "recall-0.15": 0.672872, - "recall-0.25": 0.917553, + "recall-0.1": 0.417553, + "recall-0.15": 0.670213, + "recall-0.25": 0.920213, "recall-0.5": 1.0, "num_labels": 376 }, "wrist_right": { "count": 376, - "mean": 0.104259, - "median": 0.09734, - "std": 0.050125, - "sem": 0.002588, + "mean": 0.102037, + "median": 0.097009, + "std": 0.044155, + "sem": 0.00228, "min": 0.002425, - "max": 0.461713, + "max": 0.277303, "recall-0.025": 0.005319, - "recall-0.05": 0.093085, - "recall-0.1": 0.515957, - "recall-0.15": 0.853723, - "recall-0.25": 0.984043, + "recall-0.05": 0.095745, + "recall-0.1": 0.521277, + "recall-0.15": 0.864362, + "recall-0.25": 0.994681, "recall-0.5": 1.0, "num_labels": 376 }, @@ -769,17 +772,17 @@ Results of the model in various experiments on different datasets. \ }, "hip_right": { "count": 376, - "mean": 0.065474, + "mean": 0.065421, "median": 0.059695, - "std": 0.035357, - "sem": 0.001826, + "std": 0.034174, + "sem": 0.001765, "min": 0.005283, - "max": 0.302623, + "max": 0.259007, "recall-0.025": 0.053191, "recall-0.05": 0.359043, - "recall-0.1": 0.906915, - "recall-0.15": 0.976064, - "recall-0.25": 0.989362, + "recall-0.1": 0.904255, + "recall-0.15": 0.973404, + "recall-0.25": 0.992021, "recall-0.5": 1.0, "num_labels": 376 }, @@ -801,15 +804,15 @@ Results of the model in various experiments on different datasets. \ }, "knee_right": { "count": 376, - "mean": 0.053573, - "median": 0.049537, - "std": 0.02731, - "sem": 0.00141, + "mean": 0.053806, + "median": 0.049906, + "std": 0.027608, + "sem": 0.001426, "min": 0.008841, "max": 0.230822, "recall-0.025": 0.087766, - "recall-0.05": 0.50266, - "recall-0.1": 0.954787, + "recall-0.05": 0.5, + "recall-0.1": 0.952128, "recall-0.15": 0.992021, "recall-0.25": 1.0, "recall-0.5": 1.0, @@ -817,32 +820,32 @@ Results of the model in various experiments on different datasets. \ }, "ankle_left": { "count": 376, - "mean": 0.072656, - "median": 0.066373, - "std": 0.046322, - "sem": 0.002392, + "mean": 0.074289, + "median": 0.066969, + "std": 0.045644, + "sem": 0.002357, "min": 0.009841, - "max": 0.390551, + "max": 0.378337, "recall-0.025": 0.042553, - "recall-0.05": 0.329787, - "recall-0.1": 0.861702, - "recall-0.15": 0.962766, - "recall-0.25": 0.984043, + "recall-0.05": 0.327128, + "recall-0.1": 0.832447, + "recall-0.15": 0.944149, + "recall-0.25": 0.989362, "recall-0.5": 1.0, "num_labels": 376 }, "ankle_right": { "count": 376, - "mean": 0.063864, - "median": 0.052121, - "std": 0.048194, - "sem": 0.002489, + "mean": 0.066324, + "median": 0.052238, + "std": 0.048596, + "sem": 0.002509, "min": 0.007344, "max": 0.373408, "recall-0.025": 0.069149, - "recall-0.05": 0.43883, - "recall-0.1": 0.896277, - "recall-0.15": 0.960106, + "recall-0.05": 0.430851, + "recall-0.1": 0.859043, + "recall-0.15": 0.946809, "recall-0.25": 0.986702, "recall-0.5": 1.0, "num_labels": 376 @@ -850,17 +853,17 @@ Results of the model in various experiments on different datasets. \ "joint_recalls": { "num_labels": 4888, "recall-0.025": 0.04194, - "recall-0.05": 0.30176, - "recall-0.1": 0.80135, - "recall-0.15": 0.93269, - "recall-0.25": 0.98732, - "recall-0.5": 0.99959 + "recall-0.05": 0.30115, + "recall-0.1": 0.79705, + "recall-0.15": 0.93208, + "recall-0.25": 0.98977, + "recall-0.5": 1.0 } } { "total_parts": 5264, - "correct_parts": 4995, - "pcp": 0.948898 + "correct_parts": 5005, + "pcp": 0.950798 } ``` diff --git a/rpt/tracker.hpp b/rpt/tracker.hpp new file mode 100644 index 0000000..d15937b --- /dev/null +++ b/rpt/tracker.hpp @@ -0,0 +1,289 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// ================================================================================================= + +struct Track +{ + std::vector>> core_poses; + std::vector>> full_poses; + + std::vector timestamps; + size_t id; +}; + +// ================================================================================================= + +class PoseTracker +{ +public: + PoseTracker(float max_distance); + + std::vector>>> track_poses( + const std::vector>> &poses_3d, + const std::vector &joint_names, + const double timestamp); + + void reset(); + +private: + float max_distance; + size_t history_size = 3; + float max_movement_speed = 2.0; + + std::vector timestamps; + std::vector pose_tracks; + + const std::vector core_joints = { + "shoulder_left", + "shoulder_right", + "hip_left", + "hip_right", + "elbow_left", + "elbow_right", + "knee_left", + "knee_right", + "wrist_left", + "wrist_right", + "ankle_left", + "ankle_right", + }; + + int match_to_track(const std::vector> &core_pose_3d); + + std::vector> refine_pose(const Track &track); +}; + +// ================================================================================================= +// ================================================================================================= + +PoseTracker::PoseTracker(float max_distance) +{ + this->max_distance = max_distance; +} + +// ================================================================================================= + +void PoseTracker::reset() +{ + pose_tracks.clear(); + timestamps.clear(); +} + +// ================================================================================================= + +std::vector>>> PoseTracker::track_poses( + const std::vector>> &poses_3d, + const std::vector &joint_names, + const double timestamp) +{ + // Extract core joints + std::vector core_joint_idx; + for (const auto &joint : core_joints) + { + auto it = std::find(joint_names.begin(), joint_names.end(), joint); + core_joint_idx.push_back(std::distance(joint_names.begin(), it)); + } + std::vector>> core_poses; + core_poses.resize(poses_3d.size()); + for (size_t i = 0; i < poses_3d.size(); ++i) + { + core_poses[i].resize(core_joint_idx.size()); + for (size_t j = 0; j < core_joint_idx.size(); ++j) + { + for (size_t k = 0; k < 4; ++k) + { + core_poses[i][j][k] = poses_3d[i][core_joint_idx[j]][k]; + } + } + } + + // Match core poses to tracks + for (size_t i = 0; i < core_poses.size(); ++i) + { + int track_idx = match_to_track(core_poses[i]); + if (track_idx == -1) + { + // Create a new track + Track new_track; + new_track.core_poses.push_back(core_poses[i]); + new_track.full_poses.push_back(poses_3d[i]); + new_track.timestamps.push_back(timestamp); + new_track.id = pose_tracks.size(); + pose_tracks.push_back(new_track); + } + else + { + // Update existing track + auto &track = pose_tracks[track_idx]; + track.core_poses.push_back(core_poses[i]); + track.full_poses.push_back(poses_3d[i]); + track.timestamps.push_back(timestamp); + } + } + + // Remove old tracks + timestamps.push_back(timestamp); + if (timestamps.size() > history_size) + { + timestamps.erase(timestamps.begin()); + } + double max_age = timestamps.front(); + for (size_t i = 0; i < pose_tracks.size();) + { + auto &track = pose_tracks[i]; + double last_timestamp = track.timestamps.back(); + if (last_timestamp < max_age) + { + pose_tracks.erase(pose_tracks.begin() + i); + } + else + { + ++i; + } + } + + // Remove old poses from tracks + for (auto &track : pose_tracks) + { + while (track.core_poses.size() > history_size) + { + track.core_poses.erase(track.core_poses.begin()); + track.full_poses.erase(track.full_poses.begin()); + track.timestamps.erase(track.timestamps.begin()); + } + } + + // Refine poses + std::vector>>> tracked_poses; + for (size_t i = 0; i < pose_tracks.size(); ++i) + { + auto &track = pose_tracks[i]; + if (track.core_poses.size() > 0) + { + std::vector> refined_pose = refine_pose(track); + tracked_poses.emplace_back(track.id, refined_pose); + } + } + + return tracked_poses; +} + +// ================================================================================================= + +int PoseTracker::match_to_track(const std::vector> &core_pose_3d) +{ + int best_track = -1; + float best_distance_sq = max_distance * max_distance; + + for (size_t i = 0; i < pose_tracks.size(); ++i) + { + const auto &track = pose_tracks[i]; + if (track.core_poses.size() == 0) + continue; + + // Calculate distance to the last pose in the track + const auto &last_pose = track.core_poses.back(); + float distance_sq = 0.0; + for (size_t j = 0; j < core_pose_3d.size(); ++j) + { + float dx = core_pose_3d[j][0] - last_pose[j][0]; + float dy = core_pose_3d[j][1] - last_pose[j][1]; + float dz = core_pose_3d[j][2] - last_pose[j][2]; + distance_sq += dx * dx + dy * dy + dz * dz; + } + distance_sq /= core_pose_3d.size(); + + if (distance_sq < best_distance_sq) + { + best_distance_sq = distance_sq; + best_track = static_cast(i); + } + } + return best_track; +} + +// ================================================================================================= + +std::vector> PoseTracker::refine_pose(const Track &track) +{ + // Restrict maximum movement by physical constraints, by clipping the pose to the maximum + // movement distance from one of the track's history poses + // + // While advanced sensor filtering techniques, like using a Kalman-Filter, might improve the + // average accuracy of the pose, they introduce update delays on fast movement changes. For + // example, if a person stands still for a while and then suddenly moves, the first few frames + // will likely be treated as outliers, since the filter will not be able to adapt fast enough. + // This behaviour is not desired in a real-time critical applications, where the pose needs to + // be updated to the real physical position of the person as fast as possible. Therefore, only + // the movement speed is limited here. + + if (track.core_poses.size() < 2) + { + return track.full_poses.back(); + } + + // Calculate maximum possible movement distance from each history pose + std::vector max_movement_dists_sq; + max_movement_dists_sq.resize(track.core_poses.size() - 1); + double last_timestamp = track.timestamps.back(); + for (size_t i = 0; i < track.core_poses.size() - 1; ++i) + { + double ts = track.timestamps[i]; + float max_movement = max_movement_speed * (last_timestamp - ts); + max_movement_dists_sq[i] = max_movement * max_movement; + } + + // Clip joint if required + std::vector> refined_pose = track.full_poses.back(); + for (size_t i = 0; i < refined_pose.size(); ++i) + { + float min_dist_sq = std::numeric_limits::infinity(); + size_t closest_idx = 0; + bool clip = true; + + for (size_t j = 0; j < max_movement_dists_sq.size(); ++j) + { + + float dx = refined_pose[i][0] - track.full_poses[j][i][0]; + float dy = refined_pose[i][1] - track.full_poses[j][i][1]; + float dz = refined_pose[i][2] - track.full_poses[j][i][2]; + float dist_sq = dx * dx + dy * dy + dz * dz; + if (dist_sq < min_dist_sq) + { + min_dist_sq = dist_sq; + closest_idx = j; + } + if (dist_sq <= max_movement_dists_sq[j]) + { + clip = false; + break; + } + } + + if (clip) + { + float dist_sq = min_dist_sq; + float scale = max_movement_dists_sq[closest_idx] / dist_sq; + + float dx = refined_pose[i][0] - track.full_poses[closest_idx][i][0]; + float dy = refined_pose[i][1] - track.full_poses[closest_idx][i][1]; + float dz = refined_pose[i][2] - track.full_poses[closest_idx][i][2]; + refined_pose[i][0] = track.full_poses[closest_idx][i][0] + dx * scale; + refined_pose[i][1] = track.full_poses[closest_idx][i][1] + dy * scale; + refined_pose[i][2] = track.full_poses[closest_idx][i][2] + dz * scale; + + // Set confidence to a low value if the joint is clipped + refined_pose[i][3] = 0.1; + } + } + + return refined_pose; +} diff --git a/scripts/test_skelda_dataset.cpp b/scripts/test_skelda_dataset.cpp index 6760510..7875106 100644 --- a/scripts/test_skelda_dataset.cpp +++ b/scripts/test_skelda_dataset.cpp @@ -17,6 +17,7 @@ using json = nlohmann::json; #include "/RapidPoseTriangulation/rpt/camera.hpp" #include "/RapidPoseTriangulation/rpt/interface.hpp" +#include "/RapidPoseTriangulation/rpt/tracker.hpp" #include "/RapidPoseTriangulation/scripts/utils_2d_pose.hpp" #include "/RapidPoseTriangulation/scripts/utils_pipeline.hpp" @@ -121,6 +122,7 @@ int main(int argc, char **argv) const float min_match_score = config["min_match_score"]; const size_t min_group_size = config["min_group_size"]; const int take_interval = config["take_interval"]; + const float ifps = config["fps"]; // Load 2D model bool use_wb = utils_pipeline::use_whole_body(whole_body); @@ -131,6 +133,9 @@ int main(int argc, char **argv) // Load 3D model std::unique_ptr tri_model = std::make_unique( min_match_score, min_group_size); + const float max_distance = 0.3 + 2.0 / ifps; + std::unique_ptr pose_tracker = std::make_unique( + max_distance); // Timers size_t time_count = dataset.size(); @@ -138,10 +143,12 @@ int main(int argc, char **argv) std::vector times_debayer; std::vector times_pose2d; std::vector times_pose3d; + std::vector times_tracks; times_image.reserve(time_count); times_debayer.reserve(time_count); times_pose2d.reserve(time_count); times_pose3d.reserve(time_count); + times_tracks.reserve(time_count); size_t print_steps = (size_t)std::floor((float)time_count / 100.0f); print_steps = std::max((size_t)1, print_steps); @@ -214,6 +221,7 @@ int main(int argc, char **argv) { // Reset last poses if scene changes tri_model->reset(); + pose_tracker->reset(); old_scene = item["scene"]; } @@ -241,6 +249,19 @@ int main(int argc, char **argv) elapsed = std::chrono::high_resolution_clock::now() - stime; times_pose3d.push_back(elapsed.count()); + stime = std::chrono::high_resolution_clock::now(); + double ts = ((int)item["index"]) / ifps; + auto pose_tracks = pose_tracker->track_poses(poses_3d, joint_names_2d, ts); + std::vector>> poses_3d_refined; + for (size_t j = 0; j < pose_tracks.size(); j++) + { + auto &pose = std::get<1>(pose_tracks[j]); + poses_3d_refined.push_back(pose); + } + poses_3d = poses_3d_refined; + elapsed = std::chrono::high_resolution_clock::now() - stime; + times_tracks.push_back(elapsed.count()); + all_poses_3d.push_back(std::move(poses_3d)); all_ids.push_back(item["id"].get()); old_id = item["index"]; @@ -254,23 +275,27 @@ int main(int argc, char **argv) double time_debayer = 0.0; double time_pose2d = 0.0; double time_pose3d = 0.0; + double time_tracks = 0.0; for (size_t i = warmup; i < time_count; i++) { time_image += times_image[i]; time_debayer += times_debayer[i]; time_pose2d += times_pose2d[i]; time_pose3d += times_pose3d[i]; + time_tracks += times_tracks[i]; } double avg_time_image = time_image / (time_count - warmup); double avg_time_debayer = time_debayer / (time_count - warmup); double avg_time_pose2d = time_pose2d / (time_count - warmup); double avg_time_pose3d = time_pose3d / (time_count - warmup); - double fps = 1.0 / (avg_time_debayer + avg_time_pose2d + avg_time_pose3d); + double avg_time_tracks = time_tracks / (time_count - warmup); + double fps = 1.0 / (avg_time_debayer + avg_time_pose2d + avg_time_pose3d + avg_time_tracks); std::cout << "{\n" << " \"img_loading\": " << avg_time_image << ",\n" << " \"demosaicing\": " << avg_time_debayer << ",\n" << " \"avg_time_2d\": " << avg_time_pose2d << ",\n" << " \"avg_time_3d\": " << avg_time_pose3d << ",\n" + << " \"time_tracks\": " << avg_time_tracks << ",\n" << " \"fps\": " << fps << "\n" << "}" << std::endl; tri_model->print_stats(); diff --git a/scripts/test_skelda_dataset.py b/scripts/test_skelda_dataset.py index e445f8a..24c5b88 100644 --- a/scripts/test_skelda_dataset.py +++ b/scripts/test_skelda_dataset.py @@ -55,6 +55,7 @@ datasets = { "human36m": { "path": "/datasets/human36m/skelda/pose_test.json", "take_interval": 5, + "fps": 50, "min_match_score": 0.95, "min_group_size": 1, "min_bbox_score": 0.4, @@ -84,6 +85,7 @@ datasets = { }, "campus": { "path": "/datasets/campus/skelda/test.json", + "fps": 20, "take_interval": 1, "min_match_score": 0.92, "min_bbox_score": 0.5, @@ -91,6 +93,7 @@ datasets = { "shelf": { "path": "/datasets/shelf/skelda/test.json", "take_interval": 1, + "fps": 20, "min_match_score": 0.95, "min_group_size": 2, }, @@ -346,6 +349,7 @@ def main(): "batch_poses": batch_poses, "whole_body": whole_body, "take_interval": datasets[dataset_use]["take_interval"], + "fps": datasets[dataset_use]["fps"], } utils_pipeline.save_json(config, config_path)