From 103aeb5887afd09e03dd9a801ba0e22b03c9fc0b Mon Sep 17 00:00:00 2001 From: crosstyan Date: Wed, 11 Mar 2026 23:42:07 +0800 Subject: [PATCH] Refactor triangulation stages and camera model --- bindings/rpt_module.cpp | 298 +++++++++++++++++- rpt/cached_camera.hpp | 19 ++ rpt/camera.cpp | 175 ++++++----- rpt/camera.hpp | 44 +-- rpt/interface.cpp | 10 + rpt/interface.hpp | 151 ++++++++- rpt/triangulator.cpp | 663 +++++++++++++++++++++++++++++++++------- src/rpt/__init__.py | 31 +- src/rpt/_helpers.py | 38 ++- tests/test_interface.py | 71 ++++- 10 files changed, 1268 insertions(+), 232 deletions(-) create mode 100644 rpt/cached_camera.hpp diff --git a/bindings/rpt_module.cpp b/bindings/rpt_module.cpp index c1279df..8f94f47 100644 --- a/bindings/rpt_module.cpp +++ b/bindings/rpt_module.cpp @@ -1,6 +1,8 @@ #include +#include #include #include +#include #include #include @@ -19,7 +21,10 @@ using PoseArray2D = nb::ndarray, nb::c_contig>; using CountArray = nb::ndarray, nb::c_contig>; using RoomArray = nb::ndarray, nb::c_contig>; +using PoseArray3DConst = + nb::ndarray, nb::c_contig>; using PoseArray3D = nb::ndarray, nb::c_contig>; +using PoseArray2DOut = nb::ndarray, nb::c_contig>; PoseBatch2DView pose_batch_view_from_numpy(const PoseArray2D &poses_2d, const CountArray &person_counts) { @@ -45,6 +50,15 @@ PoseBatch2DView pose_batch_view_from_numpy(const PoseArray2D &poses_2d, const Co }; } +PoseBatch3DView pose_batch3d_view_from_numpy(const PoseArray3DConst &poses_3d) +{ + return PoseBatch3DView { + poses_3d.data(), + static_cast(poses_3d.shape(0)), + static_cast(poses_3d.shape(1)), + }; +} + std::array, 2> roomparams_from_numpy(const RoomArray &roomparams) { std::array, 2> result {}; @@ -69,10 +83,75 @@ PoseArray3D pose_batch_to_numpy(PoseBatch3D batch) const size_t shape[3] = {batch.num_persons, batch.num_joints, 4}; return PoseArray3D(storage->data(), 3, shape, owner); } + +PoseArray3D pose_batch_to_numpy_copy(const PoseBatch3D &batch) +{ + PoseBatch3D copy = batch; + return pose_batch_to_numpy(std::move(copy)); +} + +PoseArray2DOut pose_rows_to_numpy_copy(const std::vector> &rows) +{ + auto *storage = new std::vector(rows.size() * 4, 0.0f); + for (size_t row = 0; row < rows.size(); ++row) + { + for (size_t coord = 0; coord < 4; ++coord) + { + (*storage)[row * 4 + coord] = rows[row][coord]; + } + } + + nb::capsule owner(storage, [](void *value) noexcept + { + delete static_cast *>(value); + }); + + const size_t shape[2] = {rows.size(), 4}; + return PoseArray2DOut(storage->data(), 2, shape, owner); +} + +PoseArray3D merged_poses_to_numpy_copy(const std::vector>> &poses) +{ + size_t num_poses = poses.size(); + size_t num_joints = 0; + if (!poses.empty()) + { + num_joints = poses[0].size(); + } + + auto *storage = new std::vector(num_poses * num_joints * 4, 0.0f); + for (size_t pose = 0; pose < num_poses; ++pose) + { + if (poses[pose].size() != num_joints) + { + delete storage; + throw std::invalid_argument("Merged poses must use a consistent joint count."); + } + for (size_t joint = 0; joint < num_joints; ++joint) + { + for (size_t coord = 0; coord < 4; ++coord) + { + (*storage)[((pose * num_joints) + joint) * 4 + coord] = poses[pose][joint][coord]; + } + } + } + + nb::capsule owner(storage, [](void *value) noexcept + { + delete static_cast *>(value); + }); + + const size_t shape[3] = {num_poses, num_joints, 4}; + return PoseArray3D(storage->data(), 3, shape, owner); +} } // namespace NB_MODULE(_core, m) { + nb::enum_(m, "CameraModel") + .value("PINHOLE", CameraModel::Pinhole) + .value("FISHEYE", CameraModel::Fisheye); + nb::class_(m, "Camera") .def(nb::init<>()) .def_rw("name", &Camera::name) @@ -82,12 +161,184 @@ NB_MODULE(_core, m) .def_rw("T", &Camera::T) .def_rw("width", &Camera::width) .def_rw("height", &Camera::height) - .def_rw("type", &Camera::type) + .def_rw("model", &Camera::model) .def("__repr__", [](const Camera &camera) { return camera.to_string(); }); + nb::class_(m, "PairCandidate") + .def(nb::init<>()) + .def_rw("view1", &PairCandidate::view1) + .def_rw("view2", &PairCandidate::view2) + .def_rw("person1", &PairCandidate::person1) + .def_rw("person2", &PairCandidate::person2) + .def_rw("global_person1", &PairCandidate::global_person1) + .def_rw("global_person2", &PairCandidate::global_person2); + + nb::class_(m, "PreviousPoseMatch") + .def(nb::init<>()) + .def_rw("previous_pose_index", &PreviousPoseMatch::previous_pose_index) + .def_rw("score_view1", &PreviousPoseMatch::score_view1) + .def_rw("score_view2", &PreviousPoseMatch::score_view2) + .def_rw("matched_view1", &PreviousPoseMatch::matched_view1) + .def_rw("matched_view2", &PreviousPoseMatch::matched_view2) + .def_rw("kept", &PreviousPoseMatch::kept) + .def_rw("decision", &PreviousPoseMatch::decision); + + nb::class_(m, "PreviousPoseFilterDebug") + .def(nb::init<>()) + .def_rw("used_previous_poses", &PreviousPoseFilterDebug::used_previous_poses) + .def_rw("matches", &PreviousPoseFilterDebug::matches) + .def_rw("kept_pair_indices", &PreviousPoseFilterDebug::kept_pair_indices) + .def_rw("kept_pairs", &PreviousPoseFilterDebug::kept_pairs); + + nb::class_(m, "CoreProposalDebug") + .def(nb::init<>()) + .def_rw("pair_index", &CoreProposalDebug::pair_index) + .def_rw("pair", &CoreProposalDebug::pair) + .def_rw("score", &CoreProposalDebug::score) + .def_rw("kept", &CoreProposalDebug::kept) + .def_rw("drop_reason", &CoreProposalDebug::drop_reason) + .def_prop_ro("pose_3d", [](const CoreProposalDebug &proposal) + { + return pose_rows_to_numpy_copy(proposal.pose_3d); + }, nb::rv_policy::move); + + nb::class_(m, "ProposalGroupDebug") + .def(nb::init<>()) + .def_rw("center", &ProposalGroupDebug::center) + .def_rw("proposal_indices", &ProposalGroupDebug::proposal_indices) + .def_prop_ro("pose_3d", [](const ProposalGroupDebug &group) + { + return pose_rows_to_numpy_copy(group.pose_3d); + }, nb::rv_policy::move); + + nb::class_(m, "GroupingDebug") + .def(nb::init<>()) + .def_rw("initial_groups", &GroupingDebug::initial_groups) + .def_rw("duplicate_pair_drops", &GroupingDebug::duplicate_pair_drops) + .def_rw("groups", &GroupingDebug::groups); + + nb::class_(m, "FullProposalDebug") + .def(nb::init<>()) + .def_rw("source_core_proposal_index", &FullProposalDebug::source_core_proposal_index) + .def_rw("pair", &FullProposalDebug::pair) + .def_prop_ro("pose_3d", [](const FullProposalDebug &proposal) + { + return pose_rows_to_numpy_copy(proposal.pose_3d); + }, nb::rv_policy::move); + + nb::class_(m, "MergeDebug") + .def(nb::init<>()) + .def_rw("group_proposal_indices", &MergeDebug::group_proposal_indices) + .def_prop_ro("merged_poses", [](const MergeDebug &merge) + { + return merged_poses_to_numpy_copy(merge.merged_poses); + }, nb::rv_policy::move); + + nb::class_(m, "TriangulationTrace") + .def(nb::init<>()) + .def_rw("pairs", &TriangulationTrace::pairs) + .def_rw("previous_filter", &TriangulationTrace::previous_filter) + .def_rw("core_proposals", &TriangulationTrace::core_proposals) + .def_rw("grouping", &TriangulationTrace::grouping) + .def_rw("full_proposals", &TriangulationTrace::full_proposals) + .def_rw("merge", &TriangulationTrace::merge) + .def_prop_ro("final_poses", [](const TriangulationTrace &trace) + { + return pose_batch_to_numpy_copy(trace.final_poses); + }, nb::rv_policy::move); + + m.def( + "build_pair_candidates", + [](const PoseArray2D &poses_2d, const CountArray &person_counts) + { + return build_pair_candidates(pose_batch_view_from_numpy(poses_2d, person_counts)); + }, + "poses_2d"_a, + "person_counts"_a); + + m.def( + "filter_pairs_with_previous_poses", + [](const PoseArray2D &poses_2d, + const CountArray &person_counts, + const std::vector &cameras, + const std::vector &joint_names, + const PoseArray3DConst &previous_poses_3d, + float min_match_score) + { + return filter_pairs_with_previous_poses( + pose_batch_view_from_numpy(poses_2d, person_counts), + cameras, + joint_names, + pose_batch3d_view_from_numpy(previous_poses_3d), + min_match_score); + }, + "poses_2d"_a, + "person_counts"_a, + "cameras"_a, + "joint_names"_a, + "previous_poses_3d"_a, + "min_match_score"_a = 0.95f); + + m.def( + "triangulate_debug", + [](const PoseArray2D &poses_2d, + const CountArray &person_counts, + const std::vector &cameras, + const RoomArray &roomparams, + const std::vector &joint_names, + float min_match_score, + size_t min_group_size) + { + return triangulate_debug( + pose_batch_view_from_numpy(poses_2d, person_counts), + cameras, + roomparams_from_numpy(roomparams), + joint_names, + nullptr, + min_match_score, + min_group_size); + }, + "poses_2d"_a, + "person_counts"_a, + "cameras"_a, + "roomparams"_a, + "joint_names"_a, + "min_match_score"_a = 0.95f, + "min_group_size"_a = 1); + + m.def( + "triangulate_debug", + [](const PoseArray2D &poses_2d, + const CountArray &person_counts, + const std::vector &cameras, + const RoomArray &roomparams, + const std::vector &joint_names, + const PoseArray3DConst &previous_poses_3d, + float min_match_score, + size_t min_group_size) + { + const PoseBatch3DView previous_view = pose_batch3d_view_from_numpy(previous_poses_3d); + return triangulate_debug( + pose_batch_view_from_numpy(poses_2d, person_counts), + cameras, + roomparams_from_numpy(roomparams), + joint_names, + &previous_view, + min_match_score, + min_group_size); + }, + "poses_2d"_a, + "person_counts"_a, + "cameras"_a, + "roomparams"_a, + "joint_names"_a, + "previous_poses_3d"_a, + "min_match_score"_a = 0.95f, + "min_group_size"_a = 1); + m.def( "triangulate_poses", [](const PoseArray2D &poses_2d, @@ -98,11 +349,15 @@ NB_MODULE(_core, m) float min_match_score, size_t min_group_size) { - const PoseBatch2DView pose_batch = pose_batch_view_from_numpy(poses_2d, person_counts); - const auto room = roomparams_from_numpy(roomparams); - PoseBatch3D poses_3d = triangulate_poses( - pose_batch, cameras, room, joint_names, min_match_score, min_group_size); - return pose_batch_to_numpy(std::move(poses_3d)); + const PoseBatch3D poses_3d = triangulate_poses( + pose_batch_view_from_numpy(poses_2d, person_counts), + cameras, + roomparams_from_numpy(roomparams), + joint_names, + nullptr, + min_match_score, + min_group_size); + return pose_batch_to_numpy(poses_3d); }, "poses_2d"_a, "person_counts"_a, @@ -111,4 +366,35 @@ NB_MODULE(_core, m) "joint_names"_a, "min_match_score"_a = 0.95f, "min_group_size"_a = 1); + + m.def( + "triangulate_poses", + [](const PoseArray2D &poses_2d, + const CountArray &person_counts, + const std::vector &cameras, + const RoomArray &roomparams, + const std::vector &joint_names, + const PoseArray3DConst &previous_poses_3d, + float min_match_score, + size_t min_group_size) + { + const PoseBatch3DView previous_view = pose_batch3d_view_from_numpy(previous_poses_3d); + const PoseBatch3D poses_3d = triangulate_poses( + pose_batch_view_from_numpy(poses_2d, person_counts), + cameras, + roomparams_from_numpy(roomparams), + joint_names, + &previous_view, + min_match_score, + min_group_size); + return pose_batch_to_numpy(poses_3d); + }, + "poses_2d"_a, + "person_counts"_a, + "cameras"_a, + "roomparams"_a, + "joint_names"_a, + "previous_poses_3d"_a, + "min_match_score"_a = 0.95f, + "min_group_size"_a = 1); } diff --git a/rpt/cached_camera.hpp b/rpt/cached_camera.hpp new file mode 100644 index 0000000..f9d51dc --- /dev/null +++ b/rpt/cached_camera.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include + +#include "camera.hpp" + +struct CachedCamera +{ + const Camera cam; + const std::array, 3> invR; + const std::array center; + const std::array, 3> newK; + const std::array, 3> invK; +}; + +CachedCamera cache_camera(const Camera &camera); + +void undistort_point_pinhole(std::array &point, const std::array &distortion); +void undistort_point_fisheye(std::array &point, const std::array &distortion); diff --git a/rpt/camera.cpp b/rpt/camera.cpp index 026f91d..5b2ffad 100644 --- a/rpt/camera.cpp +++ b/rpt/camera.cpp @@ -1,11 +1,60 @@ #include #include #include +#include #include #include #include -#include "camera.hpp" +#include "cached_camera.hpp" + +namespace +{ +std::array, 3> transpose3x3(const std::array, 3> &M) +{ + return {{{M[0][0], M[1][0], M[2][0]}, + {M[0][1], M[1][1], M[2][1]}, + {M[0][2], M[1][2], M[2][2]}}}; +} + +std::array, 3> invert3x3(const std::array, 3> &M) +{ + std::array, 3> adj = { + {{ + M[1][1] * M[2][2] - M[1][2] * M[2][1], + M[0][2] * M[2][1] - M[0][1] * M[2][2], + M[0][1] * M[1][2] - M[0][2] * M[1][1], + }, + { + M[1][2] * M[2][0] - M[1][0] * M[2][2], + M[0][0] * M[2][2] - M[0][2] * M[2][0], + M[0][2] * M[1][0] - M[0][0] * M[1][2], + }, + { + M[1][0] * M[2][1] - M[1][1] * M[2][0], + M[0][1] * M[2][0] - M[0][0] * M[2][1], + M[0][0] * M[1][1] - M[0][1] * M[1][0], + }}}; + + float det = M[0][0] * adj[0][0] + M[0][1] * adj[1][0] + M[0][2] * adj[2][0]; + if (std::fabs(det) < 1e-6f) + { + return {{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}}; + } + + float idet = 1.0f / det; + return {{ + {{adj[0][0] * idet, adj[0][1] * idet, adj[0][2] * idet}}, + {{adj[1][0] * idet, adj[1][1] * idet, adj[1][2] * idet}}, + {{adj[2][0] * idet, adj[2][1] * idet, adj[2][2] * idet}}, + }}; +} + +std::array, 3> calc_optimal_camera_matrix_fisheye( + const Camera &cam, float balance, std::pair new_size); +std::array, 3> calc_optimal_camera_matrix_pinhole( + const Camera &cam, float alpha, std::pair new_size); +} // namespace // ================================================================================================= // ================================================================================================= @@ -63,7 +112,7 @@ std::string Camera::to_string() const out << "'width': " << width << ", "; out << "'height': " << height << ", "; - out << "'type': " << type; + out << "'model': '" << camera_model_name(model) << "'"; out << "}"; return out.str(); @@ -71,6 +120,33 @@ std::string Camera::to_string() const // ================================================================================================= +const char *camera_model_name(CameraModel model) +{ + switch (model) + { + case CameraModel::Pinhole: + return "pinhole"; + case CameraModel::Fisheye: + return "fisheye"; + } + return "unknown"; +} + +CameraModel parse_camera_model(const std::string &value) +{ + if (value == "pinhole") + { + return CameraModel::Pinhole; + } + if (value == "fisheye") + { + return CameraModel::Fisheye; + } + throw std::invalid_argument("Unsupported camera model: " + value); +} + +// ================================================================================================= + std::ostream &operator<<(std::ostream &out, const Camera &cam) { out << cam.to_string(); @@ -80,93 +156,33 @@ std::ostream &operator<<(std::ostream &out, const Camera &cam) // ================================================================================================= // ================================================================================================= -CameraInternal::CameraInternal(const Camera &cam) +CachedCamera cache_camera(const Camera &cam) { - this->cam = cam; - this->invR = transpose3x3(cam.R); + const std::array, 3> invR = transpose3x3(cam.R); // Camera center: // C = -(Rᵀ * t) = -(Rᵀ * (R * (T * -1))) = -(Rᵀ * (R * -T)) = -(Rᵀ * -R * T) = -(-T) = T - this->center = {cam.T[0][0], cam.T[1][0], cam.T[2][0]}; + const std::array center = {cam.T[0][0], cam.T[1][0], cam.T[2][0]}; // Undistort camera matrix // As with the undistortion, the own implementation avoids some overhead compared to OpenCV - if (cam.type == "fisheye") + std::array, 3> newK; + if (cam.model == CameraModel::Fisheye) { - newK = calc_optimal_camera_matrix_fisheye(1.0, {cam.width, cam.height}); + newK = calc_optimal_camera_matrix_fisheye(cam, 1.0f, {cam.width, cam.height}); } else { - newK = calc_optimal_camera_matrix_pinhole(1.0, {cam.width, cam.height}); + newK = calc_optimal_camera_matrix_pinhole(cam, 1.0f, {cam.width, cam.height}); } - this->invK = invert3x3(newK); + const std::array, 3> invK = invert3x3(newK); + + return CachedCamera {cam, invR, center, newK, invK}; } // ================================================================================================= -std::array, 3> CameraInternal::transpose3x3( - const std::array, 3> &M) -{ - return {{{M[0][0], M[1][0], M[2][0]}, - {M[0][1], M[1][1], M[2][1]}, - {M[0][2], M[1][2], M[2][2]}}}; -} - -// ================================================================================================= - -std::array, 3> CameraInternal::invert3x3( - const std::array, 3> &M) -{ - // Compute the inverse using the adjugate method - // See: https://scicomp.stackexchange.com/a/29206 - - std::array, 3> adj = { - {{ - M[1][1] * M[2][2] - M[1][2] * M[2][1], - M[0][2] * M[2][1] - M[0][1] * M[2][2], - M[0][1] * M[1][2] - M[0][2] * M[1][1], - }, - { - M[1][2] * M[2][0] - M[1][0] * M[2][2], - M[0][0] * M[2][2] - M[0][2] * M[2][0], - M[0][2] * M[1][0] - M[0][0] * M[1][2], - }, - { - M[1][0] * M[2][1] - M[1][1] * M[2][0], - M[0][1] * M[2][0] - M[0][0] * M[2][1], - M[0][0] * M[1][1] - M[0][1] * M[1][0], - }}}; - - float det = M[0][0] * adj[0][0] + M[0][1] * adj[1][0] + M[0][2] * adj[2][0]; - if (std::fabs(det) < 1e-6f) - { - return {{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}}; - } - - float idet = 1.0f / det; - std::array, 3> inv = { - {{ - adj[0][0] * idet, - adj[0][1] * idet, - adj[0][2] * idet, - }, - { - adj[1][0] * idet, - adj[1][1] * idet, - adj[1][2] * idet, - }, - { - adj[2][0] * idet, - adj[2][1] * idet, - adj[2][2] * idet, - }}}; - - return inv; -} - -// ================================================================================================= - -void CameraInternal::undistort_point_pinhole(std::array &p, const std::vector &k) +void undistort_point_pinhole(std::array &p, const std::array &k) { // Following: cv::cvUndistortPointsInternal // Uses only the distortion coefficients: [k1, k2, p1, p2, k3] @@ -202,7 +218,7 @@ void CameraInternal::undistort_point_pinhole(std::array &p, const std: // ================================================================================================= -void CameraInternal::undistort_point_fisheye(std::array &p, const std::vector &k) +void undistort_point_fisheye(std::array &p, const std::array &k) { // Following: cv::fisheye::undistortPoints // Uses only the distortion coefficients: [k1, k2, k3, k4] @@ -250,8 +266,10 @@ void CameraInternal::undistort_point_fisheye(std::array &p, const std: // ================================================================================================= -std::array, 3> CameraInternal::calc_optimal_camera_matrix_fisheye( - float balance, std::pair new_size) +namespace +{ +std::array, 3> calc_optimal_camera_matrix_fisheye( + const Camera &cam, float balance, std::pair new_size) { // Following: cv::fisheye::estimateNewCameraMatrixForUndistortRectify // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/fisheye.cpp#L630 @@ -355,8 +373,8 @@ std::array, 3> CameraInternal::calc_optimal_camera_matrix_f // ================================================================================================= -std::array, 3> CameraInternal::calc_optimal_camera_matrix_pinhole( - float alpha, std::pair new_size) +std::array, 3> calc_optimal_camera_matrix_pinhole( + const Camera &cam, float alpha, std::pair new_size) { // Following: cv::getOptimalNewCameraMatrix // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/calibration_base.cpp#L1565 @@ -479,3 +497,4 @@ std::array, 3> CameraInternal::calc_optimal_camera_matrix_p {0.0, 0.0, 1.0}}}; return newK; } +} // namespace diff --git a/rpt/camera.hpp b/rpt/camera.hpp index 15d5a45..3247b67 100644 --- a/rpt/camera.hpp +++ b/rpt/camera.hpp @@ -7,46 +7,28 @@ // ================================================================================================= +enum class CameraModel +{ + Pinhole, + Fisheye, +}; + +const char *camera_model_name(CameraModel model); +CameraModel parse_camera_model(const std::string &value); + +// ================================================================================================= + struct Camera { std::string name; std::array, 3> K; - std::vector DC; + std::array DC = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; std::array, 3> R; std::array, 3> T; int width; int height; - std::string type; + CameraModel model = CameraModel::Pinhole; friend std::ostream &operator<<(std::ostream &out, Camera const &camera); std::string to_string() const; }; - -// ================================================================================================= - -class CameraInternal -{ -public: - CameraInternal(const Camera &cam); - - Camera cam; - - std::array, 3> invR; - std::array center; - std::array, 3> newK; - std::array, 3> invK; - - static std::array, 3> transpose3x3( - const std::array, 3> &M); - - static std::array, 3> invert3x3( - const std::array, 3> &M); - - static void undistort_point_pinhole(std::array &p, const std::vector &k); - static void undistort_point_fisheye(std::array &p, const std::vector &k); - - std::array, 3> calc_optimal_camera_matrix_fisheye( - float balance, std::pair new_size); - std::array, 3> calc_optimal_camera_matrix_pinhole( - float alpha, std::pair new_size); -}; diff --git a/rpt/interface.cpp b/rpt/interface.cpp index 477cc69..40d8ff6 100644 --- a/rpt/interface.cpp +++ b/rpt/interface.cpp @@ -38,6 +38,11 @@ const float &PoseBatch2DView::at(size_t view, size_t person, size_t joint, size_ return data[pose2d_offset(view, person, joint, coord, max_persons, num_joints)]; } +const float &PoseBatch3DView::at(size_t person, size_t joint, size_t coord) const +{ + return data[pose3d_offset(person, joint, coord, num_joints)]; +} + const float &PoseBatch2D::at(size_t view, size_t person, size_t joint, size_t coord) const { return data[pose2d_offset(view, person, joint, coord, max_persons, num_joints)]; @@ -109,6 +114,11 @@ const float &PoseBatch3D::at(size_t person, size_t joint, size_t coord) const return data[pose3d_offset(person, joint, coord, num_joints)]; } +PoseBatch3DView PoseBatch3D::view() const +{ + return PoseBatch3DView {data.data(), num_persons, num_joints}; +} + NestedPoses3D PoseBatch3D::to_nested() const { NestedPoses3D poses_3d(num_persons); diff --git a/rpt/interface.hpp b/rpt/interface.hpp index f64f932..1d13bbf 100644 --- a/rpt/interface.hpp +++ b/rpt/interface.hpp @@ -25,6 +25,15 @@ struct PoseBatch2DView const float &at(size_t view, size_t person, size_t joint, size_t coord) const; }; +struct PoseBatch3DView +{ + const float *data = nullptr; + size_t num_persons = 0; + size_t num_joints = 0; + + const float &at(size_t person, size_t joint, size_t coord) const; +}; + struct PoseBatch2D { std::vector data; @@ -48,6 +57,7 @@ struct PoseBatch3D float &at(size_t person, size_t joint, size_t coord); const float &at(size_t person, size_t joint, size_t coord) const; + PoseBatch3DView view() const; NestedPoses3D to_nested() const; static PoseBatch3D from_nested(const NestedPoses3D &poses_3d); @@ -55,6 +65,136 @@ struct PoseBatch3D // ================================================================================================= +struct PairCandidate +{ + int view1 = -1; + int view2 = -1; + int person1 = -1; + int person2 = -1; + int global_person1 = -1; + int global_person2 = -1; +}; + +struct PreviousPoseMatch +{ + int previous_pose_index = -1; + float score_view1 = 0.0f; + float score_view2 = 0.0f; + bool matched_view1 = false; + bool matched_view2 = false; + bool kept = true; + std::string decision = "unfiltered"; +}; + +struct PreviousPoseFilterDebug +{ + bool used_previous_poses = false; + std::vector matches; + std::vector kept_pair_indices; + std::vector kept_pairs; +}; + +struct CoreProposalDebug +{ + int pair_index = -1; + PairCandidate pair; + std::vector> pose_3d; + float score = 0.0f; + bool kept = false; + std::string drop_reason = "uninitialized"; +}; + +struct ProposalGroupDebug +{ + std::array center = {0.0f, 0.0f, 0.0f}; + std::vector> pose_3d; + std::vector proposal_indices; +}; + +struct GroupingDebug +{ + std::vector initial_groups; + std::vector duplicate_pair_drops; + std::vector groups; +}; + +struct FullProposalDebug +{ + int source_core_proposal_index = -1; + PairCandidate pair; + std::vector> pose_3d; +}; + +struct MergeDebug +{ + std::vector>> merged_poses; + std::vector> group_proposal_indices; +}; + +struct TriangulationTrace +{ + std::vector pairs; + PreviousPoseFilterDebug previous_filter; + std::vector core_proposals; + GroupingDebug grouping; + std::vector full_proposals; + MergeDebug merge; + PoseBatch3D final_poses; +}; + +// ================================================================================================= + +std::vector build_pair_candidates(const PoseBatch2DView &poses_2d); + +PreviousPoseFilterDebug filter_pairs_with_previous_poses( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::vector &joint_names, + const PoseBatch3DView &previous_poses_3d, + float min_match_score = 0.95f); + +inline PreviousPoseFilterDebug filter_pairs_with_previous_poses( + const PoseBatch2D &poses_2d, + const std::vector &cameras, + const std::vector &joint_names, + const PoseBatch3D &previous_poses_3d, + float min_match_score = 0.95f) +{ + return filter_pairs_with_previous_poses( + poses_2d.view(), cameras, joint_names, previous_poses_3d.view(), min_match_score); +} + +TriangulationTrace triangulate_debug( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::array, 2> &roomparams, + const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d = nullptr, + float min_match_score = 0.95f, + size_t min_group_size = 1); + +inline TriangulationTrace triangulate_debug( + const PoseBatch2D &poses_2d, + const std::vector &cameras, + const std::array, 2> &roomparams, + const std::vector &joint_names, + const PoseBatch3D *previous_poses_3d = nullptr, + float min_match_score = 0.95f, + size_t min_group_size = 1) +{ + PoseBatch3DView previous_view_storage; + const PoseBatch3DView *previous_view = nullptr; + if (previous_poses_3d != nullptr) + { + previous_view_storage = previous_poses_3d->view(); + previous_view = &previous_view_storage; + } + return triangulate_debug( + poses_2d.view(), cameras, roomparams, joint_names, previous_view, min_match_score, min_group_size); +} + +// ================================================================================================= + /** * Calculate a triangulation using a padded pose tensor. * @@ -72,6 +212,7 @@ PoseBatch3D triangulate_poses( const std::vector &cameras, const std::array, 2> &roomparams, const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d = nullptr, float min_match_score = 0.95f, size_t min_group_size = 1); @@ -80,9 +221,17 @@ inline PoseBatch3D triangulate_poses( const std::vector &cameras, const std::array, 2> &roomparams, const std::vector &joint_names, + const PoseBatch3D *previous_poses_3d = nullptr, float min_match_score = 0.95f, size_t min_group_size = 1) { + PoseBatch3DView previous_view_storage; + const PoseBatch3DView *previous_view = nullptr; + if (previous_poses_3d != nullptr) + { + previous_view_storage = previous_poses_3d->view(); + previous_view = &previous_view_storage; + } return triangulate_poses( - poses_2d.view(), cameras, roomparams, joint_names, min_match_score, min_group_size); + poses_2d.view(), cameras, roomparams, joint_names, previous_view, min_match_score, min_group_size); } diff --git a/rpt/triangulator.cpp b/rpt/triangulator.cpp index e70140b..24ae9a7 100644 --- a/rpt/triangulator.cpp +++ b/rpt/triangulator.cpp @@ -10,6 +10,7 @@ #include #include "interface.hpp" +#include "cached_camera.hpp" namespace { @@ -83,7 +84,7 @@ size_t packed_pose_offset(size_t pose, size_t joint, size_t coord, size_t num_jo } std::array undistort_point( - const std::array &raw_point, const CameraInternal &icam) + const std::array &raw_point, const CachedCamera &icam) { std::array point = raw_point; const float ifx_old = 1.0f / icam.cam.K[0][0]; @@ -98,13 +99,13 @@ std::array undistort_point( point[0] = (point[0] - cx_old) * ifx_old; point[1] = (point[1] - cy_old) * ify_old; - if (icam.cam.type == "fisheye") + if (icam.cam.model == CameraModel::Fisheye) { - CameraInternal::undistort_point_fisheye(point, icam.cam.DC); + undistort_point_fisheye(point, icam.cam.DC); } else { - CameraInternal::undistort_point_pinhole(point, icam.cam.DC); + undistort_point_pinhole(point, icam.cam.DC); } point[0] = (point[0] * fx_new) + cx_new; @@ -125,7 +126,7 @@ class PackedPoseStore2D public: PackedPoseStore2D( const PoseBatch2DView &source, - const std::vector &internal_cameras) + const std::vector &internal_cameras) : person_counts(source.num_views), view_offsets(source.num_views), num_views(source.num_views), @@ -229,16 +230,75 @@ public: { } - PoseBatch3D triangulate_poses( + TriangulationTrace triangulate_debug( const PoseBatch2DView &poses_2d, const std::vector &cameras, const std::array, 2> &roomparams, - const std::vector &joint_names); + const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d); + + PreviousPoseFilterDebug filter_pairs_with_previous_poses( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::vector &joint_names, + const PoseBatch3DView &previous_poses_3d); private: + struct PreparedInputs + { + std::vector internal_cameras; + std::vector core_joint_idx; + std::vector> core_limbs_idx; + PackedPoseStore2D packed_poses; + + PreparedInputs( + std::vector cameras_in, + std::vector core_joint_idx_in, + std::vector> core_limbs_idx_in, + PackedPoseStore2D packed_poses_in) + : internal_cameras(std::move(cameras_in)), + core_joint_idx(std::move(core_joint_idx_in)), + core_limbs_idx(std::move(core_limbs_idx_in)), + packed_poses(std::move(packed_poses_in)) + { + } + }; + + struct PreviousProjectionCache + { + std::vector> projected_poses; + std::vector>> projected_dists; + std::vector core_poses; + }; + + PreparedInputs prepare_inputs( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::vector &joint_names); + + std::vector build_pair_candidates(const PackedPoseStore2D &packed_poses) const; + + PreviousProjectionCache project_previous_poses( + const PoseBatch3DView &previous_poses_3d, + const std::vector &internal_cameras, + const std::vector &core_joint_idx); + + PreviousPoseFilterDebug filter_pairs_with_previous_poses( + const PackedPoseStore2D &packed_poses, + const std::vector &internal_cameras, + const std::vector &core_joint_idx, + const std::vector &pairs, + const PoseBatch3DView &previous_poses_3d); + + float calc_pose_score( + const Pose2D &pose, + const Pose2D &reference_pose, + const std::vector &dists, + const CachedCamera &icam); + std::tuple, std::vector>> project_poses( const std::vector &poses_3d, - const CameraInternal &icam, + const CachedCamera &icam, bool calc_dists); std::vector score_projection( @@ -251,8 +311,8 @@ private: std::pair triangulate_and_score( const Pose2D &pose1, const Pose2D &pose2, - const CameraInternal &cam1, - const CameraInternal &cam2, + const CachedCamera &cam1, + const CachedCamera &cam2, const std::array, 2> &roomparams, const std::vector> &core_limbs_idx); @@ -313,17 +373,9 @@ private: }; }; -PoseBatch3D empty_pose_batch3d(size_t num_joints) -{ - PoseBatch3D poses_3d; - poses_3d.num_joints = num_joints; - return poses_3d; -} - -PoseBatch3D TriangulatorInternal::triangulate_poses( +TriangulatorInternal::PreparedInputs TriangulatorInternal::prepare_inputs( const PoseBatch2DView &poses_2d, const std::vector &cameras, - const std::array, 2> &roomparams, const std::vector &joint_names) { if (poses_2d.num_views == 0) @@ -342,6 +394,10 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( { throw std::invalid_argument("Number of joint names and 2D poses must be the same."); } + if (poses_2d.num_joints == 0) + { + throw std::invalid_argument("At least one joint is required."); + } if (poses_2d.max_persons > 0 && poses_2d.num_joints > 0 && poses_2d.data == nullptr) { throw std::invalid_argument("poses_2d data must not be null."); @@ -356,11 +412,11 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( num_cams = cameras.size(); - std::vector internal_cameras; + std::vector internal_cameras; internal_cameras.reserve(cameras.size()); for (const auto &camera : cameras) { - internal_cameras.emplace_back(camera); + internal_cameras.push_back(cache_camera(camera)); } std::vector core_joint_idx; @@ -383,64 +439,310 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( } PackedPoseStore2D packed_poses(poses_2d, internal_cameras); + return PreparedInputs( + std::move(internal_cameras), + std::move(core_joint_idx), + std::move(core_limbs_idx), + std::move(packed_poses)); +} - std::vector all_pairs; - for (size_t view1 = 0; view1 < cameras.size(); ++view1) +std::vector TriangulatorInternal::build_pair_candidates(const PackedPoseStore2D &packed_poses) const +{ + std::vector pairs; + for (size_t view1 = 0; view1 < packed_poses.num_views; ++view1) { - for (size_t view2 = view1 + 1; view2 < cameras.size(); ++view2) + for (size_t view2 = view1 + 1; view2 < packed_poses.num_views; ++view2) { for (size_t person1 = 0; person1 < packed_poses.person_counts[view1]; ++person1) { for (size_t person2 = 0; person2 < packed_poses.person_counts[view2]; ++person2) { - all_pairs.emplace_back( - std::make_tuple( - static_cast(view1), - static_cast(view2), - static_cast(person1), - static_cast(person2)), - std::make_pair( - static_cast(packed_poses.pose_index(view1, person1)), - static_cast(packed_poses.pose_index(view2, person2)))); + pairs.push_back(PairCandidate { + static_cast(view1), + static_cast(view2), + static_cast(person1), + static_cast(person2), + static_cast(packed_poses.pose_index(view1, person1)), + static_cast(packed_poses.pose_index(view2, person2)), + }); } } } } + return pairs; +} - if (all_pairs.empty()) +TriangulatorInternal::PreviousProjectionCache TriangulatorInternal::project_previous_poses( + const PoseBatch3DView &previous_poses_3d, + const std::vector &internal_cameras, + const std::vector &core_joint_idx) +{ + PreviousProjectionCache cache; + cache.core_poses.resize(previous_poses_3d.num_persons); + for (size_t person = 0; person < previous_poses_3d.num_persons; ++person) { - return empty_pose_batch3d(joint_names.size()); - } - - std::vector> all_scored_poses(all_pairs.size()); - for (size_t i = 0; i < all_pairs.size(); ++i) - { - const auto &pids = all_pairs[i].first; - const auto &cam1 = internal_cameras[std::get<0>(pids)]; - const auto &cam2 = internal_cameras[std::get<1>(pids)]; - Pose2D pose1_core = packed_poses.pose(std::get<0>(pids), std::get<2>(pids), core_joint_idx); - Pose2D pose2_core = packed_poses.pose(std::get<1>(pids), std::get<3>(pids), core_joint_idx); - - auto [pose3d, score] = triangulate_and_score( - pose1_core, pose2_core, cam1, cam2, roomparams, core_limbs_idx); - all_scored_poses[i] = std::make_pair(std::move(pose3d), score); - } - - for (size_t i = all_scored_poses.size(); i > 0; --i) - { - if (all_scored_poses[i - 1].second < min_match_score) + Pose3D pose(core_joint_idx.size(), {0.0f, 0.0f, 0.0f, 0.0f}); + for (size_t joint = 0; joint < core_joint_idx.size(); ++joint) { - all_scored_poses.erase(all_scored_poses.begin() + static_cast(i - 1)); - all_pairs.erase(all_pairs.begin() + static_cast(i - 1)); + const size_t source_joint = core_joint_idx[joint]; + for (size_t coord = 0; coord < 4; ++coord) + { + pose[joint][coord] = previous_poses_3d.at(person, source_joint, coord); + } + } + cache.core_poses[person] = std::move(pose); + } + + cache.projected_poses.resize(internal_cameras.size()); + cache.projected_dists.resize(internal_cameras.size()); + for (size_t view = 0; view < internal_cameras.size(); ++view) + { + auto [projected_poses, projected_dists] = project_poses(cache.core_poses, internal_cameras[view], true); + cache.projected_poses[view] = std::move(projected_poses); + cache.projected_dists[view] = std::move(projected_dists); + } + + return cache; +} + +float TriangulatorInternal::calc_pose_score( + const Pose2D &pose, + const Pose2D &reference_pose, + const std::vector &dists, + const CachedCamera &icam) +{ + const float min_score = 0.1f; + std::vector mask(pose.size(), false); + size_t valid_count = 0; + for (size_t joint = 0; joint < pose.size(); ++joint) + { + mask[joint] = pose[joint][2] > min_score && reference_pose[joint][2] > min_score; + if (mask[joint]) + { + ++valid_count; + } + } + if (valid_count < 3) + { + return 0.0f; + } + + const float iscale = (icam.cam.width + icam.cam.height) * 0.5f; + std::vector scores = score_projection(pose, reference_pose, dists, mask, iscale); + const size_t drop_k = static_cast(pose.size() * 0.2f); + if (drop_k > 0) + { + std::partial_sort(scores.begin(), scores.begin() + drop_k, scores.end()); + } + + float total = 0.0f; + for (size_t i = drop_k; i < scores.size(); ++i) + { + total += scores[i]; + } + return total / static_cast(scores.size() - drop_k); +} + +PreviousPoseFilterDebug TriangulatorInternal::filter_pairs_with_previous_poses( + const PackedPoseStore2D &packed_poses, + const std::vector &internal_cameras, + const std::vector &core_joint_idx, + const std::vector &pairs, + const PoseBatch3DView &previous_poses_3d) +{ + PreviousPoseFilterDebug debug; + debug.used_previous_poses = true; + debug.matches.resize(pairs.size()); + + if (previous_poses_3d.num_persons == 0) + { + debug.kept_pair_indices.resize(pairs.size()); + std::iota(debug.kept_pair_indices.begin(), debug.kept_pair_indices.end(), 0); + debug.kept_pairs = pairs; + for (auto &match : debug.matches) + { + match.decision = "no_previous_poses"; + } + return debug; + } + + const PreviousProjectionCache projected_previous = + project_previous_poses(previous_poses_3d, internal_cameras, core_joint_idx); + const float threshold = min_match_score; + + for (size_t pair_index = 0; pair_index < pairs.size(); ++pair_index) + { + const PairCandidate &pair = pairs[pair_index]; + const Pose2D pose1_core = packed_poses.pose( + static_cast(pair.view1), static_cast(pair.person1), core_joint_idx); + const Pose2D pose2_core = packed_poses.pose( + static_cast(pair.view2), static_cast(pair.person2), core_joint_idx); + + PreviousPoseMatch best_match; + bool found_decision = false; + for (size_t previous_index = 0; previous_index < previous_poses_3d.num_persons; ++previous_index) + { + const float score_view1 = calc_pose_score( + pose1_core, + projected_previous.projected_poses[static_cast(pair.view1)][previous_index], + projected_previous.projected_dists[static_cast(pair.view1)][previous_index], + internal_cameras[static_cast(pair.view1)]); + const float score_view2 = calc_pose_score( + pose2_core, + projected_previous.projected_poses[static_cast(pair.view2)][previous_index], + projected_previous.projected_dists[static_cast(pair.view2)][previous_index], + internal_cameras[static_cast(pair.view2)]); + const bool matched_view1 = score_view1 > threshold; + const bool matched_view2 = score_view2 > threshold; + + if (matched_view1 && matched_view2) + { + best_match = PreviousPoseMatch { + static_cast(previous_index), + score_view1, + score_view2, + true, + true, + true, + "matched_previous_pose", + }; + found_decision = true; + break; + } + + if (matched_view1 || matched_view2) + { + best_match = PreviousPoseMatch { + static_cast(previous_index), + score_view1, + score_view2, + matched_view1, + matched_view2, + false, + "dropped_single_view_match", + }; + found_decision = true; + break; + } + } + + if (!found_decision) + { + best_match.decision = "new_person_candidate"; + best_match.kept = true; + } + + debug.matches[pair_index] = best_match; + if (best_match.kept) + { + debug.kept_pair_indices.push_back(static_cast(pair_index)); + debug.kept_pairs.push_back(pair); } } - if (all_pairs.empty()) + return debug; +} + +TriangulationTrace TriangulatorInternal::triangulate_debug( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::array, 2> &roomparams, + const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d) +{ + TriangulationTrace trace; + trace.final_poses.num_joints = joint_names.size(); + + if (previous_poses_3d != nullptr && previous_poses_3d->num_persons > 0 && + previous_poses_3d->num_joints != joint_names.size()) { - return empty_pose_batch3d(joint_names.size()); + throw std::invalid_argument("previous_poses_3d must use the same joint count as joint_names."); + } + + PreparedInputs prepared = prepare_inputs(poses_2d, cameras, joint_names); + trace.pairs = build_pair_candidates(prepared.packed_poses); + if (trace.pairs.empty()) + { + return trace; + } + + std::vector active_pair_indices(trace.pairs.size()); + std::iota(active_pair_indices.begin(), active_pair_indices.end(), 0); + std::vector active_pairs = trace.pairs; + + if (previous_poses_3d != nullptr) + { + trace.previous_filter = filter_pairs_with_previous_poses( + prepared.packed_poses, + prepared.internal_cameras, + prepared.core_joint_idx, + trace.pairs, + *previous_poses_3d); + active_pair_indices = trace.previous_filter.kept_pair_indices; + active_pairs = trace.previous_filter.kept_pairs; + if (active_pairs.empty()) + { + return trace; + } + } + + trace.core_proposals.reserve(active_pairs.size()); + std::vector kept_pose_pairs; + std::vector> kept_scored_poses; + std::vector kept_core_indices; + for (size_t i = 0; i < active_pairs.size(); ++i) + { + const PairCandidate &pair = active_pairs[i]; + const CachedCamera &cam1 = prepared.internal_cameras[static_cast(pair.view1)]; + const CachedCamera &cam2 = prepared.internal_cameras[static_cast(pair.view2)]; + const Pose2D pose1_core = prepared.packed_poses.pose( + static_cast(pair.view1), static_cast(pair.person1), prepared.core_joint_idx); + const Pose2D pose2_core = prepared.packed_poses.pose( + static_cast(pair.view2), static_cast(pair.person2), prepared.core_joint_idx); + + auto [pose3d, score] = triangulate_and_score( + pose1_core, pose2_core, cam1, cam2, roomparams, prepared.core_limbs_idx); + + CoreProposalDebug proposal; + proposal.pair_index = active_pair_indices[i]; + proposal.pair = pair; + proposal.pose_3d = pose3d; + proposal.score = score; + proposal.kept = score >= min_match_score; + proposal.drop_reason = proposal.kept ? "kept" : "below_min_match_score"; + trace.core_proposals.push_back(proposal); + + if (!proposal.kept) + { + continue; + } + + kept_core_indices.push_back(static_cast(trace.core_proposals.size() - 1)); + kept_scored_poses.emplace_back(std::move(pose3d), score); + kept_pose_pairs.emplace_back( + std::make_tuple(pair.view1, pair.view2, pair.person1, pair.person2), + std::make_pair(pair.global_person1, pair.global_person2)); + } + + if (kept_pose_pairs.empty()) + { + return trace; + } + + auto groups = calc_grouping(kept_pose_pairs, kept_scored_poses, min_match_score); + trace.grouping.initial_groups.reserve(groups.size()); + for (const auto &group : groups) + { + ProposalGroupDebug debug_group; + debug_group.center = std::get<0>(group); + debug_group.pose_3d = std::get<1>(group); + for (const int index : std::get<2>(group)) + { + debug_group.proposal_indices.push_back(kept_core_indices[static_cast(index)]); + } + trace.grouping.initial_groups.push_back(std::move(debug_group)); } - std::vector groups = calc_grouping(all_pairs, all_scored_poses, min_match_score); for (size_t i = groups.size(); i > 0; --i) { if (std::get<2>(groups[i - 1]).size() < min_group_size) @@ -448,16 +750,15 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( groups.erase(groups.begin() + static_cast(i - 1)); } } - if (groups.empty()) { - return empty_pose_batch3d(joint_names.size()); + return trace; } std::map, std::vector> pairs_map; - for (size_t i = 0; i < all_pairs.size(); ++i) + for (size_t i = 0; i < kept_pose_pairs.size(); ++i) { - const auto &pair = all_pairs[i]; + const auto &pair = kept_pose_pairs[i]; const auto &mid1 = std::make_tuple( std::get<0>(pair.first), std::get<1>(pair.first), std::get<0>(pair.second)); const auto &mid2 = std::make_tuple( @@ -466,19 +767,19 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( pairs_map[mid2].push_back(i); } - std::vector group_map(all_pairs.size()); + std::vector group_map(kept_pose_pairs.size()); for (size_t i = 0; i < groups.size(); ++i) { - for (const auto &idx : std::get<2>(groups[i])) + for (const int index : std::get<2>(groups[i])) { - group_map[idx] = i; + group_map[static_cast(index)] = i; } } std::set drop_indices; - for (auto &pair : pairs_map) + for (auto &pair_entry : pairs_map) { - auto &indices = pair.second; + auto &indices = pair_entry.second; if (indices.size() <= 1) { continue; @@ -488,10 +789,10 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( std::vector pair_scores; group_sizes.reserve(indices.size()); pair_scores.reserve(indices.size()); - for (auto idx : indices) + for (const size_t index : indices) { - group_sizes.push_back(std::get<2>(groups[group_map[idx]]).size()); - pair_scores.push_back(all_scored_poses[idx].second); + group_sizes.push_back(std::get<2>(groups[group_map[index]]).size()); + pair_scores.push_back(kept_scored_poses[index].second); } std::vector indices_sorted(indices.size()); @@ -514,10 +815,12 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( std::vector drop_list(drop_indices.begin(), drop_indices.end()); std::sort(drop_list.begin(), drop_list.end(), std::greater()); - for (size_t drop_idx : drop_list) + for (const size_t drop_idx : drop_list) { - all_scored_poses.erase(all_scored_poses.begin() + static_cast(drop_idx)); - all_pairs.erase(all_pairs.begin() + static_cast(drop_idx)); + trace.grouping.duplicate_pair_drops.push_back(kept_core_indices[drop_idx]); + kept_scored_poses.erase(kept_scored_poses.begin() + static_cast(drop_idx)); + kept_pose_pairs.erase(kept_pose_pairs.begin() + static_cast(drop_idx)); + kept_core_indices.erase(kept_core_indices.begin() + static_cast(drop_idx)); for (auto &group : groups) { @@ -527,11 +830,11 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( { indices.erase(it); } - for (auto &index : indices) + for (int &index : indices) { if (static_cast(index) > drop_idx) { - index -= 1; + --index; } } } @@ -544,24 +847,45 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( groups.erase(groups.begin() + static_cast(i - 1)); } } - if (groups.empty()) { - return empty_pose_batch3d(joint_names.size()); + return trace; } - std::vector all_full_poses(all_pairs.size()); - for (size_t i = 0; i < all_pairs.size(); ++i) + trace.grouping.groups.reserve(groups.size()); + for (const auto &group : groups) { - const auto &pids = all_pairs[i].first; - const auto &cam1 = internal_cameras[std::get<0>(pids)]; - const auto &cam2 = internal_cameras[std::get<1>(pids)]; - Pose2D pose1 = packed_poses.pose(std::get<0>(pids), std::get<2>(pids)); - Pose2D pose2 = packed_poses.pose(std::get<1>(pids), std::get<3>(pids)); + ProposalGroupDebug debug_group; + debug_group.center = std::get<0>(group); + debug_group.pose_3d = std::get<1>(group); + for (const int index : std::get<2>(group)) + { + debug_group.proposal_indices.push_back(kept_core_indices[static_cast(index)]); + } + trace.grouping.groups.push_back(std::move(debug_group)); + } - auto [pose3d, score] = triangulate_and_score( - pose1, pose2, cam1, cam2, roomparams, {}); + std::vector full_pose_buffer(kept_pose_pairs.size()); + std::vector full_proposal_index_by_core(trace.core_proposals.size(), -1); + trace.full_proposals.reserve(kept_pose_pairs.size()); + for (size_t i = 0; i < kept_pose_pairs.size(); ++i) + { + const auto &pair_ids = kept_pose_pairs[i].first; + const PairCandidate pair = { + std::get<0>(pair_ids), + std::get<1>(pair_ids), + std::get<2>(pair_ids), + std::get<3>(pair_ids), + kept_pose_pairs[i].second.first, + kept_pose_pairs[i].second.second, + }; + const CachedCamera &cam1 = prepared.internal_cameras[static_cast(pair.view1)]; + const CachedCamera &cam2 = prepared.internal_cameras[static_cast(pair.view2)]; + Pose2D pose1 = prepared.packed_poses.pose(static_cast(pair.view1), static_cast(pair.person1)); + Pose2D pose2 = prepared.packed_poses.pose(static_cast(pair.view2), static_cast(pair.person2)); + auto [pose3d, score] = triangulate_and_score(pose1, pose2, cam1, cam2, roomparams, {}); + (void)score; for (size_t joint = 0; joint < pose3d.size(); ++joint) { const float score1 = pose1[joint][2]; @@ -576,32 +900,43 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( } } - all_full_poses[i] = std::move(pose3d); + full_pose_buffer[i] = pose3d; + full_proposal_index_by_core[kept_core_indices[i]] = static_cast(trace.full_proposals.size()); + trace.full_proposals.push_back(FullProposalDebug { + kept_core_indices[i], + pair, + std::move(pose3d), + }); } std::vector final_poses_3d(groups.size()); + trace.merge.group_proposal_indices.reserve(groups.size()); + trace.merge.merged_poses.reserve(groups.size()); for (size_t i = 0; i < groups.size(); ++i) { std::vector poses; + std::vector source_core_indices; poses.reserve(std::get<2>(groups[i]).size()); - for (const auto &idx : std::get<2>(groups[i])) + source_core_indices.reserve(std::get<2>(groups[i]).size()); + for (const int index : std::get<2>(groups[i])) { - poses.push_back(all_full_poses[idx]); + poses.push_back(full_pose_buffer[static_cast(index)]); + source_core_indices.push_back(kept_core_indices[static_cast(index)]); } final_poses_3d[i] = merge_group(poses, min_match_score); + trace.merge.group_proposal_indices.push_back(std::move(source_core_indices)); + trace.merge.merged_poses.push_back(final_poses_3d[i]); } add_extra_joints(final_poses_3d, joint_names); - filter_poses(final_poses_3d, roomparams, core_joint_idx, core_limbs_idx); + filter_poses(final_poses_3d, roomparams, prepared.core_joint_idx, prepared.core_limbs_idx); add_missing_joints(final_poses_3d, joint_names); replace_far_joints(final_poses_3d, joint_names); - PoseBatch3D poses_3d; - poses_3d.num_joints = joint_names.size(); - + size_t valid_persons = 0; for (const auto &pose : final_poses_3d) { - const auto is_valid = std::any_of( + const bool is_valid = std::any_of( pose.begin(), pose.end(), [this](const std::array &joint) @@ -610,15 +945,17 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( }); if (is_valid) { - ++poses_3d.num_persons; + ++valid_persons; } } - poses_3d.data.resize(poses_3d.num_persons * poses_3d.num_joints * 4); + trace.final_poses.num_persons = valid_persons; + trace.final_poses.data.resize(valid_persons * trace.final_poses.num_joints * 4); + size_t person_index = 0; for (const auto &pose : final_poses_3d) { - const auto is_valid = std::any_of( + const bool is_valid = std::any_of( pose.begin(), pose.end(), [this](const std::array &joint) @@ -630,24 +967,57 @@ PoseBatch3D TriangulatorInternal::triangulate_poses( continue; } - for (size_t joint = 0; joint < poses_3d.num_joints; ++joint) + for (size_t joint = 0; joint < trace.final_poses.num_joints; ++joint) { for (size_t coord = 0; coord < 4; ++coord) { - poses_3d.at(person_index, joint, coord) = pose[joint][coord]; + trace.final_poses.at(person_index, joint, coord) = pose[joint][coord]; } } ++person_index; } - return poses_3d; + return trace; +} + +PreviousPoseFilterDebug TriangulatorInternal::filter_pairs_with_previous_poses( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::vector &joint_names, + const PoseBatch3DView &previous_poses_3d) +{ + if (previous_poses_3d.num_persons > 0 && previous_poses_3d.num_joints != joint_names.size()) + { + throw std::invalid_argument("previous_poses_3d must use the same joint count as joint_names."); + } + + PreparedInputs prepared = prepare_inputs(poses_2d, cameras, joint_names); + const std::vector pairs = build_pair_candidates(prepared.packed_poses); + if (pairs.empty()) + { + PreviousPoseFilterDebug empty; + empty.used_previous_poses = true; + return empty; + } + + return filter_pairs_with_previous_poses( + prepared.packed_poses, + prepared.internal_cameras, + prepared.core_joint_idx, + pairs, + previous_poses_3d); } std::tuple, std::vector>> TriangulatorInternal::project_poses( const std::vector &poses_3d, - const CameraInternal &icam, + const CachedCamera &icam, bool calc_dists) { + if (poses_3d.empty()) + { + return std::make_tuple(std::vector {}, std::vector> {}); + } + const size_t num_persons = poses_3d.size(); const size_t num_joints = poses_3d[0].size(); @@ -813,7 +1183,7 @@ std::array mat_mul_vec( return res; } -std::array calc_ray_dir(const CameraInternal &icam, const std::array &pt) +std::array calc_ray_dir(const CachedCamera &icam, const std::array &pt) { // Compute normalized ray direction from the point std::array uv1 = {pt[0], pt[1], 1.0}; @@ -824,8 +1194,8 @@ std::array calc_ray_dir(const CameraInternal &icam, const std::array triangulate_midpoint( - const CameraInternal &icam1, - const CameraInternal &icam2, + const CachedCamera &icam1, + const CachedCamera &icam2, const std::array &pt1, const std::array &pt2) { @@ -864,8 +1234,8 @@ std::array triangulate_midpoint( std::pair>, float> TriangulatorInternal::triangulate_and_score( const std::vector> &pose1, const std::vector> &pose2, - const CameraInternal &cam1, - const CameraInternal &cam2, + const CachedCamera &cam1, + const CachedCamera &cam2, const std::array, 2> &roomparams, const std::vector> &core_limbs_idx) { @@ -2026,14 +2396,89 @@ void TriangulatorInternal::replace_far_joints( } // namespace +std::vector build_pair_candidates(const PoseBatch2DView &poses_2d) +{ + if (poses_2d.person_counts == nullptr) + { + throw std::invalid_argument("person_counts must not be null."); + } + + std::vector pairs; + std::vector offsets(poses_2d.num_views, 0); + int total = 0; + for (size_t view = 0; view < poses_2d.num_views; ++view) + { + if (poses_2d.person_counts[view] > poses_2d.max_persons) + { + throw std::invalid_argument("person_counts entries must not exceed the padded person dimension."); + } + offsets[view] = total; + total += static_cast(poses_2d.person_counts[view]); + } + + for (size_t view1 = 0; view1 < poses_2d.num_views; ++view1) + { + for (size_t view2 = view1 + 1; view2 < poses_2d.num_views; ++view2) + { + for (size_t person1 = 0; person1 < poses_2d.person_counts[view1]; ++person1) + { + for (size_t person2 = 0; person2 < poses_2d.person_counts[view2]; ++person2) + { + pairs.push_back(PairCandidate { + static_cast(view1), + static_cast(view2), + static_cast(person1), + static_cast(person2), + offsets[view1] + static_cast(person1), + offsets[view2] + static_cast(person2), + }); + } + } + } + } + return pairs; +} + +PreviousPoseFilterDebug filter_pairs_with_previous_poses( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::vector &joint_names, + const PoseBatch3DView &previous_poses_3d, + float min_match_score) +{ + TriangulatorInternal triangulator(min_match_score, 1); + return triangulator.filter_pairs_with_previous_poses(poses_2d, cameras, joint_names, previous_poses_3d); +} + +TriangulationTrace triangulate_debug( + const PoseBatch2DView &poses_2d, + const std::vector &cameras, + const std::array, 2> &roomparams, + const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d, + float min_match_score, + size_t min_group_size) +{ + TriangulatorInternal triangulator(min_match_score, min_group_size); + return triangulator.triangulate_debug(poses_2d, cameras, roomparams, joint_names, previous_poses_3d); +} + PoseBatch3D triangulate_poses( const PoseBatch2DView &poses_2d, const std::vector &cameras, const std::array, 2> &roomparams, const std::vector &joint_names, + const PoseBatch3DView *previous_poses_3d, float min_match_score, size_t min_group_size) { - TriangulatorInternal triangulator(min_match_score, min_group_size); - return triangulator.triangulate_poses(poses_2d, cameras, roomparams, joint_names); + return triangulate_debug( + poses_2d, + cameras, + roomparams, + joint_names, + previous_poses_3d, + min_match_score, + min_group_size) + .final_poses; } diff --git a/src/rpt/__init__.py b/src/rpt/__init__.py index 093ba17..e8be5ed 100644 --- a/src/rpt/__init__.py +++ b/src/rpt/__init__.py @@ -3,7 +3,23 @@ from __future__ import annotations from collections.abc import Sequence from typing import TYPE_CHECKING -from ._core import Camera, triangulate_poses +from ._core import ( + Camera, + CameraModel, + CoreProposalDebug, + FullProposalDebug, + GroupingDebug, + MergeDebug, + PairCandidate, + PreviousPoseFilterDebug, + PreviousPoseMatch, + ProposalGroupDebug, + TriangulationTrace, + build_pair_candidates, + filter_pairs_with_previous_poses, + triangulate_debug, + triangulate_poses, +) if TYPE_CHECKING: import numpy as np @@ -28,7 +44,20 @@ def pack_poses_2d( __all__ = [ "Camera", + "CameraModel", + "CoreProposalDebug", + "FullProposalDebug", + "GroupingDebug", + "MergeDebug", + "PairCandidate", + "PreviousPoseFilterDebug", + "PreviousPoseMatch", + "ProposalGroupDebug", + "TriangulationTrace", + "build_pair_candidates", "convert_cameras", + "filter_pairs_with_previous_poses", "pack_poses_2d", + "triangulate_debug", "triangulate_poses", ] diff --git a/src/rpt/_helpers.py b/src/rpt/_helpers.py index 82c4505..e680865 100644 --- a/src/rpt/_helpers.py +++ b/src/rpt/_helpers.py @@ -1,12 +1,12 @@ from __future__ import annotations from collections.abc import Sequence -from typing import TypeAlias, TypedDict +from typing import Literal, TypeAlias, TypedDict import numpy as np import numpy.typing as npt -from ._core import Camera +from ._core import Camera, CameraModel Matrix3x3Like: TypeAlias = Sequence[Sequence[float]] VectorLike: TypeAlias = Sequence[float] @@ -21,12 +21,39 @@ class CameraDict(TypedDict, total=False): T: Sequence[Sequence[float]] width: int height: int - type: str + type: Literal["pinhole", "fisheye"] + model: Literal["pinhole", "fisheye"] | CameraModel +CameraModelLike: TypeAlias = CameraModel | Literal["pinhole", "fisheye"] CameraLike = Camera | CameraDict +def _coerce_camera_model(model: CameraModelLike) -> CameraModel: + if isinstance(model, CameraModel): + return model + if model == "pinhole": + return CameraModel.PINHOLE + if model == "fisheye": + return CameraModel.FISHEYE + raise ValueError(f"Unsupported camera model: {model}") + + +def _coerce_distortion(distortion: VectorLike, camera_model: CameraModel) -> tuple[float, float, float, float, float]: + values = tuple(float(value) for value in distortion) + expected = 4 if camera_model is CameraModel.FISHEYE else 5 + if len(values) not in {expected, 5}: + raise ValueError( + f"{camera_model.name.lower()} cameras require {expected} distortion coefficients" + + (" (or 5 with a trailing zero)." if camera_model is CameraModel.FISHEYE else ".") + ) + if camera_model is CameraModel.FISHEYE and len(values) == 4: + values = values + (0.0,) + if len(values) != 5: + raise ValueError("Distortion coefficients must normalize to exactly 5 values.") + return values + + def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]: """Normalize mappings or existing Camera objects into bound Camera instances.""" @@ -39,12 +66,13 @@ def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]: camera = Camera() camera.name = str(cam["name"]) camera.K = cam["K"] - camera.DC = cam["DC"] + camera_model = _coerce_camera_model(cam.get("model", cam.get("type", "pinhole"))) + camera.DC = _coerce_distortion(cam["DC"], camera_model) camera.R = cam["R"] camera.T = cam["T"] camera.width = int(cam["width"]) camera.height = int(cam["height"]) - camera.type = str(cam.get("type", "pinhole")) + camera.model = camera_model converted.append(camera) return converted diff --git a/tests/test_interface.py b/tests/test_interface.py index 27e7b65..49a3de5 100644 --- a/tests/test_interface.py +++ b/tests/test_interface.py @@ -52,7 +52,7 @@ def test_camera_structure_repr(): camera.T = [[1], [2], [3]] camera.width = 640 camera.height = 480 - camera.type = "pinhole" + camera.model = rpt.CameraModel.PINHOLE rendered = repr(camera) assert "Camera 1" in rendered @@ -100,6 +100,75 @@ def test_triangulate_repeatability(): np.testing.assert_allclose(first, second, rtol=1e-5, atol=1e-5) +def test_build_pair_candidates_exposes_cartesian_view_pairs(): + poses_2d, person_counts = rpt.pack_poses_2d( + [ + np.zeros((2, 2, 3), dtype=np.float32), + np.zeros((1, 2, 3), dtype=np.float32), + np.zeros((3, 2, 3), dtype=np.float32), + ], + joint_count=2, + ) + + pairs = rpt.build_pair_candidates(poses_2d, person_counts) + + assert len(pairs) == (2 * 1) + (2 * 3) + (1 * 3) + assert (pairs[0].view1, pairs[0].view2, pairs[0].person1, pairs[0].person2) == (0, 1, 0, 0) + assert pairs[-1].global_person2 == 5 + + +def test_triangulate_accepts_empty_previous_poses(): + poses_2d, person_counts, cameras = load_case("data/p1/sample.json", "tests/poses_p1.json") + roomparams = np.asarray([[5.6, 6.4, 2.4], [0.0, -0.5, 1.2]], dtype=np.float32) + empty_previous = np.zeros((0, len(JOINT_NAMES), 4), dtype=np.float32) + + baseline = rpt.triangulate_poses(poses_2d, person_counts, cameras, roomparams, JOINT_NAMES) + with_previous = rpt.triangulate_poses( + poses_2d, + person_counts, + cameras, + roomparams, + JOINT_NAMES, + empty_previous, + ) + + np.testing.assert_allclose(with_previous, baseline, rtol=1e-5, atol=1e-5) + + +def test_triangulate_debug_matches_final_output(): + poses_2d, person_counts, cameras = load_case("data/h1/sample.json", "tests/poses_h1.json") + roomparams = np.asarray([[4.8, 6.0, 2.0], [0.0, 0.0, 1.0]], dtype=np.float32) + + final_poses = rpt.triangulate_poses(poses_2d, person_counts, cameras, roomparams, JOINT_NAMES) + trace = rpt.triangulate_debug(poses_2d, person_counts, cameras, roomparams, JOINT_NAMES) + + np.testing.assert_allclose(trace.final_poses, final_poses, rtol=1e-5, atol=1e-5) + assert len(trace.pairs) >= len(trace.core_proposals) + for group in trace.grouping.groups: + assert all(0 <= index < len(trace.core_proposals) for index in group.proposal_indices) + for merge_indices in trace.merge.group_proposal_indices: + assert all(0 <= index < len(trace.core_proposals) for index in merge_indices) + + +def test_filter_pairs_with_previous_poses_returns_debug_matches(): + poses_2d, person_counts, cameras = load_case("data/p1/sample.json", "tests/poses_p1.json") + roomparams = np.asarray([[5.6, 6.4, 2.4], [0.0, -0.5, 1.2]], dtype=np.float32) + previous_poses = rpt.triangulate_poses(poses_2d, person_counts, cameras, roomparams, JOINT_NAMES) + + debug = rpt.filter_pairs_with_previous_poses( + poses_2d, + person_counts, + cameras, + JOINT_NAMES, + previous_poses, + ) + + assert debug.used_previous_poses is True + assert len(debug.matches) == len(rpt.build_pair_candidates(poses_2d, person_counts)) + assert len(debug.kept_pairs) == len(debug.kept_pair_indices) + assert any(match.matched_view1 or match.matched_view2 for match in debug.matches) + + def test_triangulate_does_not_mutate_inputs(): poses_2d, person_counts, cameras = load_case("data/h1/sample.json", "tests/poses_h1.json") roomparams = np.asarray([[4.8, 6.0, 2.0], [0.0, 0.0, 1.0]], dtype=np.float32)