diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a69096..5106171 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.18) project(RapidPoseTriangulation - VERSION 0.1.0 + VERSION 0.2.0 LANGUAGES CXX DESCRIPTION "Rapid Pose Triangulation library with Python bindings" ) diff --git a/bindings/rpt_module.cpp b/bindings/rpt_module.cpp index f1aba4d..bf94219 100644 --- a/bindings/rpt_module.cpp +++ b/bindings/rpt_module.cpp @@ -23,6 +23,8 @@ using CountArray = nb::ndarray, nb::c_c using TrackIdArray = nb::ndarray, nb::c_contig>; using PoseArray3DConst = nb::ndarray, nb::c_contig>; +using PoseArray3DByViewConst = + nb::ndarray, nb::c_contig>; using PoseArray3D = nb::ndarray, nb::c_contig>; using PoseArray2DOut = nb::ndarray, nb::c_contig>; @@ -59,6 +61,32 @@ PoseBatch3DView pose_batch3d_view_from_numpy(const PoseArray3DConst &poses_3d) }; } +PoseBatch3DByViewView pose_batch3d_by_view_from_numpy( + const PoseArray3DByViewConst &poses_3d, + const CountArray &person_counts) +{ + if (poses_3d.shape(0) != person_counts.shape(0)) + { + throw std::invalid_argument("poses_3d and person_counts must have the same number of views."); + } + + for (size_t i = 0; i < static_cast(person_counts.shape(0)); ++i) + { + if (person_counts(i) > poses_3d.shape(1)) + { + throw std::invalid_argument("person_counts entries must not exceed the padded person dimension."); + } + } + + return PoseBatch3DByViewView { + poses_3d.data(), + person_counts.data(), + static_cast(poses_3d.shape(0)), + static_cast(poses_3d.shape(1)), + static_cast(poses_3d.shape(2)), + }; +} + TrackedPoseBatch3DView tracked_pose_batch_view_from_numpy( const PoseArray3DConst &poses_3d, const TrackIdArray &track_ids) @@ -432,6 +460,24 @@ NB_MODULE(_core, m) "person_counts"_a, "config"_a); + m.def( + "merge_rgbd_views", + [](const PoseArray3DByViewConst &poses_3d, + const CountArray &person_counts, + const TriangulationConfig &config, + float max_distance) + { + const PoseBatch3D merged = merge_rgbd_views( + pose_batch3d_by_view_from_numpy(poses_3d, person_counts), + config, + max_distance); + return pose_batch_to_numpy(merged); + }, + "poses_3d"_a, + "person_counts"_a, + "config"_a, + "max_distance"_a = 0.5f); + m.def( "triangulate_with_report", [](const PoseArray2D &poses_2d, diff --git a/pyproject.toml b/pyproject.toml index c940f7e..94356a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build" [project] name = "rapid-pose-triangulation" -version = "0.1.0" +version = "0.2.0" description = "Rapid Pose Triangulation library with nanobind Python bindings" readme = "README.md" requires-python = ">=3.10" diff --git a/rpt/CMakeLists.txt b/rpt/CMakeLists.txt index 53820df..085c286 100644 --- a/rpt/CMakeLists.txt +++ b/rpt/CMakeLists.txt @@ -3,6 +3,7 @@ set(RPT_SOURCES camera.cpp interface.cpp + rgbd_merger.cpp triangulator.cpp ) diff --git a/rpt/interface.cpp b/rpt/interface.cpp index bc12869..38072f2 100644 --- a/rpt/interface.cpp +++ b/rpt/interface.cpp @@ -23,6 +23,17 @@ size_t pose3d_offset(size_t person, size_t joint, size_t coord, size_t num_joint { return (((person * num_joints) + joint) * 4) + coord; } + +size_t pose3d_by_view_offset( + size_t view, + size_t person, + size_t joint, + size_t coord, + size_t max_persons, + size_t num_joints) +{ + return ((((view * max_persons) + person) * num_joints) + joint) * 4 + coord; +} } // namespace // ================================================================================================= @@ -53,6 +64,11 @@ const float &TrackedPoseBatch3DView::at(size_t person, size_t joint, size_t coor return data[pose3d_offset(person, joint, coord, num_joints)]; } +const float &PoseBatch3DByViewView::at(size_t view, size_t person, size_t joint, size_t coord) const +{ + return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)]; +} + const float &PoseBatch2D::at(size_t view, size_t person, size_t joint, size_t coord) const { return data[pose2d_offset(view, person, joint, coord, max_persons, num_joints)]; @@ -129,6 +145,27 @@ PoseBatch3DView PoseBatch3D::view() const return PoseBatch3DView {data.data(), num_persons, num_joints}; } +float &PoseBatch3DByView::at(size_t view, size_t person, size_t joint, size_t coord) +{ + return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)]; +} + +const float &PoseBatch3DByView::at(size_t view, size_t person, size_t joint, size_t coord) const +{ + return data[pose3d_by_view_offset(view, person, joint, coord, max_persons, num_joints)]; +} + +PoseBatch3DByViewView PoseBatch3DByView::view() const +{ + return PoseBatch3DByViewView { + data.data(), + person_counts.data(), + num_views, + max_persons, + num_joints, + }; +} + NestedPoses3D PoseBatch3D::to_nested() const { NestedPoses3D poses_3d(num_persons); diff --git a/rpt/interface.hpp b/rpt/interface.hpp index 422ee39..b674ba1 100644 --- a/rpt/interface.hpp +++ b/rpt/interface.hpp @@ -45,6 +45,17 @@ struct TrackedPoseBatch3DView const float &at(size_t person, size_t joint, size_t coord) const; }; +struct PoseBatch3DByViewView +{ + const float *data = nullptr; + const uint32_t *person_counts = nullptr; + size_t num_views = 0; + size_t max_persons = 0; + size_t num_joints = 0; + + const float &at(size_t view, size_t person, size_t joint, size_t coord) const; +}; + struct PoseBatch2D { std::vector data; @@ -74,6 +85,19 @@ struct PoseBatch3D static PoseBatch3D from_nested(const NestedPoses3D &poses_3d); }; +struct PoseBatch3DByView +{ + std::vector data; + std::vector person_counts; + size_t num_views = 0; + size_t max_persons = 0; + size_t num_joints = 0; + + float &at(size_t view, size_t person, size_t joint, size_t coord); + const float &at(size_t view, size_t person, size_t joint, size_t coord) const; + PoseBatch3DByViewView view() const; +}; + // ================================================================================================= struct PairCandidate @@ -242,6 +266,11 @@ PoseBatch3D triangulate_poses( const TriangulationConfig &config, const TriangulationOptions *options_override = nullptr); +PoseBatch3D merge_rgbd_views( + const PoseBatch3DByViewView &poses_3d, + const TriangulationConfig &config, + float max_distance = 0.5f); + TriangulationResult triangulate_with_report( const PoseBatch2DView &poses_2d, const TriangulationConfig &config, @@ -256,6 +285,14 @@ inline PoseBatch3D triangulate_poses( return triangulate_poses(poses_2d.view(), config, options_override); } +inline PoseBatch3D merge_rgbd_views( + const PoseBatch3DByView &poses_3d, + const TriangulationConfig &config, + float max_distance = 0.5f) +{ + return merge_rgbd_views(poses_3d.view(), config, max_distance); +} + inline TriangulationTrace triangulate_debug( const PoseBatch2D &poses_2d, const TriangulationConfig &config, diff --git a/rpt/rgbd_merger.cpp b/rpt/rgbd_merger.cpp new file mode 100644 index 0000000..3560b9b --- /dev/null +++ b/rpt/rgbd_merger.cpp @@ -0,0 +1,1327 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "interface.hpp" + +namespace +{ +using Pose3D = std::vector>; +using PoseList3D = std::vector; + +struct Track +{ + Pose3D skeleton; + std::vector last_detections; +}; + +std::vector solve_assignment_hungarian(const std::vector> &costs) +{ + const size_t rows = costs.size(); + const size_t cols = rows == 0 ? 0 : costs[0].size(); + const size_t size = std::max(rows, cols); + if (size == 0) + { + return {}; + } + + const double inf = std::numeric_limits::infinity(); + std::vector> matrix(size + 1, std::vector(size + 1, 0.0)); + for (size_t row = 0; row < rows; ++row) + { + for (size_t col = 0; col < cols; ++col) + { + matrix[row + 1][col + 1] = costs[row][col]; + } + } + + std::vector u(size + 1, 0.0); + std::vector v(size + 1, 0.0); + std::vector p(size + 1, 0); + std::vector way(size + 1, 0); + + for (size_t row = 1; row <= rows; ++row) + { + p[0] = row; + size_t col0 = 0; + std::vector minv(size + 1, inf); + std::vector used(size + 1, false); + do + { + used[col0] = true; + const size_t row0 = p[col0]; + double delta = inf; + size_t col1 = 0; + for (size_t col = 1; col <= size; ++col) + { + if (used[col]) + { + continue; + } + const double current = matrix[row0][col] - u[row0] - v[col]; + if (current < minv[col]) + { + minv[col] = current; + way[col] = col0; + } + if (minv[col] < delta) + { + delta = minv[col]; + col1 = col; + } + } + for (size_t col = 0; col <= size; ++col) + { + if (used[col]) + { + u[p[col]] += delta; + v[col] -= delta; + } + else + { + minv[col] -= delta; + } + } + col0 = col1; + } while (p[col0] != 0); + + do + { + const size_t col1 = way[col0]; + p[col0] = p[col1]; + col0 = col1; + } while (col0 != 0); + } + + std::vector assignment(rows, -1); + for (size_t col = 1; col <= size; ++col) + { + const size_t row = p[col]; + if (row == 0 || row > rows || col > cols) + { + continue; + } + assignment[row - 1] = static_cast(col - 1); + } + return assignment; +} + +bool all_visible_joints_close( + const Pose3D &skel1, + const Pose3D &skel2, + float max_distance, + float vis_threshold) +{ + const float max_dist_sq = max_distance * max_distance; + bool any_visible = false; + for (size_t joint = 0; joint < skel1.size(); ++joint) + { + const bool visible1 = skel1[joint][3] > vis_threshold; + const bool visible2 = skel2[joint][3] > vis_threshold; + if (!visible1 || !visible2) + { + continue; + } + + any_visible = true; + const float dx = skel1[joint][0] - skel2[joint][0]; + const float dy = skel1[joint][1] - skel2[joint][1]; + const float dz = skel1[joint][2] - skel2[joint][2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq > max_dist_sq) + { + return false; + } + } + return any_visible; +} + +void add_extra_joints(PoseList3D &poses, const std::vector &joint_names) +{ + const auto it_head = std::find(joint_names.begin(), joint_names.end(), "head"); + const auto it_ear_left = std::find(joint_names.begin(), joint_names.end(), "ear_left"); + const auto it_ear_right = std::find(joint_names.begin(), joint_names.end(), "ear_right"); + const auto it_nose = std::find(joint_names.begin(), joint_names.end(), "nose"); + + if (it_head == joint_names.end() || it_ear_left == joint_names.end() || + it_ear_right == joint_names.end() || it_nose == joint_names.end()) + { + return; + } + + const int idx_head = std::distance(joint_names.begin(), it_head); + const int idx_ear_left = std::distance(joint_names.begin(), it_ear_left); + const int idx_ear_right = std::distance(joint_names.begin(), it_ear_right); + const int idx_nose = std::distance(joint_names.begin(), it_nose); + + for (auto &pose : poses) + { + auto &joint_head = pose[static_cast(idx_head)]; + const auto &joint_ear_left = pose[static_cast(idx_ear_left)]; + const auto &joint_ear_right = pose[static_cast(idx_ear_right)]; + if (joint_ear_left[3] > 0.1f && joint_ear_right[3] > 0.1f) + { + joint_head[0] = (joint_ear_left[0] + joint_ear_right[0]) * 0.5f; + joint_head[1] = (joint_ear_left[1] + joint_ear_right[1]) * 0.5f; + joint_head[2] = (joint_ear_left[2] + joint_ear_right[2]) * 0.5f; + joint_head[3] = std::min(joint_ear_left[3], joint_ear_right[3]); + continue; + } + + const auto &joint_nose = pose[static_cast(idx_nose)]; + if (joint_nose[3] > 0.1f) + { + joint_head[0] = joint_nose[0]; + joint_head[1] = joint_nose[1]; + joint_head[2] = joint_nose[2]; + joint_head[3] = joint_nose[3]; + } + } +} + +void filter_poses(PoseList3D &poses, const std::array, 2> &roomparams) +{ + constexpr float min_score = 0.1f; + std::vector drop_indices; + drop_indices.reserve(poses.size()); + + for (size_t pose_index = 0; pose_index < poses.size(); ++pose_index) + { + auto &pose = poses[pose_index]; + std::vector valid_joint_idx; + valid_joint_idx.reserve(pose.size()); + for (size_t joint = 0; joint < pose.size(); ++joint) + { + if (pose[joint][3] > min_score) + { + valid_joint_idx.push_back(joint); + } + } + + if (valid_joint_idx.size() < 5) + { + drop_indices.push_back(pose_index); + continue; + } + + std::array mean = {0.0f, 0.0f, 0.0f}; + std::array mins = { + std::numeric_limits::max(), + std::numeric_limits::max(), + std::numeric_limits::max(), + }; + std::array maxs = { + std::numeric_limits::lowest(), + std::numeric_limits::lowest(), + std::numeric_limits::lowest(), + }; + + for (const size_t joint : valid_joint_idx) + { + for (size_t coord = 0; coord < 3; ++coord) + { + mins[coord] = std::min(mins[coord], pose[joint][coord]); + maxs[coord] = std::max(maxs[coord], pose[joint][coord]); + mean[coord] += pose[joint][coord]; + } + } + for (size_t coord = 0; coord < 3; ++coord) + { + mean[coord] /= static_cast(valid_joint_idx.size()); + } + + constexpr float max_size = 2.3f; + constexpr float min_size = 0.3f; + std::array diff = { + maxs[0] - mins[0], + maxs[1] - mins[1], + maxs[2] - mins[2], + }; + if (diff[0] > max_size || diff[1] > max_size || diff[2] > max_size) + { + drop_indices.push_back(pose_index); + continue; + } + if (diff[0] < min_size && diff[1] < min_size && diff[2] < min_size) + { + drop_indices.push_back(pose_index); + continue; + } + + constexpr float wdist = 0.1f; + const auto &room_size = roomparams[0]; + const auto &room_center = roomparams[1]; + const std::array room_half_size = { + room_size[0] / 2.0f, + room_size[1] / 2.0f, + room_size[2] / 2.0f, + }; + bool outside = false; + for (size_t coord = 0; coord < 3; ++coord) + { + if (mean[coord] > room_half_size[coord] + room_center[coord] || + mean[coord] < -room_half_size[coord] + room_center[coord]) + { + outside = true; + break; + } + } + for (size_t coord = 0; coord < 3; ++coord) + { + if (maxs[coord] > room_half_size[coord] + room_center[coord] + wdist || + mins[coord] < -room_half_size[coord] + room_center[coord] - wdist) + { + outside = true; + break; + } + } + if (outside) + { + drop_indices.push_back(pose_index); + } + } + + for (const size_t pose_index : drop_indices) + { + for (auto &joint : poses[pose_index]) + { + joint[3] = 0.001f; + } + } +} + +void add_missing_joints(PoseList3D &poses, const std::vector &joint_names, float vis_threshold) +{ + std::unordered_map joint_name_to_idx; + joint_name_to_idx.reserve(joint_names.size()); + for (size_t idx = 0; idx < joint_names.size(); ++idx) + { + joint_name_to_idx[joint_names[idx]] = idx; + } + + std::unordered_map> adjacents = { + {"hip_right", {"hip_middle", "hip_left"}}, + {"hip_left", {"hip_middle", "hip_right"}}, + {"knee_right", {"hip_right", "knee_left"}}, + {"knee_left", {"hip_left", "knee_right"}}, + {"ankle_right", {"knee_right", "ankle_left"}}, + {"ankle_left", {"knee_left", "ankle_right"}}, + {"shoulder_right", {"shoulder_middle", "shoulder_left"}}, + {"shoulder_left", {"shoulder_middle", "shoulder_right"}}, + {"elbow_right", {"shoulder_right", "hip_right"}}, + {"elbow_left", {"shoulder_left", "hip_left"}}, + {"wrist_right", {"elbow_right"}}, + {"wrist_left", {"elbow_left"}}, + {"nose", {"shoulder_middle", "shoulder_right", "shoulder_left"}}, + {"head", {"shoulder_middle", "shoulder_right", "shoulder_left"}}, + {"foot_*_left_*", {"ankle_left"}}, + {"foot_*_right_*", {"ankle_right"}}, + {"face_*", {"nose"}}, + {"hand_*_left_*", {"wrist_left"}}, + {"hand_*_right_*", {"wrist_right"}}, + }; + + for (auto &pose : poses) + { + std::vector valid_joint_idx; + valid_joint_idx.reserve(pose.size()); + for (size_t joint = 0; joint < pose.size(); ++joint) + { + if (pose[joint][3] > vis_threshold) + { + valid_joint_idx.push_back(joint); + } + } + if (valid_joint_idx.empty()) + { + continue; + } + + std::array body_center = {0.0f, 0.0f, 0.0f}; + for (const size_t joint : valid_joint_idx) + { + body_center[0] += pose[joint][0]; + body_center[1] += pose[joint][1]; + body_center[2] += pose[joint][2]; + } + for (size_t coord = 0; coord < 3; ++coord) + { + body_center[coord] /= static_cast(valid_joint_idx.size()); + } + + for (size_t joint = 0; joint < joint_names.size(); ++joint) + { + std::string adjacent_name; + const std::string &joint_name = joint_names[joint]; + if (joint_name.starts_with("foot_") && joint_name.find("_left") != std::string::npos) + { + adjacent_name = "foot_*_left_*"; + } + else if ( + joint_name.starts_with("foot_") && + joint_name.find("_right") != std::string::npos) + { + adjacent_name = "foot_*_right_*"; + } + else if (joint_name.starts_with("face_")) + { + adjacent_name = "face_*"; + } + else if (joint_name.starts_with("hand_") && joint_name.find("_left") != std::string::npos) + { + adjacent_name = "hand_*_left_*"; + } + else if ( + joint_name.starts_with("hand_") && + joint_name.find("_right") != std::string::npos) + { + adjacent_name = "hand_*_right_*"; + } + else if (adjacents.contains(joint_name)) + { + adjacent_name = joint_name; + } + + if (adjacent_name.empty()) + { + continue; + } + + auto &joint_entry = pose[joint]; + if (joint_entry[3] != 0.0f) + { + continue; + } + + std::array adjacent_position = body_center; + auto adjacent_iter = adjacents.find(adjacent_name); + if (adjacent_iter != adjacents.end()) + { + adjacent_position = {0.0f, 0.0f, 0.0f}; + size_t adjacent_count = 0; + for (const std::string &adjacent_joint_name : adjacent_iter->second) + { + const auto mapped = joint_name_to_idx.find(adjacent_joint_name); + if (mapped == joint_name_to_idx.end()) + { + continue; + } + + const auto &adjacent_joint = pose[mapped->second]; + if (adjacent_joint[3] <= vis_threshold) + { + continue; + } + + adjacent_position[0] += adjacent_joint[0]; + adjacent_position[1] += adjacent_joint[1]; + adjacent_position[2] += adjacent_joint[2]; + ++adjacent_count; + } + if (adjacent_count > 0) + { + for (size_t coord = 0; coord < 3; ++coord) + { + adjacent_position[coord] /= static_cast(adjacent_count); + } + } + else + { + adjacent_position = body_center; + } + } + + joint_entry[0] = adjacent_position[0]; + joint_entry[1] = adjacent_position[1]; + joint_entry[2] = adjacent_position[2]; + joint_entry[3] = 0.1f; + } + } +} + +void replace_far_joints( + PoseList3D &poses, + const std::vector &joint_names, + float min_match_score) +{ + for (auto &pose : poses) + { + std::array center_head = {0.0f, 0.0f, 0.0f, 0.0f}; + std::array center_foot_left = {0.0f, 0.0f, 0.0f, 0.0f}; + std::array center_foot_right = {0.0f, 0.0f, 0.0f, 0.0f}; + std::array center_hand_left = {0.0f, 0.0f, 0.0f, 0.0f}; + std::array center_hand_right = {0.0f, 0.0f, 0.0f, 0.0f}; + + for (size_t joint = 0; joint < pose.size(); ++joint) + { + const std::string &joint_name = joint_names[joint]; + const float offset = (1.0f - min_match_score) * 2.0f; + const float min_score = min_match_score - offset; + if (pose[joint][3] <= min_score) + { + continue; + } + + if ( + joint_name.starts_with("face_") || joint_name == "nose" || joint_name == "eye_left" || + joint_name == "eye_right" || joint_name == "ear_left" || + joint_name == "ear_right") + { + center_head[0] += pose[joint][0]; + center_head[1] += pose[joint][1]; + center_head[2] += pose[joint][2]; + center_head[3] += 1.0f; + } + else if (joint_name.starts_with("foot_") || joint_name.starts_with("ankle_")) + { + if (joint_name.find("_left") != std::string::npos) + { + center_foot_left[0] += pose[joint][0]; + center_foot_left[1] += pose[joint][1]; + center_foot_left[2] += pose[joint][2]; + center_foot_left[3] += 1.0f; + } + else if (joint_name.find("_right") != std::string::npos) + { + center_foot_right[0] += pose[joint][0]; + center_foot_right[1] += pose[joint][1]; + center_foot_right[2] += pose[joint][2]; + center_foot_right[3] += 1.0f; + } + } + else if (joint_name.starts_with("hand_") || joint_name.starts_with("wrist_")) + { + if (joint_name.find("_left") != std::string::npos) + { + center_hand_left[0] += pose[joint][0]; + center_hand_left[1] += pose[joint][1]; + center_hand_left[2] += pose[joint][2]; + center_hand_left[3] += 1.0f; + } + else if (joint_name.find("_right") != std::string::npos) + { + center_hand_right[0] += pose[joint][0]; + center_hand_right[1] += pose[joint][1]; + center_hand_right[2] += pose[joint][2]; + center_hand_right[3] += 1.0f; + } + } + } + + for (size_t coord = 0; coord < 3; ++coord) + { + if (center_head[3] > 0.0f) + { + center_head[coord] /= center_head[3]; + } + if (center_foot_left[3] > 0.0f) + { + center_foot_left[coord] /= center_foot_left[3]; + } + if (center_foot_right[3] > 0.0f) + { + center_foot_right[coord] /= center_foot_right[3]; + } + if (center_hand_left[3] > 0.0f) + { + center_hand_left[coord] /= center_hand_left[3]; + } + if (center_hand_right[3] > 0.0f) + { + center_hand_right[coord] /= center_hand_right[3]; + } + } + + constexpr float max_dist_head_sq = 0.20f * 0.20f; + constexpr float max_dist_foot_sq = 0.25f * 0.25f; + constexpr float max_dist_hand_sq = 0.20f * 0.20f; + + for (size_t joint = 0; joint < pose.size(); ++joint) + { + const std::string &joint_name = joint_names[joint]; + std::array center = {0.0f, 0.0f, 0.0f, 0.0f}; + float max_dist_sq = 0.0f; + if ( + joint_name.starts_with("face_") || joint_name == "nose" || joint_name == "eye_left" || + joint_name == "eye_right" || joint_name == "ear_left" || + joint_name == "ear_right") + { + center = center_head; + max_dist_sq = max_dist_head_sq; + } + else if (joint_name.starts_with("foot_") || joint_name.starts_with("ankle_")) + { + center = joint_name.find("_left") != std::string::npos ? center_foot_left : center_foot_right; + max_dist_sq = max_dist_foot_sq; + } + else if (joint_name.starts_with("hand_") || joint_name.starts_with("wrist_")) + { + center = joint_name.find("_left") != std::string::npos ? center_hand_left : center_hand_right; + max_dist_sq = max_dist_hand_sq; + } + else + { + continue; + } + + if (center[3] <= 0.0f) + { + continue; + } + + const float dx = pose[joint][0] - center[0]; + const float dy = pose[joint][1] - center[1]; + const float dz = pose[joint][2] - center[2]; + const float dist_sq = dx * dx + dy * dy + dz * dz; + if ((pose[joint][3] > 0.0f && dist_sq > max_dist_sq) || pose[joint][3] == 0.0f) + { + pose[joint][0] = center[0]; + pose[joint][1] = center[1]; + pose[joint][2] = center[2]; + pose[joint][3] = 0.1f; + } + } + } +} + +class RgbdViewMerger +{ +public: + RgbdViewMerger(const std::vector &joint_names_in, size_t num_views_in, float max_distance_in) + : joint_names(joint_names_in), + num_views(static_cast(num_views_in)), + max_distance(max_distance_in) + { + neighbor_joints = { + {"chest", { + "shoulder_left", + "shoulder_right", + "shoulder_middle", + "hip_left", + "hip_right", + "hip_middle", + }}, + {"head", { + "nose", + "eye_left", + "eye_right", + "ear_left", + "ear_right", + "shoulder_left", + "shoulder_right", + "shoulder_middle", + "neck", + }}, + {"nose", { + "eye_left", + "eye_right", + "ear_left", + "ear_right", + "shoulder_left", + "shoulder_right", + "shoulder_middle", + "neck", + }}, + {"eye_left", { + "nose", + "eye_right", + "ear_left", + "ear_right", + "shoulder_left", + "shoulder_middle", + }}, + {"eye_right", { + "nose", + "eye_left", + "ear_left", + "ear_right", + "shoulder_right", + "shoulder_middle", + }}, + {"ear_left", { + "nose", + "ear_right", + "eye_left", + "eye_right", + "shoulder_left", + "shoulder_middle", + }}, + {"ear_right", { + "nose", + "ear_left", + "eye_left", + "eye_right", + "shoulder_right", + "shoulder_middle", + }}, + {"shoulder_left", { + "nose", + "eye_left", + "ear_left", + "elbow_left", + "hip_left", + "shoulder_middle", + "neck", + }}, + {"shoulder_right", { + "nose", + "eye_right", + "ear_right", + "elbow_right", + "hip_right", + "shoulder_middle", + "neck", + }}, + {"shoulder_middle", { + "nose", + "eye_left", + "eye_right", + "ear_left", + "ear_right", + "elbow_left", + "elbow_right", + "hip_middle", + "shoulder_left", + "shoulder_right", + "neck", + }}, + {"neck", { + "nose", + "eye_left", + "eye_right", + "shoulder_left", + "shoulder_right", + "shoulder_middle", + }}, + {"elbow_left", { + "shoulder_left", + "wrist_left", + "shoulder_middle", + "neck", + }}, + {"elbow_right", { + "shoulder_right", + "wrist_right", + "shoulder_middle", + "neck", + }}, + {"wrist_left", { + "elbow_left", + }}, + {"wrist_right", { + "elbow_right", + }}, + {"hip_left", { + "hip_right", + "knee_left", + "shoulder_left", + "hip_middle", + }}, + {"hip_right", { + "hip_left", + "knee_right", + "shoulder_right", + "hip_middle", + }}, + {"hip_middle", { + "hip_left", + "hip_right", + "knee_right", + "knee_left", + "shoulder_middle", + "neck", + }}, + {"knee_left", { + "hip_left", + "hip_middle", + "hip_right", + "ankle_left", + }}, + {"knee_right", { + "hip_right", + "hip_middle", + "hip_left", + "ankle_right", + }}, + {"ankle_left", { + "knee_left", + }}, + {"ankle_right", { + "knee_right", + }}, + }; + + for (const auto &entry : neighbor_joints) + { + std::vector ids; + ids.reserve(entry.second.size()); + for (const auto &joint_name : entry.second) + { + const auto it = std::find(joint_names.begin(), joint_names.end(), joint_name); + if (it != joint_names.end()) + { + ids.push_back(static_cast(std::distance(joint_names.begin(), it))); + } + } + neighbor_joints_ids[entry.first] = std::move(ids); + } + } + + void consume_view(const PoseList3D &view_poses) + { + if (view_poses.empty()) + { + return; + } + + const PoseList3D filtered_detections = drop_outlier_joints(view_poses); + auto [matches, new_detection_indices, outdated_track_indices] = data_association(filtered_detections); + (void)outdated_track_indices; + + for (const auto &[track_index, detection_index] : matches) + { + auto &track = tracks[track_index]; + while (track.last_detections.size() >= static_cast(num_views)) + { + track.last_detections.erase(track.last_detections.begin()); + } + track.last_detections.push_back(filtered_detections[detection_index]); + } + + for (auto &track : tracks) + { + update_track_position(track); + } + + for (const size_t detection_index : new_detection_indices) + { + Track track; + track.skeleton = filtered_detections[detection_index]; + track.last_detections.push_back(filtered_detections[detection_index]); + tracks.push_back(std::move(track)); + } + + merge_tracks(merge_distance); + for (auto &track : tracks) + { + update_track_position(track); + } + merge_tracks(merge_distance); + for (auto &track : tracks) + { + update_track_position(track); + } + } + + PoseList3D finalize() const + { + PoseList3D poses; + poses.reserve(tracks.size()); + for (const auto &track : tracks) + { + size_t visible_joints = 0; + for (const auto &joint : track.skeleton) + { + if (joint[3] > vis_threshold) + { + ++visible_joints; + } + } + if (visible_joints >= min_num_kpts) + { + poses.push_back(track.skeleton); + } + } + return poses; + } + +private: + std::array neighbor_center(const Pose3D &skeleton, const std::string &joint_name) const + { + std::array mean = {0.0f, 0.0f, 0.0f, 0.0f}; + const auto neighbor_iter = neighbor_joints_ids.find(joint_name); + if (neighbor_iter == neighbor_joints_ids.end()) + { + return mean; + } + + const auto &neighbor_ids = neighbor_iter->second; + size_t count = 0; + for (const size_t idx : neighbor_ids) + { + const auto &joint = skeleton[idx]; + if (joint[3] > vis_threshold) + { + mean[0] += joint[0]; + mean[1] += joint[1]; + mean[2] += joint[2]; + mean[3] += joint[3]; + ++count; + } + } + if (count > 0) + { + const float inv = 1.0f / static_cast(count); + mean[0] *= inv; + mean[1] *= inv; + mean[2] *= inv; + mean[3] *= inv; + } + return mean; + } + + std::array centroid(const Pose3D &skeleton) const + { + const auto it = std::find(joint_names.begin(), joint_names.end(), "chest"); + if (it != joint_names.end()) + { + const size_t chest_index = static_cast(std::distance(joint_names.begin(), it)); + if (skeleton[chest_index][3] > 0.0f) + { + return {skeleton[chest_index][0], skeleton[chest_index][1], skeleton[chest_index][2]}; + } + } + const auto center = neighbor_center(skeleton, "chest"); + return {center[0], center[1], center[2]}; + } + + PoseList3D drop_outlier_joints(const PoseList3D &detections) const + { + PoseList3D filtered; + filtered.reserve(detections.size()); + for (Pose3D pose : detections) + { + std::array hip_center = neighbor_center(pose, "hip_middle"); + if (hip_center[3] > 0.0f) + { + const float max_dist_sq = max_distance * 3.5f * max_distance * 3.5f; + for (auto &joint : pose) + { + const float dx = joint[0] - hip_center[0]; + const float dy = joint[1] - hip_center[1]; + const float dz = joint[2] - hip_center[2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq > max_dist_sq) + { + joint[3] = 0.0f; + } + } + } + + hip_center = neighbor_center(pose, "hip_middle"); + if (hip_center[3] > 0.0f) + { + const float max_dist_sq = max_distance * 2.5f * max_distance * 2.5f; + for (auto &joint : pose) + { + const float dx = joint[0] - hip_center[0]; + const float dy = joint[1] - hip_center[1]; + const float dz = joint[2] - hip_center[2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq > max_dist_sq) + { + joint[3] = 0.0f; + } + } + } + + const float max_dist_sq = max_distance * max_distance; + for (size_t joint_index = 0; joint_index < pose.size(); ++joint_index) + { + const auto center = neighbor_center(pose, joint_names[joint_index]); + if (center[3] <= 0.0f) + { + continue; + } + const float dx = pose[joint_index][0] - center[0]; + const float dy = pose[joint_index][1] - center[1]; + const float dz = pose[joint_index][2] - center[2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq > max_dist_sq) + { + pose[joint_index][3] = 0.0f; + } + } + + float sum_scores = 0.0f; + for (const auto &joint : pose) + { + sum_scores += joint[3]; + } + if (sum_scores > 0.0f) + { + filtered.push_back(std::move(pose)); + } + } + return filtered; + } + + std::tuple>, std::vector, std::vector> + data_association(const PoseList3D &new_poses) const + { + std::vector> matches; + std::vector new_detection_indices; + std::vector outdated_track_indices; + if (tracks.empty()) + { + new_detection_indices.resize(new_poses.size()); + std::iota(new_detection_indices.begin(), new_detection_indices.end(), 0); + return {matches, new_detection_indices, outdated_track_indices}; + } + + std::vector> track_centroids(tracks.size()); + for (size_t track_index = 0; track_index < tracks.size(); ++track_index) + { + track_centroids[track_index] = centroid(tracks[track_index].skeleton); + } + + std::vector> detection_centroids(new_poses.size()); + for (size_t detection_index = 0; detection_index < new_poses.size(); ++detection_index) + { + detection_centroids[detection_index] = centroid(new_poses[detection_index]); + } + + std::vector> cost_matrix( + tracks.size(), + std::vector(new_poses.size(), 0.0)); + for (size_t track_index = 0; track_index < tracks.size(); ++track_index) + { + for (size_t detection_index = 0; detection_index < new_poses.size(); ++detection_index) + { + const float dx = + track_centroids[track_index][0] - detection_centroids[detection_index][0]; + const float dy = + track_centroids[track_index][1] - detection_centroids[detection_index][1]; + const float dz = + track_centroids[track_index][2] - detection_centroids[detection_index][2]; + cost_matrix[track_index][detection_index] = std::sqrt(dx * dx + dy * dy + dz * dz); + } + } + + const std::vector assignment = solve_assignment_hungarian(cost_matrix); + for (size_t track_index = 0; track_index < assignment.size(); ++track_index) + { + const int detection_index = assignment[track_index]; + if (detection_index < 0) + { + continue; + } + const double cost = cost_matrix[track_index][static_cast(detection_index)]; + if (cost < max_distance) + { + matches.emplace_back(track_index, static_cast(detection_index)); + } + } + + std::unordered_set matched_tracks; + std::unordered_set matched_detections; + for (const auto &[track_index, detection_index] : matches) + { + matched_tracks.insert(track_index); + matched_detections.insert(detection_index); + } + for (size_t detection_index = 0; detection_index < new_poses.size(); ++detection_index) + { + if (!matched_detections.contains(detection_index)) + { + new_detection_indices.push_back(detection_index); + } + } + for (size_t track_index = 0; track_index < tracks.size(); ++track_index) + { + if (!matched_tracks.contains(track_index)) + { + outdated_track_indices.push_back(track_index); + } + } + return {matches, new_detection_indices, outdated_track_indices}; + } + + void update_track_position(Track &track) const + { + const Pose3D ¤t_pose = track.skeleton; + const size_t num_joints = current_pose.size(); + const size_t num_detections = track.last_detections.size(); + + std::vector> neighbor_centers(num_joints, {0.0f, 0.0f, 0.0f}); + for (size_t joint_index = 0; joint_index < num_joints; ++joint_index) + { + const auto neighbor_iter = neighbor_joints_ids.find(joint_names[joint_index]); + if (neighbor_iter == neighbor_joints_ids.end()) + { + continue; + } + + const auto &neighbor_ids = neighbor_iter->second; + size_t count = 0; + for (const size_t neighbor_index : neighbor_ids) + { + const auto &neighbor_joint = current_pose[neighbor_index]; + if (neighbor_joint[3] > vis_threshold) + { + neighbor_centers[joint_index][0] += neighbor_joint[0]; + neighbor_centers[joint_index][1] += neighbor_joint[1]; + neighbor_centers[joint_index][2] += neighbor_joint[2]; + ++count; + } + } + if (count > 0) + { + const float inv = 1.0f / static_cast(count); + neighbor_centers[joint_index][0] *= inv; + neighbor_centers[joint_index][1] *= inv; + neighbor_centers[joint_index][2] *= inv; + } + } + + Pose3D new_pose(num_joints, {0.0f, 0.0f, 0.0f, 0.0f}); + const float max_dist_sq = max_distance * max_distance; + for (size_t joint_index = 0; joint_index < num_joints; ++joint_index) + { + std::array mean = {0.0f, 0.0f, 0.0f, 0.0f}; + size_t count = 0; + for (size_t detection_index = 0; detection_index < num_detections; ++detection_index) + { + const auto &joint = track.last_detections[detection_index][joint_index]; + if (joint[3] <= 0.0f) + { + continue; + } + const float dx = joint[0] - neighbor_centers[joint_index][0]; + const float dy = joint[1] - neighbor_centers[joint_index][1]; + const float dz = joint[2] - neighbor_centers[joint_index][2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq <= max_dist_sq) + { + mean[0] += joint[0]; + mean[1] += joint[1]; + mean[2] += joint[2]; + mean[3] += joint[3]; + ++count; + } + } + if (count > 0) + { + const float inv = 1.0f / static_cast(count); + new_pose[joint_index][0] = mean[0] * inv; + new_pose[joint_index][1] = mean[1] * inv; + new_pose[joint_index][2] = mean[2] * inv; + new_pose[joint_index][3] = mean[3] * inv; + } + else + { + const float factor = (num_views - 1.0f) / num_views; + new_pose[joint_index] = current_pose[joint_index]; + new_pose[joint_index][3] *= factor; + } + } + + constexpr int topk = 3; + for (size_t joint_index = 0; joint_index < num_joints; ++joint_index) + { + std::vector> valid_detections; + valid_detections.reserve(num_detections); + for (size_t detection_index = 0; detection_index < num_detections; ++detection_index) + { + const auto &joint = track.last_detections[detection_index][joint_index]; + if (joint[3] <= 0.0f) + { + continue; + } + const float dx = joint[0] - new_pose[joint_index][0]; + const float dy = joint[1] - new_pose[joint_index][1]; + const float dz = joint[2] - new_pose[joint_index][2]; + const float distance_sq = dx * dx + dy * dy + dz * dz; + if (distance_sq <= max_dist_sq) + { + valid_detections.emplace_back(distance_sq, detection_index); + } + } + const size_t num_to_drop = std::max( + static_cast(valid_detections.size()) - topk, + 0); + if (num_to_drop == 0) + { + continue; + } + + std::sort(valid_detections.begin(), valid_detections.end()); + valid_detections.resize(valid_detections.size() - num_to_drop); + std::array mean = {0.0f, 0.0f, 0.0f, 0.0f}; + for (const auto &[distance_sq, detection_index] : valid_detections) + { + (void)distance_sq; + const auto &joint = track.last_detections[detection_index][joint_index]; + mean[0] += joint[0]; + mean[1] += joint[1]; + mean[2] += joint[2]; + mean[3] += joint[3]; + } + const float inv = 1.0f / static_cast(valid_detections.size()); + new_pose[joint_index][0] = mean[0] * inv; + new_pose[joint_index][1] = mean[1] * inv; + new_pose[joint_index][2] = mean[2] * inv; + new_pose[joint_index][3] = mean[3] * inv; + } + + track.skeleton = std::move(new_pose); + } + + void merge_tracks(float distance_threshold) + { + const size_t count = tracks.size(); + std::unordered_set merged_indices; + std::vector merged_tracks; + std::vector skeletons(count); + for (size_t idx = 0; idx < count; ++idx) + { + skeletons[idx] = tracks[idx].skeleton; + } + + for (size_t first = 0; first < count; ++first) + { + if (merged_indices.contains(first)) + { + continue; + } + + merged_tracks.push_back(tracks[first]); + const auto ¤t_skeleton = skeletons[first]; + for (size_t second = first + 1; second < count; ++second) + { + if (merged_indices.contains(second)) + { + continue; + } + const bool close = all_visible_joints_close( + current_skeleton, + skeletons[second], + distance_threshold, + vis_threshold); + if (!close) + { + continue; + } + + auto &merged_track = merged_tracks.back(); + const size_t num_last = merged_track.last_detections.size(); + const size_t num_current = tracks[second].last_detections.size(); + for (size_t joint = 0; joint < current_skeleton.size(); ++joint) + { + const auto &joint1 = current_skeleton[joint]; + const auto &joint2 = skeletons[second][joint]; + const float total_score = joint1[3] + joint2[3]; + if (total_score <= 0.0f) + { + continue; + } + const float inv_score = 1.0f / total_score; + const float inv_count = 1.0f / static_cast(num_last + num_current); + float w1 = (joint1[3] * inv_score) * (static_cast(num_last) * inv_count); + float w2 = (joint2[3] * inv_score) * (static_cast(num_current) * inv_count); + const float inv_weight = 1.0f / (w1 + w2); + w1 *= inv_weight; + w2 *= inv_weight; + + merged_track.skeleton[joint][0] = joint1[0] * w1 + joint2[0] * w2; + merged_track.skeleton[joint][1] = joint1[1] * w1 + joint2[1] * w2; + merged_track.skeleton[joint][2] = joint1[2] * w1 + joint2[2] * w2; + merged_track.skeleton[joint][3] = joint1[3] * w1 + joint2[3] * w2; + } + merged_track.last_detections.insert( + merged_track.last_detections.end(), + tracks[second].last_detections.begin(), + tracks[second].last_detections.end()); + merged_indices.insert(second); + } + } + + tracks = std::move(merged_tracks); + } + + std::vector joint_names; + float num_views = 1.0f; + float max_distance = 0.5f; + const size_t min_num_kpts = 7; + const float merge_distance = 0.3f; + const float vis_threshold = 0.1f; + std::unordered_map> neighbor_joints; + std::unordered_map> neighbor_joints_ids; + std::vector tracks; +}; + +PoseList3D view_to_nested(const PoseBatch3DByViewView &poses_3d, size_t view_index) +{ + PoseList3D view_poses; + view_poses.reserve(poses_3d.person_counts[view_index]); + for (size_t person = 0; person < poses_3d.person_counts[view_index]; ++person) + { + Pose3D pose(poses_3d.num_joints, {0.0f, 0.0f, 0.0f, 0.0f}); + for (size_t joint = 0; joint < poses_3d.num_joints; ++joint) + { + for (size_t coord = 0; coord < 4; ++coord) + { + pose[joint][coord] = poses_3d.at(view_index, person, joint, coord); + } + } + view_poses.push_back(std::move(pose)); + } + return view_poses; +} +} // namespace + +PoseBatch3D merge_rgbd_views( + const PoseBatch3DByViewView &poses_3d, + const TriangulationConfig &config, + float max_distance) +{ + if (poses_3d.person_counts == nullptr) + { + throw std::invalid_argument("person_counts must not be null."); + } + if (poses_3d.num_views == 0) + { + throw std::invalid_argument("No 3D views provided."); + } + if (poses_3d.num_views != config.cameras.size()) + { + throw std::invalid_argument("Number of cameras and 3D views must be the same."); + } + if (poses_3d.num_joints != config.joint_names.size()) + { + throw std::invalid_argument("Number of joint names and 3D poses must be the same."); + } + if (poses_3d.max_persons > 0 && poses_3d.num_joints > 0 && poses_3d.data == nullptr) + { + throw std::invalid_argument("poses_3d data must not be null."); + } + for (size_t view = 0; view < poses_3d.num_views; ++view) + { + if (poses_3d.person_counts[view] > poses_3d.max_persons) + { + throw std::invalid_argument( + "person_counts entries must not exceed the padded person dimension."); + } + } + + RgbdViewMerger merger(config.joint_names, poses_3d.num_views, max_distance); + for (size_t view = 0; view < poses_3d.num_views; ++view) + { + merger.consume_view(view_to_nested(poses_3d, view)); + } + + PoseList3D merged = merger.finalize(); + add_extra_joints(merged, config.joint_names); + filter_poses(merged, config.roomparams); + add_missing_joints(merged, config.joint_names, 0.1f); + replace_far_joints(merged, config.joint_names, config.options.min_match_score); + return PoseBatch3D::from_nested(merged); +} diff --git a/src/rpt/__init__.py b/src/rpt/__init__.py index 82975bb..891fd51 100644 --- a/src/rpt/__init__.py +++ b/src/rpt/__init__.py @@ -24,6 +24,7 @@ from ._core import ( build_pair_candidates as _build_pair_candidates, filter_pairs_with_previous_poses as _filter_pairs_with_previous_poses, make_camera as _make_camera, + merge_rgbd_views as _merge_rgbd_views, triangulate_debug as _triangulate_debug, triangulate_poses as _triangulate_poses, triangulate_with_report as _triangulate_with_report, @@ -33,10 +34,11 @@ if TYPE_CHECKING: import numpy as np import numpy.typing as npt - from ._helpers import CameraLike, CameraModelLike, Matrix3x3Like, PoseViewLike, VectorLike + from ._helpers import CameraLike, CameraModelLike, DepthImageLike, Matrix3x3Like, PoseViewLike, VectorLike PoseArray2D = npt.NDArray[np.float32] PoseArray3D = npt.NDArray[np.float32] + PoseArray3DByView = npt.NDArray[np.float32] PersonCountArray = npt.NDArray[np.uint32] TrackIdArray = npt.NDArray[np.int64] @@ -103,6 +105,42 @@ def pack_poses_2d( return _pack_poses_2d(views, joint_count=joint_count) +def sample_depth_for_poses( + poses_2d: "PoseArray2D", + person_counts: "PersonCountArray", + depth_images: "Sequence[DepthImageLike]", + *, + window_size: int = 7, +) -> "PoseArray3D": + """Sample aligned depth for visible 2D joints and return `[u, v, d, score]` rows.""" + + from ._helpers import sample_depth_for_poses as _sample_depth_for_poses + + return _sample_depth_for_poses(poses_2d, person_counts, depth_images, window_size=window_size) + + +def apply_depth_offsets( + poses_uvd: "PoseArray3D", + joint_names: "Sequence[str]", +) -> "PoseArray3D": + """Apply the SimpleDepthPose per-joint depth offsets to `[u, v, d, score]` rows.""" + + from ._helpers import apply_depth_offsets as _apply_depth_offsets + + return _apply_depth_offsets(poses_uvd, joint_names) + + +def lift_depth_poses_to_world( + poses_uvd: "PoseArray3D", + cameras: "Sequence[CameraLike]", +) -> "PoseArray3DByView": + """Lift `[u, v, d, score]` joints into world-space `[x, y, z, score]` poses.""" + + from ._helpers import lift_depth_poses_to_world as _lift_depth_poses_to_world + + return _lift_depth_poses_to_world(poses_uvd, cameras) + + def make_triangulation_config( cameras: "Sequence[CameraLike]", roomparams: "npt.NDArray[np.generic] | Sequence[Sequence[float]]", @@ -172,6 +210,36 @@ def triangulate_poses( return _triangulate_poses(poses_2d, person_counts, config) +def merge_rgbd_views( + poses_3d: "PoseArray3DByView", + person_counts: "PersonCountArray", + config: TriangulationConfig, + *, + max_distance: float = 0.5, +) -> "PoseArray3D": + """Merge per-view world-space RGBD pose proposals into final 3D poses.""" + return _merge_rgbd_views(poses_3d, person_counts, config, float(max_distance)) + + +def reconstruct_rgbd( + poses_2d: "PoseArray2D", + person_counts: "PersonCountArray", + depth_images: "Sequence[DepthImageLike]", + config: TriangulationConfig, + *, + use_depth_offsets: bool = True, + window_size: int = 7, + max_distance: float = 0.5, +) -> "PoseArray3D": + """Reconstruct per-frame RGBD poses from calibrated detections and aligned depth images.""" + + poses_uvd = sample_depth_for_poses(poses_2d, person_counts, depth_images, window_size=window_size) + if use_depth_offsets: + poses_uvd = apply_depth_offsets(poses_uvd, config.joint_names) + poses_3d = lift_depth_poses_to_world(poses_uvd, config.cameras) + return merge_rgbd_views(poses_3d, person_counts, config, max_distance=max_distance) + + def triangulate_with_report( poses_2d: "PoseArray2D", person_counts: "PersonCountArray", @@ -200,6 +268,7 @@ __all__ = [ "CameraModel", "AssociationReport", "AssociationStatus", + "apply_depth_offsets", "FinalPoseAssociationDebug", "TriangulationConfig", "TriangulationOptions", @@ -216,9 +285,13 @@ __all__ = [ "build_pair_candidates", "convert_cameras", "filter_pairs_with_previous_poses", + "lift_depth_poses_to_world", "make_camera", "make_triangulation_config", + "merge_rgbd_views", "pack_poses_2d", + "reconstruct_rgbd", + "sample_depth_for_poses", "triangulate_debug", "triangulate_poses", "triangulate_with_report", diff --git a/src/rpt/__init__.pyi b/src/rpt/__init__.pyi index 16e2968..e1b2be1 100644 --- a/src/rpt/__init__.pyi +++ b/src/rpt/__init__.pyi @@ -23,10 +23,11 @@ from ._core import ( TriangulationResult, TriangulationTrace, ) -from ._helpers import CameraLike, CameraModelLike, Matrix3x3Like, PoseViewLike, RoomParamsLike, VectorLike +from ._helpers import CameraLike, CameraModelLike, DepthImageLike, Matrix3x3Like, PoseViewLike, RoomParamsLike, VectorLike PoseArray2D: TypeAlias = npt.NDArray[np.float32] PoseArray3D: TypeAlias = npt.NDArray[np.float32] +PoseArray3DByView: TypeAlias = npt.NDArray[np.float32] PersonCountArray: TypeAlias = npt.NDArray[np.uint32] TrackIdArray: TypeAlias = npt.NDArray[np.int64] @@ -59,6 +60,27 @@ def pack_poses_2d( ) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.uint32]]: ... +def sample_depth_for_poses( + poses_2d: PoseArray2D, + person_counts: PersonCountArray, + depth_images: Sequence[DepthImageLike], + *, + window_size: int = 7, +) -> PoseArray3D: ... + + +def apply_depth_offsets( + poses_uvd: PoseArray3D, + joint_names: Sequence[str], +) -> PoseArray3D: ... + + +def lift_depth_poses_to_world( + poses_uvd: PoseArray3D, + cameras: Sequence[CameraLike], +) -> PoseArray3DByView: ... + + def make_triangulation_config( cameras: Sequence[CameraLike], roomparams: RoomParamsLike, @@ -103,6 +125,27 @@ def triangulate_poses( ) -> PoseArray3D: ... +def merge_rgbd_views( + poses_3d: PoseArray3DByView, + person_counts: PersonCountArray, + config: TriangulationConfig, + *, + max_distance: float = 0.5, +) -> PoseArray3D: ... + + +def reconstruct_rgbd( + poses_2d: PoseArray2D, + person_counts: PersonCountArray, + depth_images: Sequence[DepthImageLike], + config: TriangulationConfig, + *, + use_depth_offsets: bool = True, + window_size: int = 7, + max_distance: float = 0.5, +) -> PoseArray3D: ... + + def triangulate_with_report( poses_2d: PoseArray2D, person_counts: PersonCountArray, diff --git a/src/rpt/_helpers.py b/src/rpt/_helpers.py index 76b213e..f170d93 100644 --- a/src/rpt/_helpers.py +++ b/src/rpt/_helpers.py @@ -12,6 +12,7 @@ Matrix3x3Like: TypeAlias = Sequence[Sequence[float]] VectorLike: TypeAlias = Sequence[float] RoomParamsLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[float]] PoseViewLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[Sequence[float]]] | Sequence[Sequence[float]] +DepthImageLike: TypeAlias = npt.NDArray[np.generic] | Sequence[Sequence[float]] class CameraDict(TypedDict, total=False): @@ -29,6 +30,29 @@ class CameraDict(TypedDict, total=False): CameraModelLike: TypeAlias = CameraModel | Literal["pinhole", "fisheye"] CameraLike = Camera | CameraDict +DEFAULT_DEPTH_OFFSETS_METERS: dict[str, float] = { + "nose": 0.005, + "eye_left": 0.005, + "eye_right": 0.005, + "ear_left": 0.005, + "ear_right": 0.005, + "shoulder_left": 0.03, + "shoulder_right": 0.03, + "elbow_left": 0.02, + "elbow_right": 0.02, + "wrist_left": 0.01, + "wrist_right": 0.01, + "hip_left": 0.04, + "hip_right": 0.04, + "knee_left": 0.03, + "knee_right": 0.03, + "ankle_left": 0.03, + "ankle_right": 0.03, + "hip_middle": 0.04, + "shoulder_middle": 0.03, + "head": 0.0, +} + def _coerce_camera_model(model: CameraModelLike) -> CameraModel: if isinstance(model, CameraModel): @@ -55,6 +79,15 @@ def _coerce_distortion(distortion: VectorLike, camera_model: CameraModel) -> tup return values +def _coerce_depth_image(depth_image: DepthImageLike) -> npt.NDArray[np.float32]: + array = np.asarray(depth_image, dtype=np.float32) + if array.ndim == 3 and array.shape[-1] == 1: + array = np.squeeze(array, axis=-1) + if array.ndim != 2: + raise ValueError("Each depth image must have shape [height, width] or [height, width, 1].") + return np.ascontiguousarray(array, dtype=np.float32) + + def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]: """Normalize mappings or existing Camera objects into bound Camera instances.""" @@ -157,3 +190,136 @@ def make_triangulation_config( options.min_group_size = int(min_group_size) config.options = options return config + + +def sample_depth_for_poses( + poses_2d: npt.NDArray[np.generic], + person_counts: npt.NDArray[np.generic], + depth_images: Sequence[DepthImageLike], + *, + window_size: int = 7, +) -> npt.NDArray[np.float32]: + """Sample aligned depth for each visible 2D joint and return `[u, v, d, score]` rows.""" + + poses = np.asarray(poses_2d, dtype=np.float32) + counts = np.asarray(person_counts, dtype=np.uint32) + if poses.ndim != 4 or poses.shape[-1] != 3: + raise ValueError("poses_2d must have shape [views, max_persons, joints, 3].") + if counts.ndim != 1 or counts.shape[0] != poses.shape[0]: + raise ValueError("person_counts must be a 1D array aligned with the pose views.") + if len(depth_images) != poses.shape[0]: + raise ValueError("depth_images must have the same number of views as poses_2d.") + if window_size <= 0: + raise ValueError("window_size must be positive.") + radius = window_size // 2 + + poses_uvd = np.zeros((poses.shape[0], poses.shape[1], poses.shape[2], 4), dtype=np.float32) + for view_idx, depth_image in enumerate(depth_images): + depth = _coerce_depth_image(depth_image) + poses_uvd[view_idx, :, :, :2] = poses[view_idx, :, :, :2] + poses_uvd[view_idx, :, :, 3] = poses[view_idx, :, :, 2] + + valid_persons = int(counts[view_idx]) + if valid_persons == 0: + continue + + joints = poses[view_idx, :valid_persons, :, :2].astype(np.int32, copy=False).reshape(-1, 2) + scores = poses[view_idx, :valid_persons, :, 2:3].reshape(-1, 1) + + depth_padded = np.pad(depth, radius, mode="constant", constant_values=0) + offsets = np.arange(-radius, radius + 1, dtype=np.int32) + valid_xy = ( + (joints[:, 0] >= 0) + & (joints[:, 0] < depth.shape[1]) + & (joints[:, 1] >= 0) + & (joints[:, 1] < depth.shape[0]) + ) + clamped_x = np.clip(joints[:, 0], 0, depth.shape[1] - 1) + clamped_y = np.clip(joints[:, 1], 0, depth.shape[0] - 1) + center_x = clamped_x[:, None] + radius + center_y = clamped_y[:, None] + radius + vertical_grid = np.clip(np.add.outer(clamped_y, offsets) + radius, 0, depth_padded.shape[0] - 1) + horizontal_grid = np.clip( + np.add.outer(clamped_x, offsets) + radius, 0, depth_padded.shape[1] - 1 + ) + + vertical_depths = depth_padded[vertical_grid, center_x] + horizontal_depths = depth_padded[center_y, horizontal_grid] + all_depths = np.concatenate((vertical_depths, horizontal_depths), axis=1).astype(np.float32) + all_depths[~valid_xy] = np.nan + all_depths[all_depths <= 0] = np.nan + + valid_depth_rows = ~np.isnan(all_depths).all(axis=1) + sampled_depths = np.zeros((all_depths.shape[0],), dtype=np.float32) + if np.any(valid_depth_rows): + with np.errstate(all="ignore"): + sampled_depths[valid_depth_rows] = np.nanmedian(all_depths[valid_depth_rows], axis=1) + + valid_mask = ((sampled_depths > 0.0).astype(np.float32)[:, None] * (scores > 0.0).astype(np.float32)) + sampled_depths = sampled_depths.reshape(valid_persons, poses.shape[2], 1) + valid_mask = valid_mask.reshape(valid_persons, poses.shape[2], 1) + + poses_uvd[view_idx, :valid_persons, :, 2:3] = sampled_depths + poses_uvd[view_idx, :valid_persons] *= np.concatenate((valid_mask, valid_mask, valid_mask, valid_mask), axis=-1) + + return poses_uvd + + +def apply_depth_offsets( + poses_uvd: npt.NDArray[np.generic], + joint_names: Sequence[str], +) -> npt.NDArray[np.float32]: + """Apply the SimpleDepthPose per-joint depth offsets in meters.""" + + poses = np.asarray(poses_uvd, dtype=np.float32) + if poses.ndim != 4 or poses.shape[-1] != 4: + raise ValueError("poses_uvd must have shape [views, max_persons, joints, 4].") + if len(joint_names) != poses.shape[2]: + raise ValueError("joint_names must have the same number of joints as poses_uvd.") + + result = poses.copy() + offsets = np.asarray( + [DEFAULT_DEPTH_OFFSETS_METERS.get(str(joint_name), 0.0) for joint_name in joint_names], + dtype=np.float32, + ) + depth_mask = (result[:, :, :, 2:3] > 0.0).astype(np.float32) + result[:, :, :, 2:3] += depth_mask * offsets[np.newaxis, np.newaxis, :, np.newaxis] * 1000.0 + return result + + +def lift_depth_poses_to_world( + poses_uvd: npt.NDArray[np.generic], + cameras: Sequence[CameraLike], +) -> npt.NDArray[np.float32]: + """Lift `[u, v, d, score]` joints into world-space `[x, y, z, score]` poses.""" + + poses = np.asarray(poses_uvd, dtype=np.float32) + if poses.ndim != 4 or poses.shape[-1] != 4: + raise ValueError("poses_uvd must have shape [views, max_persons, joints, 4].") + + converted_cameras = convert_cameras(cameras) + if len(converted_cameras) != poses.shape[0]: + raise ValueError("cameras must have the same number of views as poses_uvd.") + + result = np.zeros_like(poses, dtype=np.float32) + for view_idx, camera in enumerate(converted_cameras): + uv = poses[view_idx, :, :, :2].reshape(-1, 2) + depth_mm = poses[view_idx, :, :, 2:3].reshape(-1, 1) + scores = poses[view_idx, :, :, 3:4].reshape(-1, 1) + + depth_m = depth_mm * 0.001 + uv_ones = np.concatenate((uv, np.ones((uv.shape[0], 1), dtype=np.float32)), axis=1) + k_inv = np.linalg.inv(np.asarray(camera.K, dtype=np.float32)) + xyz_cam = depth_m * (uv_ones @ k_inv.T) + + rotation = np.asarray(camera.R, dtype=np.float32) + translation = np.asarray(camera.T, dtype=np.float32).reshape(1, 3) + xyz_world = (rotation @ xyz_cam.T).T + translation + + pose_world = np.concatenate((xyz_world, scores), axis=1).reshape( + poses.shape[1], poses.shape[2], 4 + ) + pose_world *= (pose_world[:, :, 3:4] > 0.0).astype(np.float32) + result[view_idx] = pose_world + + return result diff --git a/tests/test_rgbd.py b/tests/test_rgbd.py new file mode 100644 index 0000000..fac13a3 --- /dev/null +++ b/tests/test_rgbd.py @@ -0,0 +1,197 @@ +import numpy as np + +import rpt + +JOINT_NAMES = [ + "nose", + "eye_left", + "eye_right", + "ear_left", + "ear_right", + "shoulder_left", + "shoulder_right", + "elbow_left", + "elbow_right", + "wrist_left", + "wrist_right", + "hip_left", + "hip_right", + "knee_left", + "knee_right", + "ankle_left", + "ankle_right", + "hip_middle", + "shoulder_middle", + "head", +] + + +def make_camera(name: str) -> rpt.Camera: + return rpt.make_camera( + name, + [[1000, 0, 0], [0, 1000, 0], [0, 0, 1]], + [0, 0, 0, 0, 0], + [[1, 0, 0], [0, 1, 0], [0, 0, 1]], + [[0], [0], [0]], + 256, + 256, + rpt.CameraModel.PINHOLE, + ) + + +def make_config(num_views: int) -> rpt.TriangulationConfig: + return rpt.make_triangulation_config( + [make_camera(f"Camera {idx}") for idx in range(num_views)], + np.asarray([[10.0, 10.0, 10.0], [0.0, 0.0, 0.0]], dtype=np.float32), + JOINT_NAMES, + ) + + +def make_body_2d() -> np.ndarray: + return np.asarray( + [ + [150, 50, 1.0], + [145, 48, 1.0], + [155, 48, 1.0], + [138, 50, 1.0], + [162, 50, 1.0], + [135, 80, 1.0], + [165, 80, 1.0], + [125, 115, 1.0], + [175, 115, 1.0], + [115, 150, 1.0], + [185, 150, 1.0], + [145, 130, 1.0], + [155, 130, 1.0], + [145, 175, 1.0], + [155, 175, 1.0], + [145, 220, 1.0], + [155, 220, 1.0], + [150, 130, 1.0], + [150, 80, 1.0], + [150, 50, 1.0], + ], + dtype=np.float32, + ) + + +def test_sample_depth_for_poses_respects_person_counts_and_scores(): + poses_2d = np.zeros((1, 2, 2, 3), dtype=np.float32) + poses_2d[0, 0, 0] = [5, 6, 0.8] + poses_2d[0, 0, 1] = [7, 8, 0.0] + person_counts = np.asarray([1], dtype=np.uint32) + + depth_image = np.full((16, 16), 3000, dtype=np.float32) + depth_image[0, 0] = 1234 + + poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, [depth_image]) + + np.testing.assert_allclose(poses_uvd[0, 0, 0], [5.0, 6.0, 3000.0, 0.8], rtol=1e-6, atol=1e-6) + np.testing.assert_array_equal(poses_uvd[0, 0, 1], np.zeros((4,), dtype=np.float32)) + np.testing.assert_array_equal(poses_uvd[0, 1], np.zeros((2, 4), dtype=np.float32)) + + +def test_sample_depth_for_poses_uses_symmetric_window(): + poses_2d = np.zeros((1, 1, 1, 3), dtype=np.float32) + poses_2d[0, 0, 0] = [5, 5, 1.0] + person_counts = np.asarray([1], dtype=np.uint32) + + depth_image = np.zeros((16, 16), dtype=np.float32) + depth_image[5, 5] = 1000.0 + depth_image[3, 5] = 5000.0 + depth_image[5, 2] = 5000.0 + depth_image[5, 3] = 5000.0 + depth_image[5, 7] = 5000.0 + depth_image[5, 8] = 5000.0 + + poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, [depth_image], window_size=3) + + np.testing.assert_allclose(poses_uvd[0, 0, 0], [5.0, 5.0, 1000.0, 1.0], rtol=1e-6, atol=1e-6) + + +def test_sample_depth_for_poses_ignores_out_of_bounds_joints(): + poses_2d = np.zeros((1, 1, 1, 3), dtype=np.float32) + poses_2d[0, 0, 0] = [99, -4, 0.7] + person_counts = np.asarray([1], dtype=np.uint32) + + poses_uvd = rpt.sample_depth_for_poses( + poses_2d, + person_counts, + [np.full((16, 16), 3000, dtype=np.float32)], + ) + + np.testing.assert_array_equal(poses_uvd[0, 0, 0], np.zeros((4,), dtype=np.float32)) + + +def test_apply_depth_offsets_uses_joint_name_mapping(): + poses_uvd = np.zeros((1, 1, 3, 4), dtype=np.float32) + poses_uvd[0, 0, :, 2] = 3000.0 + poses_uvd[0, 0, :, 3] = 1.0 + + adjusted = rpt.apply_depth_offsets(poses_uvd, ["nose", "shoulder_left", "unknown_joint"]) + + np.testing.assert_allclose(adjusted[0, 0, :, 2], [3005.0, 3030.0, 3000.0], rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(poses_uvd[0, 0, :, 2], [3000.0, 3000.0, 3000.0], rtol=1e-6, atol=1e-6) + + +def test_lift_depth_poses_to_world_matches_camera_projection(): + poses_uvd = np.zeros((1, 1, 2, 4), dtype=np.float32) + poses_uvd[0, 0, 0] = [100.0, 200.0, 3000.0, 0.9] + poses_uvd[0, 0, 1] = [0.0, 0.0, 0.0, 0.0] + + lifted = rpt.lift_depth_poses_to_world(poses_uvd, [make_camera("Camera 1")]) + + np.testing.assert_allclose(lifted[0, 0, 0], [0.3, 0.6, 3.0, 0.9], rtol=1e-6, atol=1e-6) + np.testing.assert_array_equal(lifted[0, 0, 1], np.zeros((4,), dtype=np.float32)) + + +def test_merge_rgbd_views_merges_identical_world_poses(): + config = make_config(2) + body_2d = make_body_2d() + + poses_2d = np.zeros((2, 1, len(JOINT_NAMES), 3), dtype=np.float32) + poses_2d[0, 0] = body_2d + poses_2d[1, 0] = body_2d + person_counts = np.asarray([1, 1], dtype=np.uint32) + depth_images = [np.full((256, 256), 3000, dtype=np.float32) for _ in range(2)] + + poses_uvd = rpt.sample_depth_for_poses(poses_2d, person_counts, depth_images) + poses_uvd = rpt.apply_depth_offsets(poses_uvd, JOINT_NAMES) + poses_3d_by_view = rpt.lift_depth_poses_to_world(poses_uvd, config.cameras) + merged = rpt.merge_rgbd_views(poses_3d_by_view, person_counts, config) + + assert merged.shape == (1, len(JOINT_NAMES), 4) + np.testing.assert_allclose(merged[0, :-1], poses_3d_by_view[0, 0, :-1], rtol=1e-5, atol=1e-5) + expected_head = (poses_3d_by_view[0, 0, 3] + poses_3d_by_view[0, 0, 4]) * 0.5 + expected_head[3] = min(poses_3d_by_view[0, 0, 3, 3], poses_3d_by_view[0, 0, 4, 3]) + np.testing.assert_allclose(merged[0, -1], expected_head, rtol=1e-5, atol=1e-5) + + +def test_reconstruct_rgbd_matches_manual_pipeline_and_single_view_person(): + config = make_config(2) + body_2d = make_body_2d() + + poses_2d = np.zeros((2, 1, len(JOINT_NAMES), 3), dtype=np.float32) + poses_2d[0, 0] = body_2d + person_counts = np.asarray([1, 0], dtype=np.uint32) + depth_images = [ + np.full((256, 256), 3000, dtype=np.float32), + np.zeros((256, 256), dtype=np.float32), + ] + + manual = rpt.merge_rgbd_views( + rpt.lift_depth_poses_to_world( + rpt.apply_depth_offsets( + rpt.sample_depth_for_poses(poses_2d, person_counts, depth_images), + JOINT_NAMES, + ), + config.cameras, + ), + person_counts, + config, + ) + reconstructed = rpt.reconstruct_rgbd(poses_2d, person_counts, depth_images, config) + + assert reconstructed.shape == (1, len(JOINT_NAMES), 4) + np.testing.assert_allclose(reconstructed, manual, rtol=1e-5, atol=1e-5) + assert np.count_nonzero(reconstructed[0, :, 3] > 0.0) >= 7 diff --git a/uv.lock b/uv.lock index a054597..f92b472 100644 --- a/uv.lock +++ b/uv.lock @@ -230,7 +230,7 @@ wheels = [ [[package]] name = "rapid-pose-triangulation" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" }, marker = "python_full_version < '3.11'" },