#include #include #include #include #include #include #include #include "camera.hpp" #include "triangulator.hpp" // ================================================================================================= // ================================================================================================= [[maybe_unused]] static void print_2d_poses(const std::vector> &poses) { std::cout << "Poses: (" << poses.size() << ", 3)[" << std::endl; for (const auto &pose : poses) { std::cout << " ["; for (size_t i = 0; i < 3; ++i) { std::cout << std::fixed << std::setprecision(3) << pose[i]; if (i < 2) { std::cout << ", "; } } std::cout << "]" << std::endl; } std::cout << "]" << std::endl; } // ================================================================================================= [[maybe_unused]] static void print_3d_poses(const std::vector> &poses) { std::cout << "Poses: (" << poses.size() << ", 4)[" << std::endl; for (const auto &pose : poses) { std::cout << " ["; for (size_t i = 0; i < 4; ++i) { std::cout << std::fixed << std::setprecision(3) << pose[i]; if (i < 3) { std::cout << ", "; } } std::cout << "]" << std::endl; } std::cout << "]" << std::endl; } // ================================================================================================= [[maybe_unused]] static void print_allpairs( const std::vector, std::pair>> &all_pairs) { std::cout << "All pairs: [" << std::endl; for (const auto &pair : all_pairs) { const auto &tuple_part = pair.first; const auto &pair_part = pair.second; // Print the tuple part std::cout << "(" << std::get<0>(tuple_part) << ", " << std::get<1>(tuple_part) << ", " << std::get<2>(tuple_part) << ", " << std::get<3>(tuple_part) << ")"; // Print the pair part std::cout << ", (" << pair_part.first << ", " << pair_part.second << ")" << std::endl; } std::cout << "]" << std::endl; } // ================================================================================================= // ================================================================================================= CameraInternal::CameraInternal(const Camera &cam) { this->cam = cam; this->invR = transpose3x3(cam.R); // Camera center: // C = -(Rᵀ * t) = -(Rᵀ * (R * (T * -1))) = -(Rᵀ * (R * -T)) = -(Rᵀ * -R * T) = -(-T) = T this->center = {cam.T[0][0], cam.T[1][0], cam.T[2][0]}; // Undistort camera matrix // As with the undistortion, the own implementation avoids some overhead compared to OpenCV if (cam.type == "fisheye") { newK = calc_optimal_camera_matrix_fisheye(1.0, {cam.width, cam.height}); } else { newK = calc_optimal_camera_matrix_pinhole(1.0, {cam.width, cam.height}); } this->invK = invert3x3(newK); } // ================================================================================================= std::array, 3> CameraInternal::transpose3x3( const std::array, 3> &M) { return {{{M[0][0], M[1][0], M[2][0]}, {M[0][1], M[1][1], M[2][1]}, {M[0][2], M[1][2], M[2][2]}}}; } // ================================================================================================= std::array, 3> CameraInternal::invert3x3( const std::array, 3> &M) { // Compute the inverse using the adjugate method // See: https://scicomp.stackexchange.com/a/29206 std::array, 3> adj = { {{ M[1][1] * M[2][2] - M[1][2] * M[2][1], M[0][2] * M[2][1] - M[0][1] * M[2][2], M[0][1] * M[1][2] - M[0][2] * M[1][1], }, { M[1][2] * M[2][0] - M[1][0] * M[2][2], M[0][0] * M[2][2] - M[0][2] * M[2][0], M[0][2] * M[1][0] - M[0][0] * M[1][2], }, { M[1][0] * M[2][1] - M[1][1] * M[2][0], M[0][1] * M[2][0] - M[0][0] * M[2][1], M[0][0] * M[1][1] - M[0][1] * M[1][0], }}}; float det = M[0][0] * adj[0][0] + M[0][1] * adj[1][0] + M[0][2] * adj[2][0]; if (std::fabs(det) < 1e-6f) { return {{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}}; } float idet = 1.0f / det; std::array, 3> inv = { {{ adj[0][0] * idet, adj[0][1] * idet, adj[0][2] * idet, }, { adj[1][0] * idet, adj[1][1] * idet, adj[1][2] * idet, }, { adj[2][0] * idet, adj[2][1] * idet, adj[2][2] * idet, }}}; return inv; } // ================================================================================================= void CameraInternal::undistort_point_pinhole(std::array &p, const std::vector &k) { // Following: cv::cvUndistortPointsInternal // Uses only the distortion coefficients: [k1, k2, p1, p2, k3] // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/undistort.dispatch.cpp#L432 float x0 = p[0]; float y0 = p[1]; float x = x0; float y = y0; // Iteratively refine the estimate for the undistorted point. int max_iterations = 5; for (int iter = 0; iter < max_iterations; ++iter) { float r2 = x * x + y * y; double icdist = 1.0 / (1 + ((k[4] * r2 + k[1]) * r2 + k[0]) * r2); if (icdist < 0) { x = x0; y = y0; break; } float deltaX = 2 * k[2] * x * y + k[3] * (r2 + 2 * x * x); float deltaY = k[2] * (r2 + 2 * y * y) + 2 * k[3] * x * y; x = (x0 - deltaX) * icdist; y = (y0 - deltaY) * icdist; } p[0] = x; p[1] = y; } // ================================================================================================= void CameraInternal::undistort_point_fisheye(std::array &p, const std::vector &k) { // Following: cv::fisheye::undistortPoints // Uses only the distortion coefficients: [k1, k2, k3, k4] // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/fisheye.cpp#L429 float theta_d = std::sqrt(p[0] * p[0] + p[1] * p[1]); float pi_half = std::numbers::pi * 0.5; theta_d = std::min(std::max(-pi_half, theta_d), pi_half); if (theta_d < 1e-6) { return; } float scale = 0.0; float theta = theta_d; int max_iterations = 5; for (int iter = 0; iter < max_iterations; ++iter) { float theta2 = theta * theta; float theta4 = theta2 * theta2; float theta6 = theta4 * theta2; float theta8 = theta4 * theta4; float k0_theta2 = k[0] * theta2; float k1_theta4 = k[1] * theta4; float k2_theta6 = k[2] * theta6; float k3_theta8 = k[3] * theta8; float theta_fix = (theta * (1 + k0_theta2 + k1_theta4 + k2_theta6 + k3_theta8) - theta_d) / (1 + 3 * k0_theta2 + 5 * k1_theta4 + 7 * k2_theta6 + 9 * k3_theta8); theta = theta - theta_fix; if (std::fabs(theta_fix) < 1e-6) { break; } } scale = std::tan(theta) / theta_d; p[0] *= scale; p[1] *= scale; } // ================================================================================================= std::array, 3> CameraInternal::calc_optimal_camera_matrix_fisheye( float balance, std::pair new_size) { // Following: cv::fisheye::estimateNewCameraMatrixForUndistortRectify // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/fisheye.cpp#L630 float fov_scale = 1.0; float w = static_cast(cam.width); float h = static_cast(cam.height); balance = std::min(std::max(balance, 0.0f), 1.0f); // Define four key points at the middle of each edge std::vector> pts = { {w * 0.5f, 0.0}, {w, h * 0.5f}, {w * 0.5f, h}, {0.0, h * 0.5f}}; // Extract intrinsic parameters float fx = cam.K[0][0]; float fy = cam.K[1][1]; float cx = cam.K[0][2]; float cy = cam.K[1][2]; // Undistort the edge points for (auto &pt : pts) { std::array p_normed = {(pt[0] - cx) / fx, (pt[1] - cy) / fy, 1.0}; undistort_point_fisheye(p_normed, cam.DC); pt[0] = p_normed[0]; pt[1] = p_normed[1]; } // Compute center mass of the undistorted edge points float sum_x = 0.0, sum_y = 0.0; for (const auto &pt : pts) { sum_x += pt[0]; sum_y += pt[1]; } float cn_x = sum_x / pts.size(); float cn_y = sum_y / pts.size(); // Convert to identity ratio float aspect_ratio = fx / fy; cn_y *= aspect_ratio; for (auto &pt : pts) pt[1] *= aspect_ratio; // Find the bounding box of the undistorted points float minx = std::numeric_limits::max(); float miny = std::numeric_limits::max(); float maxx = -std::numeric_limits::max(); float maxy = -std::numeric_limits::max(); for (const auto &pt : pts) { minx = std::min(minx, pt[0]); maxx = std::max(maxx, pt[0]); miny = std::min(miny, pt[1]); maxy = std::max(maxy, pt[1]); } // Calculate candidate focal lengths float f1 = (w * 0.5) / (cn_x - minx); float f2 = (w * 0.5) / (maxx - cn_x); float f3 = (h * 0.5 * aspect_ratio) / (cn_y - miny); float f4 = (h * 0.5 * aspect_ratio) / (maxy - cn_y); float fmin = std::min({f1, f2, f3, f4}); float fmax = std::max({f1, f2, f3, f4}); // Blend the candidate focal lengths float f_val = balance * fmin + (1.0f - balance) * fmax; if (fov_scale > 0.0f) f_val /= fov_scale; // Compute new intrinsic parameters float new_fx = f_val; float new_fy = f_val; float new_cx = -cn_x * f_val + (w * 0.5); float new_cy = -cn_y * f_val + (h * aspect_ratio * 0.5); // Restore aspect ratio new_fy /= aspect_ratio; new_cy /= aspect_ratio; // Optionally scale parameters to new new image size if (new_size.first > 0 && new_size.second > 0) { float rx = static_cast(new_size.first) / w; float ry = static_cast(new_size.second) / h; new_fx *= rx; new_fy *= ry; new_cx *= rx; new_cy *= ry; } std::array, 3> newK = { {{new_fx, 0.0, new_cx}, {0.0, new_fy, new_cy}, {0.0, 0.0, 1.0}}}; return newK; } // ================================================================================================= std::array, 3> CameraInternal::calc_optimal_camera_matrix_pinhole( float alpha, std::pair new_size) { // Following: cv::getOptimalNewCameraMatrix // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/calibration_base.cpp#L1565 bool center_principal_point = false; bool use_pix_roi = false; float w = static_cast(cam.width); float h = static_cast(cam.height); alpha = std::min(std::max(alpha, 0.0f), 1.0f); if (center_principal_point || use_pix_roi) { // Not implemented exit(1); } // Define key points // Calculate only the contour points of the image, and use less points, // the edges and centers should be enough if the camera has no strange distortions const size_t N = 3; std::vector> pts; pts.reserve(4 * (N - 1)); for (size_t y = 0; y < N; ++y) { for (size_t x = 0; x < N; ++x) { if (x != 0 && x != N - 1 && y != 0 && y != N - 1) { continue; } pts.push_back({x * (w - 1) / (N - 1), y * (h - 1) / (N - 1)}); } } // Extract intrinsic parameters float fx = cam.K[0][0]; float fy = cam.K[1][1]; float cx = cam.K[0][2]; float cy = cam.K[1][2]; // Undistort the key points for (auto &pt : pts) { std::array p_normed = {(pt[0] - cx) / fx, (pt[1] - cy) / fy, 1.0}; undistort_point_pinhole(p_normed, cam.DC); pt[0] = p_normed[0]; pt[1] = p_normed[1]; } // Get inscribed and circumscribed rectangles in normalized coordinates float iX0 = -std::numeric_limits::max(); float iX1 = std::numeric_limits::max(); float iY0 = -std::numeric_limits::max(); float iY1 = std::numeric_limits::max(); float oX0 = std::numeric_limits::max(); float oX1 = -std::numeric_limits::max(); float oY0 = std::numeric_limits::max(); float oY1 = -std::numeric_limits::max(); size_t k = 0; for (size_t y = 0; y < N; ++y) { for (size_t x = 0; x < N; ++x) { if (x != 0 && x != N - 1 && y != 0 && y != N - 1) { continue; } auto &pt = pts[k]; k += 1; if (x == 0) { oX0 = std::min(oX0, pt[0]); iX0 = std::max(iX0, pt[0]); } if (x == N - 1) { oX1 = std::max(oX1, pt[0]); iX1 = std::min(iX1, pt[0]); } if (y == 0) { oY0 = std::min(oY0, pt[1]); iY0 = std::max(iY0, pt[1]); } if (y == N - 1) { oY1 = std::max(oY1, pt[1]); iY1 = std::min(iY1, pt[1]); } } } float inner_width = iX1 - iX0; float inner_height = iY1 - iY0; float outer_width = oX1 - oX0; float outer_height = oY1 - oY0; // Projection mapping inner rectangle to viewport float fx0 = (new_size.first - 1) / inner_width; float fy0 = (new_size.second - 1) / inner_height; float cx0 = -fx0 * iX0; float cy0 = -fy0 * iY0; // Projection mapping outer rectangle to viewport float fx1 = (new_size.first - 1) / outer_width; float fy1 = (new_size.second - 1) / outer_height; float cx1 = -fx1 * oX0; float cy1 = -fy1 * oY0; // Interpolate between the two optimal projections float new_fx = fx0 * (1 - alpha) + fx1 * alpha; float new_fy = fy0 * (1 - alpha) + fy1 * alpha; float new_cx = cx0 * (1 - alpha) + cx1 * alpha; float new_cy = cy0 * (1 - alpha) + cy1 * alpha; std::array, 3> newK = { {{new_fx, 0.0, new_cx}, {0.0, new_fy, new_cy}, {0.0, 0.0, 1.0}}}; return newK; } // ================================================================================================= // ================================================================================================= TriangulatorInternal::TriangulatorInternal(float min_match_score, size_t min_group_size) { this->min_match_score = min_match_score; this->min_group_size = min_group_size; } // ================================================================================================= std::vector>> TriangulatorInternal::triangulate_poses( const std::vector>>> &poses_2d, const std::vector &cameras, const std::array, 2> &roomparams, const std::vector &joint_names) { auto ttime = std::chrono::steady_clock::now(); auto stime = std::chrono::steady_clock::now(); std::chrono::duration elapsed; // Check inputs if (poses_2d.empty()) { throw std::invalid_argument("No 2D poses provided."); } if (poses_2d.size() != cameras.size()) { throw std::invalid_argument("Number of cameras and 2D poses must be the same."); } if (joint_names.size() != poses_2d[0][0].size()) { throw std::invalid_argument("Number of joint names and 2D poses must be the same."); } for (const auto &joint : core_joints) { if (std::find(joint_names.begin(), joint_names.end(), joint) == joint_names.end()) { throw std::invalid_argument("Core joint '" + joint + "' not found in joint names."); } } // Convert inputs to internal formats std::vector>>> i_poses_2d = poses_2d; std::vector internal_cameras; for (size_t i = 0; i < cameras.size(); ++i) { internal_cameras.emplace_back(cameras[i]); } std::vector core_joint_idx; for (const auto &joint : core_joints) { auto it = std::find(joint_names.begin(), joint_names.end(), joint); core_joint_idx.push_back(std::distance(joint_names.begin(), it)); } std::vector> core_limbs_idx; for (const auto &limb : core_limbs) { auto it1 = std::find(core_joints.begin(), core_joints.end(), limb.first); auto it2 = std::find(core_joints.begin(), core_joints.end(), limb.second); core_limbs_idx.push_back({(size_t)std::distance(core_joints.begin(), it1), (size_t)std::distance(core_joints.begin(), it2)}); } elapsed = std::chrono::steady_clock::now() - stime; init_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Undistort 2D poses for (size_t i = 0; i < cameras.size(); ++i) { undistort_poses(i_poses_2d[i], internal_cameras[i]); } elapsed = std::chrono::steady_clock::now() - stime; undistort_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Project last 3D poses to 2D std::vector>>, std::vector>>> last_poses_2d; if (!last_poses_3d.empty()) { // Select core joints std::vector>> last_core_poses; last_core_poses.resize(last_poses_3d.size()); for (size_t i = 0; i < last_poses_3d.size(); ++i) { last_core_poses[i].resize(core_joint_idx.size()); for (size_t j = 0; j < core_joint_idx.size(); ++j) { for (size_t k = 0; k < 4; ++k) { last_core_poses[i][j][k] = last_poses_3d[i][core_joint_idx[j]][k]; } } } // Project last_poses_2d.resize(cameras.size()); for (size_t i = 0; i < cameras.size(); ++i) { auto [poses2d, dists] = project_poses(last_core_poses, internal_cameras[i], true); last_poses_2d[i] = std::make_tuple(poses2d, dists); } } elapsed = std::chrono::steady_clock::now() - stime; project_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Check matches to old poses float threshold = min_match_score - 0.2; std::map>> scored_pasts; if (!last_poses_3d.empty()) { // Calculate index pairs and initialize vectors std::vector> indices_ijk; for (size_t i = 0; i < cameras.size(); ++i) { size_t num_last_persons = std::get<0>(last_poses_2d[i]).size(); scored_pasts[i] = std::map>(); for (size_t j = 0; j < num_last_persons; ++j) { size_t num_new_persons = poses_2d[i].size(); scored_pasts[i][j] = std::vector(); for (size_t k = 0; k < num_new_persons; ++k) { indices_ijk.push_back({i, j, k}); } } } std::vector> indices_ik; for (size_t i = 0; i < cameras.size(); ++i) { size_t num_new_persons = poses_2d[i].size(); for (size_t k = 0; k < num_new_persons; ++k) { indices_ik.push_back({i, k}); } } // Precalculate core poses std::vector>> poses_2d_core_list; poses_2d_core_list.resize(indices_ik.size()); std::vector> mats_core_map; mats_core_map.resize(cameras.size()); for (size_t i = 0; i < cameras.size(); ++i) { size_t num_new_persons = poses_2d[i].size(); for (size_t k = 0; k < num_new_persons; ++k) { mats_core_map[i].push_back(0); } } for (size_t e = 0; e < indices_ik.size(); ++e) { const auto [i, k] = indices_ik[e]; const auto &pose = i_poses_2d[i][k]; std::vector> pose_core; pose_core.resize(core_joint_idx.size()); for (size_t j = 0; j < core_joint_idx.size(); ++j) { size_t idx = core_joint_idx[j]; pose_core[j] = pose[idx]; } poses_2d_core_list[e] = pose_core; mats_core_map[i][k] = e; } // Calculate matching score for (size_t e = 0; e < indices_ijk.size(); ++e) { const auto [i, j, k] = indices_ijk[e]; const auto &last_pose = std::get<0>(last_poses_2d[i])[j]; const auto &last_dist = std::get<1>(last_poses_2d[i])[j]; const auto &new_pose = poses_2d_core_list[mats_core_map[i][k]]; float score = calc_pose_score(new_pose, last_pose, last_dist, internal_cameras[i]); if (score > threshold) { { scored_pasts[i][j].push_back(k); } } } } elapsed = std::chrono::steady_clock::now() - stime; match_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Create pairs of persons // Checks if the person was already matched to the last frame and if so only creates pairs // with those, else it creates all possible pairs std::vector num_persons_sum; for (size_t i = 0; i < cameras.size(); ++i) { int nsum = poses_2d[i].size(); if (i > 0) { nsum += num_persons_sum[i - 1]; } num_persons_sum.push_back(nsum); } std::vector, std::pair>> all_pairs; std::vector> indices; for (size_t i = 0; i < cameras.size(); ++i) { for (size_t j = i + 1; j < cameras.size(); ++j) { for (size_t k = 0; k < poses_2d[i].size(); ++k) { for (size_t l = 0; l < poses_2d[j].size(); ++l) { indices.push_back({i, j, k, l}); } } } } for (size_t e = 0; e < indices.size(); ++e) { const auto [i, j, k, l] = indices[e]; int pid1 = num_persons_sum[i] + k; int pid2 = num_persons_sum[k] + l; bool match = false; if (!last_poses_3d.empty()) { for (size_t m = 0; m < last_poses_3d.size(); ++m) { auto &smi = scored_pasts[i][m]; auto &smj = scored_pasts[j][m]; bool in_smi = std::find(smi.begin(), smi.end(), k) != smi.end(); bool in_smj = std::find(smj.begin(), smj.end(), l) != smj.end(); if (in_smi && in_smj) { match = true; auto item = std::make_pair( std::make_tuple(i, j, k, l), std::make_pair(pid1, pid2)); all_pairs.push_back(item); } else if (in_smi || in_smj) { match = true; } } } if (!match) { auto item = std::make_pair( std::make_tuple(i, j, k, l), std::make_pair(pid1, pid2)); all_pairs.push_back(item); } } elapsed = std::chrono::steady_clock::now() - stime; pairs_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Calculate pair scores std::vector>, float>> all_scored_poses; all_scored_poses.resize(all_pairs.size()); for (size_t i = 0; i < all_pairs.size(); ++i) { const auto &pids = all_pairs[i].first; // Extract camera parameters const auto &cam1 = internal_cameras[std::get<0>(pids)]; const auto &cam2 = internal_cameras[std::get<1>(pids)]; // Extract 2D poses const auto &pose1 = i_poses_2d[std::get<0>(pids)][std::get<2>(pids)]; const auto &pose2 = i_poses_2d[std::get<1>(pids)][std::get<3>(pids)]; // Select core joints std::vector> pose1_core, pose2_core; pose1_core.resize(core_joint_idx.size()); pose2_core.resize(core_joint_idx.size()); for (size_t j = 0; j < core_joint_idx.size(); ++j) { size_t idx = core_joint_idx[j]; pose1_core[j] = pose1[idx]; pose2_core[j] = pose2[idx]; } // Triangulate and score auto [pose3d, score] = triangulate_and_score( pose1_core, pose2_core, cam1, cam2, roomparams, core_limbs_idx); all_scored_poses[i] = std::move(std::make_pair(pose3d, score)); } elapsed = std::chrono::steady_clock::now() - stime; pair_scoring_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Drop low scoring poses size_t num_poses = all_scored_poses.size(); for (size_t i = num_poses; i > 0; --i) { if (all_scored_poses[i - 1].second < min_match_score) { all_scored_poses.erase(all_scored_poses.begin() + i - 1); all_pairs.erase(all_pairs.begin() + i - 1); } } // Group pairs that share a person std::vector, std::vector>, std::vector>> groups = calc_grouping(all_pairs, all_scored_poses, min_match_score); // Drop groups with too few matches size_t num_groups = groups.size(); for (size_t i = num_groups; i > 0; --i) { if (std::get<2>(groups[i - 1]).size() < this->min_group_size) { groups.erase(groups.begin() + i - 1); } } elapsed = std::chrono::steady_clock::now() - stime; grouping_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Calculate full 3D poses std::vector>> all_full_poses; all_full_poses.resize(all_pairs.size()); for (size_t i = 0; i < all_pairs.size(); ++i) { const auto &pids = all_pairs[i].first; const auto &cam1 = internal_cameras[std::get<0>(pids)]; const auto &cam2 = internal_cameras[std::get<1>(pids)]; const auto &pose1 = i_poses_2d[std::get<0>(pids)][std::get<2>(pids)]; const auto &pose2 = i_poses_2d[std::get<1>(pids)][std::get<3>(pids)]; auto [pose3d, score] = triangulate_and_score( pose1, pose2, cam1, cam2, roomparams, {}); all_full_poses[i] = std::move(pose3d); } elapsed = std::chrono::steady_clock::now() - stime; full_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Merge groups std::vector>> all_merged_poses; all_merged_poses.resize(groups.size()); for (size_t i = 0; i < groups.size(); ++i) { const auto &group = groups[i]; std::vector>> poses; poses.reserve(std::get<2>(group).size()); for (const auto &idx : std::get<2>(group)) { poses.push_back(all_full_poses[idx]); } auto merged_pose = merge_group(poses, min_match_score); all_merged_poses[i] = std::move(merged_pose); } elapsed = std::chrono::steady_clock::now() - stime; merge_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Run post-processing steps std::vector>> final_poses_3d = std::move(all_merged_poses); add_extra_joints(final_poses_3d, joint_names); filter_poses(final_poses_3d, roomparams, core_joint_idx, core_limbs_idx); add_missing_joints(final_poses_3d, joint_names); // Store final result for use in the next frame. last_poses_3d = final_poses_3d; elapsed = std::chrono::steady_clock::now() - stime; post_time += elapsed.count(); stime = std::chrono::steady_clock::now(); // Convert to output format std::vector>> poses_3d; poses_3d.reserve(final_poses_3d.size()); for (size_t i = 0; i < final_poses_3d.size(); ++i) { const size_t num_joints = final_poses_3d[i].size(); std::vector> pose; pose.reserve(num_joints); size_t num_valid = 0; for (size_t j = 0; j < num_joints; ++j) { std::array point; for (size_t k = 0; k < 4; ++k) { point[k] = final_poses_3d[i][j][k]; } pose.push_back(point); if (point[3] > min_match_score) { num_valid++; } } if (num_valid > 0) { poses_3d.push_back(std::move(pose)); } } elapsed = std::chrono::steady_clock::now() - stime; convert_time += elapsed.count(); elapsed = std::chrono::steady_clock::now() - ttime; total_time += elapsed.count(); num_calls++; return poses_3d; } // ================================================================================================= void TriangulatorInternal::reset() { last_poses_3d.clear(); } // ================================================================================================= void TriangulatorInternal::print_stats() { std::cout << "{" << std::endl; std::cout << " \"triangulator_calls\": " << num_calls << "," << std::endl; std::cout << " \"init_time\": " << init_time / num_calls << "," << std::endl; std::cout << " \"undistort_time\": " << undistort_time / num_calls << "," << std::endl; std::cout << " \"project_time\": " << project_time / num_calls << "," << std::endl; std::cout << " \"match_time\": " << match_time / num_calls << "," << std::endl; std::cout << " \"pairs_time\": " << pairs_time / num_calls << "," << std::endl; std::cout << " \"pair_scoring_time\": " << pair_scoring_time / num_calls << "," << std::endl; std::cout << " \"grouping_time\": " << grouping_time / num_calls << "," << std::endl; std::cout << " \"full_time\": " << full_time / num_calls << "," << std::endl; std::cout << " \"merge_time\": " << merge_time / num_calls << "," << std::endl; std::cout << " \"post_time\": " << post_time / num_calls << "," << std::endl; std::cout << " \"convert_time\": " << convert_time / num_calls << "," << std::endl; std::cout << " \"total_time\": " << total_time / num_calls << std::endl; std::cout << "}" << std::endl; } // ================================================================================================= void TriangulatorInternal::undistort_poses( std::vector>> &poses_2d, CameraInternal &icam) { float ifx_old = 1.0 / icam.cam.K[0][0]; float ify_old = 1.0 / icam.cam.K[1][1]; float cx_old = icam.cam.K[0][2]; float cy_old = icam.cam.K[1][2]; float fx_new = icam.newK[0][0]; float fy_new = icam.newK[1][1]; float cx_new = icam.newK[0][2]; float cy_new = icam.newK[1][2]; // Undistort all the points size_t num_persons = poses_2d.size(); size_t num_joints = poses_2d[0].size(); for (size_t i = 0; i < num_persons; ++i) { for (size_t j = 0; j < num_joints; ++j) { // Normalize the point using the original camera matrix poses_2d[i][j][0] = (poses_2d[i][j][0] - cx_old) * ifx_old; poses_2d[i][j][1] = (poses_2d[i][j][1] - cy_old) * ify_old; // Undistort // Using own implementation is faster than using OpenCV, because it avoids the // overhead of creating cv::Mat objects and further unnecessary calculations for // additional distortion parameters and identity rotations in this usecase. if (icam.cam.type == "fisheye") { CameraInternal::undistort_point_fisheye(poses_2d[i][j], icam.cam.DC); } else { CameraInternal::undistort_point_pinhole(poses_2d[i][j], icam.cam.DC); } // Map the undistorted normalized point to the new image coordinates poses_2d[i][j][0] = (poses_2d[i][j][0] * fx_new) + cx_new; poses_2d[i][j][1] = (poses_2d[i][j][1] * fy_new) + cy_new; } } // Mask out points that are far outside the image (points slightly outside are still valid) int width = icam.cam.width; int height = icam.cam.height; float mask_offset = (width + height) / 20.0; for (size_t i = 0; i < num_persons; ++i) { for (size_t j = 0; j < num_joints; ++j) { if (poses_2d[i][j][0] < -mask_offset || poses_2d[i][j][0] >= width + mask_offset || poses_2d[i][j][1] < -mask_offset || poses_2d[i][j][1] >= height + mask_offset) { poses_2d[i][j][0] = 0.0; poses_2d[i][j][1] = 0.0; poses_2d[i][j][2] = 0.0; } } } } // ================================================================================================= std::tuple>>, std::vector>> TriangulatorInternal::project_poses( const std::vector>> &poses_3d, const CameraInternal &icam, bool calc_dists) { const size_t num_persons = poses_3d.size(); const size_t num_joints = poses_3d[0].size(); // Prepare output vectors std::vector>> poses_2d; std::vector> all_dists; poses_2d.resize(num_persons); all_dists.resize(num_persons); // Get camera parameters const std::array, 3> &K = icam.newK; const std::array, 3> &R = icam.invR; const std::array, 3> &T = icam.cam.T; for (size_t i = 0; i < num_persons; ++i) { const std::vector> &body3D = poses_3d[i]; std::vector> body2D(num_joints, std::array{0.0, 0.0, 0.0}); std::vector dists(num_joints, 0.0); for (size_t j = 0; j < num_joints; ++j) { float x = body3D[j][0]; float y = body3D[j][1]; float z = body3D[j][2]; float score = body3D[j][3]; // Project from world to camera coordinate system float xt = x - T[0][0]; float yt = y - T[1][0]; float zt = z - T[2][0]; float x_cam = xt * R[0][0] + yt * R[0][1] + zt * R[0][2]; float y_cam = xt * R[1][0] + yt * R[1][1] + zt * R[1][2]; float z_cam = xt * R[2][0] + yt * R[2][1] + zt * R[2][2]; // Set points behind the camera to zero if (z_cam <= 0.0) { x_cam = y_cam = z_cam = 0.0; } // Calculate distance from camera center if required if (calc_dists) dists[j] = std::sqrt(x_cam * x_cam + y_cam * y_cam + z_cam * z_cam); else dists[j] = 0.0; // Project points to image plane // Since images are already undistorted, use the pinhole projection for all camera types float u = 0.0, v = 0.0; if (z_cam > 0.0) { u = (K[0][0] * x_cam + K[0][1] * y_cam + K[0][2] * z_cam) / z_cam; v = (K[1][0] * x_cam + K[1][1] * y_cam + K[1][2] * z_cam) / z_cam; } // Filter invalid projections if (u < 0.0 || u >= icam.cam.width || v < 0.0 || v >= icam.cam.height) { u = 0.0; v = 0.0; score = 0.0; } body2D[j] = {u, v, score}; } // Store results poses_2d[i] = std::move(body2D); all_dists[i] = std::move(dists); } return std::make_tuple(poses_2d, all_dists); } // ================================================================================================= float TriangulatorInternal::calc_pose_score( const std::vector> &pose1, const std::vector> &pose2, const std::vector &dist1, const CameraInternal &icam) { const float min_score = 0.1; const size_t num_joints = pose1.size(); // Create mask for valid points std::vector mask; mask.resize(num_joints); for (size_t i = 0; i < num_joints; ++i) { if (pose1[i][2] <= min_score || pose2[i][2] <= min_score) { mask[i] = false; } else { mask[i] = true; } } // Drop if not enough valid points int valid_count = 0; for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { valid_count++; } } if (valid_count < 3) { return 0.0; } // Calculate scores float iscale = (icam.cam.width + icam.cam.height) / 2; std::vector scores = score_projection(pose1, pose2, dist1, mask, iscale); // Drop lowest scores size_t drop_k = static_cast(num_joints * 0.2); const size_t min_k = 3; std::vector scores_vec; scores_vec.reserve(valid_count); for (size_t i = 0; i < scores.size(); ++i) { if (mask[i] > 0) { scores_vec.push_back(scores[i]); } } size_t scores_size = scores_vec.size(); if (scores_size >= min_k) { drop_k = std::min(drop_k, scores_size - min_k); std::partial_sort(scores_vec.begin(), scores_vec.begin() + drop_k, scores_vec.end()); scores_vec.erase(scores_vec.begin(), scores_vec.begin() + drop_k); } // Calculate final score float score = 0.0; size_t n_items = scores_vec.size(); if (n_items > 0) { float sum_scores = std::accumulate(scores_vec.begin(), scores_vec.end(), 0.0); score = sum_scores / static_cast(n_items); } return score; } // ================================================================================================= std::vector TriangulatorInternal::score_projection( const std::vector> &pose, const std::vector> &repro, const std::vector &dists, const std::vector &mask, float iscale) { const float min_score = 0.1; const size_t num_joints = pose.size(); // Calculate error std::vector error(num_joints, 0.0); for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { float dx = pose[i][0] - repro[i][0]; float dy = pose[i][1] - repro[i][1]; float err = std::sqrt(dx * dx + dy * dy); // Set errors of invisible reprojections to a high value if (repro[i][2] < min_score) { err = iscale; } error[i] = err; } } // Scale error by image size const float inv_iscale = 1.0 / iscale; const float iscale_quarter = iscale / 4.0; for (size_t i = 0; i < num_joints; ++i) { float err = error[i]; err = std::max(0.0f, std::min(err, iscale_quarter)) * inv_iscale; error[i] = err; } // Scale error by distance to camera float mean_dist = 0.0; int count = 0; for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { mean_dist += dists[i]; ++count; } } if (count > 0) { mean_dist /= count; } const float dscale = std::sqrt(mean_dist / 3.5); for (size_t i = 0; i < num_joints; ++i) { error[i] *= dscale; } // Convert error to score std::vector score(num_joints, 0.0); for (size_t i = 0; i < num_joints; ++i) { score[i] = 1.0 / (1.0 + error[i] * 10.0); } return score; } // ================================================================================================= float dot(const std::array &a, const std::array &b) { return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; } std::array cross(const std::array &a, const std::array &b) { return {a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]}; } std::array add(const std::array &a, const std::array &b) { return {a[0] + b[0], a[1] + b[1], a[2] + b[2]}; } std::array subtract(const std::array &a, const std::array &b) { return {a[0] - b[0], a[1] - b[1], a[2] - b[2]}; } std::array multiply(const std::array &a, float s) { return {a[0] * s, a[1] * s, a[2] * s}; } std::array normalize(const std::array &v) { float norm = std::sqrt(dot(v, v)); if (norm < 1e-8f) return v; return multiply(v, 1.0f / norm); } std::array mat_mul_vec( const std::array, 3> &M, const std::array &v) { std::array res = {M[0][0] * v[0] + M[0][1] * v[1] + M[0][2] * v[2], M[1][0] * v[0] + M[1][1] * v[1] + M[1][2] * v[2], M[2][0] * v[0] + M[2][1] * v[1] + M[2][2] * v[2]}; return res; } std::array calc_ray_dir(const CameraInternal &icam, const std::array &pt) { // Compute normalized ray direction from the point std::array uv1 = {pt[0], pt[1], 1.0}; auto d = mat_mul_vec(icam.cam.R, mat_mul_vec(icam.invK, uv1)); auto ray_dir = normalize(d); return ray_dir; } std::array triangulate_midpoint( const CameraInternal &icam1, const CameraInternal &icam2, const std::array &pt1, const std::array &pt2) { // Triangulate two points by computing their two rays and the midpoint of their closest approach // See: https://en.wikipedia.org/wiki/Skew_lines#Nearest_points // Obtain the camera centers and ray directions for both views std::array p1 = icam1.center; std::array p2 = icam2.center; std::array d1 = calc_ray_dir(icam1, pt1); std::array d2 = calc_ray_dir(icam2, pt2); // Compute the perpendicular plane vectors std::array n = cross(d1, d2); std::array n1 = cross(d1, n); std::array n2 = cross(d2, n); // Calculate point on Line 1 nearest to Line 2 float t1 = dot(subtract(p2, p1), n2) / dot(d1, n2); std::array c1 = add(p1, multiply(d1, t1)); // Calculate point on Line 2 nearest to Line 1 float t2 = dot(subtract(p1, p2), n1) / dot(d2, n1); std::array c2 = add(p2, multiply(d2, t2)); // Compute midpoint between c1 and c2. std::array midpoint = multiply(add(c1, c2), 0.5); return midpoint; } // ================================================================================================= std::pair>, float> TriangulatorInternal::triangulate_and_score( const std::vector> &pose1, const std::vector> &pose2, const CameraInternal &cam1, const CameraInternal &cam2, const std::array, 2> &roomparams, const std::vector> &core_limbs_idx) { const float min_score = 0.1; const size_t num_joints = pose1.size(); // Create mask for valid points std::vector mask; mask.resize(num_joints); for (size_t i = 0; i < num_joints; ++i) { if (pose1[i][2] <= min_score || pose2[i][2] <= min_score) { mask[i] = false; } else { mask[i] = true; } } // If too few joints are visible, return a low score int num_visible = 0; for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { num_visible++; } } if (num_visible < 3) { std::vector> empty(num_joints, {0.0, 0.0, 0.0, 0.0}); float score = 0.0; return std::make_pair(empty, score); } // Use midpoint triangulation instead of cv::triangulatePoints because it is much faster, // while having almost the same accuracy. std::vector> pose3d(num_joints, {0.0, 0.0, 0.0, 0.0}); for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { auto &pt1 = pose1[i]; auto &pt2 = pose2[i]; std::array pt3d = triangulate_midpoint( cam1, cam2, {pt1[0], pt1[1]}, {pt2[0], pt2[1]}); pose3d[i] = {pt3d[0], pt3d[1], pt3d[2], 1.0}; } } // Check if mean of the points is inside the room bounds std::array mean = {0.0, 0.0, 0.0}; for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { mean[0] += pose3d[i][0]; mean[1] += pose3d[i][1]; mean[2] += pose3d[i][2]; } } float inv_num_vis = 1.0 / num_visible; for (int j = 0; j < 3; ++j) { mean[j] *= inv_num_vis; } const std::array &room_size = roomparams[0]; const std::array &room_center = roomparams[1]; for (int j = 0; j < 3; ++j) { if (mean[j] > room_center[j] + room_size[j] / 2.0 || mean[j] < room_center[j] - room_size[j] / 2.0) { // Very low score if outside room for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { pose3d[i][3] = 0.001; } } return std::make_pair(pose3d, 0.001); } } // Calculate reprojections std::vector>> poses_3d = {pose3d}; auto [repro1s, dists1s] = project_poses(poses_3d, cam1, true); auto [repro2s, dists2s] = project_poses(poses_3d, cam2, true); auto repro1 = repro1s[0]; auto dists1 = dists1s[0]; auto repro2 = repro2s[0]; auto dists2 = dists2s[0]; // Calculate scores for each view float iscale = (cam1.cam.width + cam1.cam.height) / 2.0; std::vector score1 = score_projection(pose1, repro1, dists1, mask, iscale); std::vector score2 = score_projection(pose2, repro2, dists2, mask, iscale); // Add scores to 3D pose for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { float score = 0.5 * (score1[i] + score2[i]); pose3d[i][3] = score; } } // Set scores outside the room to a low value const float wdist = 0.1; for (size_t i = 0; i < num_joints; ++i) { if (mask[i]) { for (int j = 0; j < 3; ++j) { if (pose3d[i][j] > room_center[j] + room_size[j] / 2.0 + wdist || pose3d[i][j] < room_center[j] - room_size[j] / 2.0 - wdist) { pose3d[i][3] = 0.001; break; } } } } // Filter clearly wrong limbs if (!core_limbs_idx.empty()) { const float max_length_sq = 0.9 * 0.9; for (size_t i = 0; i < core_limbs_idx.size(); ++i) { size_t limb1 = core_limbs_idx[i][0]; size_t limb2 = core_limbs_idx[i][1]; if (pose3d[limb1][3] > min_score && pose3d[limb2][3] > min_score) { float dx = pose3d[limb1][0] - pose3d[limb2][0]; float dy = pose3d[limb1][1] - pose3d[limb2][1]; float dz = pose3d[limb1][2] - pose3d[limb2][2]; float length_sq = dx * dx + dy * dy + dz * dz; if (length_sq > max_length_sq) { pose3d[limb2][3] = 0.001; } } } } // Drop lowest scores size_t drop_k = static_cast(num_joints * 0.2); const size_t min_k = 3; std::vector valid_scores; for (size_t i = 0; i < num_joints; ++i) { if (pose3d[i][3] > min_score) { valid_scores.push_back(pose3d[i][3]); } } size_t scores_size = valid_scores.size(); if (scores_size >= min_k) { drop_k = std::min(drop_k, scores_size - min_k); std::partial_sort(valid_scores.begin(), valid_scores.begin() + drop_k, valid_scores.end()); valid_scores.erase(valid_scores.begin(), valid_scores.begin() + drop_k); } // Calculate final score float final_score = 0.0; if (!valid_scores.empty()) { float sum_scores = std::accumulate(valid_scores.begin(), valid_scores.end(), 0.0); final_score = sum_scores / static_cast(valid_scores.size()); } return std::make_pair(pose3d, final_score); } // ================================================================================================= std::vector, std::vector>, std::vector>> TriangulatorInternal::calc_grouping( const std::vector, std::pair>> &all_pairs, const std::vector>, float>> &all_scored_poses, float min_score) { float max_center_dist_sq = 0.75 * 0.75; float max_joint_avg_dist = 0.25; size_t num_pairs = all_pairs.size(); // Calculate pose centers std::vector> centers; centers.resize(num_pairs); for (size_t i = 0; i < num_pairs; ++i) { const auto &pose_3d = all_scored_poses[i].first; const size_t num_joints = pose_3d.size(); std::array center = {0.0, 0.0, 0.0}; size_t num_valid = 0; for (size_t j = 0; j < num_joints; ++j) { float score = pose_3d[j][3]; if (score > min_score) { center[0] += pose_3d[j][0]; center[1] += pose_3d[j][1]; center[2] += pose_3d[j][2]; ++num_valid; } } if (num_valid > 0) { float inv = 1.0 / num_valid; center[0] *= inv; center[1] *= inv; center[2] *= inv; } centers[i] = center; } // Calculate Groups // defined as a tuple of center, pose, and all-pairs-indices of members std::vector, std::vector>, std::vector>> groups; std::vector> per_group_visible_counts; for (size_t i = 0; i < num_pairs; ++i) { const auto &pose_3d = all_scored_poses[i].first; size_t num_joints = pose_3d.size(); const std::array ¢er = centers[i]; float best_dist = std::numeric_limits::infinity(); int best_group = -1; for (size_t j = 0; j < groups.size(); ++j) { auto &group = groups[j]; std::array &group_center = std::get<0>(group); // Check if the center is close enough float dx = group_center[0] - center[0]; float dy = group_center[1] - center[1]; float dz = group_center[2] - center[2]; float center_dist_sq = dx * dx + dy * dy + dz * dz; if (center_dist_sq < max_center_dist_sq) { const auto &group_pose = std::get<1>(group); // Calculate average joint distance std::vector dists; for (size_t row = 0; row < num_joints; ++row) { float score1 = pose_3d[row][3]; float score2 = group_pose[row][3]; if (score1 > min_score && score2 > min_score) { float dx = pose_3d[row][0] - group_pose[row][0]; float dy = pose_3d[row][1] - group_pose[row][1]; float dz = pose_3d[row][2] - group_pose[row][2]; float dist = std::sqrt(dx * dx + dy * dy + dz * dz); dists.push_back(dist); } } if (dists.size() >= 5) { // Drop highest value to reduce influence of outliers auto max_it = std::max_element(dists.begin(), dists.end()); dists.erase(max_it); } if (dists.size() > 0) { // Check if the average joint distance is close enough float avg_dist = std::accumulate(dists.begin(), dists.end(), 0.0); avg_dist /= static_cast(dists.size()); if (avg_dist < max_joint_avg_dist && avg_dist < best_dist) { best_dist = avg_dist; best_group = static_cast(j); } } } } if (best_group == -1) { // Create a new group std::vector new_indices{static_cast(i)}; std::vector> group_pose = pose_3d; groups.emplace_back(center, group_pose, new_indices); // Update per joint counts std::vector new_valid_joint_counts(num_joints, 0); for (size_t row = 0; row < num_joints; ++row) { if (pose_3d[row][3] > min_score) { new_valid_joint_counts[row] = 1; } } per_group_visible_counts.push_back(std::move(new_valid_joint_counts)); } else { // Update existing group auto &group = groups[best_group]; std::array &group_center = std::get<0>(group); std::vector> &group_pose = std::get<1>(group); std::vector &group_indices = std::get<2>(group); float n_elems = static_cast(group_indices.size()); float inv_n = 1.0 / (n_elems + 1.0); // Update group center group_center[0] = (group_center[0] * n_elems + center[0]) * inv_n; group_center[1] = (group_center[1] * n_elems + center[1]) * inv_n; group_center[2] = (group_center[2] * n_elems + center[2]) * inv_n; // Update group pose for (size_t row = 0; row < num_joints; ++row) { if (pose_3d[row][3] > min_score) { float j_elems = static_cast(per_group_visible_counts[best_group][row]); float inv_j = 1.0 / (j_elems + 1.0); group_pose[row][0] = (group_pose[row][0] * j_elems + pose_3d[row][0]) * inv_j; group_pose[row][1] = (group_pose[row][1] * j_elems + pose_3d[row][1]) * inv_j; group_pose[row][2] = (group_pose[row][2] * j_elems + pose_3d[row][2]) * inv_j; group_pose[row][3] = (group_pose[row][3] * j_elems + pose_3d[row][3]) * inv_j; per_group_visible_counts[best_group][row]++; } } group_indices.push_back(static_cast(i)); } } // Merge close groups // Depending on the inital group creation, one or more groups can be created that in the end // share the same persons, even if they had a larger distance at the beginning // So merge them similar to the group assignment before std::vector, std::vector>, std::vector>> merged_groups; for (size_t i = 0; i < groups.size(); ++i) { size_t num_joints = std::get<1>(groups[i]).size(); auto &group = groups[i]; auto &group_visible_counts = per_group_visible_counts[i]; float best_dist = std::numeric_limits::infinity(); int best_group = -1; for (size_t j = 0; j < merged_groups.size(); ++j) { const auto &group_pose = std::get<1>(group); const auto &merged_pose = std::get<1>(merged_groups[j]); // Calculate average joint distance std::vector dists; for (size_t row = 0; row < num_joints; ++row) { float score1 = group_pose[row][3]; float score2 = merged_pose[row][3]; if (score1 > min_score && score2 > min_score) { float dx = group_pose[row][0] - merged_pose[row][0]; float dy = group_pose[row][1] - merged_pose[row][1]; float dz = group_pose[row][2] - merged_pose[row][2]; float dist = std::sqrt(dx * dx + dy * dy + dz * dz); dists.push_back(dist); } } if (dists.size() >= 5) { // Drop highest value to reduce influence of outliers auto max_it = std::max_element(dists.begin(), dists.end()); dists.erase(max_it); } if (dists.size() > 0) { // Check if the average joint distance is close enough float avg_dist = std::accumulate(dists.begin(), dists.end(), 0.0); avg_dist /= static_cast(dists.size()); if (avg_dist < max_joint_avg_dist && avg_dist < best_dist) { best_dist = avg_dist; best_group = static_cast(j); } } } if (best_group == -1) { // Create a new group merged_groups.push_back(group); } else { // Update existing group auto &merged_group = merged_groups[best_group]; std::array &merged_center = std::get<0>(merged_group); std::array &group_center = std::get<0>(group); std::vector &merged_group_indices = std::get<2>(merged_group); float n_elems1 = static_cast(merged_group_indices.size()); float n_elems2 = static_cast(std::get<2>(group).size()); float inv1 = n_elems1 / (n_elems1 + n_elems2); float inv2 = n_elems2 / (n_elems1 + n_elems2); // Update group center merged_center[0] = merged_center[0] * inv1 + group_center[0] * inv2; merged_center[1] = merged_center[1] * inv1 + group_center[1] * inv2; merged_center[2] = merged_center[2] * inv1 + group_center[2] * inv2; // Update group pose for (size_t row = 0; row < num_joints; ++row) { auto &group_pose = std::get<1>(group); auto &merged_pose = std::get<1>(merged_group); if (group_pose[row][3] > min_score) { float j_elems1 = static_cast(group_visible_counts[row]); float j_elems2 = static_cast(per_group_visible_counts[best_group][row]); float inv1 = j_elems1 / (j_elems1 + j_elems2); float inv2 = j_elems2 / (j_elems1 + j_elems2); merged_pose[row][0] = merged_pose[row][0] * inv1 + group_pose[row][0] * inv2; merged_pose[row][1] = merged_pose[row][1] * inv1 + group_pose[row][1] * inv2; merged_pose[row][2] = merged_pose[row][2] * inv1 + group_pose[row][2] * inv2; merged_pose[row][3] = merged_pose[row][3] * inv1 + group_pose[row][3] * inv2; group_visible_counts[row]++; } } // Merge indices merged_group_indices.insert( merged_group_indices.end(), std::get<2>(group).begin(), std::get<2>(group).end()); } } return merged_groups; } // ================================================================================================= std::vector> TriangulatorInternal::merge_group( const std::vector>> &poses_3d, float min_score) { size_t num_poses = poses_3d.size(); size_t num_joints = poses_3d[0].size(); // Merge poses to create initial pose // Use only those triangulations with a high score std::vector> sum_poses(num_joints, {0.0, 0.0, 0.0, 0.0}); std::vector sum_mask1(num_joints, 0); for (size_t i = 0; i < num_poses; ++i) { const auto &pose = poses_3d[i]; for (size_t j = 0; j < num_joints; ++j) { float score = pose[j][3]; if (score > min_score) { sum_poses[j][0] += pose[j][0]; sum_poses[j][1] += pose[j][1]; sum_poses[j][2] += pose[j][2]; sum_poses[j][3] += score; sum_mask1[j]++; } } } std::vector> initial_pose_3d(num_joints, {0.0, 0.0, 0.0, 0.0}); for (size_t j = 0; j < num_joints; ++j) { if (sum_mask1[j] > 0) { float inv = 1.0 / sum_mask1[j]; initial_pose_3d[j][0] = sum_poses[j][0] * inv; initial_pose_3d[j][1] = sum_poses[j][1] * inv; initial_pose_3d[j][2] = sum_poses[j][2] * inv; initial_pose_3d[j][3] = sum_poses[j][3] * inv; } } // Use center as default if the initial pose is empty std::vector jmask(num_joints, false); std::array center = {0.0, 0.0, 0.0}; int valid_joints = 0; for (size_t j = 0; j < num_joints; ++j) { float score = initial_pose_3d[j][3]; if (score > min_score) { jmask[j] = true; center[0] += initial_pose_3d[j][0]; center[1] += initial_pose_3d[j][1]; center[2] += initial_pose_3d[j][2]; valid_joints++; } } if (valid_joints > 0) { float inv_valid = 1.0 / valid_joints; center[0] *= inv_valid; center[1] *= inv_valid; center[2] *= inv_valid; } for (size_t j = 0; j < num_joints; ++j) { if (!jmask[j]) { initial_pose_3d[j][0] = center[0]; initial_pose_3d[j][1] = center[1]; initial_pose_3d[j][2] = center[2]; initial_pose_3d[j][3] = 0.0; } } // Drop joints with low scores and filter outlying joints using distance threshold float offset = 0.1; float max_dist_sq = 1.2 * 1.2; std::vector> mask(num_poses, std::vector(num_joints, false)); std::vector> distances(num_poses, std::vector(num_joints, 0.0)); for (size_t i = 0; i < num_poses; ++i) { const auto &pose = poses_3d[i]; for (size_t j = 0; j < num_joints; ++j) { float dx = pose[j][0] - initial_pose_3d[j][0]; float dy = pose[j][1] - initial_pose_3d[j][1]; float dz = pose[j][2] - initial_pose_3d[j][2]; float dist_sq = dx * dx + dy * dy + dz * dz; distances[i][j] = dist_sq; float score = pose[j][3]; if (dist_sq <= max_dist_sq && score > (min_score - offset)) { mask[i][j] = true; } } } // Select the best-k proposals for each joint that are closest to the initial pose int keep_best = std::max(3, (int)num_poses / 3); std::vector> best_k_mask(num_poses, std::vector(num_joints, false)); for (size_t j = 0; j < num_joints; ++j) { std::vector> valid_indices; valid_indices.reserve(num_poses); for (size_t i = 0; i < num_poses; ++i) { if (mask[i][j]) { valid_indices.emplace_back(distances[i][j], i); } } int num_valid = std::min(keep_best, static_cast(valid_indices.size())); if (num_valid > 0) { std::partial_sort(valid_indices.begin(), valid_indices.begin() + num_valid, valid_indices.end()); for (int k = 0; k < num_valid; ++k) { int idx = valid_indices[k].second; best_k_mask[idx][j] = true; } } } // Combine masks for (size_t i = 0; i < num_poses; ++i) { for (size_t j = 0; j < num_joints; ++j) { mask[i][j] = mask[i][j] && best_k_mask[i][j]; } } // Compute the final pose std::vector> sum_poses2(num_joints, {0.0, 0.0, 0.0, 0.0}); std::vector sum_mask2(num_joints, 0); for (size_t i = 0; i < num_poses; ++i) { const auto &pose = poses_3d[i]; for (size_t j = 0; j < num_joints; ++j) { if (mask[i][j]) { sum_poses2[j][0] += pose[j][0]; sum_poses2[j][1] += pose[j][1]; sum_poses2[j][2] += pose[j][2]; sum_poses2[j][3] += pose[j][3]; sum_mask2[j]++; } } } std::vector> final_pose_3d(num_joints, {0.0, 0.0, 0.0, 0.0}); for (size_t j = 0; j < num_joints; ++j) { if (sum_mask2[j] > 0) { float inv = 1.0 / sum_mask2[j]; final_pose_3d[j][0] = sum_poses2[j][0] * inv; final_pose_3d[j][1] = sum_poses2[j][1] * inv; final_pose_3d[j][2] = sum_poses2[j][2] * inv; final_pose_3d[j][3] = sum_poses2[j][3] * inv; } } return final_pose_3d; } // ================================================================================================= void TriangulatorInternal::add_extra_joints( std::vector>> &poses, const std::vector &joint_names) { // Find indices of "head", "ear_left", and "ear_right" auto it_head = std::find(joint_names.begin(), joint_names.end(), "head"); auto it_ear_left = std::find(joint_names.begin(), joint_names.end(), "ear_left"); auto it_ear_right = std::find(joint_names.begin(), joint_names.end(), "ear_right"); int idx_h = std::distance(joint_names.begin(), it_head); int idx_el = std::distance(joint_names.begin(), it_ear_left); int idx_er = std::distance(joint_names.begin(), it_ear_right); // If the confidence score of "head" is zero compute it as the average of the ears for (size_t i = 0; i < poses.size(); ++i) { auto &pose = poses[i]; if (pose[idx_h][3] == 0.0) { if (pose[idx_el][3] > 0.0 && pose[idx_er][3] > 0.0) { pose[idx_h][0] = (pose[idx_el][0] + pose[idx_er][0]) * 0.5; pose[idx_h][1] = (pose[idx_el][1] + pose[idx_er][1]) * 0.5; pose[idx_h][2] = (pose[idx_el][2] + pose[idx_er][2]) * 0.5; pose[idx_h][3] = std::min(pose[idx_el][3], pose[idx_er][3]); } } } } // ================================================================================================= void TriangulatorInternal::filter_poses( std::vector>> &poses, const std::array, 2> &roomparams, const std::vector &core_joint_idx, const std::vector> &core_limbs_idx) { const float min_score = 0.1; std::vector drop_indices; for (size_t i = 0; i < poses.size(); ++i) { auto &pose = poses[i]; size_t num_joints = pose.size(); // Collect valid joint indices std::vector valid_joint_idx; for (size_t j = 0; j < num_joints; ++j) { if (pose[j][3] > min_score) { valid_joint_idx.push_back(j); } } // Drop poses with too few joints if (valid_joint_idx.size() < 5) { drop_indices.push_back(i); continue; } // Compute min, max and mean coordinates std::array mean = {0.0, 0.0, 0.0}; std::array mins = { std::numeric_limits::max(), std::numeric_limits::max(), std::numeric_limits::max()}; std::array maxs = { std::numeric_limits::lowest(), std::numeric_limits::lowest(), std::numeric_limits::lowest()}; for (size_t j = 0; j < valid_joint_idx.size(); ++j) { size_t idx = valid_joint_idx[j]; for (size_t k = 0; k < 3; ++k) { mins[k] = std::min(mins[k], pose[idx][k]); maxs[k] = std::max(maxs[k], pose[idx][k]); mean[k] += pose[idx][k]; } } for (size_t k = 0; k < 3; ++k) { mean[k] /= static_cast(valid_joint_idx.size()); } // Drop poses that are too large or too small or too flat float min_flatness = 0.2; float max_size = 2.3; float min_size = 0.3; std::array diff; for (int j = 0; j < 3; ++j) { diff[j] = maxs[j] - mins[j]; } if (diff[0] > max_size || diff[1] > max_size || diff[2] > max_size) { drop_indices.push_back(i); continue; } if (diff[0] < min_size && diff[1] < min_size && diff[2] < min_size) { drop_indices.push_back(i); continue; } if ((diff[0] < min_flatness && diff[1] < min_flatness) || (diff[1] < min_flatness && diff[2] < min_flatness) || (diff[2] < min_flatness && diff[0] < min_flatness)) { drop_indices.push_back(i); continue; } // Drop persons outside room const float wdist = 0.1; const auto &room_size = roomparams[0]; const auto &room_center = roomparams[1]; const std::array room_half_size = { room_size[0] / 2.0f, room_size[1] / 2.0f, room_size[2] / 2.0f}; bool outside = false; for (int j = 0; j < 3; ++j) { // Check if the mean position is outside the room boundaries if (mean[j] > room_half_size[j] + room_center[j] || mean[j] < -room_half_size[j] + room_center[j]) { outside = true; break; } } for (int j = 0; j < 3; ++j) { // Check if any limb is too far outside the room if (maxs[j] > room_half_size[j] + room_center[j] + wdist || mins[j] < -room_half_size[j] + room_center[j] - wdist) { outside = true; break; } } if (outside) { drop_indices.push_back(i); continue; } // Calculate total limb length and average limb length const float max_avg_length = 0.5; const float min_avg_length = 0.1; float total_length = 0.0; int total_limbs = 0; for (size_t j = 0; j < core_limbs_idx.size(); ++j) { size_t start_idx = core_joint_idx[core_limbs_idx[j][0]]; size_t end_idx = core_joint_idx[core_limbs_idx[j][1]]; if (pose[start_idx][3] < min_score || pose[end_idx][3] < min_score) { continue; } float dx = pose[end_idx][0] - pose[start_idx][0]; float dy = pose[end_idx][1] - pose[start_idx][1]; float dz = pose[end_idx][2] - pose[start_idx][2]; float length = std::sqrt(dx * dx + dy * dy + dz * dz); total_length += length; total_limbs++; } if (total_limbs == 0) { drop_indices.push_back(i); continue; } float average_length = total_length / static_cast(total_limbs); if (average_length < min_avg_length) { drop_indices.push_back(i); continue; } if (total_limbs >= 5 && average_length > max_avg_length) { drop_indices.push_back(i); continue; } } // Set confidences of invalid poses to a low value for (size_t i = 0; i < drop_indices.size(); ++i) { auto &pose = poses[drop_indices[i]]; for (size_t j = 0; j < pose.size(); ++j) { pose[j][3] = 0.001; } } } // ================================================================================================= void TriangulatorInternal::add_missing_joints( std::vector>> &poses, const std::vector &joint_names) { // Map joint names to their indices for quick lookup std::unordered_map joint_name_to_idx; for (size_t idx = 0; idx < joint_names.size(); ++idx) { joint_name_to_idx[joint_names[idx]] = idx; } // Adjacent mapping std::unordered_map> adjacents = { {"hip_right", {"hip_middle", "hip_left"}}, {"hip_left", {"hip_middle", "hip_right"}}, {"knee_right", {"hip_right", "knee_left"}}, {"knee_left", {"hip_left", "knee_right"}}, {"ankle_right", {"knee_right", "ankle_left"}}, {"ankle_left", {"knee_left", "ankle_right"}}, {"shoulder_right", {"shoulder_middle", "shoulder_left"}}, {"shoulder_left", {"shoulder_middle", "shoulder_right"}}, {"elbow_right", {"shoulder_right", "hip_right"}}, {"elbow_left", {"shoulder_left", "hip_left"}}, {"wrist_right", {"elbow_right"}}, {"wrist_left", {"elbow_left"}}, {"nose", {"shoulder_middle", "shoulder_right", "shoulder_left"}}, {"head", {"shoulder_middle", "shoulder_right", "shoulder_left"}}, {"foot_*_left_*", {"ankle_left"}}, {"foot_*_right_*", {"ankle_right"}}, {"face_*", {"nose"}}, {"hand_*_left_*", {"wrist_left"}}, {"hand_*_right_*", {"wrist_right"}}}; for (size_t i = 0; i < poses.size(); ++i) { auto &pose = poses[i]; size_t num_joints = pose.size(); // Collect valid joint indices std::vector valid_joint_idx; for (size_t j = 0; j < num_joints; ++j) { if (pose[j][3] > min_match_score) { valid_joint_idx.push_back(j); } } if (valid_joint_idx.empty()) { continue; } // Compute body center as the mean of valid joints std::array body_center = {0.0, 0.0, 0.0}; for (size_t idx : valid_joint_idx) { body_center[0] += pose[idx][0]; body_center[1] += pose[idx][1]; body_center[2] += pose[idx][2]; } for (int j = 0; j < 3; ++j) { body_center[j] /= static_cast(valid_joint_idx.size()); } // Iterate over each joint for (size_t j = 0; j < joint_names.size(); ++j) { std::string adname = ""; const std::string &jname = joint_names[j]; // Determine adjacent joint based on name patterns if (jname.substr(0, 5) == "foot_" && jname.find("_left") != std::string::npos) { adname = "foot_*_left_*"; } else if (jname.substr(0, 5) == "foot_" && jname.find("_right") != std::string::npos) { adname = "foot_*_right_*"; } else if (jname.substr(0, 5) == "face_") { adname = "face_*"; } else if (jname.substr(0, 5) == "hand_" && jname.find("_left") != std::string::npos) { adname = "hand_*_left_*"; } else if (jname.substr(0, 5) == "hand_" && jname.find("_right") != std::string::npos) { adname = "hand_*_right_*"; } else if (adjacents.find(jname) != adjacents.end()) { adname = jname; } if (adname == "") { // No adjacent joints defined for this joint continue; } if (pose[j][3] == 0.0) { if (adjacents.find(adname) != adjacents.end()) { // Joint is missing; attempt to estimate its position based on adjacent joints std::array adjacent_position = {0.0, 0.0, 0.0}; size_t adjacent_count = 0; const std::vector &adjacent_joint_names = adjacents[adname]; for (const std::string &adj_name : adjacent_joint_names) { auto it = joint_name_to_idx.find(adj_name); if (it != joint_name_to_idx.end()) { size_t adj_idx = it->second; if (adj_idx < pose.size() && pose[adj_idx][3] > 0.1) { adjacent_position[0] += pose[adj_idx][0]; adjacent_position[1] += pose[adj_idx][1]; adjacent_position[2] += pose[adj_idx][2]; ++adjacent_count; } } } if (adjacent_count > 0) { for (size_t k = 0; k < 3; ++k) { adjacent_position[k] /= static_cast(adjacent_count); } // Update the missing joint's position pose[j][0] = adjacent_position[0]; pose[j][1] = adjacent_position[1]; pose[j][2] = adjacent_position[2]; } else { // No valid adjacent joints, use body center pose[j][0] = body_center[0]; pose[j][1] = body_center[1]; pose[j][2] = body_center[2]; } } else { // Adjacent joints not defined, use body center pose[j][0] = body_center[0]; pose[j][1] = body_center[1]; pose[j][2] = body_center[2]; } // Set a low confidence score pose[j][3] = 0.1; } } } }