RapidPoseTriangulation/rpt/triangulator.cpp

#include <algorithm>
#include <chrono>
#include <cmath>
#include <iomanip>
#include <map>
#include <numeric>
#include <unordered_map>

#include "camera.hpp"
#include "triangulator.hpp"

// =================================================================================================
// =================================================================================================

[[maybe_unused]] static void print_2d_poses(const std::vector<std::array<float, 3>> &poses)
{
    std::cout << "Poses: (" << poses.size() << ", 3)[" << std::endl;
    for (const auto &pose : poses)
    {
        std::cout << "  [";
        for (size_t i = 0; i < 3; ++i)
        {
            std::cout << std::fixed << std::setprecision(3) << pose[i];
            if (i < 2)
            {
                std::cout << ", ";
            }
        }
        std::cout << "]" << std::endl;
    }
    std::cout << "]" << std::endl;
}

// =================================================================================================

[[maybe_unused]] static void print_3d_poses(const std::vector<std::array<float, 4>> &poses)
{
    std::cout << "Poses: (" << poses.size() << ", 4)[" << std::endl;
    for (const auto &pose : poses)
    {
        std::cout << "  [";
        for (size_t i = 0; i < 4; ++i)
        {
            std::cout << std::fixed << std::setprecision(3) << pose[i];
            if (i < 3)
            {
                std::cout << ", ";
            }
        }
        std::cout << "]" << std::endl;
    }
    std::cout << "]" << std::endl;
}

// =================================================================================================

[[maybe_unused]] static void print_allpairs(
    const std::vector<std::pair<std::tuple<int, int, int, int>, std::pair<int, int>>> &all_pairs)
{
    std::cout << "All pairs: [" << std::endl;
    for (const auto &pair : all_pairs)
    {
        const auto &tuple_part = pair.first;
        const auto &pair_part = pair.second;

        // Print the tuple part
        std::cout << "("
                  << std::get<0>(tuple_part) << ", "
                  << std::get<1>(tuple_part) << ", "
                  << std::get<2>(tuple_part) << ", "
                  << std::get<3>(tuple_part) << ")";

        // Print the pair part
        std::cout << ", ("
                  << pair_part.first << ", "
                  << pair_part.second << ")"
                  << std::endl;
    }
    std::cout << "]" << std::endl;
}

// =================================================================================================
// =================================================================================================

CameraInternal::CameraInternal(const Camera &cam)
{
    this->cam = cam;
    this->invR = transpose3x3(cam.R);

    // Camera center:
    // C = -(Rᵀ * t) = -(Rᵀ * (R * (T * -1))) = -(Rᵀ * (R * -T)) = -(Rᵀ * -R * T) = -(-T) = T
    this->center = {cam.T[0][0], cam.T[1][0], cam.T[2][0]};

    // Undistort camera matrix
    // As with the undistortion, the own implementation avoids some overhead compared to OpenCV
    if (cam.type == "fisheye")
    {
        newK = calc_optimal_camera_matrix_fisheye(1.0, {cam.width, cam.height});
    }
    else
    {
        newK = calc_optimal_camera_matrix_pinhole(1.0, {cam.width, cam.height});
    }
    this->invK = invert3x3(newK);
}

// =================================================================================================

std::array<std::array<float, 3>, 3> CameraInternal::transpose3x3(
    const std::array<std::array<float, 3>, 3> &M)
{
    return {{{M[0][0], M[1][0], M[2][0]},
             {M[0][1], M[1][1], M[2][1]},
             {M[0][2], M[1][2], M[2][2]}}};
}

// =================================================================================================

std::array<std::array<float, 3>, 3> CameraInternal::invert3x3(
    const std::array<std::array<float, 3>, 3> &M)
{
    // Compute the inverse using the adjugate method
    // See: https://scicomp.stackexchange.com/a/29206

    std::array<std::array<float, 3>, 3> adj = {
        {{
             M[1][1] * M[2][2] - M[1][2] * M[2][1],
             M[0][2] * M[2][1] - M[0][1] * M[2][2],
             M[0][1] * M[1][2] - M[0][2] * M[1][1],
         },
         {
             M[1][2] * M[2][0] - M[1][0] * M[2][2],
             M[0][0] * M[2][2] - M[0][2] * M[2][0],
             M[0][2] * M[1][0] - M[0][0] * M[1][2],
         },
         {
             M[1][0] * M[2][1] - M[1][1] * M[2][0],
             M[0][1] * M[2][0] - M[0][0] * M[2][1],
             M[0][0] * M[1][1] - M[0][1] * M[1][0],
         }}};

    float det = M[0][0] * adj[0][0] + M[0][1] * adj[1][0] + M[0][2] * adj[2][0];
    if (std::fabs(det) < 1e-6f)
    {
        return {{{0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}, {0.0, 0.0, 0.0}}};
    }

    float idet = 1.0f / det;
    std::array<std::array<float, 3>, 3> inv = {
        {{
             adj[0][0] * idet,
             adj[0][1] * idet,
             adj[0][2] * idet,
         },
         {
             adj[1][0] * idet,
             adj[1][1] * idet,
             adj[1][2] * idet,
         },
         {
             adj[2][0] * idet,
             adj[2][1] * idet,
             adj[2][2] * idet,
         }}};

    return inv;
}

// =================================================================================================

void CameraInternal::undistort_point_pinhole(std::array<float, 3> &p, const std::vector<float> &k)
{
    // Following: cv::cvUndistortPointsInternal
    // Uses only the distortion coefficients: [k1, k2, p1, p2, k3]
    // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/undistort.dispatch.cpp#L432

    float x0 = p[0];
    float y0 = p[1];
    float x = x0;
    float y = y0;

    // Iteratively refine the estimate for the undistorted point.
    int max_iterations = 5;
    for (int iter = 0; iter < max_iterations; ++iter)
    {
        float r2 = x * x + y * y;
        double icdist = 1.0 / (1 + ((k[4] * r2 + k[1]) * r2 + k[0]) * r2);
        if (icdist < 0)
        {
            x = x0;
            y = y0;
            break;
        }

        float deltaX = 2 * k[2] * x * y + k[3] * (r2 + 2 * x * x);
        float deltaY = k[2] * (r2 + 2 * y * y) + 2 * k[3] * x * y;
        x = (x0 - deltaX) * icdist;
        y = (y0 - deltaY) * icdist;
    }

    p[0] = x;
    p[1] = y;
}

// =================================================================================================

void CameraInternal::undistort_point_fisheye(std::array<float, 3> &p, const std::vector<float> &k)
{
    // Following: cv::fisheye::undistortPoints
    // Uses only the distortion coefficients: [k1, k2, k3, k4]
    // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/fisheye.cpp#L429

    float theta_d = std::sqrt(p[0] * p[0] + p[1] * p[1]);
    float pi_half = std::numbers::pi * 0.5;
    theta_d = std::min(std::max(-pi_half, theta_d), pi_half);

    if (theta_d < 1e-6)
    {
        return;
    }

    float scale = 0.0;
    float theta = theta_d;

    int max_iterations = 5;
    for (int iter = 0; iter < max_iterations; ++iter)
    {
        float theta2 = theta * theta;
        float theta4 = theta2 * theta2;
        float theta6 = theta4 * theta2;
        float theta8 = theta4 * theta4;

        float k0_theta2 = k[0] * theta2;
        float k1_theta4 = k[1] * theta4;
        float k2_theta6 = k[2] * theta6;
        float k3_theta8 = k[3] * theta8;

        float theta_fix = (theta * (1 + k0_theta2 + k1_theta4 + k2_theta6 + k3_theta8) - theta_d) /
                          (1 + 3 * k0_theta2 + 5 * k1_theta4 + 7 * k2_theta6 + 9 * k3_theta8);
        theta = theta - theta_fix;

        if (std::fabs(theta_fix) < 1e-6)
        {
            break;
        }
    }

    scale = std::tan(theta) / theta_d;
    p[0] *= scale;
    p[1] *= scale;
}

// =================================================================================================

std::array<std::array<float, 3>, 3> CameraInternal::calc_optimal_camera_matrix_fisheye(
    float balance, std::pair<int, int> new_size)
{
    // Following: cv::fisheye::estimateNewCameraMatrixForUndistortRectify
    // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/fisheye.cpp#L630

    float fov_scale = 1.0;
    float w = static_cast<float>(cam.width);
    float h = static_cast<float>(cam.height);
    balance = std::min(std::max(balance, 0.0f), 1.0f);

    // Define four key points at the middle of each edge
    std::vector<std ::array<float, 2>> pts = {
        {w * 0.5f, 0.0},
        {w, h * 0.5f},
        {w * 0.5f, h},
        {0.0, h * 0.5f}};

    // Extract intrinsic parameters
    float fx = cam.K[0][0];
    float fy = cam.K[1][1];
    float cx = cam.K[0][2];
    float cy = cam.K[1][2];

    // Undistort the edge points
    for (auto &pt : pts)
    {
        std::array<float, 3> p_normed = {(pt[0] - cx) / fx, (pt[1] - cy) / fy, 1.0};
        undistort_point_fisheye(p_normed, cam.DC);
        pt[0] = p_normed[0];
        pt[1] = p_normed[1];
    }

    // Compute center mass of the undistorted edge points
    float sum_x = 0.0, sum_y = 0.0;
    for (const auto &pt : pts)
    {
        sum_x += pt[0];
        sum_y += pt[1];
    }
    float cn_x = sum_x / pts.size();
    float cn_y = sum_y / pts.size();

    // Convert to identity ratio
    float aspect_ratio = fx / fy;
    cn_y *= aspect_ratio;
    for (auto &pt : pts)
        pt[1] *= aspect_ratio;

    // Find the bounding box of the undistorted points
    float minx = std::numeric_limits<float>::max();
    float miny = std::numeric_limits<float>::max();
    float maxx = -std::numeric_limits<float>::max();
    float maxy = -std::numeric_limits<float>::max();
    for (const auto &pt : pts)
    {
        minx = std::min(minx, pt[0]);
        maxx = std::max(maxx, pt[0]);
        miny = std::min(miny, pt[1]);
        maxy = std::max(maxy, pt[1]);
    }

    // Calculate candidate focal lengths
    float f1 = (w * 0.5) / (cn_x - minx);
    float f2 = (w * 0.5) / (maxx - cn_x);
    float f3 = (h * 0.5 * aspect_ratio) / (cn_y - miny);
    float f4 = (h * 0.5 * aspect_ratio) / (maxy - cn_y);
    float fmin = std::min({f1, f2, f3, f4});
    float fmax = std::max({f1, f2, f3, f4});

    // Blend the candidate focal lengths
    float f_val = balance * fmin + (1.0f - balance) * fmax;
    if (fov_scale > 0.0f)
        f_val /= fov_scale;

    // Compute new intrinsic parameters
    float new_fx = f_val;
    float new_fy = f_val;
    float new_cx = -cn_x * f_val + (w * 0.5);
    float new_cy = -cn_y * f_val + (h * aspect_ratio * 0.5);

    // Restore aspect ratio
    new_fy /= aspect_ratio;
    new_cy /= aspect_ratio;

    // Optionally scale parameters to new new image size
    if (new_size.first > 0 && new_size.second > 0)
    {
        float rx = static_cast<float>(new_size.first) / w;
        float ry = static_cast<float>(new_size.second) / h;
        new_fx *= rx;
        new_fy *= ry;
        new_cx *= rx;
        new_cy *= ry;
    }

    std::array<std::array<float, 3>, 3> newK = {
        {{new_fx, 0.0, new_cx},
         {0.0, new_fy, new_cy},
         {0.0, 0.0, 1.0}}};
    return newK;
}

// =================================================================================================

std::array<std::array<float, 3>, 3> CameraInternal::calc_optimal_camera_matrix_pinhole(
    float alpha, std::pair<int, int> new_size)
{
    // Following: cv::getOptimalNewCameraMatrix
    // https://github.com/opencv/opencv/blob/4.x/modules/calib3d/src/calibration_base.cpp#L1565

    bool center_principal_point = false;
    bool use_pix_roi = false;
    float w = static_cast<float>(cam.width);
    float h = static_cast<float>(cam.height);
    alpha = std::min(std::max(alpha, 0.0f), 1.0f);

    if (center_principal_point || use_pix_roi)
    {
        // Not implemented
        exit(1);
    }

    // Define key points
    // Calculate only the contour points of the image, and use less points,
    // the edges and centers should be enough if the camera has no strange distortions
    const size_t N = 3;
    std::vector<std ::array<float, 2>> pts;
    pts.reserve(4 * (N - 1));
    for (size_t y = 0; y < N; ++y)
    {
        for (size_t x = 0; x < N; ++x)
        {
            if (x != 0 && x != N - 1 && y != 0 && y != N - 1)
            {
                continue;
            }
            pts.push_back({x * (w - 1) / (N - 1), y * (h - 1) / (N - 1)});
        }
    }

    // Extract intrinsic parameters
    float fx = cam.K[0][0];
    float fy = cam.K[1][1];
    float cx = cam.K[0][2];
    float cy = cam.K[1][2];

    // Undistort the key points
    for (auto &pt : pts)
    {
        std::array<float, 3> p_normed = {(pt[0] - cx) / fx, (pt[1] - cy) / fy, 1.0};
        undistort_point_pinhole(p_normed, cam.DC);
        pt[0] = p_normed[0];
        pt[1] = p_normed[1];
    }

    // Get inscribed and circumscribed rectangles in normalized coordinates
    float iX0 = -std::numeric_limits<float>::max();
    float iX1 = std::numeric_limits<float>::max();
    float iY0 = -std::numeric_limits<float>::max();
    float iY1 = std::numeric_limits<float>::max();
    float oX0 = std::numeric_limits<float>::max();
    float oX1 = -std::numeric_limits<float>::max();
    float oY0 = std::numeric_limits<float>::max();
    float oY1 = -std::numeric_limits<float>::max();
    size_t k = 0;
    for (size_t y = 0; y < N; ++y)
    {
        for (size_t x = 0; x < N; ++x)
        {
            if (x != 0 && x != N - 1 && y != 0 && y != N - 1)
            {
                continue;
            }

            auto &pt = pts[k];
            k += 1;

            if (x == 0)
            {
                oX0 = std::min(oX0, pt[0]);
                iX0 = std::max(iX0, pt[0]);
            }
            if (x == N - 1)
            {
                oX1 = std::max(oX1, pt[0]);
                iX1 = std::min(iX1, pt[0]);
            }
            if (y == 0)
            {
                oY0 = std::min(oY0, pt[1]);
                iY0 = std::max(iY0, pt[1]);
            }
            if (y == N - 1)
            {
                oY1 = std::max(oY1, pt[1]);
                iY1 = std::min(iY1, pt[1]);
            }
        }
    }
    float inner_width = iX1 - iX0;
    float inner_height = iY1 - iY0;
    float outer_width = oX1 - oX0;
    float outer_height = oY1 - oY0;

    // Projection mapping inner rectangle to viewport
    float fx0 = (new_size.first - 1) / inner_width;
    float fy0 = (new_size.second - 1) / inner_height;
    float cx0 = -fx0 * iX0;
    float cy0 = -fy0 * iY0;

    // Projection mapping outer rectangle to viewport
    float fx1 = (new_size.first - 1) / outer_width;
    float fy1 = (new_size.second - 1) / outer_height;
    float cx1 = -fx1 * oX0;
    float cy1 = -fy1 * oY0;

    // Interpolate between the two optimal projections
    float new_fx = fx0 * (1 - alpha) + fx1 * alpha;
    float new_fy = fy0 * (1 - alpha) + fy1 * alpha;
    float new_cx = cx0 * (1 - alpha) + cx1 * alpha;
    float new_cy = cy0 * (1 - alpha) + cy1 * alpha;

    std::array<std::array<float, 3>, 3> newK = {
        {{new_fx, 0.0, new_cx},
         {0.0, new_fy, new_cy},
         {0.0, 0.0, 1.0}}};
    return newK;
}

// =================================================================================================
// =================================================================================================

TriangulatorInternal::TriangulatorInternal(float min_match_score, size_t min_group_size)
{
    this->min_match_score = min_match_score;
    this->min_group_size = min_group_size;
}

// =================================================================================================

std::vector<std::vector<std::array<float, 4>>> TriangulatorInternal::triangulate_poses(
    const std::vector<std::vector<std::vector<std::array<float, 3>>>> &poses_2d,
    const std::vector<Camera> &cameras,
    const std::array<std::array<float, 3>, 2> &roomparams,
    const std::vector<std::string> &joint_names)
{
    auto ttime = std::chrono::steady_clock::now();
    auto stime = std::chrono::steady_clock::now();
    std::chrono::duration<double> elapsed;

    // Check inputs
    if (poses_2d.empty())
    {
        throw std::invalid_argument("No 2D poses provided.");
    }
    if (poses_2d.size() != cameras.size())
    {
        throw std::invalid_argument("Number of cameras and 2D poses must be the same.");
    }
    if (joint_names.size() != poses_2d[0][0].size())
    {
        throw std::invalid_argument("Number of joint names and 2D poses must be the same.");
    }
    for (const auto &joint : core_joints)
    {
        if (std::find(joint_names.begin(), joint_names.end(), joint) == joint_names.end())
        {
            throw std::invalid_argument("Core joint '" + joint + "' not found in joint names.");
        }
    }

    // Convert inputs to internal formats
    std::vector<std::vector<std::vector<std::array<float, 3>>>> i_poses_2d = poses_2d;
    std::vector<CameraInternal> internal_cameras;
    for (size_t i = 0; i < cameras.size(); ++i)
    {
        internal_cameras.emplace_back(cameras[i]);
    }
    std::vector<size_t> core_joint_idx;
    for (const auto &joint : core_joints)
    {
        auto it = std::find(joint_names.begin(), joint_names.end(), joint);
        core_joint_idx.push_back(std::distance(joint_names.begin(), it));
    }
    std::vector<std::array<size_t, 2>> core_limbs_idx;
    for (const auto &limb : core_limbs)
    {
        auto it1 = std::find(core_joints.begin(), core_joints.end(), limb.first);
        auto it2 = std::find(core_joints.begin(), core_joints.end(), limb.second);
        core_limbs_idx.push_back({(size_t)std::distance(core_joints.begin(), it1),
                                  (size_t)std::distance(core_joints.begin(), it2)});
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    init_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Undistort 2D poses
    for (size_t i = 0; i < cameras.size(); ++i)
    {
        undistort_poses(i_poses_2d[i], internal_cameras[i]);
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    undistort_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Project last 3D poses to 2D
    std::vector<std::tuple<std::vector<std::vector<std::array<float, 3>>>,
                           std::vector<std::vector<float>>>>
        last_poses_2d;
    if (!last_poses_3d.empty())
    {
        // Select core joints
        std::vector<std::vector<std::array<float, 4>>> last_core_poses;
        last_core_poses.resize(last_poses_3d.size());
        for (size_t i = 0; i < last_poses_3d.size(); ++i)
        {
            last_core_poses[i].resize(core_joint_idx.size());
            for (size_t j = 0; j < core_joint_idx.size(); ++j)
            {
                for (size_t k = 0; k < 4; ++k)
                {
                    last_core_poses[i][j][k] = last_poses_3d[i][core_joint_idx[j]][k];
                }
            }
        }

        // Project
        last_poses_2d.resize(cameras.size());
        for (size_t i = 0; i < cameras.size(); ++i)
        {
            auto [poses2d, dists] = project_poses(last_core_poses, internal_cameras[i], true);
            last_poses_2d[i] = std::make_tuple(poses2d, dists);
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    project_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Check matches to old poses
    float threshold = min_match_score - 0.2;
    std::map<size_t, std::map<size_t, std::vector<size_t>>> scored_pasts;
    if (!last_poses_3d.empty())
    {
        // Calculate index pairs and initialize vectors
        std::vector<std::array<size_t, 3>> indices_ijk;
        for (size_t i = 0; i < cameras.size(); ++i)
        {
            size_t num_last_persons = std::get<0>(last_poses_2d[i]).size();
            scored_pasts[i] = std::map<size_t, std::vector<size_t>>();

            for (size_t j = 0; j < num_last_persons; ++j)
            {
                size_t num_new_persons = poses_2d[i].size();
                scored_pasts[i][j] = std::vector<size_t>();

                for (size_t k = 0; k < num_new_persons; ++k)
                {
                    indices_ijk.push_back({i, j, k});
                }
            }
        }
        std::vector<std::array<size_t, 2>> indices_ik;
        for (size_t i = 0; i < cameras.size(); ++i)
        {
            size_t num_new_persons = poses_2d[i].size();
            for (size_t k = 0; k < num_new_persons; ++k)
            {
                indices_ik.push_back({i, k});
            }
        }

        // Precalculate core poses
        std::vector<std::vector<std::array<float, 3>>> poses_2d_core_list;
        poses_2d_core_list.resize(indices_ik.size());
        std::vector<std::vector<size_t>> mats_core_map;
        mats_core_map.resize(cameras.size());
        for (size_t i = 0; i < cameras.size(); ++i)
        {
            size_t num_new_persons = poses_2d[i].size();
            for (size_t k = 0; k < num_new_persons; ++k)
            {
                mats_core_map[i].push_back(0);
            }
        }
        for (size_t e = 0; e < indices_ik.size(); ++e)
        {
            const auto [i, k] = indices_ik[e];
            const auto &pose = i_poses_2d[i][k];
            std::vector<std::array<float, 3>> pose_core;
            pose_core.resize(core_joint_idx.size());
            for (size_t j = 0; j < core_joint_idx.size(); ++j)
            {
                size_t idx = core_joint_idx[j];
                pose_core[j] = pose[idx];
            }
            poses_2d_core_list[e] = pose_core;
            mats_core_map[i][k] = e;
        }

        // Calculate matching score
        for (size_t e = 0; e < indices_ijk.size(); ++e)
        {
            const auto [i, j, k] = indices_ijk[e];

            const auto &last_pose = std::get<0>(last_poses_2d[i])[j];
            const auto &last_dist = std::get<1>(last_poses_2d[i])[j];
            const auto &new_pose = poses_2d_core_list[mats_core_map[i][k]];

            float score = calc_pose_score(new_pose, last_pose, last_dist, internal_cameras[i]);
            if (score > threshold)
            {
                {
                    scored_pasts[i][j].push_back(k);
                }
            }
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    match_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Create pairs of persons
    // Checks if the person was already matched to the last frame and if so only creates pairs
    // with those, else it creates all possible pairs
    std::vector<int> num_persons_sum;
    for (size_t i = 0; i < cameras.size(); ++i)
    {
        int nsum = poses_2d[i].size();
        if (i > 0)
        {
            nsum += num_persons_sum[i - 1];
        }
        num_persons_sum.push_back(nsum);
    }
    std::vector<std::pair<std::tuple<int, int, int, int>, std::pair<int, int>>> all_pairs;
    std::vector<std::array<size_t, 4>> indices;
    for (size_t i = 0; i < cameras.size(); ++i)
    {
        for (size_t j = i + 1; j < cameras.size(); ++j)
        {
            for (size_t k = 0; k < poses_2d[i].size(); ++k)
            {
                for (size_t l = 0; l < poses_2d[j].size(); ++l)
                {
                    indices.push_back({i, j, k, l});
                }
            }
        }
    }
    for (size_t e = 0; e < indices.size(); ++e)
    {
        const auto [i, j, k, l] = indices[e];

        int pid1 = num_persons_sum[i] + k;
        int pid2 = num_persons_sum[k] + l;
        bool match = false;

        if (!last_poses_3d.empty())
        {
            for (size_t m = 0; m < last_poses_3d.size(); ++m)
            {
                auto &smi = scored_pasts[i][m];
                auto &smj = scored_pasts[j][m];
                bool in_smi = std::find(smi.begin(), smi.end(), k) != smi.end();
                bool in_smj = std::find(smj.begin(), smj.end(), l) != smj.end();

                if (in_smi && in_smj)
                {
                    match = true;
                    auto item = std::make_pair(
                        std::make_tuple(i, j, k, l), std::make_pair(pid1, pid2));
                    all_pairs.push_back(item);
                }
                else if (in_smi || in_smj)
                {
                    match = true;
                }
            }
        }

        if (!match)
        {
            auto item = std::make_pair(
                std::make_tuple(i, j, k, l), std::make_pair(pid1, pid2));
            all_pairs.push_back(item);
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    pairs_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Calculate pair scores
    std::vector<std::pair<std::vector<std::array<float, 4>>, float>> all_scored_poses;
    all_scored_poses.resize(all_pairs.size());
    for (size_t i = 0; i < all_pairs.size(); ++i)
    {
        const auto &pids = all_pairs[i].first;

        // Extract camera parameters
        const auto &cam1 = internal_cameras[std::get<0>(pids)];
        const auto &cam2 = internal_cameras[std::get<1>(pids)];

        // Extract 2D poses
        const auto &pose1 = i_poses_2d[std::get<0>(pids)][std::get<2>(pids)];
        const auto &pose2 = i_poses_2d[std::get<1>(pids)][std::get<3>(pids)];

        // Select core joints
        std::vector<std::array<float, 3>> pose1_core, pose2_core;
        pose1_core.resize(core_joint_idx.size());
        pose2_core.resize(core_joint_idx.size());
        for (size_t j = 0; j < core_joint_idx.size(); ++j)
        {
            size_t idx = core_joint_idx[j];
            pose1_core[j] = pose1[idx];
            pose2_core[j] = pose2[idx];
        }

        // Triangulate and score
        auto [pose3d, score] = triangulate_and_score(
            pose1_core, pose2_core, cam1, cam2, roomparams, core_limbs_idx);
        all_scored_poses[i] = std::move(std::make_pair(pose3d, score));
    }

    // Drop low scoring poses
    size_t num_poses = all_scored_poses.size();
    for (size_t i = num_poses; i > 0; --i)
    {
        if (all_scored_poses[i - 1].second < min_match_score)
        {
            all_scored_poses.erase(all_scored_poses.begin() + i - 1);
            all_pairs.erase(all_pairs.begin() + i - 1);
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    pair_scoring_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Group pairs that share a person
    std::vector<std::tuple<
        std::array<float, 3>, std::vector<std::array<float, 4>>, std::vector<int>>>
        groups = calc_grouping(all_pairs, all_scored_poses, min_match_score);

    // Drop groups with too few matches
    size_t num_groups = groups.size();
    for (size_t i = num_groups; i > 0; --i)
    {
        if (std::get<2>(groups[i - 1]).size() < this->min_group_size)
        {
            groups.erase(groups.begin() + i - 1);
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    grouping_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Calculate full 3D poses
    std::vector<std::vector<std::array<float, 4>>> all_full_poses;
    all_full_poses.resize(all_pairs.size());
    for (size_t i = 0; i < all_pairs.size(); ++i)
    {
        const auto &pids = all_pairs[i].first;
        const auto &cam1 = internal_cameras[std::get<0>(pids)];
        const auto &cam2 = internal_cameras[std::get<1>(pids)];
        const auto &pose1 = i_poses_2d[std::get<0>(pids)][std::get<2>(pids)];
        const auto &pose2 = i_poses_2d[std::get<1>(pids)][std::get<3>(pids)];

        auto [pose3d, score] = triangulate_and_score(
            pose1, pose2, cam1, cam2, roomparams, {});

        // Scale scores with 2D confidences
        // They can improve the merge weighting, but especially the earlier step of pair-filtering
        // works better if only per-view consistency is used, so they are not included before.
        for (size_t j = 0; j < pose3d.size(); ++j)
        {
            float score1 = pose1[j][2];
            float score2 = pose2[j][2];
            float min_score = 0.1;

            if (score1 > min_score && score2 > min_score)
            {
                float scoreP = (score1 + score2) * 0.5;
                float scoreT = pose3d[j][3];

                // Since the triangulation score is less sensitive and generally higher,
                // weight it stronger to balance the two scores.
                pose3d[j][3] = 0.9 * scoreT + 0.1 * scoreP;
            }
        }

        all_full_poses[i] = std::move(pose3d);
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    full_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Merge groups
    std::vector<std::vector<std::array<float, 4>>> all_merged_poses;
    all_merged_poses.resize(groups.size());
    for (size_t i = 0; i < groups.size(); ++i)
    {
        const auto &group = groups[i];
        std::vector<std::vector<std::array<float, 4>>> poses;
        poses.reserve(std::get<2>(group).size());

        for (const auto &idx : std::get<2>(group))
        {
            poses.push_back(all_full_poses[idx]);
        }

        auto merged_pose = merge_group(poses, min_match_score);
        all_merged_poses[i] = std::move(merged_pose);
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    merge_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Run post-processing steps
    std::vector<std::vector<std::array<float, 4>>> final_poses_3d = std::move(all_merged_poses);
    add_extra_joints(final_poses_3d, joint_names);
    filter_poses(final_poses_3d, roomparams, core_joint_idx, core_limbs_idx);
    add_missing_joints(final_poses_3d, joint_names);

    // Store final result for use in the next frame.
    last_poses_3d = final_poses_3d;

    elapsed = std::chrono::steady_clock::now() - stime;
    post_time += elapsed.count();
    stime = std::chrono::steady_clock::now();

    // Convert to output format
    std::vector<std::vector<std::array<float, 4>>> poses_3d;
    poses_3d.reserve(final_poses_3d.size());
    for (size_t i = 0; i < final_poses_3d.size(); ++i)
    {
        const size_t num_joints = final_poses_3d[i].size();
        std::vector<std::array<float, 4>> pose;
        pose.reserve(num_joints);
        size_t num_valid = 0;

        for (size_t j = 0; j < num_joints; ++j)
        {
            std::array<float, 4> point;
            for (size_t k = 0; k < 4; ++k)
            {
                point[k] = final_poses_3d[i][j][k];
            }
            pose.push_back(point);

            if (point[3] > min_match_score)
            {
                num_valid++;
            }
        }

        if (num_valid > 0)
        {
            poses_3d.push_back(std::move(pose));
        }
    }

    elapsed = std::chrono::steady_clock::now() - stime;
    convert_time += elapsed.count();

    elapsed = std::chrono::steady_clock::now() - ttime;
    total_time += elapsed.count();
    num_calls++;

    return poses_3d;
}

// =================================================================================================

void TriangulatorInternal::reset()
{
    last_poses_3d.clear();
}

// =================================================================================================

void TriangulatorInternal::print_stats()
{
    std::cout << "{" << std::endl;
    std::cout << "  \"triangulator_calls\": " << num_calls << "," << std::endl;
    std::cout << "  \"init_time\": " << init_time / num_calls << "," << std::endl;
    std::cout << "  \"undistort_time\": " << undistort_time / num_calls << "," << std::endl;
    std::cout << "  \"project_time\": " << project_time / num_calls << "," << std::endl;
    std::cout << "  \"match_time\": " << match_time / num_calls << "," << std::endl;
    std::cout << "  \"pairs_time\": " << pairs_time / num_calls << "," << std::endl;
    std::cout << "  \"pair_scoring_time\": " << pair_scoring_time / num_calls << "," << std::endl;
    std::cout << "  \"grouping_time\": " << grouping_time / num_calls << "," << std::endl;
    std::cout << "  \"full_time\": " << full_time / num_calls << "," << std::endl;
    std::cout << "  \"merge_time\": " << merge_time / num_calls << "," << std::endl;
    std::cout << "  \"post_time\": " << post_time / num_calls << "," << std::endl;
    std::cout << "  \"convert_time\": " << convert_time / num_calls << "," << std::endl;
    std::cout << "  \"total_time\": " << total_time / num_calls << std::endl;
    std::cout << "}" << std::endl;
}

// =================================================================================================

void TriangulatorInternal::undistort_poses(
    std::vector<std::vector<std::array<float, 3>>> &poses_2d, CameraInternal &icam)
{
    float ifx_old = 1.0 / icam.cam.K[0][0];
    float ify_old = 1.0 / icam.cam.K[1][1];
    float cx_old = icam.cam.K[0][2];
    float cy_old = icam.cam.K[1][2];
    float fx_new = icam.newK[0][0];
    float fy_new = icam.newK[1][1];
    float cx_new = icam.newK[0][2];
    float cy_new = icam.newK[1][2];

    // Undistort all the points
    size_t num_persons = poses_2d.size();
    size_t num_joints = poses_2d[0].size();
    for (size_t i = 0; i < num_persons; ++i)
    {
        for (size_t j = 0; j < num_joints; ++j)
        {
            // Normalize the point using the original camera matrix
            poses_2d[i][j][0] = (poses_2d[i][j][0] - cx_old) * ifx_old;
            poses_2d[i][j][1] = (poses_2d[i][j][1] - cy_old) * ify_old;

            // Undistort
            // Using own implementation is faster than using OpenCV, because it avoids the
            // overhead of creating cv::Mat objects and further unnecessary calculations for
            // additional distortion parameters and identity rotations in this usecase.
            if (icam.cam.type == "fisheye")
            {
                CameraInternal::undistort_point_fisheye(poses_2d[i][j], icam.cam.DC);
            }
            else
            {
                CameraInternal::undistort_point_pinhole(poses_2d[i][j], icam.cam.DC);
            }

            // Map the undistorted normalized point to the new image coordinates
            poses_2d[i][j][0] = (poses_2d[i][j][0] * fx_new) + cx_new;
            poses_2d[i][j][1] = (poses_2d[i][j][1] * fy_new) + cy_new;
        }
    }

    // Mask out points that are far outside the image (points slightly outside are still valid)
    int width = icam.cam.width;
    int height = icam.cam.height;
    float mask_offset = (width + height) / 20.0;
    for (size_t i = 0; i < num_persons; ++i)
    {
        for (size_t j = 0; j < num_joints; ++j)
        {
            if (poses_2d[i][j][0] < -mask_offset || poses_2d[i][j][0] >= width + mask_offset ||
                poses_2d[i][j][1] < -mask_offset || poses_2d[i][j][1] >= height + mask_offset)
            {
                poses_2d[i][j][0] = 0.0;
                poses_2d[i][j][1] = 0.0;
                poses_2d[i][j][2] = 0.0;
            }
        }
    }
}

// =================================================================================================

std::tuple<std::vector<std::vector<std::array<float, 3>>>, std::vector<std::vector<float>>>
TriangulatorInternal::project_poses(
    const std::vector<std::vector<std::array<float, 4>>> &poses_3d,
    const CameraInternal &icam,
    bool calc_dists)
{
    const size_t num_persons = poses_3d.size();
    const size_t num_joints = poses_3d[0].size();

    // Prepare output vectors
    std::vector<std::vector<std::array<float, 3>>> poses_2d;
    std::vector<std::vector<float>> all_dists;
    poses_2d.resize(num_persons);
    all_dists.resize(num_persons);

    // Get camera parameters
    const std::array<std::array<float, 3>, 3> &K = icam.newK;
    const std::array<std::array<float, 3>, 3> &R = icam.invR;
    const std::array<std::array<float, 1>, 3> &T = icam.cam.T;

    for (size_t i = 0; i < num_persons; ++i)
    {
        const std::vector<std::array<float, 4>> &body3D = poses_3d[i];
        std::vector<std::array<float, 3>> body2D(num_joints, std::array<float, 3>{0.0, 0.0, 0.0});
        std::vector<float> dists(num_joints, 0.0);

        for (size_t j = 0; j < num_joints; ++j)
        {
            float x = body3D[j][0];
            float y = body3D[j][1];
            float z = body3D[j][2];
            float score = body3D[j][3];

            // Project from world to camera coordinate system
            float xt = x - T[0][0];
            float yt = y - T[1][0];
            float zt = z - T[2][0];
            float x_cam = xt * R[0][0] + yt * R[0][1] + zt * R[0][2];
            float y_cam = xt * R[1][0] + yt * R[1][1] + zt * R[1][2];
            float z_cam = xt * R[2][0] + yt * R[2][1] + zt * R[2][2];

            // Set points behind the camera to zero
            if (z_cam <= 0.0)
            {
                x_cam = y_cam = z_cam = 0.0;
            }

            // Calculate distance from camera center if required
            if (calc_dists)
                dists[j] = std::sqrt(x_cam * x_cam + y_cam * y_cam + z_cam * z_cam);
            else
                dists[j] = 0.0;

            // Project points to image plane
            // Since images are already undistorted, use the pinhole projection for all camera types
            float u = 0.0, v = 0.0;
            if (z_cam > 0.0)
            {
                u = (K[0][0] * x_cam + K[0][1] * y_cam + K[0][2] * z_cam) / z_cam;
                v = (K[1][0] * x_cam + K[1][1] * y_cam + K[1][2] * z_cam) / z_cam;
            }

            // Filter invalid projections
            if (u < 0.0 || u >= icam.cam.width || v < 0.0 || v >= icam.cam.height)
            {
                u = 0.0;
                v = 0.0;
                score = 0.0;
            }

            body2D[j] = {u, v, score};
        }

        // Store results
        poses_2d[i] = std::move(body2D);
        all_dists[i] = std::move(dists);
    }

    return std::make_tuple(poses_2d, all_dists);
}

// =================================================================================================

float TriangulatorInternal::calc_pose_score(
    const std::vector<std::array<float, 3>> &pose1,
    const std::vector<std::array<float, 3>> &pose2,
    const std::vector<float> &dist1,
    const CameraInternal &icam)
{
    const float min_score = 0.1;
    const size_t num_joints = pose1.size();

    // Create mask for valid points
    std::vector<bool> mask;
    mask.resize(num_joints);
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (pose1[i][2] <= min_score || pose2[i][2] <= min_score)
        {
            mask[i] = false;
        }
        else
        {
            mask[i] = true;
        }
    }

    // Drop if not enough valid points
    int valid_count = 0;
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            valid_count++;
        }
    }
    if (valid_count < 3)
    {
        return 0.0;
    }

    // Calculate scores
    float iscale = (icam.cam.width + icam.cam.height) / 2;
    std::vector<float> scores = score_projection(pose1, pose2, dist1, mask, iscale);

    // Drop lowest scores
    size_t drop_k = static_cast<size_t>(num_joints * 0.2);
    const size_t min_k = 3;
    std::vector<float> scores_vec;
    scores_vec.reserve(valid_count);
    for (size_t i = 0; i < scores.size(); ++i)
    {
        if (mask[i] > 0)
        {
            scores_vec.push_back(scores[i]);
        }
    }
    size_t scores_size = scores_vec.size();
    if (scores_size >= min_k)
    {
        drop_k = std::min(drop_k, scores_size - min_k);
        std::partial_sort(scores_vec.begin(), scores_vec.begin() + drop_k, scores_vec.end());
        scores_vec.erase(scores_vec.begin(), scores_vec.begin() + drop_k);
    }

    // Calculate final score
    float score = 0.0;
    size_t n_items = scores_vec.size();
    if (n_items > 0)
    {
        float sum_scores = std::accumulate(scores_vec.begin(), scores_vec.end(), 0.0);
        score = sum_scores / static_cast<float>(n_items);
    }

    return score;
}

// =================================================================================================

std::vector<float> TriangulatorInternal::score_projection(
    const std::vector<std::array<float, 3>> &pose,
    const std::vector<std::array<float, 3>> &repro,
    const std::vector<float> &dists,
    const std::vector<bool> &mask,
    float iscale)
{
    const float min_score = 0.1;
    const size_t num_joints = pose.size();

    // Calculate error
    std::vector<float> error(num_joints, 0.0);
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            float dx = pose[i][0] - repro[i][0];
            float dy = pose[i][1] - repro[i][1];
            float err = std::sqrt(dx * dx + dy * dy);

            // Set errors of invisible reprojections to a high value
            if (repro[i][2] < min_score)
            {
                err = iscale;
            }
            error[i] = err;
        }
    }

    // Scale error by image size
    const float inv_iscale = 1.0 / iscale;
    const float iscale_quarter = iscale / 4.0;
    for (size_t i = 0; i < num_joints; ++i)
    {
        float err = error[i];
        err = std::max(0.0f, std::min(err, iscale_quarter)) * inv_iscale;
        error[i] = err;
    }

    // Scale error by distance to camera
    float mean_dist = 0.0;
    int count = 0;
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            mean_dist += dists[i];
            ++count;
        }
    }
    if (count > 0)
    {
        mean_dist /= count;
    }
    const float dscale = std::sqrt(mean_dist / 3.5);
    for (size_t i = 0; i < num_joints; ++i)
    {
        error[i] *= dscale;
    }

    // Convert error to score
    std::vector<float> score(num_joints, 0.0);
    for (size_t i = 0; i < num_joints; ++i)
    {
        score[i] = 1.0 / (1.0 + error[i] * 10.0);
    }

    return score;
}

// =================================================================================================

float dot(const std::array<float, 3> &a, const std::array<float, 3> &b)
{
    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}
std::array<float, 3> cross(const std::array<float, 3> &a, const std::array<float, 3> &b)
{
    return {a[1] * b[2] - a[2] * b[1],
            a[2] * b[0] - a[0] * b[2],
            a[0] * b[1] - a[1] * b[0]};
}
std::array<float, 3> add(const std::array<float, 3> &a, const std::array<float, 3> &b)
{
    return {a[0] + b[0], a[1] + b[1], a[2] + b[2]};
}
std::array<float, 3> subtract(const std::array<float, 3> &a, const std::array<float, 3> &b)
{
    return {a[0] - b[0], a[1] - b[1], a[2] - b[2]};
}
std::array<float, 3> multiply(const std::array<float, 3> &a, float s)
{
    return {a[0] * s, a[1] * s, a[2] * s};
}
std::array<float, 3> normalize(const std::array<float, 3> &v)
{
    float norm = std::sqrt(dot(v, v));
    if (norm < 1e-8f)
        return v;
    return multiply(v, 1.0f / norm);
}
std::array<float, 3> mat_mul_vec(
    const std::array<std::array<float, 3>, 3> &M, const std::array<float, 3> &v)
{
    std::array<float, 3> res = {M[0][0] * v[0] + M[0][1] * v[1] + M[0][2] * v[2],
                                M[1][0] * v[0] + M[1][1] * v[1] + M[1][2] * v[2],
                                M[2][0] * v[0] + M[2][1] * v[1] + M[2][2] * v[2]};
    return res;
}

std::array<float, 3> calc_ray_dir(const CameraInternal &icam, const std::array<float, 2> &pt)
{
    // Compute normalized ray direction from the point
    std::array<float, 3> uv1 = {pt[0], pt[1], 1.0};
    auto d = mat_mul_vec(icam.cam.R, mat_mul_vec(icam.invK, uv1));
    auto ray_dir = normalize(d);

    return ray_dir;
}

std::array<float, 3> triangulate_midpoint(
    const CameraInternal &icam1,
    const CameraInternal &icam2,
    const std::array<float, 2> &pt1,
    const std::array<float, 2> &pt2)
{
    // Triangulate two points by computing their two rays and the midpoint of their closest approach
    // See: https://en.wikipedia.org/wiki/Skew_lines#Nearest_points

    // Obtain the camera centers and ray directions for both views
    std::array<float, 3> p1 = icam1.center;
    std::array<float, 3> p2 = icam2.center;
    std::array<float, 3> d1 = calc_ray_dir(icam1, pt1);
    std::array<float, 3> d2 = calc_ray_dir(icam2, pt2);

    // Compute the perpendicular plane vectors
    std::array<float, 3> n = cross(d1, d2);
    std::array<float, 3> n1 = cross(d1, n);
    std::array<float, 3> n2 = cross(d2, n);

    // Calculate point on Line 1 nearest to Line 2
    float t1 = dot(subtract(p2, p1), n2) / dot(d1, n2);
    std::array<float, 3> c1 = add(p1, multiply(d1, t1));

    // Calculate point on Line 2 nearest to Line 1
    float t2 = dot(subtract(p1, p2), n1) / dot(d2, n1);
    std::array<float, 3> c2 = add(p2, multiply(d2, t2));

    // Compute midpoint between c1 and c2.
    std::array<float, 3> midpoint = multiply(add(c1, c2), 0.5);
    return midpoint;
}

// =================================================================================================

std::pair<std::vector<std::array<float, 4>>, float> TriangulatorInternal::triangulate_and_score(
    const std::vector<std::array<float, 3>> &pose1,
    const std::vector<std::array<float, 3>> &pose2,
    const CameraInternal &cam1,
    const CameraInternal &cam2,
    const std::array<std::array<float, 3>, 2> &roomparams,
    const std::vector<std::array<size_t, 2>> &core_limbs_idx)
{
    const float min_score = 0.1;
    const size_t num_joints = pose1.size();

    // Create mask for valid points
    std::vector<bool> mask;
    mask.resize(num_joints);
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (pose1[i][2] <= min_score || pose2[i][2] <= min_score)
        {
            mask[i] = false;
        }
        else
        {
            mask[i] = true;
        }
    }

    // If too few joints are visible, return a low score
    int num_visible = 0;
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            num_visible++;
        }
    }
    if (num_visible < 3)
    {
        std::vector<std::array<float, 4>> empty(num_joints, {0.0, 0.0, 0.0, 0.0});
        float score = 0.0;
        return std::make_pair(empty, score);
    }

    // Use midpoint triangulation instead of cv::triangulatePoints because it is much faster,
    // while having almost the same accuracy.
    std::vector<std::array<float, 4>> pose3d(num_joints, {0.0, 0.0, 0.0, 0.0});
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            auto &pt1 = pose1[i];
            auto &pt2 = pose2[i];
            std::array<float, 3> pt3d = triangulate_midpoint(
                cam1, cam2, {pt1[0], pt1[1]}, {pt2[0], pt2[1]});
            pose3d[i] = {pt3d[0], pt3d[1], pt3d[2], 1.0};
        }
    }

    // Check if mean of the points is inside the room bounds
    std::array<float, 3> mean = {0.0, 0.0, 0.0};
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            mean[0] += pose3d[i][0];
            mean[1] += pose3d[i][1];
            mean[2] += pose3d[i][2];
        }
    }
    float inv_num_vis = 1.0 / num_visible;
    for (int j = 0; j < 3; ++j)
    {
        mean[j] *= inv_num_vis;
    }
    const std::array<float, 3> &room_size = roomparams[0];
    const std::array<float, 3> &room_center = roomparams[1];
    for (int j = 0; j < 3; ++j)
    {
        if (mean[j] > room_center[j] + room_size[j] / 2.0 ||
            mean[j] < room_center[j] - room_size[j] / 2.0)
        {
            // Very low score if outside room
            for (size_t i = 0; i < num_joints; ++i)
            {
                if (mask[i])
                {
                    pose3d[i][3] = 0.001;
                }
            }
            return std::make_pair(pose3d, 0.001);
        }
    }

    // Calculate reprojections
    std::vector<std::vector<std::array<float, 4>>> poses_3d = {pose3d};
    auto [repro1s, dists1s] = project_poses(poses_3d, cam1, true);
    auto [repro2s, dists2s] = project_poses(poses_3d, cam2, true);
    auto repro1 = repro1s[0];
    auto dists1 = dists1s[0];
    auto repro2 = repro2s[0];
    auto dists2 = dists2s[0];

    // Calculate scores for each view
    float iscale = (cam1.cam.width + cam1.cam.height) / 2.0;
    std::vector<float> score1 = score_projection(pose1, repro1, dists1, mask, iscale);
    std::vector<float> score2 = score_projection(pose2, repro2, dists2, mask, iscale);

    // Add scores to 3D pose
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            float scoreT = 0.5 * (score1[i] + score2[i]);
            pose3d[i][3] = scoreT;
        }
    }

    // Set scores outside the room to a low value
    const float wdist = 0.1;
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (mask[i])
        {
            for (int j = 0; j < 3; ++j)
            {
                if (pose3d[i][j] > room_center[j] + room_size[j] / 2.0 + wdist ||
                    pose3d[i][j] < room_center[j] - room_size[j] / 2.0 - wdist)
                {
                    pose3d[i][3] = 0.001;
                    break;
                }
            }
        }
    }

    // Filter clearly wrong limbs
    if (!core_limbs_idx.empty())
    {
        const float max_length_sq = 0.9 * 0.9;
        for (size_t i = 0; i < core_limbs_idx.size(); ++i)
        {
            size_t limb1 = core_limbs_idx[i][0];
            size_t limb2 = core_limbs_idx[i][1];

            if (pose3d[limb1][3] > min_score && pose3d[limb2][3] > min_score)
            {
                float dx = pose3d[limb1][0] - pose3d[limb2][0];
                float dy = pose3d[limb1][1] - pose3d[limb2][1];
                float dz = pose3d[limb1][2] - pose3d[limb2][2];
                float length_sq = dx * dx + dy * dy + dz * dz;

                if (length_sq > max_length_sq)
                {
                    pose3d[limb2][3] = 0.001;
                }
            }
        }
    }

    // Drop lowest scores
    size_t drop_k = static_cast<size_t>(num_joints * 0.2);
    const size_t min_k = 3;
    std::vector<float> valid_scores;
    for (size_t i = 0; i < num_joints; ++i)
    {
        if (pose3d[i][3] > min_score)
        {
            valid_scores.push_back(pose3d[i][3]);
        }
    }
    size_t scores_size = valid_scores.size();
    if (scores_size >= min_k)
    {
        drop_k = std::min(drop_k, scores_size - min_k);
        std::partial_sort(valid_scores.begin(), valid_scores.begin() + drop_k, valid_scores.end());
        valid_scores.erase(valid_scores.begin(), valid_scores.begin() + drop_k);
    }

    // Calculate final score
    float final_score = 0.0;
    if (!valid_scores.empty())
    {
        float sum_scores = std::accumulate(valid_scores.begin(), valid_scores.end(), 0.0);
        final_score = sum_scores / static_cast<float>(valid_scores.size());
    }

    return std::make_pair(pose3d, final_score);
}

// =================================================================================================

std::vector<std::tuple<std::array<float, 3>, std::vector<std::array<float, 4>>, std::vector<int>>>
TriangulatorInternal::calc_grouping(
    const std::vector<std::pair<std::tuple<int, int, int, int>, std::pair<int, int>>> &all_pairs,
    const std::vector<std::pair<std::vector<std::array<float, 4>>, float>> &all_scored_poses,
    float min_score)
{
    float max_center_dist_sq = 0.75 * 0.75;
    float max_joint_avg_dist = 0.25;
    size_t num_pairs = all_pairs.size();

    // Calculate pose centers
    std::vector<std::array<float, 3>> centers;
    centers.resize(num_pairs);
    for (size_t i = 0; i < num_pairs; ++i)
    {
        const auto &pose_3d = all_scored_poses[i].first;
        const size_t num_joints = pose_3d.size();

        std::array<float, 3> center = {0.0, 0.0, 0.0};
        size_t num_valid = 0;
        for (size_t j = 0; j < num_joints; ++j)
        {
            float score = pose_3d[j][3];
            if (score > min_score)
            {
                center[0] += pose_3d[j][0];
                center[1] += pose_3d[j][1];
                center[2] += pose_3d[j][2];
                ++num_valid;
            }
        }
        if (num_valid > 0)
        {
            float inv = 1.0 / num_valid;
            center[0] *= inv;
            center[1] *= inv;
            center[2] *= inv;
        }
        centers[i] = center;
    }

    // Calculate Groups
    // defined as a tuple of center, pose, and all-pairs-indices of members
    std::vector<std::tuple<
        std::array<float, 3>, std::vector<std::array<float, 4>>, std::vector<int>>>
        groups;
    std::vector<std::vector<size_t>> per_group_visible_counts;
    for (size_t i = 0; i < num_pairs; ++i)
    {
        const auto &pose_3d = all_scored_poses[i].first;
        size_t num_joints = pose_3d.size();

        const std::array<float, 3> &center = centers[i];
        float best_dist = std::numeric_limits<float>::infinity();
        int best_group = -1;

        for (size_t j = 0; j < groups.size(); ++j)
        {
            auto &group = groups[j];
            std::array<float, 3> &group_center = std::get<0>(group);

            // Check if the center is close enough
            float dx = group_center[0] - center[0];
            float dy = group_center[1] - center[1];
            float dz = group_center[2] - center[2];
            float center_dist_sq = dx * dx + dy * dy + dz * dz;

            if (center_dist_sq < max_center_dist_sq)
            {
                const auto &group_pose = std::get<1>(group);

                // Calculate average joint distance
                std::vector<float> dists;
                for (size_t row = 0; row < num_joints; ++row)
                {
                    float score1 = pose_3d[row][3];
                    float score2 = group_pose[row][3];
                    if (score1 > min_score && score2 > min_score)
                    {
                        float dx = pose_3d[row][0] - group_pose[row][0];
                        float dy = pose_3d[row][1] - group_pose[row][1];
                        float dz = pose_3d[row][2] - group_pose[row][2];
                        float dist = std::sqrt(dx * dx + dy * dy + dz * dz);
                        dists.push_back(dist);
                    }
                }
                if (dists.size() >= 5)
                {
                    // Drop highest value to reduce influence of outliers
                    auto max_it = std::max_element(dists.begin(), dists.end());
                    dists.erase(max_it);
                }
                if (dists.size() > 0)
                {
                    // Check if the average joint distance is close enough
                    float avg_dist = std::accumulate(dists.begin(), dists.end(), 0.0);
                    avg_dist /= static_cast<float>(dists.size());
                    if (avg_dist < max_joint_avg_dist && avg_dist < best_dist)
                    {
                        best_dist = avg_dist;
                        best_group = static_cast<int>(j);
                    }
                }
            }
        }

        if (best_group == -1)
        {
            // Create a new group
            std::vector<int> new_indices{static_cast<int>(i)};
            std::vector<std::array<float, 4>> group_pose = pose_3d;
            groups.emplace_back(center, group_pose, new_indices);

            // Update per joint counts
            std::vector<size_t> new_valid_joint_counts(num_joints, 0);
            for (size_t row = 0; row < num_joints; ++row)
            {
                if (pose_3d[row][3] > min_score)
                {
                    new_valid_joint_counts[row] = 1;
                }
            }
            per_group_visible_counts.push_back(std::move(new_valid_joint_counts));
        }
        else
        {
            // Update existing group
            auto &group = groups[best_group];
            std::array<float, 3> &group_center = std::get<0>(group);
            std::vector<std::array<float, 4>> &group_pose = std::get<1>(group);
            std::vector<int> &group_indices = std::get<2>(group);

            float n_elems = static_cast<float>(group_indices.size());
            float inv_n = 1.0 / (n_elems + 1.0);

            // Update group center
            group_center[0] = (group_center[0] * n_elems + center[0]) * inv_n;
            group_center[1] = (group_center[1] * n_elems + center[1]) * inv_n;
            group_center[2] = (group_center[2] * n_elems + center[2]) * inv_n;

            // Update group pose
            for (size_t row = 0; row < num_joints; ++row)
            {
                if (pose_3d[row][3] > min_score)
                {
                    float j_elems = static_cast<float>(per_group_visible_counts[best_group][row]);
                    float inv_j = 1.0 / (j_elems + 1.0);

                    group_pose[row][0] = (group_pose[row][0] * j_elems + pose_3d[row][0]) * inv_j;
                    group_pose[row][1] = (group_pose[row][1] * j_elems + pose_3d[row][1]) * inv_j;
                    group_pose[row][2] = (group_pose[row][2] * j_elems + pose_3d[row][2]) * inv_j;
                    group_pose[row][3] = (group_pose[row][3] * j_elems + pose_3d[row][3]) * inv_j;
                    per_group_visible_counts[best_group][row]++;
                }
            }
            group_indices.push_back(static_cast<int>(i));
        }
    }

    // Merge close groups
    // Depending on the inital group creation, one or more groups can be created that in the end
    // share the same persons, even if they had a larger distance at the beginning
    // So merge them similar to the group assignment before
    std::vector<std::tuple<
        std::array<float, 3>, std::vector<std::array<float, 4>>, std::vector<int>>>
        merged_groups;
    for (size_t i = 0; i < groups.size(); ++i)
    {
        size_t num_joints = std::get<1>(groups[i]).size();
        auto &group = groups[i];
        auto &group_visible_counts = per_group_visible_counts[i];

        float best_dist = std::numeric_limits<float>::infinity();
        int best_group = -1;

        for (size_t j = 0; j < merged_groups.size(); ++j)
        {
            const auto &group_pose = std::get<1>(group);
            const auto &merged_pose = std::get<1>(merged_groups[j]);

            // Calculate average joint distance
            std::vector<float> dists;
            for (size_t row = 0; row < num_joints; ++row)
            {
                float score1 = group_pose[row][3];
                float score2 = merged_pose[row][3];
                if (score1 > min_score && score2 > min_score)
                {
                    float dx = group_pose[row][0] - merged_pose[row][0];
                    float dy = group_pose[row][1] - merged_pose[row][1];
                    float dz = group_pose[row][2] - merged_pose[row][2];
                    float dist = std::sqrt(dx * dx + dy * dy + dz * dz);
                    dists.push_back(dist);
                }
            }
            if (dists.size() >= 5)
            {
                // Drop highest value to reduce influence of outliers
                auto max_it = std::max_element(dists.begin(), dists.end());
                dists.erase(max_it);
            }
            if (dists.size() > 0)
            {
                // Check if the average joint distance is close enough
                float avg_dist = std::accumulate(dists.begin(), dists.end(), 0.0);
                avg_dist /= static_cast<float>(dists.size());
                if (avg_dist < max_joint_avg_dist && avg_dist < best_dist)
                {
                    best_dist = avg_dist;
                    best_group = static_cast<int>(j);
                }
            }
        }

        if (best_group == -1)
        {
            // Create a new group
            merged_groups.push_back(group);
        }
        else
        {
            // Update existing group
            auto &merged_group = merged_groups[best_group];
            std::array<float, 3> &merged_center = std::get<0>(merged_group);
            std::array<float, 3> &group_center = std::get<0>(group);
            std::vector<int> &merged_group_indices = std::get<2>(merged_group);

            float n_elems1 = static_cast<float>(merged_group_indices.size());
            float n_elems2 = static_cast<float>(std::get<2>(group).size());
            float inv1 = n_elems1 / (n_elems1 + n_elems2);
            float inv2 = n_elems2 / (n_elems1 + n_elems2);

            // Update group center
            merged_center[0] = merged_center[0] * inv1 + group_center[0] * inv2;
            merged_center[1] = merged_center[1] * inv1 + group_center[1] * inv2;
            merged_center[2] = merged_center[2] * inv1 + group_center[2] * inv2;

            // Update group pose
            for (size_t row = 0; row < num_joints; ++row)
            {
                auto &group_pose = std::get<1>(group);
                auto &merged_pose = std::get<1>(merged_group);

                if (group_pose[row][3] > min_score)
                {
                    float j_elems1 = static_cast<float>(group_visible_counts[row]);
                    float j_elems2 = static_cast<float>(per_group_visible_counts[best_group][row]);
                    float inv1 = j_elems1 / (j_elems1 + j_elems2);
                    float inv2 = j_elems2 / (j_elems1 + j_elems2);

                    merged_pose[row][0] = merged_pose[row][0] * inv1 + group_pose[row][0] * inv2;
                    merged_pose[row][1] = merged_pose[row][1] * inv1 + group_pose[row][1] * inv2;
                    merged_pose[row][2] = merged_pose[row][2] * inv1 + group_pose[row][2] * inv2;
                    merged_pose[row][3] = merged_pose[row][3] * inv1 + group_pose[row][3] * inv2;
                    group_visible_counts[row]++;
                }
            }

            // Merge indices
            merged_group_indices.insert(
                merged_group_indices.end(), std::get<2>(group).begin(), std::get<2>(group).end());
        }
    }

    return merged_groups;
}

// =================================================================================================

std::vector<std::array<float, 4>> TriangulatorInternal::merge_group(
    const std::vector<std::vector<std::array<float, 4>>> &poses_3d,
    float min_score)
{
    size_t num_poses = poses_3d.size();
    size_t num_joints = poses_3d[0].size();

    // Merge poses to create initial pose
    // Use only those triangulations with a high score
    std::vector<std::array<float, 4>> sum_poses(num_joints, {0.0, 0.0, 0.0, 0.0});
    std::vector<float> sum_mask1(num_joints, 0);
    for (size_t i = 0; i < num_poses; ++i)
    {
        const auto &pose = poses_3d[i];

        for (size_t j = 0; j < num_joints; ++j)
        {
            float score = pose[j][3];
            if (score > min_score)
            {
                float weight = std::pow(score, 2);
                sum_poses[j][0] += pose[j][0] * weight;
                sum_poses[j][1] += pose[j][1] * weight;
                sum_poses[j][2] += pose[j][2] * weight;
                sum_poses[j][3] += score * weight;
                sum_mask1[j] += weight;
            }
        }
    }
    std::vector<std::array<float, 4>> initial_pose_3d(num_joints, {0.0, 0.0, 0.0, 0.0});
    for (size_t j = 0; j < num_joints; ++j)
    {
        if (sum_mask1[j] > 0)
        {
            float inv = 1.0 / sum_mask1[j];
            initial_pose_3d[j][0] = sum_poses[j][0] * inv;
            initial_pose_3d[j][1] = sum_poses[j][1] * inv;
            initial_pose_3d[j][2] = sum_poses[j][2] * inv;
            initial_pose_3d[j][3] = sum_poses[j][3] * inv;
        }
    }

    // Use center as default if the initial pose is empty
    std::vector<bool> jmask(num_joints, false);
    std::array<float, 3> center = {0.0, 0.0, 0.0};
    int valid_joints = 0;
    for (size_t j = 0; j < num_joints; ++j)
    {
        float score = initial_pose_3d[j][3];
        if (score > min_score)
        {
            jmask[j] = true;
            center[0] += initial_pose_3d[j][0];
            center[1] += initial_pose_3d[j][1];
            center[2] += initial_pose_3d[j][2];
            valid_joints++;
        }
    }
    if (valid_joints > 0)
    {
        float inv_valid = 1.0 / valid_joints;
        center[0] *= inv_valid;
        center[1] *= inv_valid;
        center[2] *= inv_valid;
    }
    for (size_t j = 0; j < num_joints; ++j)
    {
        if (!jmask[j])
        {
            initial_pose_3d[j][0] = center[0];
            initial_pose_3d[j][1] = center[1];
            initial_pose_3d[j][2] = center[2];
            initial_pose_3d[j][3] = 0.0;
        }
    }

    // Drop joints with low scores and filter outlying joints using distance threshold
    float offset = 0.1;
    float max_dist_sq = 1.2 * 1.2;
    std::vector<std::vector<bool>> mask(num_poses, std::vector<bool>(num_joints, false));
    std::vector<std::vector<float>> distances(num_poses, std::vector<float>(num_joints, 0.0));
    for (size_t i = 0; i < num_poses; ++i)
    {
        const auto &pose = poses_3d[i];
        for (size_t j = 0; j < num_joints; ++j)
        {
            float dx = pose[j][0] - initial_pose_3d[j][0];
            float dy = pose[j][1] - initial_pose_3d[j][1];
            float dz = pose[j][2] - initial_pose_3d[j][2];
            float dist_sq = dx * dx + dy * dy + dz * dz;
            distances[i][j] = dist_sq;

            float score = pose[j][3];
            if (dist_sq <= max_dist_sq && score > (min_score - offset))
            {
                mask[i][j] = true;
            }
        }
    }

    // Select the best-k proposals for each joint that are closest to the initial pose
    int keep_best = std::max(3, (int)num_poses / 3);
    std::vector<std::vector<bool>> best_k_mask(num_poses, std::vector<bool>(num_joints, false));
    for (size_t j = 0; j < num_joints; ++j)
    {
        std::vector<std::pair<float, int>> valid_indices;
        valid_indices.reserve(num_poses);
        for (size_t i = 0; i < num_poses; ++i)
        {
            if (mask[i][j])
            {
                valid_indices.emplace_back(distances[i][j], i);
            }
        }
        int num_valid = std::min(keep_best, static_cast<int>(valid_indices.size()));
        if (num_valid > 0)
        {
            std::partial_sort(valid_indices.begin(), valid_indices.begin() + num_valid,
                              valid_indices.end());

            for (int k = 0; k < num_valid; ++k)
            {
                int idx = valid_indices[k].second;
                best_k_mask[idx][j] = true;
            }
        }
    }

    // Combine masks
    for (size_t i = 0; i < num_poses; ++i)
    {
        for (size_t j = 0; j < num_joints; ++j)
        {
            mask[i][j] = mask[i][j] && best_k_mask[i][j];
        }
    }

    // Compute the final pose
    std::vector<std::array<float, 4>> sum_poses2(num_joints, {0.0, 0.0, 0.0, 0.0});
    std::vector<float> sum_mask2(num_joints, 0);
    for (size_t i = 0; i < num_poses; ++i)
    {
        const auto &pose = poses_3d[i];

        for (size_t j = 0; j < num_joints; ++j)
        {
            if (mask[i][j])
            {
                // Use an exponential weighting to give more importance to higher scores
                float weight = std::pow(pose[j][3], 4);
                sum_poses2[j][0] += pose[j][0] * weight;
                sum_poses2[j][1] += pose[j][1] * weight;
                sum_poses2[j][2] += pose[j][2] * weight;
                sum_poses2[j][3] += pose[j][3] * weight;
                sum_mask2[j] += weight;
            }
        }
    }
    std::vector<std::array<float, 4>> final_pose_3d(num_joints, {0.0, 0.0, 0.0, 0.0});
    for (size_t j = 0; j < num_joints; ++j)
    {
        if (sum_mask2[j] > 0)
        {
            float inv = 1.0 / sum_mask2[j];
            final_pose_3d[j][0] = sum_poses2[j][0] * inv;
            final_pose_3d[j][1] = sum_poses2[j][1] * inv;
            final_pose_3d[j][2] = sum_poses2[j][2] * inv;
            final_pose_3d[j][3] = sum_poses2[j][3] * inv;
        }
    }

    return final_pose_3d;
}

// =================================================================================================

void TriangulatorInternal::add_extra_joints(
    std::vector<std::vector<std::array<float, 4>>> &poses,
    const std::vector<std::string> &joint_names)
{
    // Find indices of "head", "ear_left", and "ear_right"
    auto it_head = std::find(joint_names.begin(), joint_names.end(), "head");
    auto it_ear_left = std::find(joint_names.begin(), joint_names.end(), "ear_left");
    auto it_ear_right = std::find(joint_names.begin(), joint_names.end(), "ear_right");
    int idx_h = std::distance(joint_names.begin(), it_head);
    int idx_el = std::distance(joint_names.begin(), it_ear_left);
    int idx_er = std::distance(joint_names.begin(), it_ear_right);

    // If the confidence score of "head" is zero compute it as the average of the ears
    for (size_t i = 0; i < poses.size(); ++i)
    {
        auto &pose = poses[i];
        if (pose[idx_h][3] == 0.0)
        {
            if (pose[idx_el][3] > 0.0 && pose[idx_er][3] > 0.0)
            {
                pose[idx_h][0] = (pose[idx_el][0] + pose[idx_er][0]) * 0.5;
                pose[idx_h][1] = (pose[idx_el][1] + pose[idx_er][1]) * 0.5;
                pose[idx_h][2] = (pose[idx_el][2] + pose[idx_er][2]) * 0.5;
                pose[idx_h][3] = std::min(pose[idx_el][3], pose[idx_er][3]);
            }
        }
    }
}

// =================================================================================================

void TriangulatorInternal::filter_poses(
    std::vector<std::vector<std::array<float, 4>>> &poses,
    const std::array<std::array<float, 3>, 2> &roomparams,
    const std::vector<size_t> &core_joint_idx,
    const std::vector<std::array<size_t, 2>> &core_limbs_idx)
{
    const float min_score = 0.1;
    std::vector<int> drop_indices;

    for (size_t i = 0; i < poses.size(); ++i)
    {
        auto &pose = poses[i];
        size_t num_core_joints = core_joint_idx.size();
        size_t num_full_joints = pose.size();

        // Collect valid joint indices
        std::vector<size_t> valid_joint_idx;
        for (size_t j = 0; j < num_core_joints; ++j)
        {
            size_t idx = core_joint_idx[j];
            if (pose[idx][3] > min_score)
            {
                valid_joint_idx.push_back(idx);
            }
        }

        // Drop poses with too few joints
        if (valid_joint_idx.size() < 5)
        {
            drop_indices.push_back(i);
            continue;
        }

        // Compute min, max and mean coordinates
        std::array<float, 3> mean = {0.0, 0.0, 0.0};
        std::array<float, 3> mins = {
            std::numeric_limits<float>::max(),
            std::numeric_limits<float>::max(),
            std::numeric_limits<float>::max()};
        std::array<float, 3> maxs = {
            std::numeric_limits<float>::lowest(),
            std::numeric_limits<float>::lowest(),
            std::numeric_limits<float>::lowest()};
        for (size_t j = 0; j < valid_joint_idx.size(); ++j)
        {
            size_t idx = valid_joint_idx[j];
            for (size_t k = 0; k < 3; ++k)
            {
                mins[k] = std::min(mins[k], pose[idx][k]);
                maxs[k] = std::max(maxs[k], pose[idx][k]);
                mean[k] += pose[idx][k];
            }
        }
        for (size_t k = 0; k < 3; ++k)
        {
            mean[k] /= static_cast<float>(valid_joint_idx.size());
        }

        // Drop poses that are too large or too small or too flat
        float min_flatness = 0.2;
        float max_size = 2.3;
        float min_size = 0.3;
        std::array<float, 3> diff;
        for (int j = 0; j < 3; ++j)
        {
            diff[j] = maxs[j] - mins[j];
        }
        if (diff[0] > max_size || diff[1] > max_size || diff[2] > max_size)
        {
            drop_indices.push_back(i);
            continue;
        }
        if (diff[0] < min_size && diff[1] < min_size && diff[2] < min_size)
        {
            drop_indices.push_back(i);
            continue;
        }
        if ((diff[0] < min_flatness && diff[1] < min_flatness) ||
            (diff[1] < min_flatness && diff[2] < min_flatness) ||
            (diff[2] < min_flatness && diff[0] < min_flatness))
        {
            drop_indices.push_back(i);
            continue;
        }

        // Drop persons outside room
        const float wdist = 0.1;
        const auto &room_size = roomparams[0];
        const auto &room_center = roomparams[1];
        const std::array<float, 3> room_half_size = {
            room_size[0] / 2.0f,
            room_size[1] / 2.0f,
            room_size[2] / 2.0f};
        bool outside = false;
        for (int j = 0; j < 3; ++j)
        {
            // Check if the mean position is outside the room boundaries
            if (mean[j] > room_half_size[j] + room_center[j] ||
                mean[j] < -room_half_size[j] + room_center[j])
            {
                outside = true;
                break;
            }
        }
        for (int j = 0; j < 3; ++j)
        {
            // Check if any limb is too far outside the room
            if (maxs[j] > room_half_size[j] + room_center[j] + wdist ||
                mins[j] < -room_half_size[j] + room_center[j] - wdist)
            {
                outside = true;
                break;
            }
        }
        if (outside)
        {
            drop_indices.push_back(i);
            continue;
        }

        // Set joint scores outside the room to a low value
        for (size_t j = 0; j < num_full_joints; ++j)
        {
            if (pose[j][3] > min_score)
            {
                for (int k = 0; k < 3; ++k)
                {
                    if (pose[j][k] > room_half_size[k] + room_center[k] + wdist ||
                        pose[j][k] < -room_half_size[k] + room_center[k] - wdist)
                    {
                        pose[j][3] = 0.001;
                        break;
                    }
                }
            }
        }

        // Calculate total limb length and average limb length
        const float max_avg_length = 0.5;
        const float min_avg_length = 0.1;
        float total_length = 0.0;
        int total_limbs = 0;
        for (size_t j = 0; j < core_limbs_idx.size(); ++j)
        {
            size_t start_idx = core_joint_idx[core_limbs_idx[j][0]];
            size_t end_idx = core_joint_idx[core_limbs_idx[j][1]];

            if (pose[start_idx][3] < min_score || pose[end_idx][3] < min_score)
            {
                continue;
            }

            float dx = pose[end_idx][0] - pose[start_idx][0];
            float dy = pose[end_idx][1] - pose[start_idx][1];
            float dz = pose[end_idx][2] - pose[start_idx][2];
            float length = std::sqrt(dx * dx + dy * dy + dz * dz);

            total_length += length;
            total_limbs++;
        }
        if (total_limbs == 0)
        {
            drop_indices.push_back(i);
            continue;
        }
        float average_length = total_length / static_cast<float>(total_limbs);
        if (average_length < min_avg_length)
        {
            drop_indices.push_back(i);
            continue;
        }
        if (total_limbs >= 5 && average_length > max_avg_length)
        {
            drop_indices.push_back(i);
            continue;
        }
    }

    // Set confidences of invalid poses to a low value
    for (size_t i = 0; i < drop_indices.size(); ++i)
    {
        auto &pose = poses[drop_indices[i]];
        for (size_t j = 0; j < pose.size(); ++j)
        {
            pose[j][3] = 0.001;
        }
    }
}

// =================================================================================================

void TriangulatorInternal::add_missing_joints(
    std::vector<std::vector<std::array<float, 4>>> &poses,
    const std::vector<std::string> &joint_names)
{
    // Map joint names to their indices for quick lookup
    std::unordered_map<std::string, size_t> joint_name_to_idx;
    for (size_t idx = 0; idx < joint_names.size(); ++idx)
    {
        joint_name_to_idx[joint_names[idx]] = idx;
    }

    // Adjacent mapping
    std::unordered_map<std::string, std::vector<std::string>> adjacents = {
        {"hip_right", {"hip_middle", "hip_left"}},
        {"hip_left", {"hip_middle", "hip_right"}},
        {"knee_right", {"hip_right", "knee_left"}},
        {"knee_left", {"hip_left", "knee_right"}},
        {"ankle_right", {"knee_right", "ankle_left"}},
        {"ankle_left", {"knee_left", "ankle_right"}},
        {"shoulder_right", {"shoulder_middle", "shoulder_left"}},
        {"shoulder_left", {"shoulder_middle", "shoulder_right"}},
        {"elbow_right", {"shoulder_right", "hip_right"}},
        {"elbow_left", {"shoulder_left", "hip_left"}},
        {"wrist_right", {"elbow_right"}},
        {"wrist_left", {"elbow_left"}},
        {"nose", {"shoulder_middle", "shoulder_right", "shoulder_left"}},
        {"head", {"shoulder_middle", "shoulder_right", "shoulder_left"}},
        {"foot_*_left", {"ankle_left"}},
        {"foot_*_right", {"ankle_right"}},
        {"face_*", {"nose"}},
        {"hand_*_left_*", {"wrist_left"}},
        {"hand_*_right_*", {"wrist_right"}}};

    // Collect adjacent joint mappings
    std::vector<std::vector<size_t>> adjacent_indices;
    adjacent_indices.resize(joint_names.size());
    for (size_t j = 0; j < joint_names.size(); ++j)
    {
        std::string adname = "";
        const std::string &jname = joint_names[j];
        size_t nlen = jname.size();

        // Determine adjacent joint based on name patterns
        if (nlen >= 15 && jname.compare(0, 5, "hand_") == 0)
        {
            if (jname.find("_left") != std::string::npos)
            {
                adname = "hand_*_left_*";
            }
            else if (jname.find("_right") != std::string::npos)
            {
                adname = "hand_*_right_*";
            }
        }
        else if (nlen >= 14 && jname.compare(0, 5, "foot_") == 0)
        {
            if (jname.find("_left") != std::string::npos)
            {
                adname = "foot_*_left";
            }
            else if (jname.find("_right") != std::string::npos)
            {
                adname = "foot_*_right";
            }
        }
        else if (nlen >= 11 && jname.compare(0, 5, "face_") == 0)
        {
            adname = "face_*";
        }
        else if (adjacents.find(jname) != adjacents.end())
        {
            adname = jname;
        }

        if (!adname.empty())
        {
            auto it = adjacents.find(adname);
            if (it != adjacents.end())
            {
                for (const std::string &adj_name : it->second)
                {
                    auto jt = joint_name_to_idx.find(adj_name);
                    if (jt != joint_name_to_idx.end())
                    {
                        adjacent_indices[j].push_back(jt->second);
                    }
                }
            }
        }
    }

    for (size_t i = 0; i < poses.size(); ++i)
    {
        auto &pose = poses[i];
        size_t num_joints = pose.size();

        // Compute body center as the mean of valid joints
        std::array<float, 3> body_center = {0.0, 0.0, 0.0};
        size_t valid_count = 0;
        for (size_t j = 0; j < num_joints; ++j)
        {
            if (pose[j][3] > min_match_score)
            {
                body_center[0] += pose[j][0];
                body_center[1] += pose[j][1];
                body_center[2] += pose[j][2];
                valid_count++;
            }
        }
        if (valid_count == 0)
        {
            continue;
        }
        float inv_c = 1.0 / static_cast<float>(valid_count);
        for (int j = 0; j < 3; ++j)
        {
            body_center[j] *= inv_c;
        }

        // Iterate over each joint
        for (size_t j = 0; j < joint_names.size(); ++j)
        {
            if (pose[j][3] == 0.0)
            {
                // Joint is missing; attempt to estimate its position based on adjacent joints
                const std::vector<size_t> &adj_indices = adjacent_indices[j];
                std::array<float, 3> adjacent_position = {0.0, 0.0, 0.0};
                size_t adjacent_count = 0;

                for (size_t adj_idx : adj_indices)
                {
                    if (adj_idx < pose.size() && pose[adj_idx][3] > 0.1)
                    {
                        adjacent_position[0] += pose[adj_idx][0];
                        adjacent_position[1] += pose[adj_idx][1];
                        adjacent_position[2] += pose[adj_idx][2];
                        ++adjacent_count;
                    }
                }

                if (adjacent_count > 0)
                {
                    float inv_c = 1.0 / static_cast<float>(adjacent_count);
                    for (size_t k = 0; k < 3; ++k)
                    {
                        adjacent_position[k] *= inv_c;
                    }

                    // Update the missing joint's position
                    pose[j][0] = adjacent_position[0];
                    pose[j][1] = adjacent_position[1];
                    pose[j][2] = adjacent_position[2];
                }
                else
                {
                    // No valid adjacent joints, use body center
                    pose[j][0] = body_center[0];
                    pose[j][1] = body_center[1];
                    pose[j][2] = body_center[2];
                }

                // Set a low confidence score
                pose[j][3] = 0.1;
            }
        }
    }
}