RapidPoseTriangulation/src/rpt/_helpers.py

from __future__ import annotations

from collections.abc import Mapping, Sequence
from typing import Any, TypeAlias

import numpy as np
import numpy.typing as npt

from ._core import Camera

CameraLike = Camera | Mapping[str, Any]
PoseViewLike: TypeAlias = (
    npt.NDArray[np.generic]
    | Sequence[Sequence[Sequence[float]]]
    | Sequence[Sequence[float]]
)


def convert_cameras(cameras: Sequence[CameraLike]) -> list[Camera]:
    """Normalize mappings or existing Camera objects into bound Camera instances."""

    converted: list[Camera] = []
    for cam in cameras:
        if isinstance(cam, Camera):
            converted.append(cam)
            continue

        camera = Camera()
        camera.name = str(cam["name"])
        camera.K = cam["K"]
        camera.DC = cam["DC"]
        camera.R = cam["R"]
        camera.T = cam["T"]
        camera.width = int(cam["width"])
        camera.height = int(cam["height"])
        camera.type = str(cam.get("type", "pinhole"))
        converted.append(camera)
    return converted


def pack_poses_2d(
    views: Sequence[PoseViewLike], *, joint_count: int | None = None
) -> tuple[npt.NDArray[np.float32], npt.NDArray[np.uint32]]:
    """Pack ragged per-view pose detections into the padded tensor expected by the core API."""

    normalized: list[npt.NDArray[np.float32]] = []
    inferred_joint_count = joint_count

    for view in views:
        array = np.asarray(view, dtype=np.float32)

        if array.size == 0:
            normalized.append(np.zeros((0, 0, 3), dtype=np.float32))
            continue

        if array.ndim == 2:
            if array.shape[-1] != 3:
                raise ValueError("Single-person pose inputs must have shape [joints, 3].")
            array = array[np.newaxis, :, :]
        elif array.ndim != 3 or array.shape[-1] != 3:
            raise ValueError("Each view must have shape [persons, joints, 3] or [joints, 3].")

        if inferred_joint_count is None:
            inferred_joint_count = int(array.shape[1])
        elif array.shape[1] != inferred_joint_count:
            raise ValueError("All views must use the same joint count.")

        normalized.append(np.ascontiguousarray(array, dtype=np.float32))

    if inferred_joint_count is None:
        raise ValueError("joint_count is required when all views are empty.")

    fixed_views: list[npt.NDArray[np.float32]] = []
    max_persons = 0
    for array in normalized:
        if array.size == 0:
            array = np.zeros((0, inferred_joint_count, 3), dtype=np.float32)
        elif array.shape[1] != inferred_joint_count:
            raise ValueError("All views must use the same joint count.")
        max_persons = max(max_persons, int(array.shape[0]))
        fixed_views.append(array)

    packed = np.zeros((len(fixed_views), max_persons, inferred_joint_count, 3), dtype=np.float32)
    counts = np.zeros((len(fixed_views),), dtype=np.uint32)

    for view_idx, array in enumerate(fixed_views):
        person_count = int(array.shape[0])
        counts[view_idx] = person_count
        if person_count:
            packed[view_idx, :person_count, :, :] = array

    return packed, counts