zed-playground/py_workspace/aruco/alignment.py

import numpy as np
from loguru import logger
from jaxtyping import Float
from typing import TYPE_CHECKING

# Type aliases for shape-aware annotations
if TYPE_CHECKING:
    Vec3 = Float[np.ndarray, "3"]
    Mat33 = Float[np.ndarray, "3 3"]
    Mat44 = Float[np.ndarray, "4 4"]
    CornersNC = Float[np.ndarray, "N 3"]
else:
    Vec3 = np.ndarray
    Mat33 = np.ndarray
    Mat44 = np.ndarray
    CornersNC = np.ndarray


def compute_face_normal(corners: CornersNC) -> Vec3:
    """
    Compute the normal vector of a face defined by its corners.
    Assumes corners are in order (e.g., clockwise or counter-clockwise).

    Args:
        corners: (N, 3) array of corner coordinates.

    Returns:
        (3,) normalized normal vector.
    """
    if corners.ndim != 2 or corners.shape[1] != 3:
        raise ValueError(f"Expected (N, 3) array, got {corners.shape}")
    if corners.shape[0] < 3:
        raise ValueError("At least 3 corners are required to compute a normal.")

    # Use the cross product of two edges
    # Stable edge definition for quad faces consistent with plan/repo convention:
    # normal from (corners[1]-corners[0]) x (corners[3]-corners[0]) when N>=4,
    # fallback to index 2 if N==3.
    v1 = corners[1] - corners[0]
    if corners.shape[0] >= 4:
        v2 = corners[3] - corners[0]
    else:
        v2 = corners[2] - corners[0]

    normal = np.cross(v1, v2)
    norm = np.linalg.norm(normal)

    if norm < 1e-10:
        raise ValueError("Corners are collinear or degenerate; cannot compute normal.")

    return (normal / norm).astype(np.float64)


def rotation_align_vectors(from_vec: Vec3, to_vec: Vec3) -> Mat33:
    """
    Compute the 3x3 rotation matrix that aligns from_vec to to_vec.

    Args:
        from_vec: (3,) source vector.
        to_vec: (3,) target vector.

    Returns:
        (3, 3) rotation matrix.
    """
    if from_vec.shape != (3,) or to_vec.shape != (3,):
        raise ValueError(
            f"Expected (3,) vectors, got {from_vec.shape} and {to_vec.shape}"
        )

    norm_from = np.linalg.norm(from_vec)
    norm_to = np.linalg.norm(to_vec)

    if norm_from < 1e-10 or norm_to < 1e-10:
        raise ValueError("Cannot align zero-norm vectors.")

    # Normalize inputs
    a = from_vec / norm_from
    b = to_vec / norm_to

    v = np.cross(a, b)
    c = np.dot(a, b)
    s = np.linalg.norm(v)

    # Handle parallel case
    if s < 1e-10:
        if c > 0:
            return np.eye(3, dtype=np.float64)
        else:
            # Anti-parallel case: 180 degree rotation around an orthogonal axis
            # Find an orthogonal axis
            if abs(a[0]) < 0.9:
                ortho = np.array([1.0, 0.0, 0.0])
            else:
                ortho = np.array([0.0, 1.0, 0.0])

            axis = np.cross(a, ortho)
            axis /= np.linalg.norm(axis)

            # Rodrigues formula for 180 degrees
            K = np.array(
                [
                    [0.0, -axis[2], axis[1]],
                    [axis[2], 0.0, -axis[0]],
                    [-axis[1], axis[0], 0.0],
                ]
            )
            return (np.eye(3) + 2 * (K @ K)).astype(np.float64)

    # General case using Rodrigues' rotation formula
    # R = I + [v]_x + [v]_x^2 * (1-c)/s^2
    vx = np.array([[0.0, -v[2], v[1]], [v[2], 0.0, -v[0]], [-v[1], v[0], 0.0]])

    R = np.eye(3) + vx + (vx @ vx) * ((1 - c) / (s**2))
    return R.astype(np.float64)


def apply_alignment_to_pose(T: Mat44, R_align: Mat33) -> Mat44:
    """
    Apply an alignment rotation to a 4x4 pose matrix.
    The alignment is applied in the global frame (pre-multiplication of rotation).

    Args:
        T: (4, 4) homogeneous transformation matrix.
        R_align: (3, 3) alignment rotation matrix.

    Returns:
        (4, 4) aligned transformation matrix.
    """
    if T.shape != (4, 4):
        raise ValueError(f"Expected 4x4 matrix, got {T.shape}")
    if R_align.shape != (3, 3):
        raise ValueError(f"Expected 3x3 matrix, got {R_align.shape}")

    T_align = np.eye(4, dtype=np.float64)
    T_align[:3, :3] = R_align

    return (T_align @ T).astype(np.float64)


def estimate_up_vector_from_cameras(camera_poses: list[Mat44]) -> Vec3:
    """
    Estimate the 'up' vector of the scene based on camera positions.
    Assumes cameras are arranged roughly in a horizontal ring (coplanar).
    The normal of the plane fitting the camera centers is used as the up vector.
    The sign is disambiguated using the average camera 'up' vector (-Y in OpenCV).

    Args:
        camera_poses: List of (4, 4) camera-to-world transformation matrices.

    Returns:
        (3,) normalized up vector.
    """
    if not camera_poses:
        raise ValueError("No camera poses provided.")

    # Extract camera centers (translations)
    centers = np.array([T[:3, 3] for T in camera_poses])

    # Calculate average camera 'up' vector (assuming OpenCV convention: Y is down, so up is -Y)
    # T[:3, 1] is the Y axis direction in world frame
    # We want the vector pointing UP in world coordinates.
    # In OpenCV camera frame, Y is down. So -Y is up.
    # The world-frame representation of the camera's -Y axis is -R[:, 1]
    # T[:3, 1] is the second column of the rotation matrix (Y axis).
    avg_cam_up = np.mean([-T[:3, 1] for T in camera_poses], axis=0)
    norm = np.linalg.norm(avg_cam_up)
    if norm > 1e-6:
        avg_cam_up /= norm
    else:
        avg_cam_up = np.array([0.0, 1.0, 0.0])  # Fallback

    # If fewer than 3 cameras, we can't reliably fit a plane.
    # Fallback to average camera up vector.
    if len(camera_poses) < 3:
        logger.debug("Fewer than 3 cameras; using average camera -Y as up vector.")
        return avg_cam_up

    # Fit plane to camera centers using SVD
    centroid = np.mean(centers, axis=0)
    centered = centers - centroid

    # Check if points are collinear or coincident (rank check)
    # If they are collinear, plane is undefined.
    if np.linalg.matrix_rank(centered) < 2:
        logger.debug(
            "Camera centers are collinear; using average camera -Y as up vector."
        )
        return avg_cam_up

    try:
        u, s, vh = np.linalg.svd(centered)
        # The normal is the singular vector corresponding to the smallest singular value
        normal = vh[2, :]
    except np.linalg.LinAlgError:
        logger.warning("SVD failed; using average camera -Y as up vector.")
        return avg_cam_up

    # Disambiguate sign: choose the normal that aligns best with average camera up
    if np.dot(normal, avg_cam_up) < 0:
        normal = -normal

    return normal


def get_face_normal_from_geometry(
    face_name: str,
    marker_geometry: dict[int, np.ndarray],
    face_marker_map: dict[str, list[int]] | None = None,
) -> Vec3 | None:
    """
    Compute the average normal vector for a face based on available marker geometry.

    Args:
        face_name: Name of the face (key in face_marker_map).
        marker_geometry: Dictionary mapping marker IDs to their (4, 3) corner coordinates.
        face_marker_map: Dictionary mapping face names to marker IDs.

    Returns:
        (3,) normalized average normal vector, or None if no markers are available.
    """
    if face_marker_map is None:
        return None

    if face_name not in face_marker_map:
        return None

    marker_ids = face_marker_map[face_name]
    normals = []

    for mid in marker_ids:
        if mid in marker_geometry:
            try:
                n = compute_face_normal(marker_geometry[mid])
                normals.append(n)
            except ValueError:
                continue

    if not normals:
        return None

    avg_normal = np.mean(normals, axis=0)
    norm = np.linalg.norm(avg_normal)

    if norm < 1e-10:
        return None

    return (avg_normal / norm).astype(np.float64)


def detect_ground_face(
    visible_marker_ids: set[int],
    marker_geometry: dict[int, np.ndarray],
    camera_up_vector: Vec3 = np.array([0, -1, 0]),
    face_marker_map: dict[str, list[int]] | None = None,
) -> tuple[str, Vec3] | None:
    """
    Detect which face of the object is most likely the ground face.
    The ground face is the one whose normal is most aligned with the camera's up vector.

    Args:
        visible_marker_ids: Set of marker IDs currently visible.
        marker_geometry: Dictionary mapping marker IDs to their (4, 3) corner coordinates.
        camera_up_vector: (3,) vector representing the 'up' direction in camera frame.
        face_marker_map: Dictionary mapping face names to marker IDs.

    Returns:
        Tuple of (best_face_name, best_face_normal) or None if no face is detected.
    """
    if not visible_marker_ids:
        return None

    if face_marker_map is None:
        return None

    if camera_up_vector.shape != (3,):
        raise ValueError(
            f"Expected (3,) camera_up_vector, got {camera_up_vector.shape}"
        )

    up_norm = np.linalg.norm(camera_up_vector)
    if up_norm < 1e-10:
        raise ValueError("camera_up_vector cannot be zero-norm.")

    up_vec = camera_up_vector / up_norm
    best_face = None
    best_normal = None
    best_score = -np.inf

    # Iterate faces in mapping
    for face_name, face_marker_ids in face_marker_map.items():
        # We check ALL faces for which we have geometry, regardless of visibility.
        # This allows detecting the ground face even if it's occluded,
        # provided we have geometry for it (e.g. from a loaded model or previous detections).
        # However, get_face_normal_from_geometry requires marker_geometry to contain the markers.
        # If marker_geometry only contains *visible* markers (which is typical if passed from detection),
        # then we are limited to visible faces.
        # But if marker_geometry is the full loaded geometry, we can check all faces.

        normal = get_face_normal_from_geometry(
            face_name, marker_geometry, face_marker_map=face_marker_map
        )
        if normal is None:
            continue

        # Score by dot(normal, normalized camera_up_vector)
        score = np.dot(normal, up_vec)
        logger.debug(f"Face '{face_name}' considered. Score: {score:.4f}")

        if score > best_score:
            best_score = score
            best_face = face_name
            best_normal = normal

    if best_face is not None and best_normal is not None:
        logger.debug(
            f"Best ground face detected: '{best_face}' with score {best_score:.4f}"
        )
        return best_face, best_normal

    return None