From 15989195f1c4eba5646f412e6df508239d243f83 Mon Sep 17 00:00:00 2001 From: crosstyan Date: Sat, 7 Feb 2026 16:54:21 +0000 Subject: [PATCH] feat: implement geometry-first auto-align heuristic --- py_workspace/aruco/alignment.py | 75 ++++++++++++++++++++++++-- py_workspace/calibrate_extrinsics.py | 79 ++++++++++++++++++++++++++-- py_workspace/tests/test_alignment.py | 47 ++++++++++++++++- 3 files changed, 194 insertions(+), 7 deletions(-) diff --git a/py_workspace/aruco/alignment.py b/py_workspace/aruco/alignment.py index 7ff47e8..854f08d 100644 --- a/py_workspace/aruco/alignment.py +++ b/py_workspace/aruco/alignment.py @@ -137,6 +137,71 @@ def apply_alignment_to_pose(T: Mat44, R_align: Mat33) -> Mat44: return (T_align @ T).astype(np.float64) +def estimate_up_vector_from_cameras(camera_poses: list[Mat44]) -> Vec3: + """ + Estimate the 'up' vector of the scene based on camera positions. + Assumes cameras are arranged roughly in a horizontal ring (coplanar). + The normal of the plane fitting the camera centers is used as the up vector. + The sign is disambiguated using the average camera 'up' vector (-Y in OpenCV). + + Args: + camera_poses: List of (4, 4) camera-to-world transformation matrices. + + Returns: + (3,) normalized up vector. + """ + if not camera_poses: + raise ValueError("No camera poses provided.") + + # Extract camera centers (translations) + centers = np.array([T[:3, 3] for T in camera_poses]) + + # Calculate average camera 'up' vector (assuming OpenCV convention: Y is down, so up is -Y) + # T[:3, 1] is the Y axis direction in world frame + # We want the vector pointing UP in world coordinates. + # In OpenCV camera frame, Y is down. So -Y is up. + # The world-frame representation of the camera's -Y axis is -R[:, 1] + # T[:3, 1] is the second column of the rotation matrix (Y axis). + avg_cam_up = np.mean([-T[:3, 1] for T in camera_poses], axis=0) + norm = np.linalg.norm(avg_cam_up) + if norm > 1e-6: + avg_cam_up /= norm + else: + avg_cam_up = np.array([0.0, 1.0, 0.0]) # Fallback + + # If fewer than 3 cameras, we can't reliably fit a plane. + # Fallback to average camera up vector. + if len(camera_poses) < 3: + logger.debug("Fewer than 3 cameras; using average camera -Y as up vector.") + return avg_cam_up + + # Fit plane to camera centers using SVD + centroid = np.mean(centers, axis=0) + centered = centers - centroid + + # Check if points are collinear or coincident (rank check) + # If they are collinear, plane is undefined. + if np.linalg.matrix_rank(centered) < 2: + logger.debug( + "Camera centers are collinear; using average camera -Y as up vector." + ) + return avg_cam_up + + try: + u, s, vh = np.linalg.svd(centered) + # The normal is the singular vector corresponding to the smallest singular value + normal = vh[2, :] + except np.linalg.LinAlgError: + logger.warning("SVD failed; using average camera -Y as up vector.") + return avg_cam_up + + # Disambiguate sign: choose the normal that aligns best with average camera up + if np.dot(normal, avg_cam_up) < 0: + normal = -normal + + return normal + + def get_face_normal_from_geometry( face_name: str, marker_geometry: dict[int, np.ndarray], @@ -223,9 +288,13 @@ def detect_ground_face( # Iterate faces in mapping for face_name, face_marker_ids in face_marker_map.items(): - # Consider only faces with any visible marker ID - if not any(mid in visible_marker_ids for mid in face_marker_ids): - continue + # We check ALL faces for which we have geometry, regardless of visibility. + # This allows detecting the ground face even if it's occluded, + # provided we have geometry for it (e.g. from a loaded model or previous detections). + # However, get_face_normal_from_geometry requires marker_geometry to contain the markers. + # If marker_geometry only contains *visible* markers (which is typical if passed from detection), + # then we are limited to visible faces. + # But if marker_geometry is the full loaded geometry, we can check all faces. normal = get_face_normal_from_geometry( face_name, marker_geometry, face_marker_map=face_marker_map diff --git a/py_workspace/calibrate_extrinsics.py b/py_workspace/calibrate_extrinsics.py index 0f4e1f4..4332b59 100644 --- a/py_workspace/calibrate_extrinsics.py +++ b/py_workspace/calibrate_extrinsics.py @@ -30,6 +30,7 @@ from aruco.alignment import ( detect_ground_face, rotation_align_vectors, apply_alignment_to_pose, + estimate_up_vector_from_cameras, Vec3, Mat44, ) @@ -1032,14 +1033,86 @@ def main( ) else: # Heuristic detection - heuristic_res = detect_ground_face( - all_visible_ids, marker_geometry, face_marker_map=face_marker_map + # Estimate up vector from camera poses + camera_poses = [] + for serial, data in results.items(): + T = np.fromstring(data["pose"], sep=" ").reshape(4, 4) + camera_poses.append(T) + + estimated_up = estimate_up_vector_from_cameras(camera_poses) + logger.info( + f"Estimated scene up vector from {len(camera_poses)} cameras: {estimated_up}" ) + + # We pass the FULL marker_geometry (loaded from parquet) to detect_ground_face. + # This allows it to check all faces, not just visible ones, provided the geometry is known. + heuristic_res = detect_ground_face( + set( + marker_geometry.keys() + ), # Pass all known markers as "visible" to allow checking all faces + marker_geometry, + camera_up_vector=estimated_up, + face_marker_map=face_marker_map, + ) + if heuristic_res: target_face, ground_normal = heuristic_res ids = mapping_to_use.get(target_face, []) logger.info( - f"Heuristically detected ground face '{target_face}' (markers={ids})" + f"Heuristically detected ground face '{target_face}' (markers={ids}) using geometric alignment." + ) + + # We pass the FULL marker_geometry (loaded from parquet) to detect_ground_face. + # This allows it to check all faces, not just visible ones, provided the geometry is known. + # all_visible_ids is still passed but we might want to relax the requirement + # if we trust the geometry and estimated up vector. + # However, detect_ground_face currently requires visible_marker_ids to be non-empty + # to return anything? No, it checks `if not visible_marker_ids: return None`. + # But wait, if we want to support occluded ground face, we shouldn't require it to be visible. + # But we need at least SOME markers to be visible to define the object frame relative to cameras? + # Actually, the object frame is defined by the markers we detected. + # If we have the full geometry, we know where the ground face IS relative to the detected markers. + # So we should pass a set of ALL marker IDs in the geometry as "visible" if we want to check all faces? + # Or better, modify detect_ground_face to not require visibility if we are doing geometric alignment? + # Let's just pass all keys from marker_geometry as "visible" effectively, + # or just rely on the fact that we have a map. + + # Actually, let's look at detect_ground_face again. + # It iterates `face_marker_map`. + # It calls `get_face_normal_from_geometry`. + # `get_face_normal_from_geometry` uses `marker_geometry`. + # If `marker_geometry` contains the markers for a face, we can compute its normal. + # In `calibrate_extrinsics.py`, `marker_geometry` is the FULL loaded geometry. + # So we can compute normals for ALL faces. + # The only constraint in `detect_ground_face` was: + # `if not any(mid in visible_marker_ids for mid in face_marker_ids): continue` + # We should probably remove that constraint if we want to support occluded faces. + # But wait, `detect_ground_face` was modified in the previous step. + # Let's check the modification. + + # I removed the semantic priority block. + # But I kept the loop: + # for face_name, face_marker_ids in face_marker_map.items(): + # # We check ALL faces for which we have geometry... + # normal = get_face_normal_from_geometry(...) + + # Wait, I replaced the loop body but I didn't check if I removed the visibility check. + # Let's verify `aruco/alignment.py` content. + + heuristic_res = detect_ground_face( + set( + marker_geometry.keys() + ), # Pass all known markers as "visible" to allow checking all faces + marker_geometry, + camera_up_vector=estimated_up, + face_marker_map=face_marker_map, + ) + + if heuristic_res: + target_face, ground_normal = heuristic_res + ids = mapping_to_use.get(target_face, []) + logger.info( + f"Heuristically detected ground face '{target_face}' (markers={ids}) using geometric alignment." ) if ground_normal is not None: diff --git a/py_workspace/tests/test_alignment.py b/py_workspace/tests/test_alignment.py index be4d922..398a384 100644 --- a/py_workspace/tests/test_alignment.py +++ b/py_workspace/tests/test_alignment.py @@ -148,10 +148,18 @@ def test_detect_ground_face(): assert face_name == "bottom" np.testing.assert_allclose(normal, np.array([0, -1, 0]), atol=1e-10) - # Only top visible + # Case 1: We know about bottom, but only top is visible. Should pick bottom (best alignment). res = detect_ground_face({2}, marker_geometry, camera_up, face_marker_map) assert res is not None face_name, normal = res + assert face_name == "bottom" + np.testing.assert_allclose(normal, np.array([0, -1, 0]), atol=1e-10) + + # Case 2: We don't know about bottom (e.g. partial map). Should pick top (best available). + partial_geometry = {2: marker_geometry[2]} + res = detect_ground_face({2}, partial_geometry, camera_up, face_marker_map) + assert res is not None + face_name, normal = res assert face_name == "top" np.testing.assert_allclose(normal, np.array([0, 1, 0]), atol=1e-10) @@ -162,3 +170,40 @@ def test_detect_ground_face(): # Missing map assert detect_ground_face({1, 2}, marker_geometry, camera_up, None) is None + + +def test_detect_ground_face_geometric_priority(): + # Test that geometric alignment is preferred over semantic names + # Scenario: 'bottom' face is tilted 45 deg, 'side' face is perfectly aligned with camera up + # This simulates a box placed on its side + + face_marker_map = { + "bottom": [1], + "side": [2], + } + + # Camera up is [0, -1, 0] (Y-down convention common in CV, or Y-up depending on setup) + # Let's assume we want to align with [0, -1, 0] + camera_up = np.array([0, -1, 0], dtype=np.float64) + + # Marker 1 (bottom): Tilted 45 deg. Normal = [0.707, -0.707, 0] + # Dot product with [0, -1, 0] = 0.707 + marker_geometry = { + 1: np.array([[0, 0, 0], [1, 1, 0], [1, 1, 1], [0, 0, 1]], dtype=np.float64), + # v1=[1,1,0], v2=[0,0,1] -> cross=[1, -1, 0] -> norm=[0.707, -0.707, 0] + # Marker 2 (side): Perfectly aligned. Normal = [0, -1, 0] + # Dot product with [0, -1, 0] = 1.0 + 2: np.array([[0, 0, 0], [1, 0, 0], [1, 0, 1], [0, 0, 1]], dtype=np.float64), + # v1=[1,0,0], v2=[0,0,1] -> cross=[0, -1, 0] + } + + # OLD BEHAVIOR: would pick 'bottom' because of name + # NEW BEHAVIOR: should pick 'side' because of better alignment score + + res = detect_ground_face({1, 2}, marker_geometry, camera_up, face_marker_map) + assert res is not None + face_name, normal = res + + # This assertion will fail until we fix the code + assert face_name == "side" + np.testing.assert_allclose(normal, np.array([0, -1, 0]), atol=1e-10)