feat(calibrate): integrate multi-frame depth pooling with --depth-pool-size flag

2026-02-07 08:10:01 +00:00
parent dad1f2a69f
commit 4fc8de4bdc
6 changed files with 774 additions and 82 deletions
@@ -24,6 +24,7 @@ from aruco.pose_averaging import PoseAccumulator
 from aruco.preview import draw_detected_markers, draw_pose_axes, show_preview
 from aruco.depth_verify import verify_extrinsics_with_depth
 from aruco.depth_refine import refine_extrinsics_with_depth
+from aruco.depth_pool import pool_depth_maps
 from aruco.alignment import (
    get_face_normal_from_geometry,
    detect_ground_face,
@@ -117,13 +118,14 @@ def score_frame(

 def apply_depth_verify_refine_postprocess(
    results: Dict[str, Any],
-    verification_frames: Dict[str, Any],
+    verification_frames: Dict[int, List[Dict[str, Any]]],
    marker_geometry: Dict[int, Any],
-    camera_matrices: Dict[str, Any],
+    camera_matrices: Dict[int, Any],
    verify_depth: bool,
    refine_depth: bool,
    use_confidence_weights: bool,
    depth_confidence_threshold: int,
+    depth_pool_size: int = 1,
    report_csv_path: Optional[str] = None,
 ) -> Tuple[Dict[str, Any], List[List[Any]]]:
    """
@@ -137,12 +139,117 @@ def apply_depth_verify_refine_postprocess(

    click.echo("\nRunning depth verification/refinement on computed extrinsics...")

-    for serial, vf in verification_frames.items():
+    for serial, vfs in verification_frames.items():
        if str(serial) not in results:
            continue

-        frame = vf["frame"]
-        ids = vf["ids"]
+        # Extract depth maps and confidence maps from the top-N frames
+        # vfs is already sorted by score descending and truncated to depth_pool_size
+        depth_maps = []
+        confidence_maps = []
+
+        # We need at least one frame with depth
+        valid_frames = []
+        for vf in vfs:
+            frame = vf["frame"]
+            if frame.depth_map is not None:
+                depth_maps.append(frame.depth_map)
+                confidence_maps.append(frame.confidence_map)
+                valid_frames.append(vf)
+
+        if not valid_frames:
+            click.echo(
+                f"Camera {serial}: No frames with depth map available for verification."
+            )
+            continue
+
+        # Use the best frame (first in the list) for marker IDs and corners
+        # This ensures we use the highest quality detection for geometry
+        best_vf = valid_frames[0]
+        ids = best_vf["ids"]
+
+        # Determine if we should pool or use single frame
+        use_pooling = depth_pool_size > 1 and len(depth_maps) > 1
+
+        if use_pooling:
+            try:
+                pooled_depth, pooled_conf = pool_depth_maps(
+                    depth_maps,
+                    confidence_maps,
+                    confidence_thresh=depth_confidence_threshold,
+                )
+
+                # Check if pooling resulted in a valid map (enough valid pixels)
+                # We'll do a quick check against the best single frame
+                # If pooled map has significantly fewer valid pixels, fallback
+                best_depth = depth_maps[0]
+                best_conf = confidence_maps[0]
+
+                # Simple validity check (finite and > 0)
+                # We don't need to be perfect here, just catch catastrophic pooling failure
+                n_valid_pooled = np.count_nonzero(
+                    np.isfinite(pooled_depth) & (pooled_depth > 0)
+                )
+
+                # For best frame, we also respect confidence threshold if provided
+                mask_best = np.isfinite(best_depth) & (best_depth > 0)
+                if best_conf is not None:
+                    mask_best &= best_conf <= depth_confidence_threshold
+                n_valid_best = np.count_nonzero(mask_best)
+
+                # If pooled result is much worse (e.g. < 50% of valid points of single frame), fallback
+                # This can happen if frames are misaligned or pooling logic fails
+                if n_valid_pooled < (n_valid_best * 0.5):
+                    click.echo(
+                        f"Camera {serial}: Pooled depth has too few valid points ({n_valid_pooled} vs {n_valid_best}). "
+                        "Falling back to best single frame."
+                    )
+                    final_depth = best_depth
+                    final_conf = best_conf
+                    pool_metadata = {
+                        "pool_size_requested": depth_pool_size,
+                        "pool_size_actual": len(depth_maps),
+                        "pooled": False,
+                        "fallback_reason": "insufficient_valid_points",
+                    }
+                else:
+                    final_depth = pooled_depth
+                    final_conf = pooled_conf
+                    pool_metadata = {
+                        "pool_size_requested": depth_pool_size,
+                        "pool_size_actual": len(depth_maps),
+                        "pooled": True,
+                    }
+                    click.echo(
+                        f"Camera {serial}: Using pooled depth from {len(depth_maps)} frames."
+                    )
+            except Exception as e:
+                click.echo(
+                    f"Camera {serial}: Pooling failed with error: {e}. Falling back to single frame.",
+                    err=True,
+                )
+                final_depth = depth_maps[0]
+                final_conf = confidence_maps[0]
+                pool_metadata = {
+                    "pool_size_requested": depth_pool_size,
+                    "pool_size_actual": len(depth_maps),
+                    "pooled": False,
+                    "fallback_reason": f"exception: {str(e)}",
+                }
+        else:
+            # Single frame case (N=1 or only 1 available)
+            final_depth = depth_maps[0]
+            final_conf = confidence_maps[0]
+            # Only add metadata if pooling was requested but not possible due to lack of frames
+            if depth_pool_size > 1:
+                pool_metadata = {
+                    "pool_size_requested": depth_pool_size,
+                    "pool_size_actual": len(depth_maps),
+                    "pooled": False,
+                    "fallback_reason": "insufficient_frames",
+                }
+            else:
+                pool_metadata = None

        # Use the FINAL COMPUTED POSE for verification
        pose_str = results[str(serial)]["pose"]
@@ -155,13 +262,13 @@ def apply_depth_verify_refine_postprocess(
            if int(mid) in marker_geometry
        }

-        if marker_corners_world and frame.depth_map is not None:
+        if marker_corners_world and final_depth is not None:
            verify_res = verify_extrinsics_with_depth(
                T_mean,
                marker_corners_world,
-                frame.depth_map,
+                final_depth,
                cam_matrix,
-                confidence_map=frame.confidence_map,
+                confidence_map=final_conf,
                confidence_thresh=depth_confidence_threshold,
            )

@@ -174,6 +281,9 @@ def apply_depth_verify_refine_postprocess(
                "n_total": verify_res.n_total,
            }

+            if pool_metadata:
+                results[str(serial)]["depth_pool"] = pool_metadata
+
            click.echo(
                f"Camera {serial} verification: RMSE={verify_res.rmse:.3f}m, "
                f"Valid={verify_res.n_valid}/{verify_res.n_total}"
@@ -189,20 +299,18 @@ def apply_depth_verify_refine_postprocess(
                    T_refined, refine_stats = refine_extrinsics_with_depth(
                        T_mean,
                        marker_corners_world,
-                        frame.depth_map,
+                        final_depth,
                        cam_matrix,
-                        confidence_map=frame.confidence_map
-                        if use_confidence_weights
-                        else None,
+                        confidence_map=(final_conf if use_confidence_weights else None),
                        confidence_thresh=depth_confidence_threshold,
                    )

                    verify_res_post = verify_extrinsics_with_depth(
                        T_refined,
                        marker_corners_world,
-                        frame.depth_map,
+                        final_depth,
                        cam_matrix,
-                        confidence_map=frame.confidence_map,
+                        confidence_map=final_conf,
                        confidence_thresh=depth_confidence_threshold,
                    )

@@ -218,6 +326,9 @@ def apply_depth_verify_refine_postprocess(
                        "n_total": verify_res_post.n_total,
                    }

+                    if pool_metadata:
+                        results[str(serial)]["depth_pool"] = pool_metadata
+
                    improvement = verify_res.rmse - verify_res_post.rmse
                    results[str(serial)]["refine_depth"]["improvement_rmse"] = (
                        improvement
@@ -260,10 +371,10 @@ def apply_depth_verify_refine_postprocess(

 def run_benchmark_matrix(
    results: Dict[str, Any],
-    verification_frames: Dict[Any, Any],
-    first_frames: Dict[Any, Any],
+    verification_frames: Dict[int, List[Dict[str, Any]]],
+    first_frames: Dict[int, Dict[str, Any]],
    marker_geometry: Dict[int, Any],
-    camera_matrices: Dict[Any, Any],
+    camera_matrices: Dict[int, Any],
    depth_confidence_threshold: int,
 ) -> Dict[str, Any]:
    """
@@ -318,11 +429,10 @@ def run_benchmark_matrix(
        for config in configs:
            name = config["name"]
            use_best = config["use_best_frame"]
-            vf = (
-                verification_frames[serial_int]
-                if use_best
-                else first_frames[serial_int]
-            )
+            if use_best:
+                vf = verification_frames[serial_int][0]
+            else:
+                vf = first_frames[serial_int]

            frame = vf["frame"]
            ids = vf["ids"]
@@ -351,9 +461,9 @@ def run_benchmark_matrix(
                marker_corners_world,
                frame.depth_map,
                cam_matrix,
-                confidence_map=frame.confidence_map
-                if config["use_confidence"]
-                else None,
+                confidence_map=(
+                    frame.confidence_map if config["use_confidence"] else None
+                ),
                confidence_thresh=depth_confidence_threshold,
                loss=str(config["loss"]),
                f_scale=0.1,
@@ -430,9 +540,9 @@ def run_benchmark_matrix(
 )
@click.option(
    "--depth-mode",
-    default="NEURAL",
-    type=click.Choice(["NEURAL", "ULTRA", "PERFORMANCE", "NONE"]),
-    help="Depth computation mode.",
+    default=None,
+    type=click.Choice(["NEURAL", "NEURAL_PLUS", "NEURAL_LIGHT", "NONE"]),
+    help="Depth computation mode. Defaults to NEURAL_PLUS if depth verification/refinement is enabled, otherwise NONE.",
 )
@click.option(
    "--depth-confidence-threshold",
@@ -440,6 +550,12 @@ def run_benchmark_matrix(
    type=int,
    help="Confidence threshold for depth filtering (lower = more confident).",
 )
+@click.option(
+    "--depth-pool-size",
+    default=1,
+    type=click.IntRange(min=1, max=10),
+    help="Number of best frames to pool for depth verification/refinement (1=single best frame).",
+)
@click.option(
    "--report-csv", type=click.Path(), help="Optional path for per-frame CSV report."
 )
@@ -494,8 +610,9 @@ def main(
    verify_depth: bool,
    refine_depth: bool,
    use_confidence_weights: bool,
-    depth_mode: str,
+    depth_mode: str | None,
    depth_confidence_threshold: int,
+    depth_pool_size: int,
    report_csv: str | None,
    auto_align: bool,
    ground_face: str | None,
@@ -519,14 +636,18 @@ def main(

    depth_mode_map = {
        "NEURAL": sl.DEPTH_MODE.NEURAL,
-        "ULTRA": sl.DEPTH_MODE.ULTRA,
-        "PERFORMANCE": sl.DEPTH_MODE.PERFORMANCE,
+        "NEURAL_PLUS": sl.DEPTH_MODE.NEURAL_PLUS,
+        "NEURAL_LIGHT": sl.DEPTH_MODE.NEURAL_LIGHT,
        "NONE": sl.DEPTH_MODE.NONE,
    }
-    sl_depth_mode = depth_mode_map.get(depth_mode, sl.DEPTH_MODE.NONE)

-    if not (verify_depth or refine_depth or benchmark_matrix):
-        sl_depth_mode = sl.DEPTH_MODE.NONE
+    if depth_mode is None:
+        if verify_depth or refine_depth or benchmark_matrix:
+            sl_depth_mode = sl.DEPTH_MODE.NEURAL_PLUS
+        else:
+            sl_depth_mode = sl.DEPTH_MODE.NONE
+    else:
+        sl_depth_mode = depth_mode_map.get(depth_mode, sl.DEPTH_MODE.NONE)

    # Expand SVO paths (files or directories)
    expanded_svo = []
@@ -617,9 +738,9 @@ def main(
    }

    # Store verification frames for post-process check
-    verification_frames = {}
+    verification_frames: Dict[int, List[Dict[str, Any]]] = {}
    # Store first valid frame for benchmarking
-    first_frames = {}
+    first_frames: Dict[int, Dict[str, Any]] = {}

    # Track all visible marker IDs for heuristic ground detection
    all_visible_ids = set()
@@ -696,21 +817,29 @@ def main(
                                        "frame_index": frame_count,
                                    }

-                                best_so_far = verification_frames.get(serial)
-                                if (
-                                    best_so_far is None
-                                    or current_score > best_so_far["score"]
-                                ):
-                                    verification_frames[serial] = {
+                                if serial not in verification_frames:
+                                    verification_frames[serial] = []
+
+                                verification_frames[serial].append(
+                                    {
                                        "frame": frame,
                                        "ids": ids,
                                        "corners": corners,
                                        "score": current_score,
                                        "frame_index": frame_count,
                                    }
-                                    logger.debug(
-                                        f"Cam {serial}: New best frame {frame_count} with score {current_score:.2f}"
-                                    )
+                                )
+                                # Sort by score descending and truncate to pool size
+                                verification_frames[serial].sort(
+                                    key=lambda x: x["score"], reverse=True
+                                )
+                                verification_frames[serial] = verification_frames[
+                                    serial
+                                ][:depth_pool_size]
+
+                                logger.debug(
+                                    f"Cam {serial}: Updated verification pool (size {len(verification_frames[serial])}), top score {verification_frames[serial][0]['score']:.2f}"
+                                )

                            accumulators[serial].add_pose(
                                T_world_cam, reproj_err, frame_count
@@ -794,6 +923,7 @@ def main(
        refine_depth,
        use_confidence_weights,
        depth_confidence_threshold,
+        depth_pool_size,
        report_csv,
    )

@@ -890,6 +1020,36 @@ def main(
                )
                raise SystemExit(1)

+        # Verify depth-quality outliers if depth verification ran
+        depth_rmse_by_cam = {}
+        for serial, data in results.items():
+            depth_metrics = data.get("depth_verify_post") or data.get("depth_verify")
+            if depth_metrics and "rmse" in depth_metrics:
+                depth_rmse_by_cam[serial] = float(depth_metrics["rmse"])
+
+        if len(depth_rmse_by_cam) >= 2:
+            rmse_values = sorted(depth_rmse_by_cam.values())
+            median_rmse = float(np.median(np.array(rmse_values)))
+            outlier_factor = 2.5
+            min_outlier_rmse_m = 0.08
+
+            failed_depth_cams = []
+            for serial, rmse in depth_rmse_by_cam.items():
+                if rmse > max(min_outlier_rmse_m, outlier_factor * median_rmse):
+                    failed_depth_cams.append((serial, rmse))
+
+            if failed_depth_cams:
+                failed_str = ", ".join(
+                    f"{serial}:{rmse:.3f}m"
+                    for serial, rmse in sorted(failed_depth_cams)
+                )
+                click.echo(
+                    "Error: Calibration failed depth outlier self-check "
+                    f"(median RMSE={median_rmse:.3f}m, outliers={failed_str}).",
+                    err=True,
+                )
+                raise SystemExit(1)
+
        # Simple check: verify distance between cameras if multiple
        if len(results) >= 2:
            serials_list = sorted(results.keys())