diff --git a/py_workspace/.sisyphus/notepads/multi-frame-depth-pooling/learnings.md b/py_workspace/.sisyphus/notepads/multi-frame-depth-pooling/learnings.md
new file mode 100644
index 0000000..c46017d
--- /dev/null
+++ b/py_workspace/.sisyphus/notepads/multi-frame-depth-pooling/learnings.md
@@ -0,0 +1,41 @@
+
+## Depth Pooling Implementation
+- Implemented `pool_depth_maps` in `aruco/depth_pool.py`.
+- Uses `np.nanmedian` for robust per-pixel depth pooling.
+- Supports confidence gating (lower is better) and `min_valid_count` threshold.
+- Handles N=1 case by returning a masked copy.
+- Vectorized implementation using `np.stack` and boolean masking for performance.
+
+## 2026-02-07: Depth Pooling Test Implementation
+- Implemented comprehensive unit tests for `pool_depth_maps` in `tests/test_depth_pool.py`.
+- Verified handling of:
+  - Empty input and shape mismatches (ValueError).
+  - Single map behavior (masked copy, min_valid_count check).
+  - Median pooling logic with multiple maps.
+  - Invalid depth values (<=0, non-finite).
+  - Confidence gating (ZED semantics: lower is better).
+  - min_valid_count enforcement across multiple frames.
+- Type checking with basedpyright confirmed clean (after fixing unused call results and Optional handling in tests).
+
+## Task 4: CLI Option Wiring
+- Added `--depth-pool-size` (1-10, default 1) to `calibrate_extrinsics.py`.
+- Wired the option through `main` to `apply_depth_verify_refine_postprocess`.
+- Maintained backward compatibility by defaulting to 1.
+- Extended `verification_frames` to store a list of top-N frames per camera, sorted by score descending.
+- Maintained backward compatibility by using the first frame in the list for current verification and benchmark logic.
+- Added `depth_pool_size` parameter to `main` and passed it to `apply_depth_verify_refine_postprocess`.
+
+## 2026-02-07: Multi-Frame Depth Pooling Integration
+- Integrated `pool_depth_maps` into `calibrate_extrinsics.py`.
+- Added `--depth-pool-size` CLI option (default 1).
+- Implemented fallback logic: if pooled depth has < 50% valid points compared to best single frame, fallback to single frame.
+- Added `depth_pool` metadata to JSON output.
+- Verified N=1 equivalence with regression test `tests/test_depth_pool_integration.py`.
+- Verified E2E smoke test:
+  - Pool=1 vs Pool=5 showed mixed results on small sample (20 frames):
+    - Camera 41831756: -0.0004m (Improved)
+    - Camera 44289123: +0.0004m (Worse)
+    - Camera 44435674: -0.0003m (Improved)
+    - Camera 46195029: +0.0036m (Worse)
+  - This variance is expected on small samples; pooling is intended for stability over larger datasets.
+  - Runtime warning `All-NaN slice encountered` observed in `nanmedian` when some pixels are invalid in all frames; this is handled by `nanmedian` returning NaN, which is correct behavior for us.
diff --git a/py_workspace/aruco/depth_pool.py b/py_workspace/aruco/depth_pool.py
new file mode 100644
index 0000000..3908d1e
--- /dev/null
+++ b/py_workspace/aruco/depth_pool.py
@@ -0,0 +1,89 @@
+import numpy as np
+
+
+def pool_depth_maps(
+    depth_maps: list[np.ndarray],
+    confidence_maps: list[np.ndarray] | None = None,
+    confidence_thresh: float = 50.0,
+    min_valid_count: int = 1,
+) -> tuple[np.ndarray, np.ndarray | None]:
+    """
+    Pool multiple depth maps into a single depth map using per-pixel median.
+
+    Args:
+        depth_maps: List of depth maps (H, W) in meters.
+        confidence_maps: Optional list of confidence maps (H, W).
+            ZED semantics: lower is better, 100 is often invalid/occluded.
+        confidence_thresh: Confidence values > threshold are considered invalid.
+        min_valid_count: Minimum number of valid depth values required to produce a pooled value.
+
+    Returns:
+        Tuple of (pooled_depth_map, pooled_confidence_map).
+        pooled_depth_map: (H, W) array with median depth or NaN.
+        pooled_confidence_map: (H, W) array with per-pixel minimum confidence, or None.
+
+    Raises:
+        ValueError: If depth_maps is empty or shapes are inconsistent.
+    """
+    if not depth_maps:
+        raise ValueError("depth_maps list cannot be empty")
+
+    n_maps = len(depth_maps)
+    shape = depth_maps[0].shape
+
+    for i, dm in enumerate(depth_maps):
+        if dm.shape != shape:
+            raise ValueError(
+                f"Depth map {i} has inconsistent shape {dm.shape} != {shape}"
+            )
+
+    if confidence_maps:
+        if len(confidence_maps) != n_maps:
+            raise ValueError(
+                f"Number of confidence maps ({len(confidence_maps)}) "
+                + f"must match number of depth maps ({n_maps})"
+            )
+        for i, cm in enumerate(confidence_maps):
+            if cm.shape != shape:
+                raise ValueError(
+                    f"Confidence map {i} has inconsistent shape {cm.shape} != {shape}"
+                )
+
+    if n_maps == 1:
+        pooled_depth = depth_maps[0].copy()
+        invalid_mask = ~np.isfinite(pooled_depth) | (pooled_depth <= 0)
+        if confidence_maps:
+            invalid_mask |= confidence_maps[0] > confidence_thresh
+
+        pooled_depth[invalid_mask] = np.nan
+
+        if min_valid_count > 1:
+            pooled_depth[:] = np.nan
+
+        pooled_conf = confidence_maps[0].copy() if confidence_maps else None
+        return pooled_depth, pooled_conf
+
+    depth_stack = np.stack(depth_maps, axis=0)
+
+    valid_mask = np.isfinite(depth_stack) & (depth_stack > 0)
+
+    conf_stack = None
+    if confidence_maps:
+        conf_stack = np.stack(confidence_maps, axis=0)
+        valid_mask &= conf_stack <= confidence_thresh
+
+    masked_depths = depth_stack.copy()
+    masked_depths[~valid_mask] = np.nan
+
+    valid_counts = np.sum(valid_mask, axis=0)
+
+    with np.errstate(invalid="ignore"):
+        pooled_depth = np.nanmedian(masked_depths, axis=0)
+
+    pooled_depth[valid_counts < min_valid_count] = np.nan
+
+    pooled_conf = None
+    if conf_stack is not None:
+        pooled_conf = np.min(conf_stack, axis=0)
+
+    return pooled_depth, pooled_conf
diff --git a/py_workspace/calibrate_extrinsics.py b/py_workspace/calibrate_extrinsics.py
index 1112251..c8b2fc1 100644
--- a/py_workspace/calibrate_extrinsics.py
+++ b/py_workspace/calibrate_extrinsics.py
@@ -24,6 +24,7 @@ from aruco.pose_averaging import PoseAccumulator
 from aruco.preview import draw_detected_markers, draw_pose_axes, show_preview
 from aruco.depth_verify import verify_extrinsics_with_depth
 from aruco.depth_refine import refine_extrinsics_with_depth
+from aruco.depth_pool import pool_depth_maps
 from aruco.alignment import (
     get_face_normal_from_geometry,
     detect_ground_face,
@@ -117,13 +118,14 @@ def score_frame(
 
 def apply_depth_verify_refine_postprocess(
     results: Dict[str, Any],
-    verification_frames: Dict[str, Any],
+    verification_frames: Dict[int, List[Dict[str, Any]]],
     marker_geometry: Dict[int, Any],
-    camera_matrices: Dict[str, Any],
+    camera_matrices: Dict[int, Any],
     verify_depth: bool,
     refine_depth: bool,
     use_confidence_weights: bool,
     depth_confidence_threshold: int,
+    depth_pool_size: int = 1,
     report_csv_path: Optional[str] = None,
 ) -> Tuple[Dict[str, Any], List[List[Any]]]:
     """
@@ -137,12 +139,117 @@ def apply_depth_verify_refine_postprocess(
 
     click.echo("\nRunning depth verification/refinement on computed extrinsics...")
 
-    for serial, vf in verification_frames.items():
+    for serial, vfs in verification_frames.items():
         if str(serial) not in results:
             continue
 
-        frame = vf["frame"]
-        ids = vf["ids"]
+        # Extract depth maps and confidence maps from the top-N frames
+        # vfs is already sorted by score descending and truncated to depth_pool_size
+        depth_maps = []
+        confidence_maps = []
+
+        # We need at least one frame with depth
+        valid_frames = []
+        for vf in vfs:
+            frame = vf["frame"]
+            if frame.depth_map is not None:
+                depth_maps.append(frame.depth_map)
+                confidence_maps.append(frame.confidence_map)
+                valid_frames.append(vf)
+
+        if not valid_frames:
+            click.echo(
+                f"Camera {serial}: No frames with depth map available for verification."
+            )
+            continue
+
+        # Use the best frame (first in the list) for marker IDs and corners
+        # This ensures we use the highest quality detection for geometry
+        best_vf = valid_frames[0]
+        ids = best_vf["ids"]
+
+        # Determine if we should pool or use single frame
+        use_pooling = depth_pool_size > 1 and len(depth_maps) > 1
+
+        if use_pooling:
+            try:
+                pooled_depth, pooled_conf = pool_depth_maps(
+                    depth_maps,
+                    confidence_maps,
+                    confidence_thresh=depth_confidence_threshold,
+                )
+
+                # Check if pooling resulted in a valid map (enough valid pixels)
+                # We'll do a quick check against the best single frame
+                # If pooled map has significantly fewer valid pixels, fallback
+                best_depth = depth_maps[0]
+                best_conf = confidence_maps[0]
+
+                # Simple validity check (finite and > 0)
+                # We don't need to be perfect here, just catch catastrophic pooling failure
+                n_valid_pooled = np.count_nonzero(
+                    np.isfinite(pooled_depth) & (pooled_depth > 0)
+                )
+
+                # For best frame, we also respect confidence threshold if provided
+                mask_best = np.isfinite(best_depth) & (best_depth > 0)
+                if best_conf is not None:
+                    mask_best &= best_conf <= depth_confidence_threshold
+                n_valid_best = np.count_nonzero(mask_best)
+
+                # If pooled result is much worse (e.g. < 50% of valid points of single frame), fallback
+                # This can happen if frames are misaligned or pooling logic fails
+                if n_valid_pooled < (n_valid_best * 0.5):
+                    click.echo(
+                        f"Camera {serial}: Pooled depth has too few valid points ({n_valid_pooled} vs {n_valid_best}). "
+                        "Falling back to best single frame."
+                    )
+                    final_depth = best_depth
+                    final_conf = best_conf
+                    pool_metadata = {
+                        "pool_size_requested": depth_pool_size,
+                        "pool_size_actual": len(depth_maps),
+                        "pooled": False,
+                        "fallback_reason": "insufficient_valid_points",
+                    }
+                else:
+                    final_depth = pooled_depth
+                    final_conf = pooled_conf
+                    pool_metadata = {
+                        "pool_size_requested": depth_pool_size,
+                        "pool_size_actual": len(depth_maps),
+                        "pooled": True,
+                    }
+                    click.echo(
+                        f"Camera {serial}: Using pooled depth from {len(depth_maps)} frames."
+                    )
+            except Exception as e:
+                click.echo(
+                    f"Camera {serial}: Pooling failed with error: {e}. Falling back to single frame.",
+                    err=True,
+                )
+                final_depth = depth_maps[0]
+                final_conf = confidence_maps[0]
+                pool_metadata = {
+                    "pool_size_requested": depth_pool_size,
+                    "pool_size_actual": len(depth_maps),
+                    "pooled": False,
+                    "fallback_reason": f"exception: {str(e)}",
+                }
+        else:
+            # Single frame case (N=1 or only 1 available)
+            final_depth = depth_maps[0]
+            final_conf = confidence_maps[0]
+            # Only add metadata if pooling was requested but not possible due to lack of frames
+            if depth_pool_size > 1:
+                pool_metadata = {
+                    "pool_size_requested": depth_pool_size,
+                    "pool_size_actual": len(depth_maps),
+                    "pooled": False,
+                    "fallback_reason": "insufficient_frames",
+                }
+            else:
+                pool_metadata = None
 
         # Use the FINAL COMPUTED POSE for verification
         pose_str = results[str(serial)]["pose"]
@@ -155,13 +262,13 @@ def apply_depth_verify_refine_postprocess(
             if int(mid) in marker_geometry
         }
 
-        if marker_corners_world and frame.depth_map is not None:
+        if marker_corners_world and final_depth is not None:
             verify_res = verify_extrinsics_with_depth(
                 T_mean,
                 marker_corners_world,
-                frame.depth_map,
+                final_depth,
                 cam_matrix,
-                confidence_map=frame.confidence_map,
+                confidence_map=final_conf,
                 confidence_thresh=depth_confidence_threshold,
             )
 
@@ -174,6 +281,9 @@ def apply_depth_verify_refine_postprocess(
                 "n_total": verify_res.n_total,
             }
 
+            if pool_metadata:
+                results[str(serial)]["depth_pool"] = pool_metadata
+
             click.echo(
                 f"Camera {serial} verification: RMSE={verify_res.rmse:.3f}m, "
                 f"Valid={verify_res.n_valid}/{verify_res.n_total}"
@@ -189,20 +299,18 @@ def apply_depth_verify_refine_postprocess(
                     T_refined, refine_stats = refine_extrinsics_with_depth(
                         T_mean,
                         marker_corners_world,
-                        frame.depth_map,
+                        final_depth,
                         cam_matrix,
-                        confidence_map=frame.confidence_map
-                        if use_confidence_weights
-                        else None,
+                        confidence_map=(final_conf if use_confidence_weights else None),
                         confidence_thresh=depth_confidence_threshold,
                     )
 
                     verify_res_post = verify_extrinsics_with_depth(
                         T_refined,
                         marker_corners_world,
-                        frame.depth_map,
+                        final_depth,
                         cam_matrix,
-                        confidence_map=frame.confidence_map,
+                        confidence_map=final_conf,
                         confidence_thresh=depth_confidence_threshold,
                     )
 
@@ -218,6 +326,9 @@ def apply_depth_verify_refine_postprocess(
                         "n_total": verify_res_post.n_total,
                     }
 
+                    if pool_metadata:
+                        results[str(serial)]["depth_pool"] = pool_metadata
+
                     improvement = verify_res.rmse - verify_res_post.rmse
                     results[str(serial)]["refine_depth"]["improvement_rmse"] = (
                         improvement
@@ -260,10 +371,10 @@ def apply_depth_verify_refine_postprocess(
 
 def run_benchmark_matrix(
     results: Dict[str, Any],
-    verification_frames: Dict[Any, Any],
-    first_frames: Dict[Any, Any],
+    verification_frames: Dict[int, List[Dict[str, Any]]],
+    first_frames: Dict[int, Dict[str, Any]],
     marker_geometry: Dict[int, Any],
-    camera_matrices: Dict[Any, Any],
+    camera_matrices: Dict[int, Any],
     depth_confidence_threshold: int,
 ) -> Dict[str, Any]:
     """
@@ -318,11 +429,10 @@ def run_benchmark_matrix(
         for config in configs:
             name = config["name"]
             use_best = config["use_best_frame"]
-            vf = (
-                verification_frames[serial_int]
-                if use_best
-                else first_frames[serial_int]
-            )
+            if use_best:
+                vf = verification_frames[serial_int][0]
+            else:
+                vf = first_frames[serial_int]
 
             frame = vf["frame"]
             ids = vf["ids"]
@@ -351,9 +461,9 @@ def run_benchmark_matrix(
                 marker_corners_world,
                 frame.depth_map,
                 cam_matrix,
-                confidence_map=frame.confidence_map
-                if config["use_confidence"]
-                else None,
+                confidence_map=(
+                    frame.confidence_map if config["use_confidence"] else None
+                ),
                 confidence_thresh=depth_confidence_threshold,
                 loss=str(config["loss"]),
                 f_scale=0.1,
@@ -430,9 +540,9 @@ def run_benchmark_matrix(
 )
 @click.option(
     "--depth-mode",
-    default="NEURAL",
-    type=click.Choice(["NEURAL", "ULTRA", "PERFORMANCE", "NONE"]),
-    help="Depth computation mode.",
+    default=None,
+    type=click.Choice(["NEURAL", "NEURAL_PLUS", "NEURAL_LIGHT", "NONE"]),
+    help="Depth computation mode. Defaults to NEURAL_PLUS if depth verification/refinement is enabled, otherwise NONE.",
 )
 @click.option(
     "--depth-confidence-threshold",
@@ -440,6 +550,12 @@ def run_benchmark_matrix(
     type=int,
     help="Confidence threshold for depth filtering (lower = more confident).",
 )
+@click.option(
+    "--depth-pool-size",
+    default=1,
+    type=click.IntRange(min=1, max=10),
+    help="Number of best frames to pool for depth verification/refinement (1=single best frame).",
+)
 @click.option(
     "--report-csv", type=click.Path(), help="Optional path for per-frame CSV report."
 )
@@ -494,8 +610,9 @@ def main(
     verify_depth: bool,
     refine_depth: bool,
     use_confidence_weights: bool,
-    depth_mode: str,
+    depth_mode: str | None,
     depth_confidence_threshold: int,
+    depth_pool_size: int,
     report_csv: str | None,
     auto_align: bool,
     ground_face: str | None,
@@ -519,14 +636,18 @@ def main(
 
     depth_mode_map = {
         "NEURAL": sl.DEPTH_MODE.NEURAL,
-        "ULTRA": sl.DEPTH_MODE.ULTRA,
-        "PERFORMANCE": sl.DEPTH_MODE.PERFORMANCE,
+        "NEURAL_PLUS": sl.DEPTH_MODE.NEURAL_PLUS,
+        "NEURAL_LIGHT": sl.DEPTH_MODE.NEURAL_LIGHT,
         "NONE": sl.DEPTH_MODE.NONE,
     }
-    sl_depth_mode = depth_mode_map.get(depth_mode, sl.DEPTH_MODE.NONE)
 
-    if not (verify_depth or refine_depth or benchmark_matrix):
-        sl_depth_mode = sl.DEPTH_MODE.NONE
+    if depth_mode is None:
+        if verify_depth or refine_depth or benchmark_matrix:
+            sl_depth_mode = sl.DEPTH_MODE.NEURAL_PLUS
+        else:
+            sl_depth_mode = sl.DEPTH_MODE.NONE
+    else:
+        sl_depth_mode = depth_mode_map.get(depth_mode, sl.DEPTH_MODE.NONE)
 
     # Expand SVO paths (files or directories)
     expanded_svo = []
@@ -617,9 +738,9 @@ def main(
     }
 
     # Store verification frames for post-process check
-    verification_frames = {}
+    verification_frames: Dict[int, List[Dict[str, Any]]] = {}
     # Store first valid frame for benchmarking
-    first_frames = {}
+    first_frames: Dict[int, Dict[str, Any]] = {}
 
     # Track all visible marker IDs for heuristic ground detection
     all_visible_ids = set()
@@ -696,21 +817,29 @@ def main(
                                         "frame_index": frame_count,
                                     }
 
-                                best_so_far = verification_frames.get(serial)
-                                if (
-                                    best_so_far is None
-                                    or current_score > best_so_far["score"]
-                                ):
-                                    verification_frames[serial] = {
+                                if serial not in verification_frames:
+                                    verification_frames[serial] = []
+
+                                verification_frames[serial].append(
+                                    {
                                         "frame": frame,
                                         "ids": ids,
                                         "corners": corners,
                                         "score": current_score,
                                         "frame_index": frame_count,
                                     }
-                                    logger.debug(
-                                        f"Cam {serial}: New best frame {frame_count} with score {current_score:.2f}"
-                                    )
+                                )
+                                # Sort by score descending and truncate to pool size
+                                verification_frames[serial].sort(
+                                    key=lambda x: x["score"], reverse=True
+                                )
+                                verification_frames[serial] = verification_frames[
+                                    serial
+                                ][:depth_pool_size]
+
+                                logger.debug(
+                                    f"Cam {serial}: Updated verification pool (size {len(verification_frames[serial])}), top score {verification_frames[serial][0]['score']:.2f}"
+                                )
 
                             accumulators[serial].add_pose(
                                 T_world_cam, reproj_err, frame_count
@@ -794,6 +923,7 @@ def main(
         refine_depth,
         use_confidence_weights,
         depth_confidence_threshold,
+        depth_pool_size,
         report_csv,
     )
 
@@ -890,6 +1020,36 @@ def main(
                 )
                 raise SystemExit(1)
 
+        # Verify depth-quality outliers if depth verification ran
+        depth_rmse_by_cam = {}
+        for serial, data in results.items():
+            depth_metrics = data.get("depth_verify_post") or data.get("depth_verify")
+            if depth_metrics and "rmse" in depth_metrics:
+                depth_rmse_by_cam[serial] = float(depth_metrics["rmse"])
+
+        if len(depth_rmse_by_cam) >= 2:
+            rmse_values = sorted(depth_rmse_by_cam.values())
+            median_rmse = float(np.median(np.array(rmse_values)))
+            outlier_factor = 2.5
+            min_outlier_rmse_m = 0.08
+
+            failed_depth_cams = []
+            for serial, rmse in depth_rmse_by_cam.items():
+                if rmse > max(min_outlier_rmse_m, outlier_factor * median_rmse):
+                    failed_depth_cams.append((serial, rmse))
+
+            if failed_depth_cams:
+                failed_str = ", ".join(
+                    f"{serial}:{rmse:.3f}m"
+                    for serial, rmse in sorted(failed_depth_cams)
+                )
+                click.echo(
+                    "Error: Calibration failed depth outlier self-check "
+                    f"(median RMSE={median_rmse:.3f}m, outliers={failed_str}).",
+                    err=True,
+                )
+                raise SystemExit(1)
+
         # Simple check: verify distance between cameras if multiple
         if len(results) >= 2:
             serials_list = sorted(results.keys())
diff --git a/py_workspace/tests/test_depth_cli_postprocess.py b/py_workspace/tests/test_depth_cli_postprocess.py
index b1b58e2..786d09c 100644
--- a/py_workspace/tests/test_depth_cli_postprocess.py
+++ b/py_workspace/tests/test_depth_cli_postprocess.py
@@ -67,7 +67,7 @@ def test_benchmark_matrix(mock_dependencies):
         "frame_index": 100,
     }
 
-    verification_frames = {serial_int: vf}
+    verification_frames = {serial_int: [vf]}
     first_frames = {serial_int: vf}
     marker_geometry = {1: np.zeros((4, 3))}
     camera_matrices = {serial_int: np.eye(3)}
@@ -100,6 +100,7 @@ def test_verify_only(mock_dependencies, tmp_path):
 
     # Setup inputs
     serial = "123456"
+    serial_int = int(serial)
     results = {
         serial: {
             "pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1",  # Identity matrix flattened
@@ -107,16 +108,18 @@ def test_verify_only(mock_dependencies, tmp_path):
         }
     }
     verification_frames = {
-        serial: {
-            "frame": MagicMock(
-                depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
-            ),
-            "ids": np.array([[1]]),
-            "corners": np.zeros((1, 4, 2)),
-        }
+        serial_int: [
+            {
+                "frame": MagicMock(
+                    depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
+                ),
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+            }
+        ]
     }
     marker_geometry = {1: np.zeros((4, 3))}
-    camera_matrices = {serial: np.eye(3)}
+    camera_matrices = {serial_int: np.eye(3)}
 
     updated_results, csv_rows = apply_depth_verify_refine_postprocess(
         results=results,
@@ -146,18 +149,21 @@ def test_refine_depth(mock_dependencies):
 
     # Setup inputs
     serial = "123456"
+    serial_int = int(serial)
     results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1", "stats": {}}}
     verification_frames = {
-        serial: {
-            "frame": MagicMock(
-                depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
-            ),
-            "ids": np.array([[1]]),
-            "corners": np.zeros((1, 4, 2)),
-        }
+        serial_int: [
+            {
+                "frame": MagicMock(
+                    depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
+                ),
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+            }
+        ]
     }
     marker_geometry = {1: np.zeros((4, 3))}
-    camera_matrices = {serial: np.eye(3)}
+    camera_matrices = {serial_int: np.eye(3)}
 
     # Mock verify to return different values for pre and post
     # First call (pre-refine)
@@ -199,15 +205,18 @@ def test_refine_depth_warning_negligible_improvement(mock_dependencies):
     mock_verify, mock_refine, mock_echo = mock_dependencies
 
     serial = "123456"
+    serial_int = int(serial)
     results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1", "stats": {}}}
     verification_frames = {
-        serial: {
-            "frame": MagicMock(depth_map=np.zeros((10, 10))),
-            "ids": np.array([[1]]),
-        }
+        serial_int: [
+            {
+                "frame": MagicMock(depth_map=np.zeros((10, 10))),
+                "ids": np.array([[1]]),
+            }
+        ]
     }
     marker_geometry = {1: np.zeros((4, 3))}
-    camera_matrices = {serial: np.eye(3)}
+    camera_matrices = {serial_int: np.eye(3)}
 
     # RMSE stays almost same
     res_pre = MagicMock(rmse=0.1, n_valid=10, residuals=[])
@@ -249,15 +258,18 @@ def test_refine_depth_warning_failed_or_stalled(mock_dependencies):
     mock_verify, mock_refine, mock_echo = mock_dependencies
 
     serial = "123456"
+    serial_int = int(serial)
     results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1", "stats": {}}}
     verification_frames = {
-        serial: {
-            "frame": MagicMock(depth_map=np.zeros((10, 10))),
-            "ids": np.array([[1]]),
-        }
+        serial_int: [
+            {
+                "frame": MagicMock(depth_map=np.zeros((10, 10))),
+                "ids": np.array([[1]]),
+            }
+        ]
     }
     marker_geometry = {1: np.zeros((4, 3))}
-    camera_matrices = {serial: np.eye(3)}
+    camera_matrices = {serial_int: np.eye(3)}
 
     res_pre = MagicMock(rmse=0.1, n_valid=10, residuals=[])
     res_post = MagicMock(rmse=0.1, n_valid=10, residuals=[])
@@ -298,18 +310,21 @@ def test_csv_output(mock_dependencies, tmp_path):
     csv_path = tmp_path / "report.csv"
 
     serial = "123456"
+    serial_int = int(serial)
     results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1", "stats": {}}}
     verification_frames = {
-        serial: {
-            "frame": MagicMock(
-                depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
-            ),
-            "ids": np.array([[1]]),
-            "corners": np.zeros((1, 4, 2)),
-        }
+        serial_int: [
+            {
+                "frame": MagicMock(
+                    depth_map=np.zeros((10, 10)), confidence_map=np.zeros((10, 10))
+                ),
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+            }
+        ]
     }
     marker_geometry = {1: np.zeros((4, 3))}
-    camera_matrices = {serial: np.eye(3)}
+    camera_matrices = {serial_int: np.eye(3)}
 
     updated_results, csv_rows = apply_depth_verify_refine_postprocess(
         results=results,
@@ -324,11 +339,11 @@ def test_csv_output(mock_dependencies, tmp_path):
     )
 
     assert len(csv_rows) == 2  # From mock_verify_res.residuals
-    assert csv_rows[0] == [serial, 1, 0, 0.01]
+    assert csv_rows[0] == [serial_int, 1, 0, 0.01]
 
     # Verify file content
     assert csv_path.exists()
     content = csv_path.read_text().splitlines()
     assert len(content) == 3  # Header + 2 rows
     assert content[0] == "serial,marker_id,corner_idx,residual"
-    assert content[1] == f"{serial},1,0,0.01"
+    assert content[1] == f"{serial_int},1,0,0.01"
diff --git a/py_workspace/tests/test_depth_pool.py b/py_workspace/tests/test_depth_pool.py
new file mode 100644
index 0000000..9d734ca
--- /dev/null
+++ b/py_workspace/tests/test_depth_pool.py
@@ -0,0 +1,134 @@
+import numpy as np
+import pytest
+from aruco.depth_pool import pool_depth_maps
+
+
+def test_pool_depth_maps_empty():
+    with pytest.raises(ValueError, match="depth_maps list cannot be empty"):
+        _ = pool_depth_maps([])
+
+
+def test_pool_depth_maps_shape_mismatch():
+    dm1 = np.ones((10, 10))
+    dm2 = np.ones((10, 11))
+    with pytest.raises(ValueError, match="inconsistent shape"):
+        _ = pool_depth_maps([dm1, dm2])
+
+
+def test_pool_depth_maps_confidence_mismatch():
+    dm1 = np.ones((10, 10))
+    cm1 = np.ones((10, 10))
+    with pytest.raises(ValueError, match="must match number of depth maps"):
+        _ = pool_depth_maps([dm1], confidence_maps=[cm1, cm1])
+
+
+def test_pool_depth_maps_confidence_shape_mismatch():
+    dm1 = np.ones((10, 10))
+    cm1 = np.ones((10, 11))
+    with pytest.raises(ValueError, match="inconsistent shape"):
+        _ = pool_depth_maps([dm1], confidence_maps=[cm1])
+
+
+def test_pool_depth_maps_single_map():
+    # N=1 returns masked copy behavior
+    dm = np.array([[1.0, -1.0], [np.nan, 2.0]])
+    pooled, conf = pool_depth_maps([dm])
+
+    expected = np.array([[1.0, np.nan], [np.nan, 2.0]])
+    np.testing.assert_allclose(pooled, expected)
+    assert conf is None
+
+    # Test min_valid_count > 1 for single map
+    pooled, _ = pool_depth_maps([dm], min_valid_count=2)
+    assert np.all(np.isnan(pooled))
+
+
+def test_pool_depth_maps_median():
+    # Median pooling with clean values
+    dm1 = np.array([[1.0, 2.0], [3.0, 4.0]])
+    dm2 = np.array([[1.2, 1.8], [3.2, 3.8]])
+    dm3 = np.array([[0.8, 2.2], [2.8, 4.2]])
+
+    pooled, _ = pool_depth_maps([dm1, dm2, dm3])
+
+    # Median of [1.0, 1.2, 0.8] is 1.0
+    # Median of [2.0, 1.8, 2.2] is 2.0
+    # Median of [3.0, 3.2, 2.8] is 3.0
+    # Median of [4.0, 3.8, 4.2] is 4.0
+    expected = np.array([[1.0, 2.0], [3.0, 4.0]])
+    np.testing.assert_allclose(pooled, expected)
+
+
+def test_pool_depth_maps_invalid_handling():
+    # NaN/invalid handling (non-finite or <=0)
+    dm1 = np.array([[1.0, np.nan], [0.0, -1.0]])
+    dm2 = np.array([[1.2, 2.0], [3.0, 4.0]])
+
+    pooled, _ = pool_depth_maps([dm1, dm2])
+
+    # (0,0): median(1.0, 1.2) = 1.1
+    # (0,1): median(nan, 2.0) = 2.0
+    # (1,0): median(0.0, 3.0) = 3.0 (0.0 is invalid)
+    # (1,1): median(-1.0, 4.0) = 4.0 (-1.0 is invalid)
+    expected = np.array([[1.1, 2.0], [3.0, 4.0]])
+    np.testing.assert_allclose(pooled, expected)
+
+
+def test_pool_depth_maps_confidence_gating():
+    # Confidence gating (confidence > threshold excluded)
+    dm1 = np.array([[1.0, 1.0], [1.0, 1.0]])
+    dm2 = np.array([[2.0, 2.0], [2.0, 2.0]])
+
+    cm1 = np.array([[10, 60], [10, 60]])
+    cm2 = np.array([[60, 10], [10, 10]])
+
+    # threshold = 50
+    pooled, pooled_conf = pool_depth_maps(
+        [dm1, dm2], confidence_maps=[cm1, cm2], confidence_thresh=50.0
+    )
+
+    # (0,0): dm1 valid (10), dm2 invalid (60) -> 1.0
+    # (0,1): dm1 invalid (60), dm2 valid (10) -> 2.0
+    # (1,0): dm1 valid (10), dm2 valid (10) -> 1.5
+    # (1,1): dm1 invalid (60), dm2 valid (10) -> 2.0
+    expected_depth = np.array([[1.0, 2.0], [1.5, 2.0]])
+    expected_conf = np.array([[10, 10], [10, 10]])
+
+    np.testing.assert_allclose(pooled, expected_depth)
+    assert pooled_conf is not None
+    np.testing.assert_allclose(pooled_conf, expected_conf)
+
+
+def test_pool_depth_maps_all_invalid():
+    # All invalid -> NaN outputs
+    dm1 = np.array([[np.nan, 0.0], [-1.0, 1.0]])
+    cm1 = np.array([[10, 10], [10, 100]])  # 100 > 50
+
+    pooled, _ = pool_depth_maps([dm1], confidence_maps=[cm1], confidence_thresh=50.0)
+    assert np.all(np.isnan(pooled))
+
+
+def test_pool_depth_maps_min_valid_count():
+    # min_valid_count enforcement
+    dm1 = np.array([[1.0, 1.0], [1.0, 1.0]])
+    dm2 = np.array([[2.0, 2.0], [np.nan, np.nan]])
+
+    # min_valid_count = 2
+    pooled, _ = pool_depth_maps([dm1, dm2], min_valid_count=2)
+
+    # (0,0): 2 valid -> 1.5
+    # (0,1): 2 valid -> 1.5
+    # (1,0): 1 valid -> nan
+    # (1,1): 1 valid -> nan
+    expected = np.array([[1.5, 1.5], [np.nan, np.nan]])
+    np.testing.assert_allclose(pooled, expected)
+
+
+def test_pool_depth_maps_confidence_none():
+    # confidence_maps None behavior
+    dm1 = np.ones((2, 2))
+    dm2 = np.ones((2, 2)) * 2
+
+    pooled, conf = pool_depth_maps([dm1, dm2])
+    assert conf is None
+    np.testing.assert_allclose(pooled, np.ones((2, 2)) * 1.5)
diff --git a/py_workspace/tests/test_depth_pool_integration.py b/py_workspace/tests/test_depth_pool_integration.py
new file mode 100644
index 0000000..26d9c4f
--- /dev/null
+++ b/py_workspace/tests/test_depth_pool_integration.py
@@ -0,0 +1,253 @@
+import pytest
+import numpy as np
+from unittest.mock import MagicMock, patch
+import sys
+from pathlib import Path
+
+# Add py_workspace to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+from calibrate_extrinsics import apply_depth_verify_refine_postprocess
+
+
+@pytest.fixture
+def mock_dependencies():
+    with (
+        patch("calibrate_extrinsics.verify_extrinsics_with_depth") as mock_verify,
+        patch("calibrate_extrinsics.refine_extrinsics_with_depth") as mock_refine,
+        patch("calibrate_extrinsics.click.echo") as mock_echo,
+    ):
+        # Setup mock return values
+        mock_verify_res = MagicMock()
+        mock_verify_res.rmse = 0.05
+        mock_verify_res.mean_abs = 0.04
+        mock_verify_res.median = 0.03
+        mock_verify_res.depth_normalized_rmse = 0.02
+        mock_verify_res.n_valid = 100
+        mock_verify_res.n_total = 120
+        mock_verify_res.residuals = []
+        mock_verify.return_value = mock_verify_res
+
+        mock_refine.return_value = (np.eye(4), {"success": True})
+
+        yield mock_verify, mock_refine, mock_echo
+
+
+def test_pool_size_1_equivalence(mock_dependencies):
+    """
+    Regression test: Ensure pool_size=1 behaves exactly like the old single-frame path.
+    """
+    mock_verify, _, _ = mock_dependencies
+
+    serial = "123456"
+    results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1"}}
+
+    # Create a frame with specific depth values
+    depth_map = np.ones((10, 10)) * 2.0
+    conf_map = np.zeros((10, 10))
+
+    frame_mock = MagicMock()
+    frame_mock.depth_map = depth_map
+    frame_mock.confidence_map = conf_map
+
+    vf = {
+        "frame": frame_mock,
+        "ids": np.array([[1]]),
+        "corners": np.zeros((1, 4, 2)),
+        "score": 100.0,
+    }
+
+    # Structure for new implementation: list of frames
+    verification_frames = {serial: [vf]}
+    marker_geometry = {1: np.zeros((4, 3))}
+    camera_matrices = {serial: np.eye(3)}
+
+    # Run with pool_size=1
+    apply_depth_verify_refine_postprocess(
+        results=results,
+        verification_frames=verification_frames,
+        marker_geometry=marker_geometry,
+        camera_matrices=camera_matrices,
+        verify_depth=True,
+        refine_depth=False,
+        use_confidence_weights=False,
+        depth_confidence_threshold=50,
+        depth_pool_size=1,
+    )
+
+    # Verify that verify_extrinsics_with_depth was called with the exact depth map from the frame
+    args, _ = mock_verify.call_args
+    passed_depth_map = args[2]
+
+    np.testing.assert_array_equal(passed_depth_map, depth_map)
+    assert passed_depth_map is depth_map
+
+
+def test_pool_size_5_integration(mock_dependencies):
+    """
+    Test that pool_size > 1 actually calls pooling and uses the result.
+    """
+    mock_verify, _, mock_echo = mock_dependencies
+
+    serial = "123456"
+    results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1"}}
+
+    # Create 3 frames with different depth values
+    # Frame 1: 2.0m
+    # Frame 2: 2.2m
+    # Frame 3: 1.8m
+    # Median should be 2.0m
+
+    frames = []
+    for d in [2.0, 2.2, 1.8]:
+        f = MagicMock()
+        f.depth_map = np.ones((10, 10)) * d
+        f.confidence_map = np.zeros((10, 10))
+        frames.append(f)
+
+    vfs = []
+    for i, f in enumerate(frames):
+        vfs.append(
+            {
+                "frame": f,
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+                "score": 100.0 - i,
+            }
+        )
+
+    verification_frames = {serial: vfs}
+    marker_geometry = {1: np.zeros((4, 3))}
+    camera_matrices = {serial: np.eye(3)}
+
+    # Run with pool_size=3
+    apply_depth_verify_refine_postprocess(
+        results=results,
+        verification_frames=verification_frames,
+        marker_geometry=marker_geometry,
+        camera_matrices=camera_matrices,
+        verify_depth=True,
+        refine_depth=False,
+        use_confidence_weights=False,
+        depth_confidence_threshold=50,
+        depth_pool_size=3,
+    )
+
+    # Check that "Using pooled depth" was logged
+    any_pooled = any(
+        "Using pooled depth" in str(call.args[0]) for call in mock_echo.call_args_list
+    )
+    assert any_pooled
+
+    # Check that the depth map passed to verify is the median (2.0)
+    args, _ = mock_verify.call_args
+    passed_depth_map = args[2]
+
+    expected_median = np.ones((10, 10)) * 2.0
+    np.testing.assert_allclose(passed_depth_map, expected_median)
+
+    # Verify metadata was added
+    assert "depth_pool" in results[serial]
+    assert results[serial]["depth_pool"]["pooled"] is True
+    assert results[serial]["depth_pool"]["pool_size_actual"] == 3
+
+
+def test_pool_fallback_insufficient_valid(mock_dependencies):
+    """
+    Test fallback to single frame when pooled result has too few valid points.
+    """
+    mock_verify, _, mock_echo = mock_dependencies
+
+    serial = "123456"
+    results = {serial: {"pose": "1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1"}}
+
+    # Frame 1: Good depth
+    f1 = MagicMock()
+    f1.depth_map = np.ones((10, 10)) * 2.0
+    f1.confidence_map = np.zeros((10, 10))
+
+    # Frame 2: NaN depth (simulating misalignment or noise)
+    f2 = MagicMock()
+    f2.depth_map = np.full((10, 10), np.nan)
+    f2.confidence_map = np.zeros((10, 10))
+
+    # Frame 3: NaN depth
+    f3 = MagicMock()
+    f3.depth_map = np.full((10, 10), np.nan)
+    f3.confidence_map = np.zeros((10, 10))
+
+    # With median pooling, if >50% are NaN, result is NaN (standard median behavior with NaNs usually propagates or ignores)
+    # Our pool_depth_maps uses nanmedian, which ignores NaNs.
+    # But if we have [2.0, NaN, NaN], median of [2.0] is 2.0.
+    # Wait, let's make it so they are valid but inconsistent to cause variance?
+    # Or just force the pooled result to be bad by making them all different and sparse?
+
+    # Let's use the fact that we can patch pool_depth_maps in the test!
+    with patch("calibrate_extrinsics.pool_depth_maps") as mock_pool:
+        # Return empty/invalid map
+        mock_pool.return_value = (
+            np.zeros((10, 10)),
+            None,
+        )  # Zeros are invalid depth (<=0)
+
+        # Frame 1: Valid on left half
+        d1 = np.full((10, 10), np.nan)
+        d1[:, :5] = 2.0
+        f1.depth_map = d1
+        f1.confidence_map = np.zeros((10, 10))
+
+        # Frame 2: Valid on right half
+        d2 = np.full((10, 10), np.nan)
+        d2[:, 5:] = 2.0
+        f2.depth_map = d2
+        f2.confidence_map = np.zeros((10, 10))
+
+        vfs = [
+            {
+                "frame": f1,
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+                "score": 100,
+            },
+            {
+                "frame": f2,
+                "ids": np.array([[1]]),
+                "corners": np.zeros((1, 4, 2)),
+                "score": 90,
+            },
+        ]
+
+        verification_frames = {serial: vfs}
+        marker_geometry = {1: np.zeros((4, 3))}
+        camera_matrices = {serial: np.eye(3)}
+
+        apply_depth_verify_refine_postprocess(
+            results=results,
+            verification_frames=verification_frames,
+            marker_geometry=marker_geometry,
+            camera_matrices=camera_matrices,
+            verify_depth=True,
+            refine_depth=False,
+            use_confidence_weights=False,
+            depth_confidence_threshold=50,
+            depth_pool_size=2,
+        )
+
+        # Check for fallback message
+        any_fallback = any(
+            "Falling back to best single frame" in str(call.args[0])
+            for call in mock_echo.call_args_list
+        )
+        assert any_fallback
+
+        # Verify we used the best frame (f1)
+        args, _ = mock_verify.call_args
+        passed_depth_map = args[2]
+        assert passed_depth_map is d1
+
+        # Verify metadata
+        assert results[serial]["depth_pool"]["pooled"] is False
+        assert (
+            results[serial]["depth_pool"]["fallback_reason"]
+            == "insufficient_valid_points"
+        )