feat(demo): add export and silhouette visualization outputs

Add preprocess-only silhouette export and configurable result exporters so demo runs can be persisted for offline analysis and reproducible evaluation. Include optional parquet support and CLI visualization dumps while updating tests and tracking notes for the verified pipeline/debug workflow.
2026-02-27 17:16:20 +08:00
parent 3496a1beb7
commit f501119d43
10 changed files with 1101 additions and 217 deletions
@@ -5,7 +5,7 @@ with track ID tracking and gap detection.
 """

 from collections import deque
-from typing import TYPE_CHECKING, Protocol, final
+from typing import TYPE_CHECKING, Protocol, cast, final

 import numpy as np
 import torch
@@ -20,6 +20,9 @@ if TYPE_CHECKING:
 SIL_HEIGHT: int = 64
 SIL_WIDTH: int = 44

+# Type alias for array-like inputs
+type _ArrayLike = torch.Tensor | ndarray
+

 class _Boxes(Protocol):
    """Protocol for boxes with xyxy and id attributes."""
@@ -207,6 +210,33 @@ class SilhouetteWindow:
        return len(self._buffer) / self.window_size


+def _to_numpy(obj: _ArrayLike) -> ndarray:
+    """Safely convert array-like object to numpy array.
+
+    Handles torch tensors (CPU or CUDA) by detaching and moving to CPU first.
+    Falls back to np.asarray for other array-like objects.
+
+    Args:
+        obj: Array-like object (numpy array, torch tensor, or similar).
+
+    Returns:
+        Numpy array representation of the input.
+    """
+    # Handle torch tensors (including CUDA tensors)
+    detach_fn = getattr(obj, "detach", None)
+    if detach_fn is not None and callable(detach_fn):
+        # It's a torch tensor
+        tensor = detach_fn()
+        cpu_fn = getattr(tensor, "cpu", None)
+        if cpu_fn is not None and callable(cpu_fn):
+            tensor = cpu_fn()
+        numpy_fn = getattr(tensor, "numpy", None)
+        if numpy_fn is not None and callable(numpy_fn):
+            return cast(ndarray, numpy_fn())
+    # Fall back to np.asarray for other array-like objects
+    return cast(ndarray, np.asarray(obj))
+
+
 def select_person(
    results: _DetectionResults,
 ) -> tuple[ndarray, tuple[int, int, int, int], int] | None:
@@ -232,7 +262,7 @@ def select_person(
    if track_ids_obj is None:
        return None

-    track_ids: ndarray = np.asarray(track_ids_obj)
+    track_ids: ndarray = _to_numpy(cast(ndarray, track_ids_obj))
    if track_ids.size == 0:
        return None

@@ -241,7 +271,7 @@ def select_person(
    if xyxy_obj is None:
        return None

-    bboxes: ndarray = np.asarray(xyxy_obj)
+    bboxes: ndarray = _to_numpy(cast(ndarray, xyxy_obj))
    if bboxes.ndim == 1:
        bboxes = bboxes.reshape(1, -1)

@@ -257,7 +287,7 @@ def select_person(
    if masks_data is None:
        return None

-    masks: ndarray = np.asarray(masks_data)
+    masks: ndarray = _to_numpy(cast(ndarray, masks_data))
    if masks.ndim == 2:
        masks = masks[np.newaxis, ...]

@@ -284,12 +314,35 @@ def select_person(

    # Extract mask and bbox
    mask: "NDArray[np.float32]" = masks[best_idx]
-    bbox = (
-        int(float(bboxes[best_idx][0])),
-        int(float(bboxes[best_idx][1])),
-        int(float(bboxes[best_idx][2])),
-        int(float(bboxes[best_idx][3])),
-    )
+    mask_shape = mask.shape
+    mask_h, mask_w = int(mask_shape[0]), int(mask_shape[1])
+
+    # Get original image dimensions from results (YOLO provides this)
+    orig_shape = getattr(results, "orig_shape", None)
+    # Validate orig_shape is a sequence of at least 2 numeric values
+    if (
+        orig_shape is not None
+        and isinstance(orig_shape, (tuple, list))
+        and len(orig_shape) >= 2
+    ):
+        frame_h, frame_w = int(orig_shape[0]), int(orig_shape[1])
+        # Scale bbox from frame space to mask space
+        scale_x = mask_w / frame_w if frame_w > 0 else 1.0
+        scale_y = mask_h / frame_h if frame_h > 0 else 1.0
+        bbox = (
+            int(float(bboxes[best_idx][0]) * scale_x),
+            int(float(bboxes[best_idx][1]) * scale_y),
+            int(float(bboxes[best_idx][2]) * scale_x),
+            int(float(bboxes[best_idx][3]) * scale_y),
+        )
+    else:
+        # Fallback: use bbox as-is (assume same coordinate space)
+        bbox = (
+            int(float(bboxes[best_idx][0])),
+            int(float(bboxes[best_idx][1])),
+            int(float(bboxes[best_idx][2])),
+            int(float(bboxes[best_idx][3])),
+        )
    track_id = int(track_ids[best_idx]) if best_idx < len(track_ids) else best_idx

    return mask, bbox, track_id