fix(demo): stabilize visualizer bbox and mask rendering

Align bbox coordinate handling across primary and fallback paths, normalize Both-mode raw mask rendering, and tighten demo result typing to reduce runtime/display inconsistencies.
2026-02-28 18:05:33 +08:00
parent 06a6cd1ccf
commit 7f073179d7
7 changed files with 416 additions and 73 deletions
@@ -12,10 +12,11 @@ import torch
 from jaxtyping import Float
 from numpy import ndarray

+from .preprocess import BBoxXYXY
+
 if TYPE_CHECKING:
    from numpy.typing import NDArray

-
 # Silhouette dimensions from preprocess.py
 SIL_HEIGHT: int = 64
 SIL_WIDTH: int = 44
@@ -239,19 +240,23 @@ def _to_numpy(obj: _ArrayLike) -> ndarray:

 def select_person(
    results: _DetectionResults,
-) -> tuple[ndarray, tuple[int, int, int, int], int] | None:
+) -> tuple[ndarray, BBoxXYXY, BBoxXYXY, int] | None:
    """Select the person with largest bounding box from detection results.

    Args:
        results: Detection results object with boxes and masks attributes.
            Expected to have:
-            - boxes.xyxy: array of bounding boxes [N, 4]
-            - masks.data: array of masks [N, H, W]
+            - boxes.xyxy: array of bounding boxes [N, 4] in frame coordinates (XYXY format)
+            - masks.data: array of masks [N, H, W] in mask coordinates
            - boxes.id: optional track IDs [N]

    Returns:
-        Tuple of (mask, bbox, track_id) for the largest person,
+        Tuple of (mask, bbox_mask, bbox_frame, track_id) for the largest person,
        or None if no valid detections or track IDs unavailable.
+        - mask: the person's segmentation mask
+        - bbox_mask: bounding box in mask coordinate space (XYXY format: x1, y1, x2, y2)
+        - bbox_frame: bounding box in frame coordinate space (XYXY format: x1, y1, x2, y2)
+        - track_id: the person's track ID
    """
    # Check for track IDs
    boxes_obj: _Boxes | object = getattr(results, "boxes", None)
@@ -329,20 +334,27 @@ def select_person(
        # Scale bbox from frame space to mask space
        scale_x = mask_w / frame_w if frame_w > 0 else 1.0
        scale_y = mask_h / frame_h if frame_h > 0 else 1.0
-        bbox = (
+        bbox_mask = (
            int(float(bboxes[best_idx][0]) * scale_x),
            int(float(bboxes[best_idx][1]) * scale_y),
            int(float(bboxes[best_idx][2]) * scale_x),
            int(float(bboxes[best_idx][3]) * scale_y),
        )
-    else:
-        # Fallback: use bbox as-is (assume same coordinate space)
-        bbox = (
+        bbox_frame = (
            int(float(bboxes[best_idx][0])),
            int(float(bboxes[best_idx][1])),
            int(float(bboxes[best_idx][2])),
            int(float(bboxes[best_idx][3])),
        )
+    else:
+        # Fallback: use bbox as-is for both (assume same coordinate space)
+        bbox_mask = (
+            int(float(bboxes[best_idx][0])),
+            int(float(bboxes[best_idx][1])),
+            int(float(bboxes[best_idx][2])),
+            int(float(bboxes[best_idx][3])),
+        )
+        bbox_frame = bbox_mask
    track_id = int(track_ids[best_idx]) if best_idx < len(track_ids) else best_idx

-    return mask, bbox, track_id
+    return mask, bbox_mask, bbox_frame, track_id