fix(demo): stabilize visualizer bbox and mask rendering

Align bbox coordinate handling across primary and fallback paths, normalize Both-mode raw mask rendering, and tighten demo result typing to reduce runtime/display inconsistencies.
This commit is contained in:
2026-02-28 18:05:33 +08:00
parent 06a6cd1ccf
commit 7f073179d7
7 changed files with 416 additions and 73 deletions
+22 -10
View File
@@ -12,10 +12,11 @@ import torch
from jaxtyping import Float
from numpy import ndarray
from .preprocess import BBoxXYXY
if TYPE_CHECKING:
from numpy.typing import NDArray
# Silhouette dimensions from preprocess.py
SIL_HEIGHT: int = 64
SIL_WIDTH: int = 44
@@ -239,19 +240,23 @@ def _to_numpy(obj: _ArrayLike) -> ndarray:
def select_person(
results: _DetectionResults,
) -> tuple[ndarray, tuple[int, int, int, int], int] | None:
) -> tuple[ndarray, BBoxXYXY, BBoxXYXY, int] | None:
"""Select the person with largest bounding box from detection results.
Args:
results: Detection results object with boxes and masks attributes.
Expected to have:
- boxes.xyxy: array of bounding boxes [N, 4]
- masks.data: array of masks [N, H, W]
- boxes.xyxy: array of bounding boxes [N, 4] in frame coordinates (XYXY format)
- masks.data: array of masks [N, H, W] in mask coordinates
- boxes.id: optional track IDs [N]
Returns:
Tuple of (mask, bbox, track_id) for the largest person,
Tuple of (mask, bbox_mask, bbox_frame, track_id) for the largest person,
or None if no valid detections or track IDs unavailable.
- mask: the person's segmentation mask
- bbox_mask: bounding box in mask coordinate space (XYXY format: x1, y1, x2, y2)
- bbox_frame: bounding box in frame coordinate space (XYXY format: x1, y1, x2, y2)
- track_id: the person's track ID
"""
# Check for track IDs
boxes_obj: _Boxes | object = getattr(results, "boxes", None)
@@ -329,20 +334,27 @@ def select_person(
# Scale bbox from frame space to mask space
scale_x = mask_w / frame_w if frame_w > 0 else 1.0
scale_y = mask_h / frame_h if frame_h > 0 else 1.0
bbox = (
bbox_mask = (
int(float(bboxes[best_idx][0]) * scale_x),
int(float(bboxes[best_idx][1]) * scale_y),
int(float(bboxes[best_idx][2]) * scale_x),
int(float(bboxes[best_idx][3]) * scale_y),
)
else:
# Fallback: use bbox as-is (assume same coordinate space)
bbox = (
bbox_frame = (
int(float(bboxes[best_idx][0])),
int(float(bboxes[best_idx][1])),
int(float(bboxes[best_idx][2])),
int(float(bboxes[best_idx][3])),
)
else:
# Fallback: use bbox as-is for both (assume same coordinate space)
bbox_mask = (
int(float(bboxes[best_idx][0])),
int(float(bboxes[best_idx][1])),
int(float(bboxes[best_idx][2])),
int(float(bboxes[best_idx][3])),
)
bbox_frame = bbox_mask
track_id = int(track_ids[best_idx]) if best_idx < len(track_ids) else best_idx
return mask, bbox, track_id
return mask, bbox_mask, bbox_frame, track_id