fix(demo): stabilize visualizer bbox and mask rendering

Align bbox coordinate handling across primary and fallback paths, normalize Both-mode raw mask rendering, and tighten demo result typing to reduce runtime/display inconsistencies.
2026-02-28 18:05:33 +08:00
parent 06a6cd1ccf
commit 7f073179d7
7 changed files with 416 additions and 73 deletions
@@ -17,8 +17,8 @@ from numpy.typing import NDArray
 from ultralytics.models.yolo.model import YOLO

 from .input import FrameStream, create_source
-from .output import ResultPublisher, create_publisher, create_result
-from .preprocess import frame_to_person_mask, mask_to_silhouette
+from .output import DemoResult, ResultPublisher, create_publisher, create_result
+from .preprocess import BBoxXYXY, frame_to_person_mask, mask_to_silhouette
 from .sconet_demo import ScoNetDemo
 from .window import SilhouetteWindow, select_person

@@ -53,6 +53,7 @@ class _DetectionResultsLike(Protocol):
    def masks(self) -> _MasksLike: ...


+
 class _TrackCallable(Protocol):
    def __call__(
        self,
@@ -80,8 +81,9 @@ class ScoliosisPipeline:
    _silhouette_visualize_dir: Path | None
    _result_export_path: Path | None
    _result_export_format: str
-    _result_buffer: list[dict[str, object]]
+    _result_buffer: list[DemoResult]
    _visualizer: OpenCVVisualizer | None
+    _last_viz_payload: dict[str, object] | None

    def __init__(
        self,
@@ -135,6 +137,7 @@ class ScoliosisPipeline:
            self._visualizer = OpenCVVisualizer()
        else:
            self._visualizer = None
+        self._last_viz_payload = None

    @staticmethod
    def _extract_int(meta: dict[str, object], key: str, fallback: int) -> int:
@@ -171,37 +174,59 @@ class ScoliosisPipeline:
        tuple[
            Float[ndarray, "64 44"],
            UInt8[ndarray, "h w"],
-            tuple[int, int, int, int],
+            BBoxXYXY,
            int,
        ]
        | None
    ):
        selected = select_person(result)
        if selected is not None:
-            mask_raw, bbox, track_id = selected
+            mask_raw, bbox_mask, bbox_frame, track_id = selected
            silhouette = cast(
                Float[ndarray, "64 44"] | None,
-                mask_to_silhouette(self._to_mask_u8(mask_raw), bbox),
+                mask_to_silhouette(self._to_mask_u8(mask_raw), bbox_mask),
            )
            if silhouette is not None:
-                return silhouette, mask_raw, bbox, int(track_id)
+                return silhouette, mask_raw, bbox_frame, int(track_id)

        fallback = cast(
-            tuple[UInt8[ndarray, "h w"], tuple[int, int, int, int]] | None,
+            tuple[UInt8[ndarray, "h w"], BBoxXYXY] | None,
            frame_to_person_mask(result),
        )
        if fallback is None:
            return None

-        mask_u8, bbox = fallback
+        mask_u8, bbox_mask = fallback
        silhouette = cast(
            Float[ndarray, "64 44"] | None,
-            mask_to_silhouette(mask_u8, bbox),
+            mask_to_silhouette(mask_u8, bbox_mask),
        )
        if silhouette is None:
            return None
+        # Convert mask-space bbox to frame-space for visualization
+        # Use result.orig_shape to get frame dimensions safely
+        orig_shape = getattr(result, "orig_shape", None)
+        if orig_shape is not None and isinstance(orig_shape, (tuple, list)) and len(orig_shape) >= 2:
+            frame_h, frame_w = int(orig_shape[0]), int(orig_shape[1])
+            mask_h, mask_w = mask_u8.shape[0], mask_u8.shape[1]
+            if mask_w > 0 and mask_h > 0 and frame_w > 0 and frame_h > 0:
+                scale_x = frame_w / mask_w
+                scale_y = frame_h / mask_h
+                bbox_frame = (
+                    int(bbox_mask[0] * scale_x),
+                    int(bbox_mask[1] * scale_y),
+                    int(bbox_mask[2] * scale_x),
+                    int(bbox_mask[3] * scale_y),
+                )
+            else:
+                # Fallback: use mask-space bbox if dimensions invalid
+                bbox_frame = bbox_mask
+        else:
+            # Fallback: use mask-space bbox if orig_shape unavailable
+            bbox_frame = bbox_mask
        # For fallback case, mask_raw is the same as mask_u8
-        return silhouette, mask_u8, bbox, 0
+        return silhouette, mask_u8, bbox_frame, 0
+

    @jaxtyped(typechecker=beartype)
    def process_frame(
@@ -342,23 +367,48 @@ class ScoliosisPipeline:
                    )

                # Update visualizer if enabled
-                if self._visualizer is not None and viz_payload is not None:
-                    # Cast viz_payload to dict for type checking
-                    viz_dict = cast(dict[str, object], viz_payload)
-                    mask_raw_obj = viz_dict.get("mask_raw")
-                    bbox_obj = viz_dict.get("bbox")
-                    silhouette_obj = viz_dict.get("silhouette")
-                    track_id_val = viz_dict.get("track_id", 0)
-                    track_id = track_id_val if isinstance(track_id_val, int) else 0
-                    label_obj = viz_dict.get("label")
-                    confidence_obj = viz_dict.get("confidence")
+                if self._visualizer is not None:
+                    # Cache valid payload for no-detection frames
+                    if viz_payload is not None:
+                        # Cache a copy to prevent mutation of original data
+                        viz_payload_dict = cast(dict[str, object], viz_payload)
+                        cached: dict[str, object] = {}
+                        for k, v in viz_payload_dict.items():
+                            copy_method = cast(Callable[[], object] | None, getattr(v, "copy", None))
+                            if copy_method is not None:
+                                cached[k] = copy_method()
+                            else:
+                                cached[k] = v
+                        self._last_viz_payload = cached
+                    
+                    # Use cached payload if current is None
+                    viz_data = viz_payload if viz_payload is not None else self._last_viz_payload
+                    
+                    if viz_data is not None:
+                        # Cast viz_payload to dict for type checking
+                        viz_dict = cast(dict[str, object], viz_data)
+                        mask_raw_obj = viz_dict.get("mask_raw")
+                        bbox_obj = viz_dict.get("bbox")
+                        silhouette_obj = viz_dict.get("silhouette")
+                        track_id_val = viz_dict.get("track_id", 0)
+                        track_id = track_id_val if isinstance(track_id_val, int) else 0
+                        label_obj = viz_dict.get("label")
+                        confidence_obj = viz_dict.get("confidence")

-                    # Cast extracted values to expected types
-                    mask_raw = cast(NDArray[np.uint8] | None, mask_raw_obj)
-                    bbox = cast(tuple[int, int, int, int] | None, bbox_obj)
-                    silhouette = cast(NDArray[np.float32] | None, silhouette_obj)
-                    label = cast(str | None, label_obj)
-                    confidence = cast(float | None, confidence_obj)
+                        # Cast extracted values to expected types
+                        mask_raw = cast(NDArray[np.uint8] | None, mask_raw_obj)
+                        bbox = cast(BBoxXYXY | None, bbox_obj)
+                        silhouette = cast(NDArray[np.float32] | None, silhouette_obj)
+                        label = cast(str | None, label_obj)
+                        confidence = cast(float | None, confidence_obj)
+                    else:
+                        # No detection and no cache - use default values
+                        mask_raw = None
+                        bbox = None
+                        track_id = 0
+                        silhouette = None
+                        label = None
+                        confidence = None

                    keep_running = self._visualizer.update(
                        frame_u8,