feat(demo): add realtime visualization pipeline flow

Integrate an opt-in OpenCV visualizer into the demo runtime so operators can monitor tracking, segmentation, and inference confidence in real time without changing the default non-visual execution path.
2026-02-27 20:14:24 +08:00
parent 846549498c
commit 4cc2ef7c63
3 changed files with 670 additions and 11 deletions
@@ -1,7 +1,122 @@
 from __future__ import annotations
-from .pipeline import main
+import argparse
 import inspect
 import logging
 import sys
 from .pipeline import ScoliosisPipeline
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser(description="Scoliosis Detection Pipeline")
    parser.add_argument(
        "--source", type=str, required=True, help="Video source path or camera ID"
    )
    parser.add_argument(
        "--checkpoint", type=str, required=True, help="Model checkpoint path"
    )
    parser.add_argument(
        "--config",
        type=str,
        default="configs/sconet/sconet_scoliosis1k.yaml",
        help="Config file path",
    )
    parser.add_argument("--device", type=str, default="cuda:0", help="Device to run on")
    parser.add_argument(
        "--yolo-model", type=str, default="ckpt/yolo11n-seg.pt", help="YOLO model name"
    )
    parser.add_argument(
        "--window", type=int, default=30, help="Window size for classification"
    )
    parser.add_argument("--stride", type=int, default=30, help="Stride for window")
    parser.add_argument(
        "--nats-url", type=str, default=None, help="NATS URL for result publishing"
    )
    parser.add_argument(
        "--nats-subject", type=str, default="scoliosis.result", help="NATS subject"
    )
    parser.add_argument(
        "--max-frames", type=int, default=None, help="Maximum frames to process"
    )
    parser.add_argument(
        "--preprocess-only", action="store_true", help="Only preprocess silhouettes"
    )
    parser.add_argument(
        "--silhouette-export-path",
        type=str,
        default=None,
        help="Path to export silhouettes",
    )
    parser.add_argument(
        "--silhouette-export-format", type=str, default="pickle", help="Export format"
    )
    parser.add_argument(
        "--silhouette-visualize-dir",
        type=str,
        default=None,
        help="Directory for silhouette visualizations",
    )
    parser.add_argument(
        "--result-export-path", type=str, default=None, help="Path to export results"
    )
    parser.add_argument(
        "--result-export-format", type=str, default="json", help="Result export format"
    )
    parser.add_argument(
        "--visualize", action="store_true", help="Enable real-time visualization"
    )
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )
    # Validate preprocess-only mode requires silhouette export path
    if args.preprocess_only and not args.silhouette_export_path:
        print(
            "Error: --silhouette-export-path is required when using --preprocess-only",
            file=sys.stderr,
        )
        raise SystemExit(2)
    try:
        # Import here to avoid circular imports
        from .pipeline import validate_runtime_inputs
        validate_runtime_inputs(
            source=args.source, checkpoint=args.checkpoint, config=args.config
        )
        # Build kwargs based on what ScoliosisPipeline accepts
        sig = inspect.signature(ScoliosisPipeline.__init__)
        pipeline_kwargs = {
            "source": args.source,
            "checkpoint": args.checkpoint,
            "config": args.config,
            "device": args.device,
            "yolo_model": args.yolo_model,
            "window": args.window,
            "stride": args.stride,
            "nats_url": args.nats_url,
            "nats_subject": args.nats_subject,
            "max_frames": args.max_frames,
            "preprocess_only": args.preprocess_only,
            "silhouette_export_path": args.silhouette_export_path,
            "silhouette_export_format": args.silhouette_export_format,
            "silhouette_visualize_dir": args.silhouette_visualize_dir,
            "result_export_path": args.result_export_path,
            "result_export_format": args.result_export_format,
        }
        if "visualize" in sig.parameters:
            pipeline_kwargs["visualize"] = args.visualize
        pipeline = ScoliosisPipeline(**pipeline_kwargs)
        raise SystemExit(pipeline.run())
    except ValueError as err:
        print(f"Error: {err}", file=sys.stderr)
        raise SystemExit(2) from err
    except RuntimeError as err:
        print(f"Runtime error: {err}", file=sys.stderr)
        raise SystemExit(1) from err
@@ -78,6 +78,7 @@ class ScoliosisPipeline:
    _result_export_path: Path | None
    _result_export_format: str
    _result_buffer: list[dict[str, object]]
    _visualizer: object | None
    def __init__(
        self,
@@ -98,6 +99,7 @@ class ScoliosisPipeline:
        silhouette_visualize_dir: str | None = None,
        result_export_path: str | None = None,
        result_export_format: str = "json",
        visualize: bool = False,
    ) -> None:
        self._detector = YOLO(yolo_model)
        self._source = create_source(source, max_frames=max_frames)
@@ -124,6 +126,12 @@ class ScoliosisPipeline:
        )
        self._result_export_format = result_export_format
        self._result_buffer = []
        if visualize:
            from .visualizer import OpenCVVisualizer
            self._visualizer = OpenCVVisualizer()
        else:
            self._visualizer = None
    @staticmethod
    def _extract_int(meta: dict[str, object], key: str, fallback: int) -> int:
@@ -156,7 +164,15 @@ class ScoliosisPipeline:
    def _select_silhouette(
        self,
        result: _DetectionResultsLike,
-    ) -> tuple[Float[ndarray, "64 44"], int] | None:
+    ) -> (
        tuple[
            Float[ndarray, "64 44"],
            UInt8[ndarray, "h w"],
            tuple[int, int, int, int],
            int,
        ]
        | None
    ):
        selected = select_person(result)
        if selected is not None:
            mask_raw, bbox, track_id = selected
@@ -165,7 +181,7 @@ class ScoliosisPipeline:
                mask_to_silhouette(self._to_mask_u8(mask_raw), bbox),
            )
            if silhouette is not None:
-                return silhouette, int(track_id)
+                return silhouette, mask_raw, bbox, int(track_id)
        fallback = cast(
            tuple[UInt8[ndarray, "h w"], tuple[int, int, int, int]] | None,
@@ -181,7 +197,8 @@ class ScoliosisPipeline:
        )
        if silhouette is None:
            return None
-        return silhouette, 0
+        # For fallback case, mask_raw is the same as mask_u8
        return silhouette, mask_u8, bbox, 0
    @jaxtyped(typechecker=beartype)
    def process_frame(
@@ -212,7 +229,7 @@ class ScoliosisPipeline:
        if selected is None:
            return None
-        silhouette, track_id = selected
+        silhouette, mask_raw, bbox, track_id = selected
        # Store silhouette for export if in preprocess-only mode or if export requested
        if self._silhouette_export_path is not None or self._preprocess_only:
@@ -230,12 +247,28 @@ class ScoliosisPipeline:
            self._visualize_silhouette(silhouette, frame_idx, track_id)
        if self._preprocess_only:
-            return None
+            # Return visualization payload for display even in preprocess-only mode
            return {
                "mask_raw": mask_raw,
                "bbox": bbox,
                "silhouette": silhouette,
                "track_id": track_id,
                "label": None,
                "confidence": None,
            }
        self._window.push(silhouette, frame_idx=frame_idx, track_id=track_id)
        if not self._window.should_classify():
-            return None
+            # Return visualization payload even when not classifying yet
            return {
                "mask_raw": mask_raw,
                "bbox": bbox,
                "silhouette": silhouette,
                "track_id": track_id,
                "label": None,
                "confidence": None,
            }
        window_tensor = self._window.get_tensor(device=self._device)
        label, confidence = cast(
@@ -259,25 +292,82 @@ class ScoliosisPipeline:
            self._result_buffer.append(result)
        self._publisher.publish(result)
-        return result
+        # Return result with visualization payload
        return {
            "result": result,
            "mask_raw": mask_raw,
            "bbox": bbox,
            "silhouette": silhouette,
            "track_id": track_id,
            "label": label,
            "confidence": confidence,
        }
    def run(self) -> int:
        frame_count = 0
        start_time = time.perf_counter()
        # EMA FPS state (alpha=0.1 for smoothing)
        ema_fps = 0.0
        alpha = 0.1
        prev_time = start_time
        try:
            for item in self._source:
                frame, metadata = item
                frame_u8 = np.asarray(frame, dtype=np.uint8)
                frame_idx = self._extract_int(metadata, "frame_count", fallback=0)
                frame_count += 1
                # Compute per-frame EMA FPS
                curr_time = time.perf_counter()
                delta = curr_time - prev_time
                prev_time = curr_time
                if delta > 0:
                    instant_fps = 1.0 / delta
                    if ema_fps == 0.0:
                        ema_fps = instant_fps
                    else:
                        ema_fps = alpha * instant_fps + (1 - alpha) * ema_fps
                viz_payload = None
                try:
-                    _ = self.process_frame(frame_u8, metadata)
+                    viz_payload = self.process_frame(frame_u8, metadata)
                except Exception as frame_error:
                    logger.warning(
                        "Skipping frame %d due to processing error: %s",
                        frame_idx,
                        frame_error,
                    )
                # Update visualizer if enabled
                if self._visualizer is not None and viz_payload is not None:
                    # Cast viz_payload to dict for type checking
                    viz_dict = cast(dict[str, object], viz_payload)
                    mask_raw = viz_dict.get("mask_raw")
                    bbox = viz_dict.get("bbox")
                    silhouette = viz_dict.get("silhouette")
                    track_id_val = viz_dict.get("track_id", 0)
                    track_id = track_id_val if isinstance(track_id_val, int) else 0
                    label = viz_dict.get("label")
                    confidence = viz_dict.get("confidence")
                    # Cast _visualizer to object with update method
                    visualizer = cast(object, self._visualizer)
                    update_fn = getattr(visualizer, "update", None)
                    if callable(update_fn):
                        keep_running = update_fn(
                            frame_u8,
                            bbox,
                            track_id,
                            mask_raw,
                            silhouette,
                            label,
                            confidence,
                            ema_fps,
                        )
                        if not keep_running:
                            logger.info("Visualization closed by user.")
                            break
                if frame_count % 100 == 0:
                    elapsed = time.perf_counter() - start_time
                    fps = frame_count / elapsed if elapsed > 0 else 0.0
@@ -293,6 +383,14 @@ class ScoliosisPipeline:
        if self._closed:
            return
        # Close visualizer if enabled
        if self._visualizer is not None:
            visualizer = cast(object, self._visualizer)
            close_viz = getattr(visualizer, "close", None)
            if callable(close_viz):
                with suppress(Exception):
                    _ = close_viz()
        # Export silhouettes if requested
        if self._silhouette_export_path is not None and self._silhouette_buffer:
            self._export_silhouettes()
@@ -504,7 +602,7 @@ def validate_runtime_inputs(source: str, checkpoint: str, config: str) -> None:
    show_default=True,
 )
@click.option("--device", type=str, default="cuda:0", show_default=True)
-@click.option("--yolo-model", type=str, default="yolo11n-seg.pt", show_default=True)
+@click.option("--yolo-model", type=str, default="ckpt/yolo11n-seg.pt", show_default=True)
@click.option("--window", type=click.IntRange(min=1), default=30, show_default=True)
@click.option("--stride", type=click.IntRange(min=1), default=30, show_default=True)
@click.option("--nats-url", type=str, default=None)
@@ -0,0 +1,446 @@
 """OpenCV-based visualizer for demo pipeline.
 Provides real-time visualization of detection, segmentation, and classification results
 with interactive mode switching for mask display.
 """
 from __future__ import annotations
 import logging
 from typing import cast
 import cv2
 import numpy as np
 from numpy.typing import NDArray
 logger = logging.getLogger(__name__)
 # Window names
 MAIN_WINDOW = "Scoliosis Detection"
 SEG_WINDOW = "Segmentation"
 # Silhouette dimensions (from preprocess.py)
 SIL_HEIGHT = 64
 SIL_WIDTH = 44
 # Display dimensions for upscaled silhouette
 DISPLAY_HEIGHT = 256
 DISPLAY_WIDTH = 176
 # Colors (BGR)
 COLOR_GREEN = (0, 255, 0)
 COLOR_WHITE = (255, 255, 255)
 COLOR_BLACK = (0, 0, 0)
 COLOR_RED = (0, 0, 255)
 COLOR_YELLOW = (0, 255, 255)
 # Mode labels
 MODE_LABELS = ["Both", "Raw Mask", "Normalized"]
 # Type alias for image arrays (NDArray or cv2.Mat)
 ImageArray = NDArray[np.uint8]
 class OpenCVVisualizer:
    """Real-time visualizer for gait analysis demo.
    Displays two windows:
    - Main stream: Original frame with bounding box and metadata overlay
    - Segmentation: Raw mask, normalized silhouette, or side-by-side view
    Supports interactive mode switching via keyboard.
    """
    def __init__(self) -> None:
        """Initialize visualizer with default mask mode."""
        self.mask_mode: int = 0  # 0: Both, 1: Raw, 2: Normalized
        self._windows_created: bool = False
    def _ensure_windows(self) -> None:
        """Create OpenCV windows if not already created."""
        if not self._windows_created:
            cv2.namedWindow(MAIN_WINDOW, cv2.WINDOW_NORMAL)
            cv2.namedWindow(SEG_WINDOW, cv2.WINDOW_NORMAL)
            self._windows_created = True
    def _draw_bbox(
        self,
        frame: ImageArray,
        bbox: tuple[int, int, int, int] | None,
    ) -> None:
        """Draw bounding box on frame if present.
        Args:
            frame: Input frame (H, W, 3) uint8 - modified in place
            bbox: Bounding box as (x1, y1, x2, y2) or None
        """
        if bbox is None:
            return
        x1, y1, x2, y2 = bbox
        # Draw rectangle with green color, thickness 2
        _ = cv2.rectangle(frame, (x1, y1), (x2, y2), COLOR_GREEN, 2)
    def _draw_text_overlay(
        self,
        frame: ImageArray,
        track_id: int,
        fps: float,
        label: str | None,
        confidence: float | None,
    ) -> None:
        """Draw text overlay with track info, FPS, label, and confidence.
        Args:
            frame: Input frame (H, W, 3) uint8 - modified in place
            track_id: Tracking ID
            fps: Current FPS
            label: Classification label or None
            confidence: Classification confidence or None
        """
        # Prepare text lines
        lines: list[str] = []
        lines.append(f"ID: {track_id}")
        lines.append(f"FPS: {fps:.1f}")
        if label is not None:
            if confidence is not None:
                lines.append(f"{label}: {confidence:.2%}")
            else:
                lines.append(label)
        # Draw text with background for readability
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.6
        thickness = 1
        line_height = 25
        margin = 10
        for i, text in enumerate(lines):
            y_pos = margin + (i + 1) * line_height
            # Draw background rectangle
            (text_width, text_height), _ = cv2.getTextSize(
                text, font, font_scale, thickness
            )
            _ = cv2.rectangle(
                frame,
                (margin, y_pos - text_height - 5),
                (margin + text_width + 10, y_pos + 5),
                COLOR_BLACK,
                -1,
            )
            # Draw text
            _ = cv2.putText(
                frame,
                text,
                (margin + 5, y_pos),
                font,
                font_scale,
                COLOR_WHITE,
                thickness,
            )
    def _prepare_main_frame(
        self,
        frame: ImageArray,
        bbox: tuple[int, int, int, int] | None,
        track_id: int,
        fps: float,
        label: str | None,
        confidence: float | None,
    ) -> ImageArray:
        """Prepare main display frame with bbox and text overlay.
        Args:
            frame: Input frame (H, W, C) uint8
            bbox: Bounding box or None
            track_id: Tracking ID
            fps: Current FPS
            label: Classification label or None
            confidence: Classification confidence or None
        Returns:
            Processed frame ready for display
        """
        # Ensure BGR format (convert grayscale if needed)
        if len(frame.shape) == 2:
            display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR))
        elif frame.shape[2] == 1:
            display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR))
        elif frame.shape[2] == 3:
            display_frame = frame.copy()
        elif frame.shape[2] == 4:
            display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR))
        else:
            display_frame = frame.copy()
        # Draw bbox and text (modifies in place)
        self._draw_bbox(display_frame, bbox)
        self._draw_text_overlay(display_frame, track_id, fps, label, confidence)
        return display_frame
    def _upscale_silhouette(
        self,
        silhouette: NDArray[np.float32] | NDArray[np.uint8],
    ) -> ImageArray:
        """Upscale silhouette to display size.
        Args:
            silhouette: Input silhouette (64, 44) float32 [0,1] or uint8 [0,255]
        Returns:
            Upscaled silhouette (256, 176) uint8
        """
        # Normalize to uint8 if needed
        if silhouette.dtype == np.float32 or silhouette.dtype == np.float64:
            sil_u8 = (silhouette * 255).astype(np.uint8)
        else:
            sil_u8 = silhouette.astype(np.uint8)
        # Upscale using nearest neighbor to preserve pixelation
        upscaled = cast(
            ImageArray,
            cv2.resize(
                sil_u8,
                (DISPLAY_WIDTH, DISPLAY_HEIGHT),
                interpolation=cv2.INTER_NEAREST,
            ),
        )
        return upscaled
    def _prepare_segmentation_view(
        self,
        mask_raw: ImageArray | None,
        silhouette: NDArray[np.float32] | None,
    ) -> ImageArray:
        """Prepare segmentation window content based on current mode.
        Args:
            mask_raw: Raw binary mask (H, W) uint8 or None
            silhouette: Normalized silhouette (64, 44) float32 or None
        Returns:
            Displayable image (H, W, 3) uint8
        """
        if self.mask_mode == 0:
            # Mode 0: Both (side by side)
            return self._prepare_both_view(mask_raw, silhouette)
        elif self.mask_mode == 1:
            # Mode 1: Raw mask only
            return self._prepare_raw_view(mask_raw)
        else:
            # Mode 2: Normalized silhouette only
            return self._prepare_normalized_view(silhouette)
    def _prepare_raw_view(
        self,
        mask_raw: ImageArray | None,
    ) -> ImageArray:
        """Prepare raw mask view.
        Args:
            mask_raw: Raw binary mask or None
        Returns:
            Displayable image with mode indicator
        """
        if mask_raw is None:
            # Create placeholder
            placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8)
            self._draw_mode_indicator(placeholder, "Raw Mask (No Data)")
            return placeholder
        # Ensure single channel
        if len(mask_raw.shape) == 3:
            mask_gray = cast(ImageArray, cv2.cvtColor(mask_raw, cv2.COLOR_BGR2GRAY))
        else:
            mask_gray = mask_raw
        # Resize to display size
        mask_resized = cast(
            ImageArray,
            cv2.resize(
                mask_gray,
                (DISPLAY_WIDTH, DISPLAY_HEIGHT),
                interpolation=cv2.INTER_NEAREST,
            ),
        )
        # Convert to BGR for display
        mask_bgr = cast(ImageArray, cv2.cvtColor(mask_resized, cv2.COLOR_GRAY2BGR))
        self._draw_mode_indicator(mask_bgr, "Raw Mask")
        return mask_bgr
    def _prepare_normalized_view(
        self,
        silhouette: NDArray[np.float32] | None,
    ) -> ImageArray:
        """Prepare normalized silhouette view.
        Args:
            silhouette: Normalized silhouette (64, 44) or None
        Returns:
            Displayable image with mode indicator
        """
        if silhouette is None:
            # Create placeholder
            placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8)
            self._draw_mode_indicator(placeholder, "Normalized (No Data)")
            return placeholder
        # Upscale and convert
        upscaled = self._upscale_silhouette(silhouette)
        sil_bgr = cast(ImageArray, cv2.cvtColor(upscaled, cv2.COLOR_GRAY2BGR))
        self._draw_mode_indicator(sil_bgr, "Normalized")
        return sil_bgr
    def _prepare_both_view(
        self,
        mask_raw: ImageArray | None,
        silhouette: NDArray[np.float32] | None,
    ) -> ImageArray:
        """Prepare side-by-side view of both masks.
        Args:
            mask_raw: Raw binary mask or None
            silhouette: Normalized silhouette or None
        Returns:
            Displayable side-by-side image
        """
        # Prepare individual views
        raw_view = self._prepare_raw_view(mask_raw)
        norm_view = self._prepare_normalized_view(silhouette)
        # Convert to grayscale for side-by-side composition
        if len(raw_view.shape) == 3:
            raw_gray = cast(ImageArray, cv2.cvtColor(raw_view, cv2.COLOR_BGR2GRAY))
        else:
            raw_gray = raw_view
        if len(norm_view.shape) == 3:
            norm_gray = cast(ImageArray, cv2.cvtColor(norm_view, cv2.COLOR_BGR2GRAY))
        else:
            norm_gray = norm_view
        # Stack horizontally
        combined = np.hstack([raw_gray, norm_gray])
        # Convert back to BGR
        combined_bgr = cast(ImageArray, cv2.cvtColor(combined, cv2.COLOR_GRAY2BGR))
        # Add mode indicator
        self._draw_mode_indicator(combined_bgr, "Both: Raw | Normalized")
        return combined_bgr
    def _draw_mode_indicator(
        self,
        image: ImageArray,
        label: str,
    ) -> None:
        """Draw mode indicator text on image.
        Args:
            image: Image to draw on (modified in place)
            label: Mode label text
        """
        h, w = image.shape[:2]
        # Mode text at bottom
        mode_text = f"Mode: {MODE_LABELS[self.mask_mode]} ({self.mask_mode}) - {label}"
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = 0.5
        thickness = 1
        # Get text size for background
        (text_width, text_height), _ = cv2.getTextSize(
            mode_text, font, font_scale, thickness
        )
        # Draw background at bottom center
        x_pos = (w - text_width) // 2
        y_pos = h - 10
        _ = cv2.rectangle(
            image,
            (x_pos - 5, y_pos - text_height - 5),
            (x_pos + text_width + 5, y_pos + 5),
            COLOR_BLACK,
            -1,
        )
        # Draw text
        _ = cv2.putText(
            image,
            mode_text,
            (x_pos, y_pos),
            font,
            font_scale,
            COLOR_YELLOW,
            thickness,
        )
    def update(
        self,
        frame: ImageArray,
        bbox: tuple[int, int, int, int] | None,
        track_id: int,
        mask_raw: ImageArray | None,
        silhouette: NDArray[np.float32] | None,
        label: str | None,
        confidence: float | None,
        fps: float,
    ) -> bool:
        """Update visualization with new frame data.
        Args:
            frame: Input frame (H, W, C) uint8
            bbox: Bounding box as (x1, y1, x2, y2) or None
            track_id: Tracking ID
            mask_raw: Raw binary mask (H, W) uint8 or None
            silhouette: Normalized silhouette (64, 44) float32 [0,1] or None
            label: Classification label or None
            confidence: Classification confidence [0,1] or None
            fps: Current FPS
        Returns:
            False if user requested quit (pressed 'q'), True otherwise
        """
        self._ensure_windows()
        # Prepare and show main window
        main_display = self._prepare_main_frame(
            frame, bbox, track_id, fps, label, confidence
        )
        cv2.imshow(MAIN_WINDOW, main_display)
        # Prepare and show segmentation window
        seg_display = self._prepare_segmentation_view(mask_raw, silhouette)
        cv2.imshow(SEG_WINDOW, seg_display)
        # Handle keyboard input
        key = cv2.waitKey(1) & 0xFF
        if key == ord("q"):
            return False
        elif key == ord("m"):
            # Cycle through modes: 0 -> 1 -> 2 -> 0
            self.mask_mode = (self.mask_mode + 1) % 3
            logger.debug("Switched to mask mode: %s", MODE_LABELS[self.mask_mode])
        return True
    def close(self) -> None:
        """Close all OpenCV windows and cleanup."""
        if self._windows_created:
            cv2.destroyAllWindows()
            self._windows_created = False