"""OpenCV-based visualizer for demo pipeline. Provides real-time visualization of detection, segmentation, and classification results with interactive mode switching for mask display. """ from __future__ import annotations import logging from typing import cast import cv2 import numpy as np from numpy.typing import NDArray from .preprocess import BBoxXYXY logger = logging.getLogger(__name__) # Window names MAIN_WINDOW = "Scoliosis Detection" SEG_WINDOW = "Normalized Silhouette" RAW_WINDOW = "Raw Mask" WINDOW_SEG_INPUT = "Segmentation Input" # Silhouette dimensions (from preprocess.py) SIL_HEIGHT = 64 SIL_WIDTH = 44 # Display dimensions for upscaled silhouette DISPLAY_HEIGHT = 256 DISPLAY_WIDTH = 176 RAW_STATS_PAD = 54 MODE_LABEL_PAD = 26 # Colors (BGR) COLOR_GREEN = (0, 255, 0) COLOR_WHITE = (255, 255, 255) COLOR_BLACK = (0, 0, 0) COLOR_DARK_GRAY = (56, 56, 56) COLOR_RED = (0, 0, 255) COLOR_YELLOW = (0, 255, 255) # Type alias for image arrays (NDArray or cv2.Mat) COLOR_CYAN = (255, 255, 0) COLOR_ORANGE = (0, 165, 255) COLOR_MAGENTA = (255, 0, 255) ImageArray = NDArray[np.uint8] # COCO-format skeleton connections (17 keypoints) # Connections are pairs of keypoint indices SKELETON_CONNECTIONS: list[tuple[int, int]] = [ (0, 1), # nose -> left_eye (0, 2), # nose -> right_eye (1, 3), # left_eye -> left_ear (2, 4), # right_eye -> right_ear (5, 6), # left_shoulder -> right_shoulder (5, 7), # left_shoulder -> left_elbow (7, 9), # left_elbow -> left_wrist (6, 8), # right_shoulder -> right_elbow (8, 10), # right_elbow -> right_wrist (11, 12), # left_hip -> right_hip (5, 11), # left_shoulder -> left_hip (6, 12), # right_shoulder -> right_hip (11, 13), # left_hip -> left_knee (13, 15), # left_knee -> left_ankle (12, 14), # right_hip -> right_knee (14, 16), # right_knee -> right_ankle ] class OpenCVVisualizer: def __init__(self) -> None: self.show_raw_window: bool = False self.show_raw_debug: bool = False self._windows_created: bool = False self._raw_window_created: bool = False def _ensure_windows(self) -> None: if not self._windows_created: cv2.namedWindow(MAIN_WINDOW, cv2.WINDOW_NORMAL) cv2.namedWindow(SEG_WINDOW, cv2.WINDOW_NORMAL) cv2.namedWindow(WINDOW_SEG_INPUT, cv2.WINDOW_NORMAL) self._windows_created = True def _ensure_raw_window(self) -> None: if not self._raw_window_created: cv2.namedWindow(RAW_WINDOW, cv2.WINDOW_NORMAL) self._raw_window_created = True def _hide_raw_window(self) -> None: if self._raw_window_created: cv2.destroyWindow(RAW_WINDOW) self._raw_window_created = False def _draw_bbox( self, frame: ImageArray, bbox: BBoxXYXY | None, ) -> None: """Draw bounding box on frame if present. Args: frame: Input frame (H, W, 3) uint8 - modified in place bbox: Bounding box in XYXY format as (x1, y1, x2, y2) or None """ if bbox is None: return x1, y1, x2, y2 = bbox # Draw rectangle with green color, thickness 2 _ = cv2.rectangle(frame, (x1, y1), (x2, y2), COLOR_GREEN, 2) def _draw_text_overlay( self, frame: ImageArray, track_id: int, fps: float, label: str | None, confidence: float | None, ) -> None: """Draw text overlay with track info, FPS, label, and confidence. Args: frame: Input frame (H, W, 3) uint8 - modified in place track_id: Tracking ID fps: Current FPS label: Classification label or None confidence: Classification confidence or None """ # Prepare text lines lines: list[str] = [] lines.append(f"ID: {track_id}") lines.append(f"FPS: {fps:.1f}") if label is not None: if confidence is not None: lines.append(f"{label}: {confidence:.2%}") else: lines.append(label) # Draw text with background for readability font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.6 thickness = 1 line_height = 25 margin = 10 for i, text in enumerate(lines): y_pos = margin + (i + 1) * line_height # Draw background rectangle (text_width, text_height), _ = cv2.getTextSize( text, font, font_scale, thickness ) _ = cv2.rectangle( frame, (margin, y_pos - text_height - 5), (margin + text_width + 10, y_pos + 5), COLOR_BLACK, -1, ) # Draw text _ = cv2.putText( frame, text, (margin + 5, y_pos), font, font_scale, COLOR_WHITE, thickness, ) def _draw_pose_skeleton( self, frame: ImageArray, pose_data: dict[str, object] | None, ) -> None: """Draw pose skeleton on frame. Args: frame: Input frame (H, W, 3) uint8 - modified in place pose_data: Pose data dictionary from Sports2D or similar Expected format: {'keypoints': [[x1, y1], [x2, y2], ...], 'confidence': [c1, c2, ...], 'angles': {'joint_name': angle, ...}} """ if pose_data is None: return keypoints_obj = pose_data.get("keypoints") if keypoints_obj is None: return # Convert keypoints to numpy array keypoints = np.asarray(keypoints_obj, dtype=np.float32) if keypoints.size == 0: return h, w = frame.shape[:2] # Get confidence scores if available confidence_obj = pose_data.get("confidence") confidences = ( np.asarray(confidence_obj, dtype=np.float32) if confidence_obj is not None else np.ones(len(keypoints), dtype=np.float32) ) # Draw skeleton connections for connection in SKELETON_CONNECTIONS: idx1, idx2 = connection if idx1 < len(keypoints) and idx2 < len(keypoints): # Check confidence threshold (0.3) if confidences[idx1] > 0.3 and confidences[idx2] > 0.3: pt1 = (int(keypoints[idx1][0]), int(keypoints[idx1][1])) pt2 = (int(keypoints[idx2][0]), int(keypoints[idx2][1])) # Clip to frame bounds pt1 = (max(0, min(w - 1, pt1[0])), max(0, min(h - 1, pt1[1]))) pt2 = (max(0, min(w - 1, pt2[0])), max(0, min(h - 1, pt2[1]))) _ = cv2.line(frame, pt1, pt2, COLOR_CYAN, 2) # Draw keypoints for i, (kp, conf) in enumerate(zip(keypoints, confidences)): if conf > 0.3 and i < len(keypoints): x, y = int(kp[0]), int(kp[1]) # Clip to frame bounds x = max(0, min(w - 1, x)) y = max(0, min(h - 1, y)) # Draw keypoint as circle _ = cv2.circle(frame, (x, y), 4, COLOR_MAGENTA, -1) _ = cv2.circle(frame, (x, y), 4, COLOR_WHITE, 1) def _draw_pose_angles( self, frame: ImageArray, pose_data: dict[str, object] | None, ) -> None: """Draw pose angles as text overlay. Args: frame: Input frame (H, W, 3) uint8 - modified in place pose_data: Pose data dictionary with 'angles' key """ if pose_data is None: return angles_obj = pose_data.get("angles") if angles_obj is None: return angles = cast(dict[str, float], angles_obj) if not angles: return # Draw angles in top-right corner font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.45 thickness = 1 line_height = 20 margin = 10 h, w = frame.shape[:2] # Filter and format angles angle_texts: list[tuple[str, float]] = [] for name, angle in angles.items(): # Only show angles that are reasonable (0-180 degrees) if 0 <= angle <= 180: angle_texts.append((str(name), float(angle))) # Sort by name for consistent display angle_texts.sort(key=lambda x: x[0]) # Draw from top-right for i, (name, angle) in enumerate(angle_texts[:8]): # Limit to 8 angles text = f"{name}: {angle:.1f}" (text_width, text_height), _ = cv2.getTextSize( text, font, font_scale, thickness ) x_pos = w - margin - text_width - 10 y_pos = margin + (i + 1) * line_height # Draw background rectangle _ = cv2.rectangle( frame, (x_pos - 4, y_pos - text_height - 4), (x_pos + text_width + 4, y_pos + 4), COLOR_BLACK, -1, ) # Draw text in orange _ = cv2.putText( frame, text, (x_pos, y_pos), font, font_scale, COLOR_ORANGE, thickness, ) def _prepare_main_frame( self, frame: ImageArray, bbox: BBoxXYXY | None, track_id: int, fps: float, label: str | None, confidence: float | None, pose_data: dict[str, object] | None = None, ) -> ImageArray: """Prepare main display frame with bbox and text overlay. Args: frame: Input frame (H, W, C) uint8 bbox: Bounding box in XYXY format (x1, y1, x2, y2) or None track_id: Tracking ID fps: Current FPS label: Classification label or None confidence: Classification confidence or None pose_data: Pose data dictionary or None Returns: Processed frame ready for display """ # Ensure BGR format (convert grayscale if needed) if len(frame.shape) == 2: display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)) elif frame.shape[2] == 1: display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)) elif frame.shape[2] == 3: display_frame = frame.copy() elif frame.shape[2] == 4: display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR)) else: display_frame = frame.copy() # Draw bbox and text (modifies in place) self._draw_bbox(display_frame, bbox) self._draw_text_overlay(display_frame, track_id, fps, label, confidence) # Draw pose skeleton and angles if available self._draw_pose_skeleton(display_frame, pose_data) self._draw_pose_angles(display_frame, pose_data) return display_frame def _upscale_silhouette( self, silhouette: NDArray[np.float32] | NDArray[np.uint8], ) -> ImageArray: """Upscale silhouette to display size. Args: silhouette: Input silhouette (64, 44) float32 [0,1] or uint8 [0,255] Returns: Upscaled silhouette (256, 176) uint8 """ # Normalize to uint8 if needed if silhouette.dtype == np.float32 or silhouette.dtype == np.float64: sil_u8 = (silhouette * 255).astype(np.uint8) else: sil_u8 = silhouette.astype(np.uint8) # Upscale using nearest neighbor to preserve pixelation upscaled = cast( ImageArray, cv2.resize( sil_u8, (DISPLAY_WIDTH, DISPLAY_HEIGHT), interpolation=cv2.INTER_NEAREST, ), ) return upscaled def _normalize_mask_for_display(self, mask: NDArray[np.generic]) -> ImageArray: mask_array = np.asarray(mask) if mask_array.dtype == np.bool_: bool_scaled = np.where(mask_array, np.uint8(255), np.uint8(0)).astype( np.uint8 ) return cast(ImageArray, bool_scaled) if mask_array.dtype == np.uint8: mask_array = cast(ImageArray, mask_array) max_u8 = int(np.max(mask_array)) if mask_array.size > 0 else 0 if max_u8 <= 1: scaled_u8 = np.where(mask_array > 0, np.uint8(255), np.uint8(0)).astype( np.uint8 ) return cast(ImageArray, scaled_u8) return cast(ImageArray, mask_array) if np.issubdtype(mask_array.dtype, np.integer): max_int = float(np.max(mask_array)) if mask_array.size > 0 else 0.0 if max_int <= 1.0: return cast( ImageArray, (mask_array.astype(np.float32) * 255.0).astype(np.uint8) ) clipped = np.clip(mask_array, 0, 255).astype(np.uint8) return cast(ImageArray, clipped) mask_float = np.asarray(mask_array, dtype=np.float32) max_val = float(np.max(mask_float)) if mask_float.size > 0 else 0.0 if max_val <= 0.0: return np.zeros(mask_float.shape, dtype=np.uint8) normalized = np.clip((mask_float / max_val) * 255.0, 0.0, 255.0).astype( np.uint8 ) return cast(ImageArray, normalized) def _draw_raw_stats(self, image: ImageArray, mask_raw: ImageArray | None) -> None: if mask_raw is None: return mask = np.asarray(mask_raw) if mask.size == 0: return stats = [ f"raw: {mask.dtype}", f"min/max: {float(mask.min()):.3f}/{float(mask.max()):.3f}", f"nnz: {int(np.count_nonzero(mask))}", ] font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.45 thickness = 1 line_h = 18 x0 = 8 y0 = 20 for i, txt in enumerate(stats): y = y0 + i * line_h (tw, th), _ = cv2.getTextSize(txt, font, font_scale, thickness) _ = cv2.rectangle( image, (x0 - 4, y - th - 4), (x0 + tw + 4, y + 4), COLOR_BLACK, -1 ) _ = cv2.putText( image, txt, (x0, y), font, font_scale, COLOR_YELLOW, thickness ) def _prepare_segmentation_view( self, _mask_raw: ImageArray | None, silhouette: NDArray[np.float32] | None, _bbox: BBoxXYXY | None, ) -> ImageArray: return self._prepare_normalized_view(silhouette) def _fit_gray_to_display( self, gray: ImageArray, out_h: int = DISPLAY_HEIGHT, out_w: int = DISPLAY_WIDTH, ) -> ImageArray: src_h, src_w = gray.shape[:2] if src_h <= 0 or src_w <= 0: return np.zeros((out_h, out_w), dtype=np.uint8) scale = min(out_w / src_w, out_h / src_h) new_w = max(1, int(round(src_w * scale))) new_h = max(1, int(round(src_h * scale))) resized = cast( ImageArray, cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_NEAREST), ) canvas = np.zeros((out_h, out_w), dtype=np.uint8) x0 = (out_w - new_w) // 2 y0 = (out_h - new_h) // 2 canvas[y0 : y0 + new_h, x0 : x0 + new_w] = resized return cast(ImageArray, canvas) def _crop_mask_to_bbox( self, mask_gray: ImageArray, bbox: BBoxXYXY | None, ) -> ImageArray: if bbox is None: return mask_gray h, w = mask_gray.shape[:2] x1, y1, x2, y2 = bbox x1c = max(0, min(w, int(x1))) x2c = max(0, min(w, int(x2))) y1c = max(0, min(h, int(y1))) y2c = max(0, min(h, int(y2))) if x2c <= x1c or y2c <= y1c: return mask_gray cropped = mask_gray[y1c:y2c, x1c:x2c] if cropped.size == 0: return mask_gray return cast(ImageArray, cropped) def _prepare_segmentation_input_view( self, silhouettes: NDArray[np.float32] | None, ) -> ImageArray: if silhouettes is None or silhouettes.size == 0: placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8) self._draw_mode_indicator(placeholder, "Input Silhouettes (No Data)") return placeholder n_frames = int(silhouettes.shape[0]) tiles_per_row = int(np.ceil(np.sqrt(n_frames))) rows = int(np.ceil(n_frames / tiles_per_row)) tile_h = DISPLAY_HEIGHT tile_w = DISPLAY_WIDTH grid = np.zeros((rows * tile_h, tiles_per_row * tile_w), dtype=np.uint8) for idx in range(n_frames): sil = silhouettes[idx] tile = self._upscale_silhouette(sil) r = idx // tiles_per_row c = idx % tiles_per_row y0, y1 = r * tile_h, (r + 1) * tile_h x0, x1 = c * tile_w, (c + 1) * tile_w grid[y0:y1, x0:x1] = tile grid_bgr = cast(ImageArray, cv2.cvtColor(grid, cv2.COLOR_GRAY2BGR)) for idx in range(n_frames): r = idx // tiles_per_row c = idx % tiles_per_row y0 = r * tile_h x0 = c * tile_w cv2.putText( grid_bgr, str(idx), (x0 + 8, y0 + 22), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 255), 2, cv2.LINE_AA, ) return grid_bgr def _prepare_raw_view( self, mask_raw: ImageArray | None, bbox: BBoxXYXY | None = None, ) -> ImageArray: """Prepare raw mask view. Args: mask_raw: Raw binary mask or None Returns: Displayable image with mode indicator """ if mask_raw is None: # Create placeholder placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8) self._draw_mode_indicator(placeholder, "Raw Mask (No Data)") return placeholder # Ensure single channel if len(mask_raw.shape) == 3: mask_gray = cast(ImageArray, cv2.cvtColor(mask_raw, cv2.COLOR_BGR2GRAY)) else: mask_gray = cast(ImageArray, mask_raw) mask_gray = self._normalize_mask_for_display(mask_gray) mask_gray = self._crop_mask_to_bbox(mask_gray, bbox) debug_pad = RAW_STATS_PAD if self.show_raw_debug else 0 content_h = max(1, DISPLAY_HEIGHT - debug_pad - MODE_LABEL_PAD) mask_resized = self._fit_gray_to_display( mask_gray, out_h=content_h, out_w=DISPLAY_WIDTH ) full_mask = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH), dtype=np.uint8) full_mask[debug_pad : debug_pad + content_h, :] = mask_resized # Convert to BGR for display mask_bgr = cast(ImageArray, cv2.cvtColor(full_mask, cv2.COLOR_GRAY2BGR)) if self.show_raw_debug: self._draw_raw_stats(mask_bgr, mask_raw) self._draw_mode_indicator(mask_bgr, "Raw Mask") return mask_bgr def _prepare_normalized_view( self, silhouette: NDArray[np.float32] | None, ) -> ImageArray: """Prepare normalized silhouette view. Args: silhouette: Normalized silhouette (64, 44) or None Returns: Displayable image with mode indicator """ if silhouette is None: # Create placeholder placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8) self._draw_mode_indicator(placeholder, "Normalized (No Data)") return placeholder # Upscale and convert upscaled = self._upscale_silhouette(silhouette) content_h = max(1, DISPLAY_HEIGHT - MODE_LABEL_PAD) sil_compact = self._fit_gray_to_display( upscaled, out_h=content_h, out_w=DISPLAY_WIDTH ) sil_canvas = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH), dtype=np.uint8) sil_canvas[:content_h, :] = sil_compact sil_bgr = cast(ImageArray, cv2.cvtColor(sil_canvas, cv2.COLOR_GRAY2BGR)) self._draw_mode_indicator(sil_bgr, "Normalized") return sil_bgr def _draw_mode_indicator(self, image: ImageArray, label: str) -> None: h, w = image.shape[:2] mode_text = label font = cv2.FONT_HERSHEY_SIMPLEX font_scale = 0.5 thickness = 1 # Get text size for background (text_width, text_height), _ = cv2.getTextSize( mode_text, font, font_scale, thickness ) x_pos = 14 y_pos = h - 8 y_top = max(0, h - MODE_LABEL_PAD) _ = cv2.rectangle(image, (0, y_top), (w, h), COLOR_DARK_GRAY, -1) # Draw text _ = cv2.putText( image, mode_text, (x_pos, y_pos), font, font_scale, COLOR_YELLOW, thickness, ) def update( self, frame: ImageArray, bbox: BBoxXYXY | None, bbox_mask: BBoxXYXY | None, track_id: int, mask_raw: ImageArray | None, silhouette: NDArray[np.float32] | None, segmentation_input: NDArray[np.float32] | None, label: str | None, confidence: float | None, fps: float, pose_data: dict[str, object] | None = None, ) -> bool: """Update visualization with new frame data. Args: frame: Input frame (H, W, C) uint8 bbox: Bounding box in XYXY format (x1, y1, x2, y2) or None bbox_mask: Bounding box in mask coordinates (x1, y1, x2, y2) or None track_id: Tracking ID mask_raw: Raw binary mask (H, W) uint8 or None silhouette: Normalized silhouette (64, 44) float32 [0,1] or None segmentation_input: Windowed silhouette stack for model input visualization label: Classification label or None confidence: Classification confidence [0,1] or None fps: Current FPS pose_data: Pose data dictionary or None Returns: False if user requested quit (pressed 'q'), True otherwise """ self._ensure_windows() # Prepare and show main window main_display = self._prepare_main_frame( frame, bbox, track_id, fps, label, confidence, pose_data ) cv2.imshow(MAIN_WINDOW, main_display) # Prepare and show segmentation window seg_display = self._prepare_segmentation_view(mask_raw, silhouette, bbox) cv2.imshow(SEG_WINDOW, seg_display) if self.show_raw_window: self._ensure_raw_window() raw_display = self._prepare_raw_view(mask_raw, bbox_mask) cv2.imshow(RAW_WINDOW, raw_display) seg_input_display = self._prepare_segmentation_input_view(segmentation_input) cv2.imshow(WINDOW_SEG_INPUT, seg_input_display) # Handle keyboard input key = cv2.waitKey(1) & 0xFF if key == ord("q"): return False elif key == ord("r"): self.show_raw_window = not self.show_raw_window if self.show_raw_window: self._ensure_raw_window() logger.debug("Raw mask window enabled") else: self._hide_raw_window() logger.debug("Raw mask window disabled") elif key == ord("d"): self.show_raw_debug = not self.show_raw_debug logger.debug( "Raw mask debug overlay %s", "enabled" if self.show_raw_debug else "disabled", ) return True def close(self) -> None: if self._windows_created: self._hide_raw_window() cv2.destroyAllWindows() self._windows_created = False self._raw_window_created = False