Files
OpenGait/opengait-studio/opengait_studio/visualizer.py
T
crosstyan 00fcda4fe3 feat: extract opengait_studio monorepo module
Move demo implementation into opengait_studio, retire Sports2D runtime integration, and align packaging with root-level monorepo dependency management.
2026-03-07 18:14:13 +08:00

768 lines
25 KiB
Python

"""OpenCV-based visualizer for demo pipeline.
Provides real-time visualization of detection, segmentation, and classification results
with interactive mode switching for mask display.
"""
from __future__ import annotations
import logging
from typing import cast
import cv2
import numpy as np
from numpy.typing import NDArray
from .preprocess import BBoxXYXY
logger = logging.getLogger(__name__)
# Window names
MAIN_WINDOW = "Scoliosis Detection"
SEG_WINDOW = "Normalized Silhouette"
RAW_WINDOW = "Raw Mask"
WINDOW_SEG_INPUT = "Segmentation Input"
# Silhouette dimensions (from preprocess.py)
SIL_HEIGHT = 64
SIL_WIDTH = 44
# Display dimensions for upscaled silhouette
DISPLAY_HEIGHT = 256
DISPLAY_WIDTH = 176
RAW_STATS_PAD = 54
MODE_LABEL_PAD = 26
# Colors (BGR)
COLOR_GREEN = (0, 255, 0)
COLOR_WHITE = (255, 255, 255)
COLOR_BLACK = (0, 0, 0)
COLOR_DARK_GRAY = (56, 56, 56)
COLOR_RED = (0, 0, 255)
COLOR_YELLOW = (0, 255, 255)
# Type alias for image arrays (NDArray or cv2.Mat)
COLOR_CYAN = (255, 255, 0)
COLOR_ORANGE = (0, 165, 255)
COLOR_MAGENTA = (255, 0, 255)
ImageArray = NDArray[np.uint8]
# COCO-format skeleton connections (17 keypoints)
# Connections are pairs of keypoint indices
SKELETON_CONNECTIONS: list[tuple[int, int]] = [
(0, 1), # nose -> left_eye
(0, 2), # nose -> right_eye
(1, 3), # left_eye -> left_ear
(2, 4), # right_eye -> right_ear
(5, 6), # left_shoulder -> right_shoulder
(5, 7), # left_shoulder -> left_elbow
(7, 9), # left_elbow -> left_wrist
(6, 8), # right_shoulder -> right_elbow
(8, 10), # right_elbow -> right_wrist
(11, 12), # left_hip -> right_hip
(5, 11), # left_shoulder -> left_hip
(6, 12), # right_shoulder -> right_hip
(11, 13), # left_hip -> left_knee
(13, 15), # left_knee -> left_ankle
(12, 14), # right_hip -> right_knee
(14, 16), # right_knee -> right_ankle
]
# Keypoint names for COCO format (17 keypoints)
KEYPOINT_NAMES: list[str] = [
"nose", "left_eye", "right_eye", "left_ear", "right_ear",
"left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
"left_wrist", "right_wrist", "left_hip", "right_hip",
"left_knee", "right_knee", "left_ankle", "right_ankle"
]
# Joints where angles are typically calculated (for scoliosis/ gait analysis)
ANGLE_JOINTS: list[tuple[int, int, int]] = [
(5, 7, 9), # left_shoulder -> left_elbow -> left_wrist
(6, 8, 10), # right_shoulder -> right_elbow -> right_wrist
(7, 5, 11), # left_elbow -> left_shoulder -> left_hip
(8, 6, 12), # right_elbow -> right_shoulder -> right_hip
(5, 11, 13), # left_shoulder -> left_hip -> left_knee
(6, 12, 14), # right_shoulder -> right_hip -> right_knee
(11, 13, 15),# left_hip -> left_knee -> left_ankle
(12, 14, 16),# right_hip -> right_knee -> right_ankle
]
class OpenCVVisualizer:
def __init__(self) -> None:
self.show_raw_window: bool = False
self.show_raw_debug: bool = False
self._windows_created: bool = False
self._raw_window_created: bool = False
def _ensure_windows(self) -> None:
if not self._windows_created:
cv2.namedWindow(MAIN_WINDOW, cv2.WINDOW_NORMAL)
cv2.namedWindow(SEG_WINDOW, cv2.WINDOW_NORMAL)
cv2.namedWindow(WINDOW_SEG_INPUT, cv2.WINDOW_NORMAL)
self._windows_created = True
def _ensure_raw_window(self) -> None:
if not self._raw_window_created:
cv2.namedWindow(RAW_WINDOW, cv2.WINDOW_NORMAL)
self._raw_window_created = True
def _hide_raw_window(self) -> None:
if self._raw_window_created:
cv2.destroyWindow(RAW_WINDOW)
self._raw_window_created = False
def _draw_bbox(
self,
frame: ImageArray,
bbox: BBoxXYXY | None,
) -> None:
"""Draw bounding box on frame if present.
Args:
frame: Input frame (H, W, 3) uint8 - modified in place
bbox: Bounding box in XYXY format as (x1, y1, x2, y2) or None
"""
if bbox is None:
return
x1, y1, x2, y2 = bbox
# Draw rectangle with green color, thickness 2
_ = cv2.rectangle(frame, (x1, y1), (x2, y2), COLOR_GREEN, 2)
def _draw_text_overlay(
self,
frame: ImageArray,
track_id: int,
fps: float,
label: str | None,
confidence: float | None,
) -> None:
"""Draw text overlay with track info, FPS, label, and confidence.
Args:
frame: Input frame (H, W, 3) uint8 - modified in place
track_id: Tracking ID
fps: Current FPS
label: Classification label or None
confidence: Classification confidence or None
"""
# Prepare text lines
lines: list[str] = []
lines.append(f"ID: {track_id}")
lines.append(f"FPS: {fps:.1f}")
if label is not None:
if confidence is not None:
lines.append(f"{label}: {confidence:.2%}")
else:
lines.append(label)
# Draw text with background for readability
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.6
thickness = 1
line_height = 25
margin = 10
for i, text in enumerate(lines):
y_pos = margin + (i + 1) * line_height
# Draw background rectangle
(text_width, text_height), _ = cv2.getTextSize(
text, font, font_scale, thickness
)
_ = cv2.rectangle(
frame,
(margin, y_pos - text_height - 5),
(margin + text_width + 10, y_pos + 5),
COLOR_BLACK,
-1,
)
# Draw text
_ = cv2.putText(
frame,
text,
(margin + 5, y_pos),
font,
font_scale,
COLOR_WHITE,
thickness,
)
def _draw_pose_skeleton(
self,
frame: ImageArray,
pose_data: dict[str, object] | None,
) -> None:
"""Draw pose skeleton on frame.
Args:
frame: Input frame (H, W, 3) uint8 - modified in place
pose_data: Pose data dictionary from Sports2D or similar
Expected format: {'keypoints': [[x1, y1], [x2, y2], ...],
'confidence': [c1, c2, ...],
'angles': {'joint_name': angle, ...}}
"""
if pose_data is None:
return
keypoints_obj = pose_data.get('keypoints')
if keypoints_obj is None:
return
# Convert keypoints to numpy array
keypoints = np.asarray(keypoints_obj, dtype=np.float32)
if keypoints.size == 0:
return
h, w = frame.shape[:2]
# Get confidence scores if available
confidence_obj = pose_data.get('confidence')
confidences = (
np.asarray(confidence_obj, dtype=np.float32)
if confidence_obj is not None
else np.ones(len(keypoints), dtype=np.float32)
)
# Draw skeleton connections
for connection in SKELETON_CONNECTIONS:
idx1, idx2 = connection
if idx1 < len(keypoints) and idx2 < len(keypoints):
# Check confidence threshold (0.3)
if confidences[idx1] > 0.3 and confidences[idx2] > 0.3:
pt1 = (int(keypoints[idx1][0]), int(keypoints[idx1][1]))
pt2 = (int(keypoints[idx2][0]), int(keypoints[idx2][1]))
# Clip to frame bounds
pt1 = (max(0, min(w - 1, pt1[0])), max(0, min(h - 1, pt1[1])))
pt2 = (max(0, min(w - 1, pt2[0])), max(0, min(h - 1, pt2[1])))
_ = cv2.line(frame, pt1, pt2, COLOR_CYAN, 2)
# Draw keypoints
for i, (kp, conf) in enumerate(zip(keypoints, confidences)):
if conf > 0.3 and i < len(keypoints):
x, y = int(kp[0]), int(kp[1])
# Clip to frame bounds
x = max(0, min(w - 1, x))
y = max(0, min(h - 1, y))
# Draw keypoint as circle
_ = cv2.circle(frame, (x, y), 4, COLOR_MAGENTA, -1)
_ = cv2.circle(frame, (x, y), 4, COLOR_WHITE, 1)
def _draw_pose_angles(
self,
frame: ImageArray,
pose_data: dict[str, object] | None,
) -> None:
"""Draw pose angles as text overlay.
Args:
frame: Input frame (H, W, 3) uint8 - modified in place
pose_data: Pose data dictionary with 'angles' key
"""
if pose_data is None:
return
angles_obj = pose_data.get('angles')
if angles_obj is None:
return
angles = cast(dict[str, float], angles_obj)
if not angles:
return
# Draw angles in top-right corner
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.45
thickness = 1
line_height = 20
margin = 10
h, w = frame.shape[:2]
# Filter and format angles
angle_texts: list[tuple[str, float]] = []
for name, angle in angles.items():
# Only show angles that are reasonable (0-180 degrees)
if 0 <= angle <= 180:
angle_texts.append((str(name), float(angle)))
# Sort by name for consistent display
angle_texts.sort(key=lambda x: x[0])
# Draw from top-right
for i, (name, angle) in enumerate(angle_texts[:8]): # Limit to 8 angles
text = f"{name}: {angle:.1f}"
(text_width, text_height), _ = cv2.getTextSize(
text, font, font_scale, thickness
)
x_pos = w - margin - text_width - 10
y_pos = margin + (i + 1) * line_height
# Draw background rectangle
_ = cv2.rectangle(
frame,
(x_pos - 4, y_pos - text_height - 4),
(x_pos + text_width + 4, y_pos + 4),
COLOR_BLACK,
-1,
)
# Draw text in orange
_ = cv2.putText(
frame,
text,
(x_pos, y_pos),
font,
font_scale,
COLOR_ORANGE,
thickness,
)
def _prepare_main_frame(
self,
frame: ImageArray,
bbox: BBoxXYXY | None,
track_id: int,
fps: float,
label: str | None,
confidence: float | None,
pose_data: dict[str, object] | None = None,
) -> ImageArray:
"""Prepare main display frame with bbox and text overlay.
Args:
frame: Input frame (H, W, C) uint8
bbox: Bounding box in XYXY format (x1, y1, x2, y2) or None
track_id: Tracking ID
fps: Current FPS
label: Classification label or None
confidence: Classification confidence or None
pose_data: Pose data dictionary or None
Returns:
Processed frame ready for display
"""
# Ensure BGR format (convert grayscale if needed)
if len(frame.shape) == 2:
display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR))
elif frame.shape[2] == 1:
display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR))
elif frame.shape[2] == 3:
display_frame = frame.copy()
elif frame.shape[2] == 4:
display_frame = cast(ImageArray, cv2.cvtColor(frame, cv2.COLOR_BGRA2BGR))
else:
display_frame = frame.copy()
# Draw bbox and text (modifies in place)
self._draw_bbox(display_frame, bbox)
self._draw_text_overlay(display_frame, track_id, fps, label, confidence)
# Draw pose skeleton and angles if available
self._draw_pose_skeleton(display_frame, pose_data)
self._draw_pose_angles(display_frame, pose_data)
return display_frame
def _upscale_silhouette(
self,
silhouette: NDArray[np.float32] | NDArray[np.uint8],
) -> ImageArray:
"""Upscale silhouette to display size.
Args:
silhouette: Input silhouette (64, 44) float32 [0,1] or uint8 [0,255]
Returns:
Upscaled silhouette (256, 176) uint8
"""
# Normalize to uint8 if needed
if silhouette.dtype == np.float32 or silhouette.dtype == np.float64:
sil_u8 = (silhouette * 255).astype(np.uint8)
else:
sil_u8 = silhouette.astype(np.uint8)
# Upscale using nearest neighbor to preserve pixelation
upscaled = cast(
ImageArray,
cv2.resize(
sil_u8,
(DISPLAY_WIDTH, DISPLAY_HEIGHT),
interpolation=cv2.INTER_NEAREST,
),
)
return upscaled
def _normalize_mask_for_display(self, mask: NDArray[np.generic]) -> ImageArray:
mask_array = np.asarray(mask)
if mask_array.dtype == np.bool_:
bool_scaled = np.where(mask_array, np.uint8(255), np.uint8(0)).astype(
np.uint8
)
return cast(ImageArray, bool_scaled)
if mask_array.dtype == np.uint8:
mask_array = cast(ImageArray, mask_array)
max_u8 = int(np.max(mask_array)) if mask_array.size > 0 else 0
if max_u8 <= 1:
scaled_u8 = np.where(mask_array > 0, np.uint8(255), np.uint8(0)).astype(
np.uint8
)
return cast(ImageArray, scaled_u8)
return cast(ImageArray, mask_array)
if np.issubdtype(mask_array.dtype, np.integer):
max_int = float(np.max(mask_array)) if mask_array.size > 0 else 0.0
if max_int <= 1.0:
return cast(
ImageArray, (mask_array.astype(np.float32) * 255.0).astype(np.uint8)
)
clipped = np.clip(mask_array, 0, 255).astype(np.uint8)
return cast(ImageArray, clipped)
mask_float = np.asarray(mask_array, dtype=np.float32)
max_val = float(np.max(mask_float)) if mask_float.size > 0 else 0.0
if max_val <= 0.0:
return np.zeros(mask_float.shape, dtype=np.uint8)
normalized = np.clip((mask_float / max_val) * 255.0, 0.0, 255.0).astype(
np.uint8
)
return cast(ImageArray, normalized)
def _draw_raw_stats(self, image: ImageArray, mask_raw: ImageArray | None) -> None:
if mask_raw is None:
return
mask = np.asarray(mask_raw)
if mask.size == 0:
return
stats = [
f"raw: {mask.dtype}",
f"min/max: {float(mask.min()):.3f}/{float(mask.max()):.3f}",
f"nnz: {int(np.count_nonzero(mask))}",
]
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.45
thickness = 1
line_h = 18
x0 = 8
y0 = 20
for i, txt in enumerate(stats):
y = y0 + i * line_h
(tw, th), _ = cv2.getTextSize(txt, font, font_scale, thickness)
_ = cv2.rectangle(
image, (x0 - 4, y - th - 4), (x0 + tw + 4, y + 4), COLOR_BLACK, -1
)
_ = cv2.putText(
image, txt, (x0, y), font, font_scale, COLOR_YELLOW, thickness
)
def _prepare_segmentation_view(
self,
mask_raw: ImageArray | None,
silhouette: NDArray[np.float32] | None,
bbox: BBoxXYXY | None,
) -> ImageArray:
_ = mask_raw
_ = bbox
return self._prepare_normalized_view(silhouette)
def _fit_gray_to_display(
self,
gray: ImageArray,
out_h: int = DISPLAY_HEIGHT,
out_w: int = DISPLAY_WIDTH,
) -> ImageArray:
src_h, src_w = gray.shape[:2]
if src_h <= 0 or src_w <= 0:
return np.zeros((out_h, out_w), dtype=np.uint8)
scale = min(out_w / src_w, out_h / src_h)
new_w = max(1, int(round(src_w * scale)))
new_h = max(1, int(round(src_h * scale)))
resized = cast(
ImageArray,
cv2.resize(gray, (new_w, new_h), interpolation=cv2.INTER_NEAREST),
)
canvas = np.zeros((out_h, out_w), dtype=np.uint8)
x0 = (out_w - new_w) // 2
y0 = (out_h - new_h) // 2
canvas[y0 : y0 + new_h, x0 : x0 + new_w] = resized
return cast(ImageArray, canvas)
def _crop_mask_to_bbox(
self,
mask_gray: ImageArray,
bbox: BBoxXYXY | None,
) -> ImageArray:
if bbox is None:
return mask_gray
h, w = mask_gray.shape[:2]
x1, y1, x2, y2 = bbox
x1c = max(0, min(w, int(x1)))
x2c = max(0, min(w, int(x2)))
y1c = max(0, min(h, int(y1)))
y2c = max(0, min(h, int(y2)))
if x2c <= x1c or y2c <= y1c:
return mask_gray
cropped = mask_gray[y1c:y2c, x1c:x2c]
if cropped.size == 0:
return mask_gray
return cast(ImageArray, cropped)
def _prepare_segmentation_input_view(
self,
silhouettes: NDArray[np.float32] | None,
) -> ImageArray:
if silhouettes is None or silhouettes.size == 0:
placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8)
self._draw_mode_indicator(placeholder, "Input Silhouettes (No Data)")
return placeholder
n_frames = int(silhouettes.shape[0])
tiles_per_row = int(np.ceil(np.sqrt(n_frames)))
rows = int(np.ceil(n_frames / tiles_per_row))
tile_h = DISPLAY_HEIGHT
tile_w = DISPLAY_WIDTH
grid = np.zeros((rows * tile_h, tiles_per_row * tile_w), dtype=np.uint8)
for idx in range(n_frames):
sil = silhouettes[idx]
tile = self._upscale_silhouette(sil)
r = idx // tiles_per_row
c = idx % tiles_per_row
y0, y1 = r * tile_h, (r + 1) * tile_h
x0, x1 = c * tile_w, (c + 1) * tile_w
grid[y0:y1, x0:x1] = tile
grid_bgr = cast(ImageArray, cv2.cvtColor(grid, cv2.COLOR_GRAY2BGR))
for idx in range(n_frames):
r = idx // tiles_per_row
c = idx % tiles_per_row
y0 = r * tile_h
x0 = c * tile_w
cv2.putText(
grid_bgr,
str(idx),
(x0 + 8, y0 + 22),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,
(0, 255, 255),
2,
cv2.LINE_AA,
)
return grid_bgr
def _prepare_raw_view(
self,
mask_raw: ImageArray | None,
bbox: BBoxXYXY | None = None,
) -> ImageArray:
"""Prepare raw mask view.
Args:
mask_raw: Raw binary mask or None
Returns:
Displayable image with mode indicator
"""
if mask_raw is None:
# Create placeholder
placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8)
self._draw_mode_indicator(placeholder, "Raw Mask (No Data)")
return placeholder
# Ensure single channel
if len(mask_raw.shape) == 3:
mask_gray = cast(ImageArray, cv2.cvtColor(mask_raw, cv2.COLOR_BGR2GRAY))
else:
mask_gray = cast(ImageArray, mask_raw)
mask_gray = self._normalize_mask_for_display(mask_gray)
mask_gray = self._crop_mask_to_bbox(mask_gray, bbox)
debug_pad = RAW_STATS_PAD if self.show_raw_debug else 0
content_h = max(1, DISPLAY_HEIGHT - debug_pad - MODE_LABEL_PAD)
mask_resized = self._fit_gray_to_display(
mask_gray, out_h=content_h, out_w=DISPLAY_WIDTH
)
full_mask = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH), dtype=np.uint8)
full_mask[debug_pad : debug_pad + content_h, :] = mask_resized
# Convert to BGR for display
mask_bgr = cast(ImageArray, cv2.cvtColor(full_mask, cv2.COLOR_GRAY2BGR))
if self.show_raw_debug:
self._draw_raw_stats(mask_bgr, mask_raw)
self._draw_mode_indicator(mask_bgr, "Raw Mask")
return mask_bgr
def _prepare_normalized_view(
self,
silhouette: NDArray[np.float32] | None,
) -> ImageArray:
"""Prepare normalized silhouette view.
Args:
silhouette: Normalized silhouette (64, 44) or None
Returns:
Displayable image with mode indicator
"""
if silhouette is None:
# Create placeholder
placeholder = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH, 3), dtype=np.uint8)
self._draw_mode_indicator(placeholder, "Normalized (No Data)")
return placeholder
# Upscale and convert
upscaled = self._upscale_silhouette(silhouette)
content_h = max(1, DISPLAY_HEIGHT - MODE_LABEL_PAD)
sil_compact = self._fit_gray_to_display(
upscaled, out_h=content_h, out_w=DISPLAY_WIDTH
)
sil_canvas = np.zeros((DISPLAY_HEIGHT, DISPLAY_WIDTH), dtype=np.uint8)
sil_canvas[:content_h, :] = sil_compact
sil_bgr = cast(ImageArray, cv2.cvtColor(sil_canvas, cv2.COLOR_GRAY2BGR))
self._draw_mode_indicator(sil_bgr, "Normalized")
return sil_bgr
def _draw_mode_indicator(self, image: ImageArray, label: str) -> None:
h, w = image.shape[:2]
mode_text = label
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.5
thickness = 1
# Get text size for background
(text_width, text_height), _ = cv2.getTextSize(
mode_text, font, font_scale, thickness
)
x_pos = 14
y_pos = h - 8
y_top = max(0, h - MODE_LABEL_PAD)
_ = cv2.rectangle(
image,
(0, y_top),
(w, h),
COLOR_DARK_GRAY,
-1,
)
_ = cv2.rectangle(
image,
(x_pos - 6, y_pos - text_height - 6),
(x_pos + text_width + 8, y_pos + 6),
COLOR_DARK_GRAY,
-1,
)
# Draw text
_ = cv2.putText(
image,
mode_text,
(x_pos, y_pos),
font,
font_scale,
COLOR_YELLOW,
thickness,
)
def update(
self,
frame: ImageArray,
bbox: BBoxXYXY | None,
bbox_mask: BBoxXYXY | None,
track_id: int,
mask_raw: ImageArray | None,
silhouette: NDArray[np.float32] | None,
segmentation_input: NDArray[np.float32] | None,
label: str | None,
confidence: float | None,
fps: float,
pose_data: dict[str, object] | None = None,
) -> bool:
"""Update visualization with new frame data.
Args:
frame: Input frame (H, W, C) uint8
bbox: Bounding box in XYXY format (x1, y1, x2, y2) or None
track_id: Tracking ID
mask_raw: Raw binary mask (H, W) uint8 or None
silhouette: Normalized silhouette (64, 44) float32 [0,1] or None
label: Classification label or None
confidence: Classification confidence [0,1] or None
fps: Current FPS
pose_data: Pose data dictionary or None
Returns:
False if user requested quit (pressed 'q'), True otherwise
"""
self._ensure_windows()
# Prepare and show main window
main_display = self._prepare_main_frame(
frame, bbox, track_id, fps, label, confidence, pose_data
)
cv2.imshow(MAIN_WINDOW, main_display)
# Prepare and show segmentation window
seg_display = self._prepare_segmentation_view(mask_raw, silhouette, bbox)
cv2.imshow(SEG_WINDOW, seg_display)
if self.show_raw_window:
self._ensure_raw_window()
raw_display = self._prepare_raw_view(mask_raw, bbox_mask)
cv2.imshow(RAW_WINDOW, raw_display)
seg_input_display = self._prepare_segmentation_input_view(segmentation_input)
cv2.imshow(WINDOW_SEG_INPUT, seg_input_display)
# Handle keyboard input
key = cv2.waitKey(1) & 0xFF
if key == ord("q"):
return False
elif key == ord("r"):
self.show_raw_window = not self.show_raw_window
if self.show_raw_window:
self._ensure_raw_window()
logger.debug("Raw mask window enabled")
else:
self._hide_raw_window()
logger.debug("Raw mask window disabled")
elif key == ord("d"):
self.show_raw_debug = not self.show_raw_debug
logger.debug(
"Raw mask debug overlay %s",
"enabled" if self.show_raw_debug else "disabled",
)
return True
def close(self) -> None:
if self._windows_created:
self._hide_raw_window()
cv2.destroyAllWindows()
self._windows_created = False
self._raw_window_created = False