from collections import Counter from dataclasses import dataclass from pathlib import Path import click import cv2 import numpy as np import pyarrow.parquet as pq from beartype import beartype from loguru import logger from pose_tracking_exp.common.normalization import infer_bbox_from_keypoints, normalize_rtmpose_body20 from pose_tracking_exp.schema import ( CameraCalibration, CameraFrame, FrameBundle, PoseDetection, SceneConfig, TrackerConfig, TrackerDiagnostics, TrackedFrameResult, ) from pose_tracking_exp.tracking import PoseTracker _NOMINAL_FRAME_PERIOD_NS = 33_333_333 @dataclass(slots=True) class ActualTestTrackingSummary: bundle_count: int active_frames: int proposal_frames: int max_active_tracks: int max_lost_tracks: int update_action_counts: dict[str, int] mean_accepted_views: float mean_accepted_joints: float mean_reprojection_error: float diagnostics: TrackerDiagnostics def _finite_mean(values: list[float]) -> float: finite = [value for value in values if np.isfinite(value)] if not finite: return np.inf return float(np.mean(np.asarray(finite, dtype=np.float64))) @beartype def summarize_tracking_results( results: list[TrackedFrameResult], diagnostics: TrackerDiagnostics, ) -> ActualTestTrackingSummary: update_events = [event for result in results for event in result.update_events] action_counts = Counter(event.action for event in update_events) accepted_view_samples = [float(event.accepted_view_count) for event in update_events if event.accepted_view_count > 0] accepted_joint_samples = [float(event.accepted_joint_count) for event in update_events if event.accepted_joint_count > 0] reprojection_samples = [float(event.mean_reprojection_error) for event in update_events] return ActualTestTrackingSummary( bundle_count=len(results), active_frames=sum(1 for result in results if result.active_tracks), proposal_frames=sum(1 for result in results if result.proposals), max_active_tracks=max((len(result.active_tracks) for result in results), default=0), max_lost_tracks=max((len(result.lost_tracks) for result in results), default=0), update_action_counts=dict(action_counts), mean_accepted_views=_finite_mean(accepted_view_samples), mean_accepted_joints=_finite_mean(accepted_joint_samples), mean_reprojection_error=_finite_mean(reprojection_samples), diagnostics=diagnostics, ) @beartype def format_frame_summary_lines(results: list[TrackedFrameResult]) -> tuple[str, ...]: lines: list[str] = [] for result in results: action_counts = Counter(event.action for event in result.update_events) finite_reprojection_errors = [ float(event.mean_reprojection_error) for event in result.update_events if np.isfinite(event.mean_reprojection_error) ] lines.append( "bundle={} proposals={} active_ids={} lost_ids={} tentative_ids={} actions={} mean_event_reproj={}".format( result.bundle_index, len(result.proposals), [track.track_id for track in result.active_tracks], [track.track_id for track in result.lost_tracks], [track.track_id for track in result.tentative_tracks], dict(action_counts), "{:.2f}".format(float(np.mean(np.asarray(finite_reprojection_errors, dtype=np.float64)))) if finite_reprojection_errors else "nan", ) ) return tuple(lines) @beartype def load_actual_test_scene(root: Path) -> SceneConfig: # ActualTest parquet comes from the ChArUco/OpenCV side, so `rvec` / `tvec` # are world->camera extrinsics. The RPT-facing camera pose is derived later # from this canonical OpenCV form. camera_rows = pq.read_table(root / "camera_params" / "camera_params.parquet").to_pylist() cameras: list[CameraCalibration] = [] for item in camera_rows: rotation, _ = cv2.Rodrigues(np.asarray(item["extrinsic"]["rvec"], dtype=np.float64).reshape(3, 1)) cameras.append( CameraCalibration.from_opencv_extrinsics( name=str(item["port"]), width=int(item["resolution"]["width"]), height=int(item["resolution"]["height"]), K=np.asarray(item["intrinsic"]["camera_matrix"], dtype=np.float64), DC=np.asarray(item["intrinsic"]["distortion_coefficients"], dtype=np.float64).reshape(-1), R=np.asarray(rotation, dtype=np.float64), T=np.asarray(item["extrinsic"]["tvec"], dtype=np.float64).reshape(3), rvec=np.asarray(item["extrinsic"]["rvec"], dtype=np.float64).reshape(3), ) ) return SceneConfig( room_size=np.asarray([20.0, 20.0, 8.0], dtype=np.float64), room_center=np.asarray([0.0, 0.0, 2.0], dtype=np.float64), cameras=tuple(sorted(cameras, key=lambda camera: camera.name)), ) @beartype def load_actual_test_segment_bundles( root: Path, segment_name: str, *, frame_start: int = 690, frame_stop: int | None = None, max_frames: int | None = None, min_cameras_with_rows: int = 1, min_visible_joints: int = 6, ) -> list[FrameBundle]: segment_root = root / segment_name by_camera: dict[str, dict[int, tuple[PoseDetection, ...]]] = {} for parquet_path in sorted(segment_root.glob("*_detected.parquet")): camera_name = parquet_path.name.removesuffix("_detected.parquet") rows = pq.read_table(parquet_path).to_pylist() frames: dict[int, tuple[PoseDetection, ...]] = {} for row in rows: frame_index = int(row["frame_index"]) if frame_index < frame_start: continue if frame_stop is not None and frame_index >= frame_stop: continue detections: list[PoseDetection] = [] boxes = row["boxes"] keypoints_batch = row["kps"] confidence_batch = row["kps_scores"] if not (len(boxes) == len(keypoints_batch) == len(confidence_batch)): raise ValueError( f"Mismatched detection arrays for camera {camera_name} frame {frame_index}: " f"{len(boxes)=}, {len(keypoints_batch)=}, {len(confidence_batch)=}." ) for box, keypoints_xy, confidences in zip(boxes, keypoints_batch, confidence_batch, strict=True): keypoints_xy_array = np.asarray(keypoints_xy, dtype=np.float64) confidences_array = np.asarray(confidences, dtype=np.float64) pose = normalize_rtmpose_body20(keypoints_xy_array, confidences_array) if np.count_nonzero(pose[:, 2] > 0.15) < min_visible_joints: continue bbox = ( np.asarray(box, dtype=np.float64) if len(box) == 4 else infer_bbox_from_keypoints(pose) ) visible_confidences = pose[pose[:, 2] > 0.0, 2] detections.append( PoseDetection( bbox=bbox, bbox_confidence=float(np.mean(visible_confidences)) if visible_confidences.size else 0.0, keypoints=pose, ) ) frames[frame_index] = tuple(detections) by_camera[camera_name] = frames if not by_camera: return [] candidate_frames = sorted(set().union(*(set(frames) for frames in by_camera.values()))) if min_cameras_with_rows > 1: candidate_frames = [ frame_index for frame_index in candidate_frames if sum(frame_index in frames for frames in by_camera.values()) >= min_cameras_with_rows ] if max_frames is not None: candidate_frames = candidate_frames[:max_frames] scene = load_actual_test_scene(root) camera_by_name = {camera.name: camera for camera in scene.cameras} bundles: list[FrameBundle] = [] ordered_camera_names = [camera.name for camera in scene.cameras] for bundle_index, frame_index in enumerate(candidate_frames): timestamp_unix_ns = bundle_index * _NOMINAL_FRAME_PERIOD_NS views: list[CameraFrame] = [] for camera_name in ordered_camera_names: camera = camera_by_name[camera_name] views.append( CameraFrame( camera_name=camera_name, frame_index=frame_index, timestamp_unix_ns=timestamp_unix_ns, detections=by_camera.get(camera_name, {}).get(frame_index, ()), source_size=(camera.width, camera.height), ) ) bundles.append( FrameBundle( bundle_index=bundle_index, timestamp_unix_ns=timestamp_unix_ns, views=tuple(views), ) ) return bundles @click.command() @click.argument("root_path", type=click.Path(path_type=Path, exists=True, file_okay=False)) @click.option("--segment", "segment_name", default="Segment_1", show_default=True) @click.option("--frame-start", default=690, type=int, show_default=True) @click.option("--frame-stop", type=int) @click.option("--max-frames", type=click.IntRange(min=1)) @click.option("--min-camera-rows", default=1, type=click.IntRange(min=1), show_default=True) @click.option("--max-active-tracks", default=1, type=click.IntRange(min=1), show_default=True) @click.option("--verbose-frames/--no-verbose-frames", default=False, show_default=True) def main( root_path: Path, segment_name: str, frame_start: int, frame_stop: int | None, max_frames: int | None, min_camera_rows: int, max_active_tracks: int, verbose_frames: bool, ) -> None: logger.remove() logger.add( click.get_text_stream("stderr"), level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", ) scene = load_actual_test_scene(root_path) bundles = load_actual_test_segment_bundles( root_path, segment_name, frame_start=frame_start, frame_stop=frame_stop, max_frames=max_frames, min_cameras_with_rows=min_camera_rows, ) tracker = PoseTracker(scene, TrackerConfig(max_active_tracks=max_active_tracks)) results = tracker.run(bundles) summary = summarize_tracking_results(results, tracker.diagnostics_snapshot()) logger.info( "actual_test bundles={} active_frames={} proposal_frames={} max_active_tracks={} max_lost_tracks={} " "mean_accepted_views={} mean_accepted_joints={} mean_reprojection_error={}", summary.bundle_count, summary.active_frames, summary.proposal_frames, summary.max_active_tracks, summary.max_lost_tracks, "{:.2f}".format(summary.mean_accepted_views) if np.isfinite(summary.mean_accepted_views) else "nan", "{:.2f}".format(summary.mean_accepted_joints) if np.isfinite(summary.mean_accepted_joints) else "nan", "{:.2f}".format(summary.mean_reprojection_error) if np.isfinite(summary.mean_reprojection_error) else "nan", ) logger.info( "actual_test actions={} promotions={} reacquisitions={} predict_only_updates={} proposal_reacquisition_attempts={} " "proposal_compatible_lost_frames={} nonlinear_refinements={} lm_iterations={}", summary.update_action_counts, summary.diagnostics.promotions, summary.diagnostics.reacquisitions, summary.diagnostics.predict_only_updates, summary.diagnostics.proposal_reacquisition_attempts, summary.diagnostics.proposal_compatible_lost_frames, summary.diagnostics.nonlinear_refinements, summary.diagnostics.lm_iterations, ) if verbose_frames: for line in format_frame_summary_lines(results): logger.info("actual_test_frame {}", line) if __name__ == "__main__": main()