feat!: reorganize detection and tracking pipeline

Refactor the package into common, schema, detection, and tracking namespaces and move dataset-specific ActualTest utilities into tests/support. Add a pluggable detection stack with typed protocols, pydantic-settings config, loguru-based runner logging, cvmmap and headless video sources, NATS and parquet sinks, and a structured coco-wholebody133 payload path. Teach tracking replay loading to consume parquet detection directories directly, preserve empty frames, and keep the video-to-parquet-to-tracking workflow usable for offline E2E runs. Vendor the local mmcv and xtcocotools wheels under Git LFS, update uv sources/lock state, and refresh the mmcv build so mmcv.ops loads successfully with the current torch+cu130 environment.
2026-03-26 16:24:27 +08:00
parent f1a2372b3c
commit 2c0d51ab31
56 changed files with 5179 additions and 889 deletions
@@ -0,0 +1 @@
+vendor/wheels/*.whl filter=lfs diff=lfs merge=lfs -text
@@ -9,13 +9,13 @@ Offline multiview body tracking experiments built around:
 ## Install

 ```bash
-uv sync --extra dev
+uv sync --group dev
 ```

 ## Run

 ```bash
-uv run pose-tracking-exp run data/scene.json data/replay.jsonl
+uv run pose-tracking-exp run_tracking data/scene.json data/replay.jsonl
 ```

 `scene.json` may declare camera extrinsics in either format:
@@ -26,13 +26,58 @@ uv run pose-tracking-exp run data/scene.json data/replay.jsonl
 The loader normalizes both to OpenCV extrinsics for reprojection and converts to RPT pose only when building the triangulation config.
 If you already have an older hand-authored scene file that stored RPT camera pose directly, set `extrinsic_format` explicitly to `rpt_camera_pose`.

-## Convert ParaJumping Payload Records
+## Convert cvmmap Pose Payload Records

 ```bash
-uv run pose-tracking-exp convert-parajumping input.jsonl output.jsonl
+uv run pose-tracking-exp convert-cvmmap-pose input.jsonl output.jsonl
 ```

-## ActualTest Calibration Caveat
+The current cvmmap `.pose` wire format is fixed to `COCO-WholeBody-133` keypoints.
+That is a transport compatibility constraint, not a tracker limitation: the tracker-side normalizer accepts both `coco17` and `coco_wholebody133`, because the first 17 body joints share the standard COCO ordering.
+
+References:
+
+- https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html
+- https://github.com/jin-s13/COCO-WholeBody
+
+## Run Detection
+
+```bash
+uv sync --group dev --group detection
+uv run pose-tracking-exp run_detection --config detection.toml camera0 camera1
+uv run pose-tracking-exp run_detection --source video --output-dir data/detections --config detection.toml cam0=/data/cam0.mp4 cam1=/data/cam1.mp4
+```
+
+The embedded 2D detection module is organized as a swapable shim:
+
+- `FrameSource`: where images come from
+- `PoseShim`: object detection + pose estimation backend
+- `PoseSink`: where structured detections are published or stored
+
+The default backend is `yolo_rtmpose`, and the heavy runtime dependencies live in the optional `detection` dependency group.
+Checkpoint paths are explicit config fields; the code does not hardcode local checkpoint locations.
+The only inferred path is the MMPose config path, which is resolved relative to the installed `mmpose` package when `pose_config_path` is omitted.
+For offline video runs, the default sink is parquet and writes one `*_detected.parquet` file per source. `run_tracking` can consume that directory directly as replay input.
+
+Example `detection.toml`:
+
+```bash
+instances = ["camera0", "camera1"]
+device = "cuda"
+yolo_checkpoint = "/path/to/yolo_checkpoint.pt"
+pose_checkpoint = "/path/to/coco_wholebody_pose_checkpoint.pth"
+```
+
+## Actual Test Helper
+
+```bash
+uv run --group dev --group detection python -m tests.support.actual_test /mnt/hddl/data/ActualTest_WeiHua --segment Segment_2 --frame-start 1100 --max-frames 120
+```
+
+`actual_test` is a test/support helper, not part of the public installed CLI surface.
+It keeps the union of per-camera frame indices and fills missing camera rows with empty detections, so later 2-camera stretches are still usable instead of being dropped by a 4-camera intersection.
+
+## Actual Test Calibration Caveat

 `ActualTest_WeiHua/camera_params.parquet` appears to store raw OpenCV extrinsics from the ChArUco pipeline, not camera poses. The tracker now converts those values before calling `RapidPoseTriangulation`, because RPT expects camera centers and camera-to-world rotation.

@@ -7,13 +7,14 @@ name = "pose-tracking-exp"
 version = "0.1.0"
 description = "Offline multiview pose tracking experiment with RPT-backed proposal births"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.12,<3.13"
 dependencies = [
+  "anyio>=4.11.0",
  "beartype>=0.19.0",
  "click>=8.2.1",
  "jaxtyping>=0.3.2",
  "numpy>=2.1.0",
-  "opencv-python>=4.12.0.88",
+  "opencv-python-headless>=4.12.0.88",
  "pyarrow>=21.0.0",
  "rapid-pose-triangulation",
  "scipy>=1.15.0",
@@ -22,8 +23,24 @@ dependencies = [
 [dependency-groups]
 dev = [
  "basedpyright>=1.31.0",
+  "jupyterlab>=4.5.6",
  "pytest>=8.4.0",
 ]
+detection = [
+  "cvmmap-client",
+  "loguru>=0.7.3",
+  "mmcv",
+  "mmdet>=3.3.0",
+  "mmengine>=0.10.7",
+  "mmpose>=1.3.2",
+  "nats-py>=2.11.0",
+  "pydantic>=2.11.7",
+  "pydantic-settings>=2.0.0",
+  "torch>=2.7.0",
+  "torchvision>=0.22.0",
+  "ultralytics>=8.3.166",
+  "xtcocotools",
+]

 [project.scripts]
 pose-tracking-exp = "pose_tracking_exp.cli:main"
@@ -33,6 +50,9 @@ packages = ["src/pose_tracking_exp"]

 [tool.uv.sources]
 rapid-pose-triangulation = { path = "../RapidPoseTriangulation", editable = true }
+cvmmap-client = { path = "../cvmmap-python-client", editable = true }
+mmcv = { path = "vendor/wheels/mmcv-2.2.0-cp312-cp312-linux_x86_64.whl" }
+xtcocotools = { path = "vendor/wheels/xtcocotools-1.14.3-cp312-cp312-linux_x86_64.whl" }

 [tool.pytest.ini_options]
 testpaths = ["tests"]
@@ -1,40 +1,37 @@
-from pose_tracking_exp.joints import BODY20_JOINT_NAMES, BODY20_OBSERVATION_COUNT
-from pose_tracking_exp.models import (
-    ActiveTrackState,
+from pose_tracking_exp.common.joints import BODY20_JOINT_NAMES, BODY20_OBSERVATION_COUNT
+from pose_tracking_exp.detection.cvmmap_payload import CvmmapPosePayloadCodec, decode_pose_payload
+from pose_tracking_exp.schema import (
    CameraCalibration,
    CameraFrame,
    FrameBundle,
    PoseDetection,
-    ProposalCluster,
    ReplaySequence,
    SceneConfig,
-    TentativeTrackState,
    TrackerConfig,
-    TrackedFrameResult,
 )
-from pose_tracking_exp.parajumping import decode_pose_payload
-from pose_tracking_exp.replay import load_replay_file, load_scene_file
-from pose_tracking_exp.sync import synchronize_frames
-from pose_tracking_exp.tracker import PoseTracker
+from pose_tracking_exp.tracking import (
+    PoseTracker,
+    load_parquet_replay_dir,
+    load_replay_file,
+    load_scene_file,
+    synchronize_frames,
+)

 __all__ = [
    "BODY20_JOINT_NAMES",
    "BODY20_OBSERVATION_COUNT",
-    "ActiveTrackState",
    "CameraCalibration",
    "CameraFrame",
+    "CvmmapPosePayloadCodec",
    "FrameBundle",
    "PoseDetection",
    "PoseTracker",
-    "ProposalCluster",
+    "load_parquet_replay_dir",
    "ReplaySequence",
    "SceneConfig",
-    "TentativeTrackState",
-    "TrackedFrameResult",
    "TrackerConfig",
    "decode_pose_payload",
    "load_replay_file",
    "load_scene_file",
    "synchronize_frames",
 ]
-
@@ -1,15 +1,12 @@
 import json
+import sys
 from pathlib import Path
-from typing import Literal, cast

 import click

-from pose_tracking_exp.actualtest import load_actualtest_scene, load_actualtest_segment_bundles
-from pose_tracking_exp.models import TrackerConfig
-from pose_tracking_exp.parajumping import convert_payload_jsonl_lines
-from pose_tracking_exp.replay import load_replay_file, load_scene_file
-from pose_tracking_exp.sync import synchronize_frames
-from pose_tracking_exp.tracker import PoseTracker
+from pose_tracking_exp.detection.cvmmap_payload import convert_payload_jsonl_lines
+from pose_tracking_exp.schema import TrackerConfig
+from pose_tracking_exp.tracking import PoseTracker, load_replay_file, load_scene_file, synchronize_frames


@click.group()
@@ -17,19 +14,120 @@ def main() -> None:
    """Offline multiview pose tracking experiment CLI."""


-@main.command("convert-parajumping")
+@main.command("convert-cvmmap-pose")
@click.argument("input_path", type=click.Path(path_type=Path, exists=True, dir_okay=False))
@click.argument("output_path", type=click.Path(path_type=Path, dir_okay=False))
-def convert_parajumping(input_path: Path, output_path: Path) -> None:
+def convert_cvmmap_pose(input_path: Path, output_path: Path) -> None:
    lines = input_path.read_text(encoding="utf-8").splitlines()
    converted = convert_payload_jsonl_lines(lines)
    output_path.write_text("\n".join(converted) + ("\n" if converted else ""), encoding="utf-8")


-@main.command("run")
+@main.command("run_detection")
+@click.argument("inputs", nargs=-1, type=str, required=False)
+@click.option(
+    "--config",
+    "config_path",
+    type=click.Path(dir_okay=False, path_type=Path),
+    default=None,
+    help="Optional TOML detection runner config file.",
+)
+@click.option(
+    "--source",
+    "source_kind",
+    type=click.Choice(("cvmmap", "video")),
+    default="cvmmap",
+    show_default=True,
+    help="Frame source implementation to use.",
+)
+@click.option(
+    "--sink",
+    "sink_kind",
+    type=click.Choice(("auto", "nats", "parquet")),
+    default="auto",
+    show_default=True,
+    help="Output sink. `auto` picks nats for cvmmap and parquet for video.",
+)
+@click.option(
+    "--output-dir",
+    type=click.Path(file_okay=False, path_type=Path),
+    default=None,
+    help="Required for parquet sink output.",
+)
+@click.option(
+    "--log-level",
+    default="INFO",
+    show_default=True,
+    type=click.Choice(("DEBUG", "INFO", "WARNING", "ERROR")),
+)
+def run_detection(
+    inputs: tuple[str, ...],
+    config_path: Path | None,
+    source_kind: str,
+    sink_kind: str,
+    output_dir: Path | None,
+    log_level: str,
+) -> None:
+    import anyio
+    from loguru import logger
+
+    from pose_tracking_exp.detection import (
+        CvmmapFrameSource,
+        NatsPoseSink,
+        ParquetPoseSink,
+        VideoFrameSource,
+        build_pose_shim,
+        load_detection_runner_config,
+        parse_video_input_specs,
+        resolve_instances,
+        run_detection_runner,
+    )
+
+    logger.remove()
+    logger.add(
+        sys.stderr,
+        level=log_level,
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {name}:{function}:{line} | {message}",
+    )
+    config = load_detection_runner_config(config_path)
+    config.validate_runtime_paths()
+
+    if source_kind == "cvmmap":
+        resolved_instances = resolve_instances(inputs, config.instances)
+        config = config.model_copy(update={"instances": resolved_instances})
+        sources = tuple(CvmmapFrameSource(instance) for instance in resolved_instances)
+    else:
+        video_inputs = parse_video_input_specs(inputs)
+        sources = tuple(
+            VideoFrameSource(video_path, source_name=source_name)
+            for source_name, video_path in video_inputs
+        )
+
+    pose_shim = build_pose_shim(config)
+    resolved_sink_kind = sink_kind
+    if resolved_sink_kind == "auto":
+        resolved_sink_kind = "nats" if source_kind == "cvmmap" else "parquet"
+
+    if resolved_sink_kind == "nats":
+        pose_sink = NatsPoseSink(config.nats_host)
+    else:
+        if output_dir is None:
+            raise click.ClickException("--output-dir is required for parquet sink output.")
+        pose_sink = ParquetPoseSink(output_dir)
+
+    anyio.run(
+        run_detection_runner,
+        sources,
+        pose_shim,
+        pose_sink,
+        config,
+    )
+
+
+@main.command("run_tracking")
@click.argument("scene_path", type=click.Path(path_type=Path, exists=True, dir_okay=False))
-@click.argument("replay_path", type=click.Path(path_type=Path, exists=True, dir_okay=False))
-def run(scene_path: Path, replay_path: Path) -> None:
+@click.argument("replay_path", type=click.Path(path_type=Path, exists=True))
+def run_tracking(scene_path: Path, replay_path: Path) -> None:
    scene = load_scene_file(scene_path)
    replay = load_replay_file(scene_path, replay_path)
    config = TrackerConfig()
@@ -52,79 +150,3 @@ def run(scene_path: Path, replay_path: Path) -> None:
        for result in results
    ]
    click.echo(json.dumps(payload, indent=2))
-
-
-@main.command("run-actualtest")
-@click.argument("root_path", type=click.Path(path_type=Path, exists=True, file_okay=False))
-@click.option("--segment", "segment_name", default="Segment_1", show_default=True)
-@click.option("--frame-start", default=690, type=int, show_default=True)
-@click.option("--frame-stop", type=int)
-@click.option("--max-frames", type=int)
-@click.option("--mode", type=click.Choice(["single_person", "general"]), default="single_person", show_default=True)
-@click.option("--proposal-min-score", default=0.5, type=float, show_default=True)
-@click.option("--tentative-min-age", default=2, type=int, show_default=True)
-@click.option("--tentative-hits-required", default=2, type=int, show_default=True)
-@click.option("--tentative-promote-score", default=1.2, type=float, show_default=True)
-def run_actualtest(
-    root_path: Path,
-    segment_name: str,
-    frame_start: int,
-    frame_stop: int | None,
-    max_frames: int | None,
-    mode: str,
-    proposal_min_score: float,
-    tentative_min_age: int,
-    tentative_hits_required: int,
-    tentative_promote_score: float,
-) -> None:
-    tracker_mode = cast(Literal["general", "single_person"], mode)
-    scene = load_actualtest_scene(root_path)
-    bundles = load_actualtest_segment_bundles(
-        root_path,
-        segment_name,
-        frame_start=frame_start,
-        frame_stop=frame_stop,
-        max_frames=max_frames,
-    )
-    config = TrackerConfig(
-        mode=tracker_mode,
-        proposal_min_score=proposal_min_score,
-        tentative_min_age=tentative_min_age,
-        tentative_hits_required=tentative_hits_required,
-        tentative_promote_score=tentative_promote_score,
-    )
-    tracker = PoseTracker(scene, config)
-    results = tracker.run(bundles)
-    diagnostics = tracker.diagnostics_snapshot()
-    payload = {
-        "segment": segment_name,
-        "mode": tracker_mode,
-        "bundle_count": len(results),
-        "active_track_frames": sum(1 for result in results if result.active_tracks),
-        "proposal_frames": sum(1 for result in results if result.proposals),
-        "max_active_tracks": max((len(result.active_tracks) for result in results), default=0),
-        "diagnostics": {
-            "match_existing_calls": diagnostics.match_existing_calls,
-            "match_existing_seconds": diagnostics.match_existing_seconds,
-            "proposal_build_calls": diagnostics.proposal_build_calls,
-            "proposal_build_seconds": diagnostics.proposal_build_seconds,
-            "promotions": diagnostics.promotions,
-            "reacquisitions": diagnostics.reacquisitions,
-            "active_updates": diagnostics.active_updates,
-            "seed_initializations": diagnostics.seed_initializations,
-            "nonlinear_refinements": diagnostics.nonlinear_refinements,
-        },
-        "results": [
-            {
-                "bundle_index": result.bundle_index,
-                "source_frame_index": bundle.views[0].frame_index if bundle.views else -1,
-                "timestamp_unix_ns": result.timestamp_unix_ns,
-                "tentative_track_ids": [track.track_id for track in result.tentative_tracks],
-                "active_track_ids": [track.track_id for track in result.active_tracks],
-                "lost_track_ids": [track.track_id for track in result.lost_tracks],
-                "proposal_count": len(result.proposals),
-            }
-            for bundle, result in zip(bundles, results, strict=True)
-        ],
-    }
-    click.echo(json.dumps(payload, indent=2))
@@ -0,0 +1,33 @@
+from pose_tracking_exp.common.camera_math import project_pose
+from pose_tracking_exp.common.joints import (
+    BODY20_INDEX_BY_NAME,
+    BODY20_JOINT_NAMES,
+    BODY20_OBSERVATION_COUNT,
+    COCO_BODY17_INDEX_BY_NAME,
+    COCO_BODY17_NAMES,
+    CORE_JOINT_INDICES,
+    CORE_JOINT_NAMES,
+)
+from pose_tracking_exp.common.normalization import (
+    core_reprojection_distance,
+    infer_bbox_from_keypoints,
+    normalize_coco_body20,
+    normalize_openpose25_body20,
+    normalize_rtmpose_body20,
+)
+
+__all__ = [
+    "BODY20_INDEX_BY_NAME",
+    "BODY20_JOINT_NAMES",
+    "BODY20_OBSERVATION_COUNT",
+    "COCO_BODY17_INDEX_BY_NAME",
+    "COCO_BODY17_NAMES",
+    "CORE_JOINT_INDICES",
+    "CORE_JOINT_NAMES",
+    "core_reprojection_distance",
+    "infer_bbox_from_keypoints",
+    "normalize_coco_body20",
+    "normalize_openpose25_body20",
+    "normalize_rtmpose_body20",
+    "project_pose",
+]
@@ -1,8 +1,8 @@
 import cv2
 import numpy as np

-from pose_tracking_exp.models import CameraCalibration
-from pose_tracking_exp.tensor_types import Pose3D
+from pose_tracking_exp.common.tensor_types import Pose3D
+from pose_tracking_exp.schema.camera import CameraCalibration


 def project_pose(camera: CameraCalibration, pose3d: Pose3D) -> np.ndarray:
@@ -0,0 +1,43 @@
+from pathlib import Path
+
+import pyarrow as pa
+
+from pose_tracking_exp.schema.detection import PoseDetections
+
+DETECTED_PARQUET_SUFFIX = "_detected.parquet"
+DETECTION_PARQUET_SCHEMA = pa.schema(
+    [
+        pa.field("frame_index", pa.int64()),
+        pa.field("timestamp_unix_ns", pa.int64()),
+        pa.field("source_width", pa.int32()),
+        pa.field("source_height", pa.int32()),
+        pa.field("boxes", pa.list_(pa.list_(pa.float32()))),
+        pa.field("box_scores", pa.list_(pa.float32())),
+        pa.field("kps", pa.list_(pa.list_(pa.list_(pa.float32())))),
+        pa.field("kps_scores", pa.list_(pa.list_(pa.float32()))),
+        pa.field("keypoint_schema", pa.string()),
+    ]
+)
+
+
+def detection_parquet_path(output_dir: Path, source_name: str) -> Path:
+    return output_dir / f"{source_name}{DETECTED_PARQUET_SUFFIX}"
+
+
+def pose_detections_to_row(detections: PoseDetections) -> dict[str, object]:
+    if detections.box_scores is None:
+        raise ValueError("Parquet sink requires box_scores to be present.")
+    if detections.keypoint_scores is None:
+        raise ValueError("Parquet sink requires keypoint_scores to be present.")
+
+    return {
+        "frame_index": int(detections.frame_index),
+        "timestamp_unix_ns": int(detections.timestamp_unix_ns),
+        "source_width": int(detections.source_size[0]),
+        "source_height": int(detections.source_size[1]),
+        "boxes": detections.boxes_xyxy.astype("float32", copy=False).tolist(),
+        "box_scores": detections.box_scores.astype("float32", copy=False).tolist(),
+        "kps": detections.keypoints_xy.astype("float32", copy=False).tolist(),
+        "kps_scores": detections.keypoint_scores.astype("float32", copy=False).tolist(),
+        "keypoint_schema": detections.keypoint_schema,
+    }
@@ -39,7 +39,7 @@ CORE_JOINT_NAMES: tuple[str, ...] = (

 CORE_JOINT_INDICES: tuple[int, ...] = tuple(BODY20_INDEX_BY_NAME[name] for name in CORE_JOINT_NAMES)

-RTMPOSE_BODY17_INDEX_BY_NAME = {
+COCO_BODY17_INDEX_BY_NAME = {
    "nose": 0,
    "eye_left": 1,
    "eye_right": 2,
@@ -59,5 +59,8 @@ RTMPOSE_BODY17_INDEX_BY_NAME = {
    "ankle_right": 16,
 }

-RTMPOSE_BODY17_NAMES = tuple(RTMPOSE_BODY17_INDEX_BY_NAME.keys())
+COCO_BODY17_NAMES = tuple(COCO_BODY17_INDEX_BY_NAME.keys())

+# RTMPose whole-body uses the standard COCO body-17 ordering for the first 17 joints.
+RTMPOSE_BODY17_INDEX_BY_NAME = COCO_BODY17_INDEX_BY_NAME
+RTMPOSE_BODY17_NAMES = COCO_BODY17_NAMES
@@ -1,12 +1,51 @@
 import math
 from collections.abc import Mapping
+from typing import Literal

 import numpy as np
 from beartype import beartype
 from jaxtyping import jaxtyped

-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME, BODY20_OBSERVATION_COUNT, RTMPOSE_BODY17_INDEX_BY_NAME
-from pose_tracking_exp.tensor_types import FloatArray, JointXY, Pose2D
+from pose_tracking_exp.common.joints import (
+    BODY20_INDEX_BY_NAME,
+    BODY20_OBSERVATION_COUNT,
+    COCO_BODY17_INDEX_BY_NAME,
+)
+from pose_tracking_exp.common.tensor_types import FloatArray, JointXY, Pose2D
+
+
+def _validate_coco_shape(
+    keypoints_xy: FloatArray,
+    confidences: FloatArray,
+    *,
+    keypoint_schema: Literal["coco17", "coco_wholebody133"] | None,
+) -> Literal["coco17", "coco_wholebody133"]:
+    if keypoints_xy.ndim != 2 or keypoints_xy.shape[1] != 2:
+        raise ValueError(
+            f"Expected keypoints with shape (N, 2), got {keypoints_xy.shape}."
+        )
+    if confidences.ndim != 1 or confidences.shape[0] != keypoints_xy.shape[0]:
+        raise ValueError(
+            "Expected confidences with shape matching keypoint count. "
+            f"Got {confidences.shape} for {keypoints_xy.shape}."
+        )
+
+    detected_schema: Literal["coco17", "coco_wholebody133"]
+    if keypoints_xy.shape[0] == 17:
+        detected_schema = "coco17"
+    elif keypoints_xy.shape[0] == 133:
+        detected_schema = "coco_wholebody133"
+    else:
+        raise ValueError(
+            "Expected COCO-compatible keypoints with 17 or 133 joints, "
+            f"got {keypoints_xy.shape[0]}."
+        )
+
+    if keypoint_schema is not None and keypoint_schema != detected_schema:
+        raise ValueError(
+            f"Expected {keypoint_schema} keypoints, got shape {keypoints_xy.shape}."
+        )
+    return detected_schema


 def _visible_mean(points: list[tuple[np.ndarray, float]], fallback_xy: np.ndarray) -> tuple[np.ndarray, float]:
@@ -68,18 +107,20 @@ def _normalize_named_keypoints(


@jaxtyped(typechecker=beartype)
-def normalize_rtmpose_body20(
+def normalize_coco_body20(
    keypoints_xy: FloatArray,
    confidences: FloatArray,
+    *,
+    keypoint_schema: Literal["coco17", "coco_wholebody133"] | None = None,
 ) -> Pose2D:
-    if keypoints_xy.shape != (133, 2):
-        raise ValueError(f"Expected RTMPose keypoints with shape (133, 2), got {keypoints_xy.shape}.")
-    if confidences.shape != (133,):
-        raise ValueError(f"Expected RTMPose confidences with shape (133,), got {confidences.shape}.")
-
+    _validate_coco_shape(
+        keypoints_xy,
+        confidences,
+        keypoint_schema=keypoint_schema,
+    )
    keypoint_map = {
        name: (keypoints_xy[source_index], float(confidences[source_index]))
-        for name, source_index in RTMPOSE_BODY17_INDEX_BY_NAME.items()
+        for name, source_index in COCO_BODY17_INDEX_BY_NAME.items()
    }
    return _normalize_named_keypoints(
        keypoint_map,
@@ -89,6 +130,18 @@ def normalize_rtmpose_body20(
    )


+@jaxtyped(typechecker=beartype)
+def normalize_rtmpose_body20(
+    keypoints_xy: FloatArray,
+    confidences: FloatArray,
+) -> Pose2D:
+    return normalize_coco_body20(
+        keypoints_xy,
+        confidences,
+        keypoint_schema="coco_wholebody133",
+    )
+
+
@jaxtyped(typechecker=beartype)
 def normalize_openpose25_body20(keypoints: FloatArray) -> Pose2D:
    if keypoints.shape != (25, 3):
@@ -0,0 +1,58 @@
+from pose_tracking_exp.detection.config import (
+    DEFAULT_BACKEND,
+    DetectionRunnerConfig,
+    load_detection_runner_config,
+    resolve_default_pose_config,
+    resolve_instances,
+)
+from pose_tracking_exp.detection.factory import build_pose_shim
+from pose_tracking_exp.detection.runner import (
+    SimpleMovingAverage,
+    SourceSlot,
+    run_detection_runner,
+    store_latest_frame,
+    take_pending_batch,
+)
+from pose_tracking_exp.detection.sinks import NatsPoseSink, ParquetPoseSink
+from pose_tracking_exp.detection.sources import (
+    CvmmapFrameSource,
+    IteratorFrameSource,
+    VideoFrameSource,
+    parse_video_input_specs,
+)
+from pose_tracking_exp.schema.detection import BoxDetections, CocoKeypointSchema, PoseBatchRequest, PoseDetections, SourceFrame
+from pose_tracking_exp.detection.yolo_rtmpose import (
+    WholeBodyPoseEstimator,
+    YoloRtmposeShim,
+    build_yolo_rtmpose_shim,
+    legacy_torch_checkpoint_loading,
+)
+
+__all__ = [
+    "BoxDetections",
+    "CocoKeypointSchema",
+    "CvmmapFrameSource",
+    "DEFAULT_BACKEND",
+    "DetectionRunnerConfig",
+    "IteratorFrameSource",
+    "NatsPoseSink",
+    "ParquetPoseSink",
+    "PoseBatchRequest",
+    "PoseDetections",
+    "SimpleMovingAverage",
+    "SourceFrame",
+    "SourceSlot",
+    "VideoFrameSource",
+    "WholeBodyPoseEstimator",
+    "YoloRtmposeShim",
+    "build_pose_shim",
+    "build_yolo_rtmpose_shim",
+    "legacy_torch_checkpoint_loading",
+    "load_detection_runner_config",
+    "parse_video_input_specs",
+    "resolve_default_pose_config",
+    "resolve_instances",
+    "run_detection_runner",
+    "store_latest_frame",
+    "take_pending_batch",
+]
@@ -0,0 +1,147 @@
+import tomllib
+from pathlib import Path
+from typing import Any, Literal, cast
+
+import click
+from pydantic import (
+    PositiveFloat,
+    PositiveInt,
+    ValidationError,
+    field_validator,
+    model_validator,
+)
+from pydantic_settings import (
+    BaseSettings,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+
+DEFAULT_BACKEND = "yolo_rtmpose"
+ENV_PREFIX = "POSE_TRACKING_EXP_DETECTION_"
+POSE_CONFIG_RELATIVE_PATH = Path(
+    "wholebody_2d_keypoint/rtmpose/ubody/rtmpose-l_8xb64-270e_coco-ubody-wholebody-256x192.py"
+)
+
+
+def resolve_default_pose_config() -> Path:
+    import mmpose
+
+    module_file = getattr(mmpose, "__file__", None)
+    if module_file is None:
+        raise FileNotFoundError("Could not locate the installed mmpose package.")
+    config_path = (
+        Path(module_file).resolve().parent
+        / ".mim"
+        / "configs"
+        / POSE_CONFIG_RELATIVE_PATH
+    )
+    if not config_path.exists():
+        raise FileNotFoundError(f"Default pose config is missing: {config_path}")
+    return config_path
+
+
+class DetectionRunnerConfig(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix=ENV_PREFIX,
+        extra="forbid",
+    )
+
+    instances: tuple[str, ...] = ()
+    backend: Literal["yolo_rtmpose"] = DEFAULT_BACKEND
+    device: str = "cuda"
+    nats_host: str = "nats://localhost:4222"
+    yolo_checkpoint: Path
+    yolo_conf_threshold: float = 0.6
+    pose_checkpoint: Path
+    pose_config_path: Path | None = None
+    bbox_area_threshold: PositiveInt = 50 * 50
+    max_batch_frames: PositiveInt = 8
+    max_batch_wait_ms: int = 4
+    slow_frame_budget_seconds: PositiveFloat = 1 / 22
+
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseSettings],
+        init_settings: PydanticBaseSettingsSource,
+        env_settings: PydanticBaseSettingsSource,
+        dotenv_settings: PydanticBaseSettingsSource,
+        file_secret_settings: PydanticBaseSettingsSource,
+    ) -> tuple[PydanticBaseSettingsSource, ...]:
+        return (
+            env_settings,
+            init_settings,
+            dotenv_settings,
+            file_secret_settings,
+        )
+
+    @field_validator("instances", mode="before")
+    @classmethod
+    def _parse_instances(cls, value: object) -> object:
+        if isinstance(value, str):
+            return tuple(item.strip() for item in value.split(",") if item.strip())
+        return value
+
+    @field_validator("max_batch_wait_ms")
+    @classmethod
+    def _validate_wait_ms(cls, value: int) -> int:
+        if value < 0:
+            raise ValueError("max_batch_wait_ms must be non-negative.")
+        return value
+
+    @model_validator(mode="after")
+    def _resolve_pose_config(self) -> "DetectionRunnerConfig":
+        if self.pose_config_path is None:
+            self.pose_config_path = resolve_default_pose_config()
+        return self
+
+    def validate_runtime_paths(self) -> None:
+        missing: list[Path] = []
+        for candidate in (
+            self.yolo_checkpoint,
+            self.pose_checkpoint,
+            self.pose_config_path,
+        ):
+            if candidate is None:
+                raise FileNotFoundError
+            if not candidate.exists():
+                missing.append(candidate)
+        if missing:
+            formatted = ", ".join(str(path) for path in missing)
+            raise click.ClickException(f"Missing runtime assets: {formatted}")
+
+
+def load_detection_runner_config(config_path: Path | None) -> DetectionRunnerConfig:
+    config_data: dict[str, object] = {}
+    if config_path is not None:
+        with config_path.open("rb") as handle:
+            parsed = tomllib.load(handle)
+        if not isinstance(parsed, dict):
+            raise click.ClickException("Detection runner config must be a TOML table.")
+        config_data = parsed
+
+    try:
+        # TOML/env values are validated by Pydantic at construction.
+        return DetectionRunnerConfig(**cast(dict[str, Any], config_data))
+    except (ValidationError, ValueError, FileNotFoundError) as exc:
+        raise click.ClickException(str(exc)) from exc
+
+
+def resolve_instances(
+    cli_instances: tuple[str, ...],
+    configured_instances: tuple[str, ...],
+) -> tuple[str, ...]:
+    selected = cli_instances or configured_instances
+    if not selected:
+        raise click.ClickException(
+            "Provide at least one instance on the command line or via config `instances = [...]`."
+        )
+
+    unique_instances: list[str] = []
+    seen: set[str] = set()
+    for instance in selected:
+        if instance in seen:
+            raise click.ClickException(f"Duplicate instance requested: {instance}")
+        unique_instances.append(instance)
+        seen.add(instance)
+    return tuple(unique_instances)
@@ -0,0 +1,237 @@
+"""cvmmap pose payload helpers.
+
+The current `.pose` wire format is fixed-width for COCO-WholeBody-133 keypoints.
+That is a protocol compatibility choice, not a tracker limitation: the tracker
+normalizer accepts either `coco17` or `coco_wholebody133` because the first
+17 body joints share the standard COCO ordering.
+
+References:
+- https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html
+- https://github.com/jin-s13/COCO-WholeBody
+"""
+
+import base64
+import json
+from dataclasses import dataclass
+
+import numpy as np
+from beartype import beartype
+
+from pose_tracking_exp.common.normalization import normalize_coco_body20
+from pose_tracking_exp.schema import CameraFrame, PoseDetection
+from pose_tracking_exp.schema.detection import PoseDetections
+
+PROTOCOL_HEADER = bytes([0x80]) + b"POSE"
+COCO_WHOLEBODY_KEYPOINT_COUNT = 133
+
+
+@dataclass(slots=True)
+class DecodedPosePayload:
+    frame_index: int
+    reference_size: tuple[int, int]
+    timestamp_unix_ns: int
+    detections: tuple[PoseDetection, ...]
+
+
+class CvmmapPosePayloadCodec:
+    def encode(self, detections: PoseDetections) -> bytes:
+        return encode_pose_payload(detections)
+
+
+def _read_u8(payload: memoryview, offset: int) -> tuple[int, int]:
+    return int(payload[offset]), offset + 1
+
+
+def _read_u16_array(payload: memoryview, offset: int, count: int) -> tuple[np.ndarray, int]:
+    size = count * 2
+    array = np.frombuffer(payload[offset : offset + size], dtype="<u2", count=count).astype(np.float64)
+    return array, offset + size
+
+
+@beartype
+def decode_pose_payload(payload: bytes) -> DecodedPosePayload:
+    if not payload.startswith(PROTOCOL_HEADER):
+        raise ValueError("Invalid cvmmap pose payload header.")
+
+    view = memoryview(payload)
+    offset = len(PROTOCOL_HEADER)
+    frame_index = int.from_bytes(view[offset : offset + 4], "little")
+    offset += 4
+    reference_size = tuple(int(x) for x in np.frombuffer(view[offset : offset + 4], dtype="<u2", count=2))
+    offset += 4
+
+    num_bbox = int(view[offset])
+    offset += 1
+    bbox_raw, offset = _read_u16_array(view, offset, num_bbox * 4)
+    bboxes = bbox_raw.reshape(num_bbox, 4) if num_bbox > 0 else np.zeros((0, 4), dtype=np.float64)
+
+    num_bbox_conf = int(view[offset])
+    offset += 1
+    bbox_confidence = np.frombuffer(view[offset : offset + num_bbox_conf], dtype=np.uint8, count=num_bbox_conf)
+    offset += num_bbox_conf
+
+    num_keypoints = int(view[offset])
+    offset += 1
+    keypoints_raw, offset = _read_u16_array(
+        view,
+        offset,
+        num_keypoints * COCO_WHOLEBODY_KEYPOINT_COUNT * 2,
+    )
+    keypoints_xy = (
+        keypoints_raw.reshape(num_keypoints, COCO_WHOLEBODY_KEYPOINT_COUNT, 2)
+        if num_keypoints > 0
+        else np.zeros((0, COCO_WHOLEBODY_KEYPOINT_COUNT, 2), dtype=np.float64)
+    )
+
+    num_keypoint_conf = int(view[offset])
+    offset += 1
+    keypoint_confidence_count = num_keypoint_conf * COCO_WHOLEBODY_KEYPOINT_COUNT
+    keypoint_confidence = (
+        np.frombuffer(
+            view[offset : offset + keypoint_confidence_count],
+            dtype=np.uint8,
+            count=keypoint_confidence_count,
+        ).astype(np.float64)
+        / 255.0
+    )
+    offset += keypoint_confidence_count
+    timestamp_unix_ns = int.from_bytes(view[offset : offset + 8], "little")
+
+    if num_keypoint_conf > 0 and num_keypoint_conf != num_keypoints:
+        raise ValueError("Unexpected keypoint confidence set count.")
+
+    detection_items: list[PoseDetection] = []
+    confidences = (
+        keypoint_confidence.reshape(num_keypoints, COCO_WHOLEBODY_KEYPOINT_COUNT)
+        if num_keypoints > 0
+        else np.zeros((0, COCO_WHOLEBODY_KEYPOINT_COUNT), dtype=np.float64)
+    )
+
+    for index in range(num_keypoints):
+        normalized = normalize_coco_body20(
+            keypoints_xy[index],
+            confidences[index],
+            keypoint_schema="coco_wholebody133",
+        )
+        bbox_score = float(bbox_confidence[index] / 255.0) if index < bbox_confidence.shape[0] else 0.0
+        bbox = bboxes[index] if index < bboxes.shape[0] else np.zeros(4, dtype=np.float64)
+        detection_items.append(
+            PoseDetection(
+                bbox=np.asarray(bbox, dtype=np.float64),
+                bbox_confidence=bbox_score,
+                keypoints=np.asarray(normalized, dtype=np.float64),
+            )
+        )
+
+    return DecodedPosePayload(
+        frame_index=frame_index,
+        reference_size=(reference_size[0], reference_size[1]),
+        timestamp_unix_ns=timestamp_unix_ns,
+        detections=tuple(detection_items),
+    )
+
+
+@beartype
+def encode_pose_payload(detections: PoseDetections) -> bytes:
+    detections.validate()
+    if detections.keypoint_schema != "coco_wholebody133":
+        raise ValueError(
+            "The cvmmap `.pose` payload currently requires `coco_wholebody133` keypoints."
+        )
+
+    frame_index_bytes = int(detections.frame_index).to_bytes(4, "little")
+    reference_size_bytes = np.asarray(detections.source_size, dtype=np.dtype("<u2")).tobytes()
+
+    num_bbox = int(detections.boxes_xyxy.shape[0])
+    num_bbox_bytes = num_bbox.to_bytes(1, "little")
+    bbox_bytes = np.ascontiguousarray(
+        detections.boxes_xyxy.astype(np.uint16),
+        dtype=np.dtype("<u2"),
+    ).tobytes()
+
+    num_bbox_confidence_bytes = bytes([0])
+    bbox_confidence_bytes = bytes()
+    if detections.box_scores is not None:
+        num_bbox_confidence_bytes = int(detections.box_scores.shape[0]).to_bytes(1, "little")
+        bbox_confidence_bytes = np.ascontiguousarray(
+            np.clip(detections.box_scores * np.iinfo(np.uint8).max, 0, 255).astype(np.uint8),
+            dtype=np.dtype("<u1"),
+        ).tobytes()
+
+    num_keypoints = int(detections.keypoints_xy.shape[0])
+    num_keypoints_bytes = num_keypoints.to_bytes(1, "little")
+    keypoints_bytes = np.ascontiguousarray(
+        detections.keypoints_xy.astype(np.uint16),
+        dtype=np.dtype("<u2"),
+    ).tobytes()
+
+    num_keypoint_confidence_bytes = bytes([0])
+    keypoint_confidence_bytes = bytes()
+    if detections.keypoint_scores is not None:
+        num_keypoint_confidence_bytes = int(detections.keypoint_scores.shape[0]).to_bytes(1, "little")
+        keypoint_confidence_bytes = np.ascontiguousarray(
+            np.clip(detections.keypoint_scores * np.iinfo(np.uint8).max, 0, 255).astype(np.uint8),
+            dtype=np.dtype("<u1"),
+        ).tobytes()
+
+    timestamp_unix_ns_bytes = int(detections.timestamp_unix_ns).to_bytes(8, "little")
+    return (
+        PROTOCOL_HEADER
+        + frame_index_bytes
+        + reference_size_bytes
+        + num_bbox_bytes
+        + bbox_bytes
+        + num_bbox_confidence_bytes
+        + bbox_confidence_bytes
+        + num_keypoints_bytes
+        + keypoints_bytes
+        + num_keypoint_confidence_bytes
+        + keypoint_confidence_bytes
+        + timestamp_unix_ns_bytes
+    )
+
+
+@beartype
+def frame_from_payload(camera_name: str, payload: bytes) -> CameraFrame:
+    decoded = decode_pose_payload(payload)
+    return CameraFrame(
+        camera_name=camera_name,
+        frame_index=decoded.frame_index,
+        timestamp_unix_ns=decoded.timestamp_unix_ns,
+        detections=decoded.detections,
+        source_size=decoded.reference_size,
+    )
+
+
+@beartype
+def convert_payload_record(record: dict[str, object]) -> dict[str, object]:
+    camera_name = str(record["camera"])
+    payload_b64 = str(record["payload_b64"])
+    frame = frame_from_payload(camera_name, base64.b64decode(payload_b64))
+
+    return {
+        "camera": frame.camera_name,
+        "frame_index": frame.frame_index,
+        "timestamp_unix_ns": frame.timestamp_unix_ns,
+        "source_size": list(frame.source_size),
+        "detections": [
+            {
+                "bbox": detection.bbox.tolist(),
+                "bbox_confidence": detection.bbox_confidence,
+                "keypoints": detection.keypoints.tolist(),
+            }
+            for detection in frame.detections
+        ],
+    }
+
+
+@beartype
+def convert_payload_jsonl_lines(lines: list[str]) -> list[str]:
+    output_lines: list[str] = []
+    for line in lines:
+        if not line.strip():
+            continue
+        record = json.loads(line)
+        converted = convert_payload_record(record)
+        output_lines.append(json.dumps(converted))
+    return output_lines
@@ -0,0 +1,3 @@
+from pose_tracking_exp.detection.sources.cvmmap import CvmmapFrameSource
+
+__all__ = ["CvmmapFrameSource"]
@@ -0,0 +1,21 @@
+import click
+
+from pose_tracking_exp.detection.config import DEFAULT_BACKEND, DetectionRunnerConfig
+from pose_tracking_exp.detection.protocols import PoseShim
+from pose_tracking_exp.detection.yolo_rtmpose import build_yolo_rtmpose_shim
+
+
+def build_pose_shim(config: DetectionRunnerConfig) -> PoseShim:
+    if config.backend == DEFAULT_BACKEND:
+        if config.pose_config_path is None:
+            raise click.ClickException("pose_config_path must be resolved before building the backend.")
+        return build_yolo_rtmpose_shim(
+            yolo_checkpoint=config.yolo_checkpoint,
+            yolo_conf_threshold=config.yolo_conf_threshold,
+            pose_checkpoint=config.pose_checkpoint,
+            pose_config_path=config.pose_config_path,
+            device=config.device,
+            max_batch_frames=config.max_batch_frames,
+            bbox_area_threshold=config.bbox_area_threshold,
+        )
+    raise click.ClickException(f"Unsupported detection backend: {config.backend}")
@@ -0,0 +1,3 @@
+from pose_tracking_exp.detection.sinks.nats import NatsPoseSink
+
+__all__ = ["NatsPoseSink"]
@@ -0,0 +1,49 @@
+from collections.abc import AsyncIterator, Sequence
+from typing import Protocol
+
+import numpy as np
+
+from pose_tracking_exp.schema.detection import BoxDetections, PoseBatchRequest, PoseDetections, SourceFrame
+
+
+class FrameSource(Protocol):
+    source_name: str
+
+    def frames(self) -> AsyncIterator[SourceFrame]:
+        ...
+
+
+class ObjectDetector(Protocol):
+    def detect_many(
+        self,
+        frames_rgb: Sequence[np.ndarray],
+        *,
+        classes: Sequence[int] | None = None,
+    ) -> list[BoxDetections]:
+        ...
+
+
+class PoseEstimator(Protocol):
+    def estimate_batch(
+        self,
+        requests: Sequence[PoseBatchRequest],
+    ) -> list[tuple[np.ndarray, np.ndarray]]:
+        ...
+
+
+class PoseShim(Protocol):
+    def process_many(self, frames: Sequence[SourceFrame]) -> list[PoseDetections]:
+        ...
+
+
+class PosePayloadCodec(Protocol):
+    def encode(self, detections: PoseDetections) -> bytes:
+        ...
+
+
+class PoseSink(Protocol):
+    async def publish_pose(self, detections: PoseDetections) -> None:
+        ...
+
+    async def aclose(self) -> None:
+        ...
@@ -0,0 +1,238 @@
+from dataclasses import dataclass
+from time import perf_counter
+
+import anyio
+from anyio.to_thread import run_sync as to_thread_run_sync
+from loguru import logger
+
+from pose_tracking_exp.detection.config import DetectionRunnerConfig
+from pose_tracking_exp.detection.protocols import FrameSource, PoseShim, PoseSink
+from pose_tracking_exp.schema.detection import SourceFrame
+
+PERFORMANCE_WINDOW = 60
+
+
+@dataclass(slots=True)
+class PendingFrame:
+    source_name: str
+    frame: SourceFrame
+
+
+@dataclass(slots=True)
+class SourceSlot:
+    source_name: str
+    pending_frame: PendingFrame | None = None
+    last_seen_frame_index: int | None = None
+    received_frames: int = 0
+    dropped_frames: int = 0
+    processed_frames: int = 0
+    published_frames: int = 0
+    closed: bool = False
+
+
+def store_latest_frame(slot: SourceSlot, frame: SourceFrame) -> None:
+    slot.received_frames += 1
+    if slot.pending_frame is not None:
+        slot.dropped_frames += 1
+    slot.pending_frame = PendingFrame(source_name=slot.source_name, frame=frame)
+
+
+def pending_source_count(slots: dict[str, SourceSlot]) -> int:
+    return sum(slot.pending_frame is not None for slot in slots.values())
+
+
+def take_pending_batch(
+    slots: dict[str, SourceSlot],
+    max_batch_frames: int,
+) -> list[PendingFrame]:
+    batch: list[PendingFrame] = []
+    for slot in slots.values():
+        if slot.pending_frame is None:
+            continue
+        batch.append(slot.pending_frame)
+        slot.pending_frame = None
+        if len(batch) >= max_batch_frames:
+            break
+    return batch
+
+
+def all_sources_closed_and_idle(slots: dict[str, SourceSlot]) -> bool:
+    return all(slot.closed and slot.pending_frame is None for slot in slots.values())
+
+
+class SimpleMovingAverage:
+    def __init__(self, window: int) -> None:
+        self._window = window
+        self._sum = 0.0
+        self._size = 0
+        self._value: float | None = None
+
+    def next(self, value: float) -> float:
+        if self._size < self._window:
+            self._sum += value
+            self._size += 1
+            self._value = self._sum / self._size
+        else:
+            self._sum -= self._sum / self._window
+            self._sum += value
+            self._value = self._sum / self._window
+        return float(self._value)
+
+    def get(self) -> float | None:
+        return self._value
+
+
+async def run_detection_runner(
+    sources: tuple[FrameSource, ...],
+    pose_shim: PoseShim,
+    pose_sink: PoseSink,
+    config: DetectionRunnerConfig,
+) -> None:
+    performance_sma = SimpleMovingAverage(PERFORMANCE_WINDOW)
+    batch_size_sma = SimpleMovingAverage(PERFORMANCE_WINDOW)
+    scheduler_condition = anyio.Condition()
+    slots = {
+        source.source_name: SourceSlot(source_name=source.source_name) for source in sources
+    }
+    inference_limiter = anyio.CapacityLimiter(1)
+
+    async def ingest_loop(source: FrameSource) -> None:
+        logger.info(
+            "[{}] source initialized; waiting for first frame metadata",
+            source.source_name,
+        )
+        try:
+            async for frame in source.frames():
+                should_log_init = False
+                previous_frame_index: int | None = None
+
+                async with scheduler_condition:
+                    slot = slots[source.source_name]
+                    previous_frame_index = slot.last_seen_frame_index
+                    should_log_init = previous_frame_index is None
+                    slot.last_seen_frame_index = frame.frame_index
+                    store_latest_frame(slot, frame)
+                    scheduler_condition.notify_all()
+
+                if should_log_init:
+                    logger.info(
+                        "[{}] initialized with frame shape={}x{} frame_index={}",
+                        source.source_name,
+                        frame.image_bgr.shape[1],
+                        frame.image_bgr.shape[0],
+                        frame.frame_index,
+                    )
+                elif previous_frame_index is not None and frame.frame_index != previous_frame_index + 1:
+                    logger.warning(
+                        "[{}] skip frame detected: {} -> {}",
+                        source.source_name,
+                        previous_frame_index,
+                        frame.frame_index,
+                    )
+        finally:
+            async with scheduler_condition:
+                slots[source.source_name].closed = True
+                scheduler_condition.notify_all()
+            logger.info("[{}] source closed", source.source_name)
+
+    async def scheduler_loop() -> None:
+        while True:
+            async with scheduler_condition:
+                while pending_source_count(slots) == 0:
+                    if all_sources_closed_and_idle(slots):
+                        return
+                    await scheduler_condition.wait()
+
+                if (
+                    pending_source_count(slots) < config.max_batch_frames
+                    and config.max_batch_wait_ms > 0
+                    and not all_sources_closed_and_idle(slots)
+                ):
+                    with anyio.move_on_after(config.max_batch_wait_ms / 1000):
+                        while (
+                            pending_source_count(slots) < config.max_batch_frames
+                            and not all_sources_closed_and_idle(slots)
+                        ):
+                            await scheduler_condition.wait()
+
+                batch = take_pending_batch(slots, config.max_batch_frames)
+
+            start = perf_counter()
+            pose_infos = await to_thread_run_sync(
+                pose_shim.process_many,
+                [item.frame for item in batch],
+                limiter=inference_limiter,
+            )
+            elapsed = perf_counter() - start
+            average_elapsed = elapsed / len(batch)
+            performance_sma.next(average_elapsed)
+            batch_size_sma.next(float(len(batch)))
+
+            if average_elapsed > config.slow_frame_budget_seconds:
+                logger.warning(
+                    "slow batch: size={} total={:.2f}ms avg={:.2f}ms",
+                    len(batch),
+                    elapsed * 1000,
+                    average_elapsed * 1000,
+                )
+
+            for pending_frame, pose_info in zip(batch, pose_infos, strict=True):
+                slot = slots[pending_frame.source_name]
+                slot.processed_frames += 1
+                await pose_sink.publish_pose(pose_info)
+                slot.published_frames += 1
+                if pose_info.boxes_xyxy.shape[0] == 0:
+                    logger.debug(
+                        "[{}:{}] no detections",
+                        pending_frame.source_name,
+                        pending_frame.frame.frame_index,
+                    )
+
+    async def log_performance() -> None:
+        while True:
+            await anyio.sleep(5)
+
+            async with scheduler_condition:
+                if all_sources_closed_and_idle(slots):
+                    return
+                slot_snapshot = {
+                    source_name: (
+                        slot.received_frames,
+                        slot.dropped_frames,
+                        slot.processed_frames,
+                        slot.published_frames,
+                    )
+                    for source_name, slot in slots.items()
+                }
+
+            per_source = " ".join(
+                (
+                    f"[{source_name}]"
+                    f" recv={received}"
+                    f" drop={dropped}"
+                    f" proc={processed}"
+                    f" pub={published}"
+                )
+                for source_name, (received, dropped, processed, published) in slot_snapshot.items()
+            )
+            if value := performance_sma.get():
+                batch_size = batch_size_sma.get() or 1.0
+                logger.info(
+                    "{:.2f}it/s ({:.2f}ms/frame) batch={:.2f} {}",
+                    1 / value,
+                    value * 1000,
+                    batch_size,
+                    per_source,
+                )
+            else:
+                logger.info("warming up {}", per_source)
+
+    try:
+        async with anyio.create_task_group() as task_group:
+            for source in sources:
+                task_group.start_soon(ingest_loop, source)
+            task_group.start_soon(log_performance)
+            await scheduler_loop()
+            task_group.cancel_scope.cancel()
+    finally:
+        await pose_sink.aclose()
@@ -0,0 +1,7 @@
+from pose_tracking_exp.detection.sinks.nats import NatsPoseSink
+from pose_tracking_exp.detection.sinks.parquet import ParquetPoseSink
+
+__all__ = [
+    "NatsPoseSink",
+    "ParquetPoseSink",
+]
@@ -0,0 +1,29 @@
+from pose_tracking_exp.detection.cvmmap_payload import CvmmapPosePayloadCodec
+from pose_tracking_exp.schema.detection import PoseDetections
+
+
+class NatsPoseSink:
+    def __init__(self, nats_host: str) -> None:
+        self._nats_host = nats_host
+        self._client = None
+        self._codec = CvmmapPosePayloadCodec()
+
+    async def _client_or_connect(self):
+        if self._client is None:
+            from nats.aio.client import Client as NatsClient
+
+            client = NatsClient()
+            await client.connect(servers=[self._nats_host])
+            self._client = client
+        return self._client
+
+    async def publish_pose(self, detections: PoseDetections) -> None:
+        client = await self._client_or_connect()
+        payload = self._codec.encode(detections)
+        await client.publish(f"{detections.source_name}.pose", payload)
+
+    async def aclose(self) -> None:
+        if self._client is None:
+            return
+        await self._client.drain()
+        self._client = None
@@ -0,0 +1,51 @@
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from pose_tracking_exp.common.detection_parquet import (
+    DETECTION_PARQUET_SCHEMA,
+    detection_parquet_path,
+    pose_detections_to_row,
+)
+from pose_tracking_exp.schema.detection import PoseDetections
+
+
+class ParquetPoseSink:
+    def __init__(self, output_dir: Path, *, flush_rows: int = 64) -> None:
+        self._output_dir = output_dir
+        self._flush_rows = flush_rows
+        self._buffers: dict[str, list[dict[str, object]]] = {}
+        self._writers: dict[str, pq.ParquetWriter] = {}
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+
+    def _writer_for(self, source_name: str) -> pq.ParquetWriter:
+        writer = self._writers.get(source_name)
+        if writer is not None:
+            return writer
+
+        path = detection_parquet_path(self._output_dir, source_name)
+        writer = pq.ParquetWriter(path, DETECTION_PARQUET_SCHEMA)
+        self._writers[source_name] = writer
+        return writer
+
+    def _flush_source(self, source_name: str) -> None:
+        rows = self._buffers.get(source_name)
+        if not rows:
+            return
+        table = pa.Table.from_pylist(rows, schema=DETECTION_PARQUET_SCHEMA)
+        self._writer_for(source_name).write_table(table)
+        rows.clear()
+
+    async def publish_pose(self, detections: PoseDetections) -> None:
+        rows = self._buffers.setdefault(detections.source_name, [])
+        rows.append(pose_detections_to_row(detections))
+        if len(rows) >= self._flush_rows:
+            self._flush_source(detections.source_name)
+
+    async def aclose(self) -> None:
+        for source_name in tuple(self._buffers):
+            self._flush_source(source_name)
+        for writer in self._writers.values():
+            writer.close()
+        self._writers.clear()
@@ -0,0 +1,10 @@
+from pose_tracking_exp.detection.sources.adapters import IteratorFrameSource
+from pose_tracking_exp.detection.sources.cvmmap import CvmmapFrameSource
+from pose_tracking_exp.detection.sources.video import VideoFrameSource, parse_video_input_specs
+
+__all__ = [
+    "CvmmapFrameSource",
+    "IteratorFrameSource",
+    "VideoFrameSource",
+    "parse_video_input_specs",
+]
@@ -0,0 +1,47 @@
+from collections.abc import AsyncIterator, Callable, Iterator
+from typing import Protocol
+
+from anyio.to_thread import run_sync as to_thread_run_sync
+
+from pose_tracking_exp.schema.detection import SourceFrame
+
+
+class BlockingFrameProducer(Protocol):
+    source_name: str
+
+    def iter_frames(self) -> Iterator[SourceFrame]:
+        ...
+
+
+def _next_or_none(iterator: Iterator[SourceFrame]) -> SourceFrame | None:
+    return next(iterator, None)
+
+
+class IteratorFrameSource:
+    def __init__(
+        self,
+        source_name: str,
+        iterator_factory: Callable[[], Iterator[SourceFrame]],
+    ) -> None:
+        self.source_name = source_name
+        self._iterator_factory = iterator_factory
+
+    async def frames(self) -> AsyncIterator[SourceFrame]:
+        iterator = self._iterator_factory()
+        try:
+            while True:
+                frame = await to_thread_run_sync(_next_or_none, iterator)
+                if frame is None:
+                    return
+                yield frame
+        finally:
+            close = getattr(iterator, "close", None)
+            if callable(close):
+                await to_thread_run_sync(close)
+
+
+def wrap_blocking_source(producer: BlockingFrameProducer) -> IteratorFrameSource:
+    return IteratorFrameSource(
+        source_name=producer.source_name,
+        iterator_factory=producer.iter_frames,
+    )
@@ -0,0 +1,22 @@
+from collections.abc import AsyncIterator
+
+import numpy as np
+
+from pose_tracking_exp.schema.detection import SourceFrame
+
+
+class CvmmapFrameSource:
+    def __init__(self, source_name: str) -> None:
+        self.source_name = source_name
+
+    async def frames(self) -> AsyncIterator[SourceFrame]:
+        from cvmmap import CvMmapClient
+
+        client = CvMmapClient(self.source_name)
+        async for frame, meta in client:
+            yield SourceFrame(
+                source_name=self.source_name,
+                image_bgr=np.array(frame, copy=True),
+                frame_index=meta.frame_count,
+                timestamp_unix_ns=meta.timestamp_ns,
+            )
@@ -0,0 +1,83 @@
+from collections.abc import AsyncIterator, Iterator, Sequence
+from pathlib import Path
+
+import click
+import cv2
+import numpy as np
+
+from pose_tracking_exp.detection.sources.adapters import wrap_blocking_source
+from pose_tracking_exp.schema.detection import SourceFrame
+
+_DEFAULT_VIDEO_FPS = 30.0
+
+
+def parse_video_input_specs(specs: Sequence[str]) -> tuple[tuple[str, Path], ...]:
+    inputs: list[tuple[str, Path]] = []
+    seen: set[str] = set()
+    for spec in specs:
+        source_name, separator, raw_path = spec.partition("=")
+        if separator == "" or not source_name or not raw_path:
+            raise click.ClickException(
+                f"Video input must be in source=path form, got: {spec!r}"
+            )
+        if source_name in seen:
+            raise click.ClickException(f"Duplicate video source requested: {source_name}")
+        path = Path(raw_path).expanduser().resolve()
+        if not path.exists():
+            raise click.ClickException(f"Missing video input: {path}")
+        inputs.append((source_name, path))
+        seen.add(source_name)
+    if not inputs:
+        raise click.ClickException("Provide at least one --input source=path entry.")
+    return tuple(inputs)
+
+
+class VideoFrameSource:
+    def __init__(
+        self,
+        video_path: Path,
+        *,
+        source_name: str | None = None,
+        default_fps: float = _DEFAULT_VIDEO_FPS,
+    ) -> None:
+        self.video_path = video_path
+        self.source_name = source_name or video_path.stem
+        self._default_fps = default_fps
+        self._adapter = wrap_blocking_source(self)
+
+    async def frames(self) -> AsyncIterator[SourceFrame]:
+        async for frame in self._adapter.frames():
+            yield frame
+
+    def iter_frames(self) -> Iterator[SourceFrame]:
+        capture = cv2.VideoCapture(str(self.video_path))
+        if not capture.isOpened():
+            capture.release()
+            raise click.ClickException(f"Could not open video input: {self.video_path}")
+
+        fps = float(capture.get(cv2.CAP_PROP_FPS))
+        if not np.isfinite(fps) or fps <= 0:
+            fps = self._default_fps
+
+        frame_index = 0
+        try:
+            while True:
+                success, frame = capture.read()
+                if not success or frame is None:
+                    return
+
+                pos_msec = float(capture.get(cv2.CAP_PROP_POS_MSEC))
+                if np.isfinite(pos_msec) and (pos_msec > 0.0 or frame_index == 0):
+                    timestamp_unix_ns = int(round(pos_msec * 1_000_000.0))
+                else:
+                    timestamp_unix_ns = int(round((frame_index / fps) * 1_000_000_000.0))
+
+                yield SourceFrame(
+                    source_name=self.source_name,
+                    image_bgr=np.ascontiguousarray(frame),
+                    frame_index=frame_index,
+                    timestamp_unix_ns=timestamp_unix_ns,
+                )
+                frame_index += 1
+        finally:
+            capture.release()
@@ -0,0 +1,263 @@
+from contextlib import contextmanager
+from collections.abc import Sequence
+from pathlib import Path
+from typing import Any, cast
+
+import cv2
+import numpy as np
+
+from pose_tracking_exp.detection.protocols import ObjectDetector, PoseEstimator
+from pose_tracking_exp.schema.detection import BoxDetections, PoseBatchRequest, PoseDetections, SourceFrame
+
+COCO_PERSON_CLASS_ID = 0
+
+
+class YoloObjectDetector:
+    def __init__(
+        self,
+        checkpoint: Path,
+        *,
+        device: str,
+        conf_threshold: float,
+        max_batch_frames: int,
+    ) -> None:
+        import ultralytics
+
+        yolo_ctor = getattr(ultralytics, "YOLO")
+        self._model: Any = yolo_ctor(str(checkpoint))
+        self._device = device
+        self._conf_threshold = conf_threshold
+        self._max_batch_frames = max_batch_frames
+
+    def detect_many(
+        self,
+        frames_rgb: Sequence[np.ndarray],
+        *,
+        classes: Sequence[int] | None = None,
+    ) -> list[BoxDetections]:
+        if not frames_rgb:
+            return []
+
+        frames_list = list(frames_rgb)
+        results = self._model(
+            frames_list,
+            conf=self._conf_threshold,
+            device=self._device,
+            classes=classes,
+            batch=min(self._max_batch_frames, len(frames_list)),
+            verbose=False,
+        )
+
+        detections: list[BoxDetections] = []
+        for frame_rgb, result in zip(frames_list, results, strict=True):
+            boxes = result.boxes
+            if boxes is None:
+                detections.append(
+                    BoxDetections(
+                        boxes_xyxy=np.empty((0, 4), dtype=np.float32),
+                        scores=np.empty((0,), dtype=np.float32),
+                        reference_frame_shape=(frame_rgb.shape[0], frame_rgb.shape[1]),
+                    )
+                )
+                continue
+
+            detections.append(
+                BoxDetections(
+                    boxes_xyxy=boxes.xyxy.cpu().numpy(),
+                    scores=boxes.conf.cpu().numpy(),
+                    reference_frame_shape=(frame_rgb.shape[0], frame_rgb.shape[1]),
+                )
+            )
+        return detections
+
+
+@contextmanager
+def legacy_torch_checkpoint_loading():
+    import torch
+
+    original_torch_load = torch.load
+
+    def patched_torch_load(*args, **kwargs):
+        kwargs.setdefault("weights_only", False)
+        return original_torch_load(*args, **kwargs)
+
+    torch.load = patched_torch_load
+    try:
+        yield
+    finally:
+        torch.load = original_torch_load
+
+
+class WholeBodyPoseEstimator:
+    def __init__(self, config_path: Path, checkpoint_path: Path, *, device: str) -> None:
+        from mmengine.dataset import Compose, pseudo_collate
+        from mmengine.registry import init_default_scope
+        from mmpose.apis import init_model
+
+        self._compose = Compose
+        self._pseudo_collate = pseudo_collate
+        self._init_default_scope = init_default_scope
+
+        with legacy_torch_checkpoint_loading():
+            self._model: Any = init_model(str(config_path), str(checkpoint_path), device=device)
+
+        model_cfg = cast(Any, self._model.cfg)
+        self._scope = cast(str | None, model_cfg.get("default_scope", "mmpose"))
+        self._pipeline = self._compose(cast(Any, model_cfg.test_dataloader.dataset.pipeline))
+
+    def estimate_batch(
+        self,
+        requests: Sequence[PoseBatchRequest],
+    ) -> list[tuple[np.ndarray, np.ndarray]]:
+        import torch
+
+        if not requests:
+            return []
+
+        if self._scope is not None:
+            self._init_default_scope(self._scope)
+
+        torch_module = cast(Any, torch)
+        data_list = []
+        detection_counts: list[int] = []
+        for request in requests:
+            boxes = np.asarray(request.boxes_xyxy, dtype=np.float32)
+            detections = int(boxes.shape[0])
+            detection_counts.append(detections)
+            for bbox in boxes:
+                data_info = {
+                    "img": request.image_rgb,
+                    "bbox": bbox[None],
+                    "bbox_score": np.ones(1, dtype=np.float32),
+                }
+                data_info.update(cast(Any, self._model.dataset_meta))
+                data_list.append(self._pipeline(data_info))
+
+        samples = []
+        if data_list:
+            batch = self._pseudo_collate(data_list)
+            with torch_module.no_grad():
+                samples = self._model.test_step(batch)
+
+        outputs: list[tuple[np.ndarray, np.ndarray]] = []
+        offset = 0
+        for detections in detection_counts:
+            keypoints = np.zeros((detections, 133, 2), dtype=np.float32)
+            scores = np.zeros((detections, 133), dtype=np.float32)
+            for index in range(detections):
+                pred_instances = samples[offset + index].pred_instances
+                try:
+                    keypoints[index] = np.asarray(pred_instances.keypoints[0], dtype=np.float32)
+                    scores[index] = np.asarray(
+                        pred_instances.keypoint_scores[0],
+                        dtype=np.float32,
+                    )
+                except IndexError:
+                    continue
+            outputs.append((keypoints, scores))
+            offset += detections
+        return outputs
+
+
+class YoloRtmposeShim:
+    def __init__(
+        self,
+        object_detector: ObjectDetector,
+        pose_estimator: PoseEstimator,
+        *,
+        bbox_area_threshold: int,
+    ) -> None:
+        self._object_detector = object_detector
+        self._pose_estimator = pose_estimator
+        self._bbox_area_threshold = bbox_area_threshold
+
+    def process_many(self, frames: Sequence[SourceFrame]) -> list[PoseDetections]:
+        if not frames:
+            return []
+
+        frames_rgb = [
+            cv2.cvtColor(frame.image_bgr, cv2.COLOR_BGR2RGB)
+            for frame in frames
+        ]
+        detections = self._object_detector.detect_many(
+            frames_rgb,
+            classes=[COCO_PERSON_CLASS_ID],
+        )
+
+        results = [
+            PoseDetections(
+                source_name=frame.source_name,
+                frame_index=frame.frame_index,
+                source_size=(frame.image_bgr.shape[1], frame.image_bgr.shape[0]),
+                boxes_xyxy=np.empty((0, 4), dtype=np.float32),
+                box_scores=np.empty((0,), dtype=np.float32),
+                keypoints_xy=np.empty((0, 133, 2), dtype=np.float32),
+                keypoint_scores=np.empty((0, 133), dtype=np.float32),
+                timestamp_unix_ns=frame.timestamp_unix_ns,
+                keypoint_schema="coco_wholebody133",
+            )
+            for frame in frames
+        ]
+        pose_requests: list[PoseBatchRequest] = []
+        detection_mapping: list[tuple[int, BoxDetections]] = []
+        for index, (frame, frame_rgb, detection_result) in enumerate(
+            zip(frames, frames_rgb, detections, strict=True)
+        ):
+            filtered_result = detection_result.filter_by_area(self._bbox_area_threshold)
+            if filtered_result.boxes_num == 0:
+                continue
+            pose_requests.append(
+                PoseBatchRequest(
+                    image_rgb=frame_rgb,
+                    boxes_xyxy=filtered_result.boxes_xyxy,
+                )
+            )
+            detection_mapping.append((index, filtered_result))
+
+        pose_outputs = self._pose_estimator.estimate_batch(pose_requests)
+        for (frame_index, detection_result), (keypoints, keypoint_scores) in zip(
+            detection_mapping,
+            pose_outputs,
+            strict=True,
+        ):
+            source_frame = frames[frame_index]
+            results[frame_index] = PoseDetections(
+                source_name=source_frame.source_name,
+                frame_index=source_frame.frame_index,
+                source_size=detection_result.reference_size,
+                boxes_xyxy=detection_result.boxes_xyxy,
+                box_scores=detection_result.scores,
+                keypoints_xy=keypoints,
+                keypoint_scores=keypoint_scores,
+                timestamp_unix_ns=source_frame.timestamp_unix_ns,
+                keypoint_schema="coco_wholebody133",
+            )
+        return results
+
+
+def build_yolo_rtmpose_shim(
+    *,
+    yolo_checkpoint: Path,
+    yolo_conf_threshold: float,
+    pose_checkpoint: Path,
+    pose_config_path: Path,
+    device: str,
+    max_batch_frames: int,
+    bbox_area_threshold: int,
+) -> YoloRtmposeShim:
+    object_detector = YoloObjectDetector(
+        yolo_checkpoint,
+        device=device,
+        conf_threshold=yolo_conf_threshold,
+        max_batch_frames=max_batch_frames,
+    )
+    pose_estimator = WholeBodyPoseEstimator(
+        pose_config_path,
+        pose_checkpoint,
+        device=device,
+    )
+    return YoloRtmposeShim(
+        object_detector,
+        pose_estimator,
+        bbox_area_threshold=bbox_area_threshold,
+    )
@@ -1,224 +0,0 @@
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Literal
-
-import cv2
-import numpy as np
-
-from pose_tracking_exp.tensor_types import Matrix3, Pose2D, Pose3D, Vector3
-
-
-@dataclass(slots=True)
-class CameraCalibration:
-    name: str
-    width: int
-    height: int
-    K: Matrix3
-    DC: np.ndarray
-    # Canonical in-repo convention: OpenCV world->camera extrinsics.
-    R: Matrix3
-    T: Vector3
-    model: str = "pinhole"
-    rvec: np.ndarray | None = None
-    pose_R: Matrix3 = field(init=False)
-    pose_T: Vector3 = field(init=False)
-
-    def __post_init__(self) -> None:
-        self.K = np.asarray(self.K, dtype=np.float64).reshape(3, 3)
-        self.DC = np.asarray(self.DC, dtype=np.float64).reshape(-1)
-        self.R = np.asarray(self.R, dtype=np.float64).reshape(3, 3)
-        self.T = np.asarray(self.T, dtype=np.float64).reshape(3)
-        if self.rvec is None:
-            rvec, _ = cv2.Rodrigues(self.R)
-            self.rvec = np.asarray(rvec, dtype=np.float64).reshape(3)
-        else:
-            self.rvec = np.asarray(self.rvec, dtype=np.float64).reshape(3)
-        self.pose_R = self.R.T
-        self.pose_T = -(self.pose_R @ self.T)
-
-    @classmethod
-    def from_opencv_extrinsics(
-        cls,
-        *,
-        name: str,
-        width: int,
-        height: int,
-        K: Matrix3,
-        DC: np.ndarray,
-        R: Matrix3,
-        T: Vector3,
-        model: str = "pinhole",
-        rvec: np.ndarray | None = None,
-    ) -> "CameraCalibration":
-        return cls(
-            name=name,
-            width=width,
-            height=height,
-            K=K,
-            DC=DC,
-            R=R,
-            T=T,
-            model=model,
-            rvec=rvec,
-        )
-
-    @classmethod
-    def from_rpt_pose(
-        cls,
-        *,
-        name: str,
-        width: int,
-        height: int,
-        K: Matrix3,
-        DC: np.ndarray,
-        R: Matrix3,
-        T: Vector3,
-        model: str = "pinhole",
-    ) -> "CameraCalibration":
-        pose_R = np.asarray(R, dtype=np.float64).reshape(3, 3)
-        pose_T = np.asarray(T, dtype=np.float64).reshape(3)
-        rotation = pose_R.T
-        translation = -(rotation @ pose_T)
-        rvec, _ = cv2.Rodrigues(rotation)
-        return cls(
-            name=name,
-            width=width,
-            height=height,
-            K=K,
-            DC=DC,
-            R=rotation,
-            T=translation,
-            model=model,
-            rvec=np.asarray(rvec, dtype=np.float64).reshape(3),
-        )
-
-
-@dataclass(slots=True)
-class SceneConfig:
-    room_size: Vector3
-    room_center: Vector3
-    cameras: tuple[CameraCalibration, ...]
-
-
-@dataclass(slots=True)
-class PoseDetection:
-    bbox: np.ndarray
-    bbox_confidence: float
-    keypoints: Pose2D
-
-
-@dataclass(slots=True)
-class CameraFrame:
-    camera_name: str
-    frame_index: int
-    timestamp_unix_ns: int
-    detections: tuple[PoseDetection, ...]
-    source_size: tuple[int, int]
-
-
-@dataclass(slots=True)
-class FrameBundle:
-    bundle_index: int
-    timestamp_unix_ns: int
-    views: tuple[CameraFrame, ...]
-
-
-@dataclass(slots=True)
-class ReplaySequence:
-    scene_path: Path
-    replay_path: Path
-    frames_by_camera: dict[str, list[CameraFrame]]
-
-
-@dataclass(slots=True)
-class ProposalCluster:
-    pose3d: Pose3D
-    root: Vector3
-    source_views: frozenset[str]
-    support_size: int
-    mean_score: float
-
-
-@dataclass(slots=True)
-class SkeletonState:
-    parameters: np.ndarray
-    beta: np.ndarray
-    pose3d: Pose3D
-
-
-@dataclass(slots=True)
-class TentativeTrackState:
-    track_id: int
-    state: Literal["tentative"] = "tentative"
-    age: int = 0
-    misses: int = 0
-    score: float = 0.0
-    last_bundle_index: int = -1
-    root: Vector3 = field(default_factory=lambda: np.zeros(3, dtype=np.float64))
-    pose3d: Pose3D = field(default_factory=lambda: np.zeros((20, 4), dtype=np.float64))
-    evidence_buffer: list[Pose3D] = field(default_factory=list)
-
-
-@dataclass(slots=True)
-class ActiveTrackState:
-    track_id: int
-    status: Literal["active", "lost"] = "active"
-    misses: int = 0
-    lost_age: int = 0
-    score: float = 0.0
-    last_bundle_index: int = -1
-    skeleton: SkeletonState = field(
-        default_factory=lambda: SkeletonState(
-            parameters=np.zeros(31, dtype=np.float64),
-            beta=np.ones(8, dtype=np.float64),
-            pose3d=np.zeros((20, 4), dtype=np.float64),
-        )
-    )
-    noise_scale: np.ndarray = field(default_factory=lambda: np.full((20,), 9.0, dtype=np.float64))
-
-
-TrackState = TentativeTrackState | ActiveTrackState
-
-
-@dataclass(slots=True)
-class TrackedFrameResult:
-    bundle_index: int
-    timestamp_unix_ns: int
-    tentative_tracks: tuple[TentativeTrackState, ...]
-    active_tracks: tuple[ActiveTrackState, ...]
-    lost_tracks: tuple[ActiveTrackState, ...]
-    proposals: tuple[ProposalCluster, ...]
-
-
-@dataclass(slots=True)
-class TrackerDiagnostics:
-    match_existing_calls: int = 0
-    match_existing_seconds: float = 0.0
-    proposal_build_calls: int = 0
-    proposal_build_seconds: float = 0.0
-    promotions: int = 0
-    reacquisitions: int = 0
-    active_updates: int = 0
-    seed_initializations: int = 0
-    nonlinear_refinements: int = 0
-
-
-@dataclass(slots=True)
-class TrackerConfig:
-    mode: Literal["general", "single_person"] = "general"
-    min_bundle_views: int = 2
-    max_sync_skew_ns: int = 12_000_000
-    tentative_buffer_size: int = 5
-    tentative_min_age: int = 3
-    tentative_hits_required: int = 3
-    tentative_promote_score: float = 3.0
-    tentative_max_misses: int = 2
-    active_min_views: int = 2
-    active_core_gate_px: float = 80.0
-    active_joint_gate_px: float = 120.0
-    active_miss_to_lost: int = 3
-    lost_delete_age: int = 15
-    proposal_match_distance_m: float = 0.45
-    noise_ema: float = 0.85
-    proposal_min_score: float = 0.9
-    proposal_min_group_size: int = 1
@@ -1,147 +0,0 @@
-import base64
-import json
-from dataclasses import dataclass
-
-import numpy as np
-from beartype import beartype
-
-from pose_tracking_exp.models import CameraFrame, PoseDetection
-from pose_tracking_exp.normalization import normalize_rtmpose_body20
-
-PROTOCOL_HEADER = bytes([0x80]) + b"POSE"
-POSE_JOINT_COUNT = 133
-
-
-@dataclass(slots=True)
-class DecodedPosePayload:
-    frame_index: int
-    reference_size: tuple[int, int]
-    timestamp_unix_ns: int
-    detections: tuple[PoseDetection, ...]
-
-
-def _read_u8(payload: memoryview, offset: int) -> tuple[int, int]:
-    return int(payload[offset]), offset + 1
-
-
-def _read_u16_array(payload: memoryview, offset: int, count: int) -> tuple[np.ndarray, int]:
-    size = count * 2
-    array = np.frombuffer(payload[offset : offset + size], dtype="<u2", count=count).astype(np.float64)
-    return array, offset + size
-
-
-@beartype
-def decode_pose_payload(payload: bytes) -> DecodedPosePayload:
-    if not payload.startswith(PROTOCOL_HEADER):
-        raise ValueError("Invalid ParaJumping pose payload header.")
-
-    view = memoryview(payload)
-    offset = len(PROTOCOL_HEADER)
-    frame_index = int.from_bytes(view[offset : offset + 4], "little")
-    offset += 4
-    reference_size = tuple(int(x) for x in np.frombuffer(view[offset : offset + 4], dtype="<u2", count=2))
-    offset += 4
-
-    num_bbox = int(view[offset])
-    offset += 1
-    bbox_raw, offset = _read_u16_array(view, offset, num_bbox * 4)
-    bboxes = bbox_raw.reshape(num_bbox, 4) if num_bbox > 0 else np.zeros((0, 4), dtype=np.float64)
-
-    num_bbox_conf = int(view[offset])
-    offset += 1
-    bbox_confidence = np.frombuffer(view[offset : offset + num_bbox_conf], dtype=np.uint8, count=num_bbox_conf)
-    offset += num_bbox_conf
-
-    num_keypoints = int(view[offset])
-    offset += 1
-    keypoints_raw, offset = _read_u16_array(view, offset, num_keypoints * POSE_JOINT_COUNT * 2)
-    keypoints_xy = (
-        keypoints_raw.reshape(num_keypoints, POSE_JOINT_COUNT, 2)
-        if num_keypoints > 0
-        else np.zeros((0, POSE_JOINT_COUNT, 2), dtype=np.float64)
-    )
-
-    num_keypoint_conf = int(view[offset])
-    offset += 1
-    keypoint_confidence = (
-        np.frombuffer(view[offset : offset + num_keypoint_conf], dtype=np.uint8, count=num_keypoint_conf).astype(np.float64)
-        / 255.0
-    )
-    offset += num_keypoint_conf
-    timestamp_unix_ns = int.from_bytes(view[offset : offset + 8], "little")
-
-    if num_keypoint_conf > 0 and num_keypoint_conf != num_keypoints * POSE_JOINT_COUNT:
-        raise ValueError("Unexpected keypoint confidence payload length.")
-
-    detection_items: list[PoseDetection] = []
-    confidences = (
-        keypoint_confidence.reshape(num_keypoints, POSE_JOINT_COUNT)
-        if num_keypoints > 0
-        else np.zeros((0, POSE_JOINT_COUNT), dtype=np.float64)
-    )
-
-    for index in range(num_keypoints):
-        normalized = normalize_rtmpose_body20(keypoints_xy[index], confidences[index])
-        bbox_score = float(bbox_confidence[index] / 255.0) if index < bbox_confidence.shape[0] else 0.0
-        bbox = bboxes[index] if index < bboxes.shape[0] else np.zeros(4, dtype=np.float64)
-        detection_items.append(
-            PoseDetection(
-                bbox=np.asarray(bbox, dtype=np.float64),
-                bbox_confidence=bbox_score,
-                keypoints=np.asarray(normalized, dtype=np.float64),
-            )
-        )
-
-    return DecodedPosePayload(
-        frame_index=frame_index,
-        reference_size=(reference_size[0], reference_size[1]),
-        timestamp_unix_ns=timestamp_unix_ns,
-        detections=tuple(detection_items),
-    )
-
-
-@beartype
-def frame_from_payload(camera_name: str, payload: bytes) -> CameraFrame:
-    decoded = decode_pose_payload(payload)
-    return CameraFrame(
-        camera_name=camera_name,
-        frame_index=decoded.frame_index,
-        timestamp_unix_ns=decoded.timestamp_unix_ns,
-        detections=decoded.detections,
-        source_size=decoded.reference_size,
-    )
-
-
-@beartype
-def convert_payload_record(record: dict[str, object]) -> dict[str, object]:
-    camera_name = str(record["camera"])
-    payload_b64 = str(record["payload_b64"])
-    frame = frame_from_payload(camera_name, base64.b64decode(payload_b64))
-
-    return {
-        "camera": frame.camera_name,
-        "frame_index": frame.frame_index,
-        "timestamp_unix_ns": frame.timestamp_unix_ns,
-        "source_size": list(frame.source_size),
-        "detections": [
-            {
-                "bbox": detection.bbox.tolist(),
-                "bbox_confidence": detection.bbox_confidence,
-                "keypoints": detection.keypoints.tolist(),
-            }
-            for detection in frame.detections
-        ],
-    }
-
-
-@beartype
-def convert_payload_jsonl_lines(lines: list[str]) -> list[str]:
-    output_lines: list[str] = []
-    for line in lines:
-        if not line.strip():
-            continue
-        record = json.loads(line)
-        converted = convert_payload_record(record)
-        output_lines.append(json.dumps(converted))
-    return output_lines
-
@@ -1,108 +0,0 @@
-import json
-from pathlib import Path
-
-import numpy as np
-from beartype import beartype
-
-from pose_tracking_exp.models import CameraCalibration, CameraFrame, PoseDetection, ReplaySequence, SceneConfig
-
-_OPENCV_EXTRINSICS = "opencv_world_to_camera"
-_RPT_POSE = "rpt_camera_pose"
-
-
-def _as_float_array(values: object, shape: tuple[int, ...]) -> np.ndarray:
-    array = np.asarray(values, dtype=np.float64)
-    if array.shape != shape:
-        raise ValueError(f"Expected shape {shape}, got {array.shape}.")
-    return array
-
-
-@beartype
-def load_scene_file(path: Path) -> SceneConfig:
-    payload = json.loads(path.read_text(encoding="utf-8"))
-    default_extrinsic_format = str(payload.get("extrinsic_format", _OPENCV_EXTRINSICS))
-    cameras: list[CameraCalibration] = []
-    for camera_payload in payload["cameras"]:
-        extrinsic_format = str(camera_payload.get("extrinsic_format", default_extrinsic_format))
-        name = str(camera_payload["name"])
-        width = int(camera_payload["width"])
-        height = int(camera_payload["height"])
-        K = _as_float_array(camera_payload["K"], (3, 3))
-        DC = np.asarray(camera_payload.get("DC", [0.0, 0.0, 0.0, 0.0, 0.0]), dtype=np.float64)
-        R = _as_float_array(camera_payload["R"], (3, 3))
-        T = _as_float_array(camera_payload["T"], (3, 1)).reshape(3)
-        model = str(camera_payload.get("model", "pinhole"))
-        if extrinsic_format == _OPENCV_EXTRINSICS:
-            cameras.append(
-                CameraCalibration.from_opencv_extrinsics(
-                    name=name,
-                    width=width,
-                    height=height,
-                    K=K,
-                    DC=DC,
-                    R=R,
-                    T=T,
-                    model=model,
-                    rvec=np.asarray(camera_payload["rvec"], dtype=np.float64).reshape(3)
-                    if "rvec" in camera_payload
-                    else None,
-                )
-            )
-        elif extrinsic_format == _RPT_POSE:
-            cameras.append(
-                CameraCalibration.from_rpt_pose(
-                    name=name,
-                    width=width,
-                    height=height,
-                    K=K,
-                    DC=DC,
-                    R=R,
-                    T=T,
-                    model=model,
-                )
-            )
-        else:
-            raise ValueError(
-                f"Unsupported extrinsic format {extrinsic_format!r}. "
-                f"Expected {_OPENCV_EXTRINSICS!r} or {_RPT_POSE!r}."
-            )
-    return SceneConfig(
-        room_size=_as_float_array(payload["room_size"], (3,)),
-        room_center=_as_float_array(payload["room_center"], (3,)),
-        cameras=tuple(cameras),
-    )
-
-
-@beartype
-def load_replay_file(scene_path: Path, replay_path: Path) -> ReplaySequence:
-    frames_by_camera: dict[str, list[CameraFrame]] = {}
-    for raw_line in replay_path.read_text(encoding="utf-8").splitlines():
-        if not raw_line.strip():
-            continue
-        payload = json.loads(raw_line)
-        camera_name = str(payload["camera"])
-        detections: list[PoseDetection] = []
-        for detection_payload in payload["detections"]:
-            detections.append(
-                PoseDetection(
-                    bbox=np.asarray(detection_payload["bbox"], dtype=np.float64),
-                    bbox_confidence=float(detection_payload["bbox_confidence"]),
-                    keypoints=np.asarray(detection_payload["keypoints"], dtype=np.float64),
-                )
-            )
-        frames_by_camera.setdefault(camera_name, []).append(
-            CameraFrame(
-                camera_name=camera_name,
-                frame_index=int(payload["frame_index"]),
-                timestamp_unix_ns=int(payload["timestamp_unix_ns"]),
-                detections=tuple(detections),
-                source_size=(
-                    int(payload["source_size"][0]),
-                    int(payload["source_size"][1]),
-                ),
-            )
-        )
-
-    for frames in frames_by_camera.values():
-        frames.sort(key=lambda item: (item.timestamp_unix_ns, item.frame_index))
-    return ReplaySequence(scene_path=scene_path, replay_path=replay_path, frames_by_camera=frames_by_camera)
@@ -0,0 +1,50 @@
+from pose_tracking_exp.schema.camera import (
+    CameraCalibration,
+    CameraModel,
+    PINHOLE_CAMERA_MODEL,
+    SceneConfig,
+    parse_camera_model,
+)
+from pose_tracking_exp.schema.detection import (
+    BoxDetections,
+    CocoKeypointSchema,
+    PoseBatchRequest,
+    PoseDetections,
+    SourceFrame,
+)
+from pose_tracking_exp.schema.observation import CameraFrame, FrameBundle, PoseDetection, ReplaySequence
+from pose_tracking_exp.schema.tracking import (
+    ActiveTrackState,
+    ProposalCluster,
+    SkeletonState,
+    TentativeTrackState,
+    TrackState,
+    TrackerConfig,
+    TrackerDiagnostics,
+    TrackedFrameResult,
+)
+
+__all__ = [
+    "ActiveTrackState",
+    "BoxDetections",
+    "CameraCalibration",
+    "CameraFrame",
+    "CameraModel",
+    "CocoKeypointSchema",
+    "FrameBundle",
+    "PINHOLE_CAMERA_MODEL",
+    "PoseBatchRequest",
+    "PoseDetection",
+    "PoseDetections",
+    "ProposalCluster",
+    "ReplaySequence",
+    "SceneConfig",
+    "SkeletonState",
+    "TentativeTrackState",
+    "TrackState",
+    "TrackerConfig",
+    "TrackerDiagnostics",
+    "TrackedFrameResult",
+    "SourceFrame",
+    "parse_camera_model",
+]
@@ -0,0 +1,106 @@
+from dataclasses import dataclass, field
+from typing import Literal
+
+import cv2
+import numpy as np
+
+from pose_tracking_exp.common.tensor_types import Matrix3, Vector3
+
+CameraModel = Literal["pinhole"]
+PINHOLE_CAMERA_MODEL: CameraModel = "pinhole"
+
+
+def parse_camera_model(model: str) -> CameraModel:
+    if model != PINHOLE_CAMERA_MODEL:
+        raise ValueError(
+            f"Unsupported camera model {model!r}. Expected {PINHOLE_CAMERA_MODEL!r}."
+        )
+    return PINHOLE_CAMERA_MODEL
+
+
+@dataclass(slots=True)
+class CameraCalibration:
+    name: str
+    width: int
+    height: int
+    K: Matrix3
+    DC: np.ndarray
+    R: Matrix3
+    T: Vector3
+    model: CameraModel = PINHOLE_CAMERA_MODEL
+    rvec: np.ndarray | None = None
+    pose_R: Matrix3 = field(init=False)
+    pose_T: Vector3 = field(init=False)
+
+    def __post_init__(self) -> None:
+        self.K = np.asarray(self.K, dtype=np.float64).reshape(3, 3)
+        self.DC = np.asarray(self.DC, dtype=np.float64).reshape(-1)
+        self.R = np.asarray(self.R, dtype=np.float64).reshape(3, 3)
+        self.T = np.asarray(self.T, dtype=np.float64).reshape(3)
+        self.model = parse_camera_model(self.model)
+        if self.rvec is None:
+            rvec, _ = cv2.Rodrigues(self.R)
+            self.rvec = np.asarray(rvec, dtype=np.float64).reshape(3)
+        else:
+            self.rvec = np.asarray(self.rvec, dtype=np.float64).reshape(3)
+        self.pose_R = self.R.T
+        self.pose_T = -(self.pose_R @ self.T)
+
+    @staticmethod
+    def from_opencv_extrinsics(
+        name: str,
+        width: int,
+        height: int,
+        K: Matrix3,
+        DC: np.ndarray,
+        R: Matrix3,
+        T: Vector3,
+        model: CameraModel = PINHOLE_CAMERA_MODEL,
+        rvec: np.ndarray | None = None,
+    ) -> "CameraCalibration":
+        return CameraCalibration(
+            name=name,
+            width=width,
+            height=height,
+            K=K,
+            DC=DC,
+            R=R,
+            T=T,
+            model=model,
+            rvec=rvec,
+        )
+
+    @staticmethod
+    def from_rpt_pose(
+        name: str,
+        width: int,
+        height: int,
+        K: Matrix3,
+        DC: np.ndarray,
+        R: Matrix3,
+        T: Vector3,
+        model: CameraModel = PINHOLE_CAMERA_MODEL,
+    ) -> "CameraCalibration":
+        pose_R = np.asarray(R, dtype=np.float64).reshape(3, 3)
+        pose_T = np.asarray(T, dtype=np.float64).reshape(3)
+        rotation = pose_R.T
+        translation = -(rotation @ pose_T)
+        rvec, _ = cv2.Rodrigues(rotation)
+        return CameraCalibration(
+            name=name,
+            width=width,
+            height=height,
+            K=K,
+            DC=DC,
+            R=rotation,
+            T=translation,
+            model=model,
+            rvec=np.asarray(rvec, dtype=np.float64).reshape(3),
+        )
+
+
+@dataclass(slots=True)
+class SceneConfig:
+    room_size: Vector3
+    room_center: Vector3
+    cameras: tuple[CameraCalibration, ...]
@@ -0,0 +1,116 @@
+"""Shared 2D detection schema.
+
+`coco_wholebody133` matches the COCO-WholeBody dataset terminology used by
+MMPose and the official dataset repo. The first 17 joints follow the standard
+COCO body ordering, so it is body-compatible with `coco17`.
+
+References:
+- https://mmpose.readthedocs.io/en/latest/dataset_zoo/2d_wholebody_keypoint.html
+- https://github.com/jin-s13/COCO-WholeBody
+"""
+
+from dataclasses import dataclass
+from typing import Literal
+
+import numpy as np
+
+CocoKeypointSchema = Literal["coco17", "coco_wholebody133"]
+
+
+def expected_keypoint_count(schema: CocoKeypointSchema) -> int:
+    if schema == "coco17":
+        return 17
+    return 133
+
+
+@dataclass(slots=True)
+class SourceFrame:
+    source_name: str
+    image_bgr: np.ndarray
+    frame_index: int
+    timestamp_unix_ns: int
+
+
+@dataclass(slots=True)
+class BoxDetections:
+    boxes_xyxy: np.ndarray
+    scores: np.ndarray
+    reference_frame_shape: tuple[int, int]
+
+    @property
+    def reference_size(self) -> tuple[int, int]:
+        return (self.reference_frame_shape[1], self.reference_frame_shape[0])
+
+    @property
+    def boxes_num(self) -> int:
+        return int(self.boxes_xyxy.shape[0])
+
+    def filter_by_area(self, area_threshold: int) -> "BoxDetections":
+        if area_threshold <= 0:
+            raise ValueError("Area threshold must be positive.")
+        areas = np.abs(
+            (self.boxes_xyxy[:, 2] - self.boxes_xyxy[:, 0])
+            * (self.boxes_xyxy[:, 3] - self.boxes_xyxy[:, 1])
+        )
+        mask = areas >= area_threshold
+        return BoxDetections(
+            boxes_xyxy=self.boxes_xyxy[mask],
+            scores=self.scores[mask],
+            reference_frame_shape=self.reference_frame_shape,
+        )
+
+
+@dataclass(slots=True)
+class PoseBatchRequest:
+    image_rgb: np.ndarray
+    boxes_xyxy: np.ndarray
+
+
+@dataclass(slots=True)
+class PoseDetections:
+    source_name: str
+    frame_index: int
+    source_size: tuple[int, int]
+    boxes_xyxy: np.ndarray
+    box_scores: np.ndarray | None
+    keypoints_xy: np.ndarray
+    keypoint_scores: np.ndarray | None
+    timestamp_unix_ns: int
+    keypoint_schema: CocoKeypointSchema = "coco_wholebody133"
+
+    def validate(self) -> None:
+        if self.boxes_xyxy.ndim != 2 or self.boxes_xyxy.shape[1] != 4:
+            raise ValueError(
+                f"Expected boxes with shape (N, 4), got {self.boxes_xyxy.shape}."
+            )
+        if self.keypoints_xy.ndim != 3 or self.keypoints_xy.shape[2] != 2:
+            raise ValueError(
+                "Expected keypoints with shape (N, K, 2), "
+                f"got {self.keypoints_xy.shape}."
+            )
+
+        expected_count = expected_keypoint_count(self.keypoint_schema)
+        if self.keypoints_xy.shape[1] != expected_count:
+            raise ValueError(
+                f"Expected {self.keypoint_schema} keypoints with {expected_count} joints, "
+                f"got {self.keypoints_xy.shape[1]}."
+            )
+
+        detection_count = int(self.keypoints_xy.shape[0])
+        if self.boxes_xyxy.shape[0] != detection_count:
+            raise ValueError(
+                "Expected box and keypoint detection counts to match, "
+                f"got {self.boxes_xyxy.shape[0]} and {detection_count}."
+            )
+        if self.box_scores is not None and self.box_scores.shape != (detection_count,):
+            raise ValueError(
+                f"Expected box scores with shape ({detection_count},), got {self.box_scores.shape}."
+            )
+        if self.keypoint_scores is not None and self.keypoint_scores.shape != (
+            detection_count,
+            expected_count,
+        ):
+            raise ValueError(
+                "Expected keypoint scores with shape "
+                f"({detection_count}, {expected_count}), got {self.keypoint_scores.shape}."
+            )
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+from pose_tracking_exp.common.tensor_types import Pose2D
+
+
+@dataclass(slots=True)
+class PoseDetection:
+    bbox: np.ndarray
+    bbox_confidence: float
+    keypoints: Pose2D
+
+
+@dataclass(slots=True)
+class CameraFrame:
+    camera_name: str
+    frame_index: int
+    timestamp_unix_ns: int
+    detections: tuple[PoseDetection, ...]
+    source_size: tuple[int, int]
+
+
+@dataclass(slots=True)
+class FrameBundle:
+    bundle_index: int
+    timestamp_unix_ns: int
+    views: tuple[CameraFrame, ...]
+
+
+@dataclass(slots=True)
+class ReplaySequence:
+    scene_path: Path
+    replay_path: Path
+    frames_by_camera: dict[str, list[CameraFrame]]
@@ -0,0 +1,102 @@
+from dataclasses import dataclass, field
+from typing import Literal
+
+import numpy as np
+
+from pose_tracking_exp.common.tensor_types import Pose3D, Vector3
+
+
+@dataclass(slots=True)
+class ProposalCluster:
+    pose3d: Pose3D
+    root: Vector3
+    source_views: frozenset[str]
+    support_size: int
+    mean_score: float
+
+
+@dataclass(slots=True)
+class SkeletonState:
+    parameters: np.ndarray
+    beta: np.ndarray
+    pose3d: Pose3D
+
+
+@dataclass(slots=True)
+class TentativeTrackState:
+    track_id: int
+    state: Literal["tentative"] = "tentative"
+    age: int = 0
+    misses: int = 0
+    score: float = 0.0
+    last_bundle_index: int = -1
+    root: Vector3 = field(default_factory=lambda: np.zeros(3, dtype=np.float64))
+    pose3d: Pose3D = field(default_factory=lambda: np.zeros((20, 4), dtype=np.float64))
+    evidence_buffer: list[Pose3D] = field(default_factory=list)
+
+
+@dataclass(slots=True)
+class ActiveTrackState:
+    track_id: int
+    status: Literal["active", "lost"] = "active"
+    misses: int = 0
+    lost_age: int = 0
+    score: float = 0.0
+    last_bundle_index: int = -1
+    skeleton: SkeletonState = field(
+        default_factory=lambda: SkeletonState(
+            parameters=np.zeros(31, dtype=np.float64),
+            beta=np.ones(8, dtype=np.float64),
+            pose3d=np.zeros((20, 4), dtype=np.float64),
+        )
+    )
+    noise_scale: np.ndarray = field(
+        default_factory=lambda: np.full((20,), 9.0, dtype=np.float64)
+    )
+
+
+TrackState = TentativeTrackState | ActiveTrackState
+
+
+@dataclass(slots=True)
+class TrackedFrameResult:
+    bundle_index: int
+    timestamp_unix_ns: int
+    tentative_tracks: tuple[TentativeTrackState, ...]
+    active_tracks: tuple[ActiveTrackState, ...]
+    lost_tracks: tuple[ActiveTrackState, ...]
+    proposals: tuple[ProposalCluster, ...]
+
+
+@dataclass(slots=True)
+class TrackerDiagnostics:
+    match_existing_calls: int = 0
+    match_existing_seconds: float = 0.0
+    proposal_build_calls: int = 0
+    proposal_build_seconds: float = 0.0
+    promotions: int = 0
+    reacquisitions: int = 0
+    active_updates: int = 0
+    seed_initializations: int = 0
+    nonlinear_refinements: int = 0
+
+
+@dataclass(slots=True)
+class TrackerConfig:
+    max_active_tracks: int | None = None
+    min_bundle_views: int = 2
+    max_sync_skew_ns: int = 12_000_000
+    tentative_buffer_size: int = 5
+    tentative_min_age: int = 3
+    tentative_hits_required: int = 3
+    tentative_promote_score: float = 3.0
+    tentative_max_misses: int = 2
+    active_min_views: int = 2
+    active_core_gate_px: float = 80.0
+    active_joint_gate_px: float = 120.0
+    active_miss_to_lost: int = 3
+    lost_delete_age: int = 15
+    proposal_match_distance_m: float = 0.45
+    noise_ema: float = 0.85
+    proposal_min_score: float = 0.9
+    proposal_min_group_size: int = 1
@@ -0,0 +1,15 @@
+from pose_tracking_exp.tracking.kinematics import seed_state_from_pose3d, update_noise_scale, update_state_from_multiview
+from pose_tracking_exp.tracking.replay_io import load_parquet_replay_dir, load_replay_file, load_scene_file
+from pose_tracking_exp.tracking.sync import synchronize_frames
+from pose_tracking_exp.tracking.tracker import PoseTracker
+
+__all__ = [
+    "PoseTracker",
+    "load_parquet_replay_dir",
+    "load_replay_file",
+    "load_scene_file",
+    "seed_state_from_pose3d",
+    "synchronize_frames",
+    "update_noise_scale",
+    "update_state_from_multiview",
+]
@@ -4,10 +4,10 @@ import numpy as np
 from beartype import beartype
 from scipy.optimize import least_squares

-from pose_tracking_exp.camera_math import project_pose
-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
-from pose_tracking_exp.models import CameraCalibration, PoseDetection, SkeletonState
-from pose_tracking_exp.tensor_types import Pose3D
+from pose_tracking_exp.common.camera_math import project_pose
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.common.tensor_types import Pose3D
+from pose_tracking_exp.schema import CameraCalibration, PoseDetection, SkeletonState

 PARAMETER_DIMENSION = 31
 SHAPE_DIMENSION = 8
@@ -0,0 +1,221 @@
+import json
+from pathlib import Path
+from typing import cast
+
+import numpy as np
+import pyarrow.parquet as pq
+from beartype import beartype
+
+from pose_tracking_exp.common.detection_parquet import DETECTED_PARQUET_SUFFIX
+from pose_tracking_exp.common.normalization import infer_bbox_from_keypoints, normalize_coco_body20
+from pose_tracking_exp.schema import (
+    CameraCalibration,
+    CameraFrame,
+    CocoKeypointSchema,
+    PoseDetection,
+    ReplaySequence,
+    SceneConfig,
+    parse_camera_model,
+)
+
+_OPENCV_EXTRINSICS = "opencv_world_to_camera"
+_RPT_POSE = "rpt_camera_pose"
+
+
+def _as_float_array(values: object, shape: tuple[int, ...]) -> np.ndarray:
+    array = np.asarray(values, dtype=np.float64)
+    if array.shape != shape:
+        raise ValueError(f"Expected shape {shape}, got {array.shape}.")
+    return array
+
+
+@beartype
+def load_scene_file(path: Path) -> SceneConfig:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    default_extrinsic_format = str(payload.get("extrinsic_format", _OPENCV_EXTRINSICS))
+    cameras: list[CameraCalibration] = []
+    for camera_payload in payload["cameras"]:
+        extrinsic_format = str(
+            camera_payload.get("extrinsic_format", default_extrinsic_format)
+        )
+        name = str(camera_payload["name"])
+        width = int(camera_payload["width"])
+        height = int(camera_payload["height"])
+        K = _as_float_array(camera_payload["K"], (3, 3))
+        DC = np.asarray(
+            camera_payload.get("DC", [0.0, 0.0, 0.0, 0.0, 0.0]), dtype=np.float64
+        )
+        R = _as_float_array(camera_payload["R"], (3, 3))
+        T = _as_float_array(camera_payload["T"], (3, 1)).reshape(3)
+        model = parse_camera_model(camera_payload.get("model", "pinhole"))
+        if extrinsic_format == _OPENCV_EXTRINSICS:
+            cameras.append(
+                CameraCalibration.from_opencv_extrinsics(
+                    name=name,
+                    width=width,
+                    height=height,
+                    K=K,
+                    DC=DC,
+                    R=R,
+                    T=T,
+                    model=model,
+                    rvec=np.asarray(camera_payload["rvec"], dtype=np.float64).reshape(3)
+                    if "rvec" in camera_payload
+                    else None,
+                )
+            )
+        elif extrinsic_format == _RPT_POSE:
+            cameras.append(
+                CameraCalibration.from_rpt_pose(
+                    name=name,
+                    width=width,
+                    height=height,
+                    K=K,
+                    DC=DC,
+                    R=R,
+                    T=T,
+                    model=model,
+                )
+            )
+        else:
+            raise ValueError(
+                f"Unsupported extrinsic format {extrinsic_format!r}. "
+                f"Expected {_OPENCV_EXTRINSICS!r} or {_RPT_POSE!r}."
+            )
+    return SceneConfig(
+        room_size=_as_float_array(payload["room_size"], (3,)),
+        room_center=_as_float_array(payload["room_center"], (3,)),
+        cameras=tuple(cameras),
+    )
+
+
+@beartype
+def load_replay_file(scene_path: Path, replay_path: Path) -> ReplaySequence:
+    if replay_path.is_dir():
+        return load_parquet_replay_dir(scene_path, replay_path)
+
+    frames_by_camera: dict[str, list[CameraFrame]] = {}
+    for raw_line in replay_path.read_text(encoding="utf-8").splitlines():
+        if not raw_line.strip():
+            continue
+        payload = json.loads(raw_line)
+        camera_name = str(payload["camera"])
+        detections: list[PoseDetection] = []
+        for detection_payload in payload["detections"]:
+            detections.append(
+                PoseDetection(
+                    bbox=np.asarray(detection_payload["bbox"], dtype=np.float64),
+                    bbox_confidence=float(detection_payload["bbox_confidence"]),
+                    keypoints=np.asarray(
+                        detection_payload["keypoints"], dtype=np.float64
+                    ),
+                )
+            )
+        frames_by_camera.setdefault(camera_name, []).append(
+            CameraFrame(
+                camera_name=camera_name,
+                frame_index=int(payload["frame_index"]),
+                timestamp_unix_ns=int(payload["timestamp_unix_ns"]),
+                detections=tuple(detections),
+                source_size=(
+                    int(payload["source_size"][0]),
+                    int(payload["source_size"][1]),
+                ),
+            )
+        )
+
+    for frames in frames_by_camera.values():
+        frames.sort(key=lambda item: (item.timestamp_unix_ns, item.frame_index))
+    return ReplaySequence(
+        scene_path=scene_path,
+        replay_path=replay_path,
+        frames_by_camera=frames_by_camera,
+    )
+
+
+def _pose_detections_from_parquet_row(row: dict[str, object]) -> tuple[PoseDetection, ...]:
+    boxes = np.asarray(row.get("boxes", []), dtype=np.float64)
+    if boxes.size == 0:
+        boxes = np.empty((0, 4), dtype=np.float64)
+
+    box_scores = np.asarray(row.get("box_scores", []), dtype=np.float64)
+    keypoints_xy = np.asarray(row.get("kps", []), dtype=np.float64)
+    if keypoints_xy.size == 0:
+        keypoints_xy = np.empty((0, 133, 2), dtype=np.float64)
+
+    keypoint_scores = np.asarray(row.get("kps_scores", []), dtype=np.float64)
+    if keypoint_scores.size == 0:
+        keypoint_scores = np.empty((0, 133), dtype=np.float64)
+
+    raw_keypoint_schema = row.get("keypoint_schema", "coco_wholebody133")
+    if raw_keypoint_schema not in {"coco17", "coco_wholebody133"}:
+        raise ValueError(f"Unsupported keypoint schema in parquet replay: {raw_keypoint_schema!r}")
+    keypoint_schema = cast(CocoKeypointSchema, raw_keypoint_schema)
+    if keypoints_xy.shape[0] != keypoint_scores.shape[0]:
+        raise ValueError(
+            "Expected matching keypoint coordinate and score counts in parquet replay row."
+        )
+
+    detections: list[PoseDetection] = []
+    for detection_index in range(int(keypoints_xy.shape[0])):
+        normalized = normalize_coco_body20(
+            keypoints_xy[detection_index],
+            keypoint_scores[detection_index],
+            keypoint_schema=keypoint_schema,
+        )
+        bbox = (
+            boxes[detection_index]
+            if detection_index < boxes.shape[0]
+            else infer_bbox_from_keypoints(normalized)
+        )
+        visible = normalized[:, 2] > 0.0
+        bbox_confidence = (
+            float(box_scores[detection_index])
+            if detection_index < box_scores.shape[0]
+            else float(np.mean(normalized[visible, 2]))
+            if np.any(visible)
+            else 0.0
+        )
+        detections.append(
+            PoseDetection(
+                bbox=np.asarray(bbox, dtype=np.float64),
+                bbox_confidence=bbox_confidence,
+                keypoints=np.asarray(normalized, dtype=np.float64),
+            )
+        )
+    return tuple(detections)
+
+
+@beartype
+def load_parquet_replay_dir(scene_path: Path, replay_root: Path) -> ReplaySequence:
+    parquet_paths = sorted(replay_root.glob(f"*{DETECTED_PARQUET_SUFFIX}"))
+    if not parquet_paths:
+        raise FileNotFoundError(
+            f"No detection parquet files matching *{DETECTED_PARQUET_SUFFIX} under {replay_root}."
+        )
+
+    frames_by_camera: dict[str, list[CameraFrame]] = {}
+    for parquet_path in parquet_paths:
+        camera_name = parquet_path.name.removesuffix(DETECTED_PARQUET_SUFFIX)
+        frames: list[CameraFrame] = []
+        for row in pq.read_table(parquet_path).to_pylist():
+            frames.append(
+                CameraFrame(
+                    camera_name=camera_name,
+                    frame_index=int(row["frame_index"]),
+                    timestamp_unix_ns=int(row["timestamp_unix_ns"]),
+                    detections=_pose_detections_from_parquet_row(row),
+                    source_size=(
+                        int(row.get("source_width", 0)),
+                        int(row.get("source_height", 0)),
+                    ),
+                )
+            )
+        frames.sort(key=lambda item: (item.timestamp_unix_ns, item.frame_index))
+        frames_by_camera[camera_name] = frames
+
+    return ReplaySequence(
+        scene_path=scene_path,
+        replay_path=replay_root,
+        frames_by_camera=frames_by_camera,
+    )
@@ -1,16 +1,19 @@
-from typing import Any
-
 import numpy as np
 import rpt
 from beartype import beartype
+from rpt._core import TriangulationConfig, TriangulationTrace  # type: ignore[reportMissingModuleSource]

-from pose_tracking_exp.joints import BODY20_JOINT_NAMES, BODY20_OBSERVATION_COUNT, BODY20_INDEX_BY_NAME
-from pose_tracking_exp.models import CameraFrame, ProposalCluster, SceneConfig
-from pose_tracking_exp.tensor_types import Pose2D
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME, BODY20_JOINT_NAMES, BODY20_OBSERVATION_COUNT
+from pose_tracking_exp.common.tensor_types import Pose2D
+from pose_tracking_exp.schema import CameraFrame, ProposalCluster, SceneConfig


-@beartype
-def build_rpt_config(scene: SceneConfig, *, min_match_score: float, min_group_size: int) -> Any:
+def build_rpt_config(
+    scene: SceneConfig,
+    *,
+    min_match_score: float,
+    min_group_size: int,
+) -> TriangulationConfig:
    cameras = [
        {
            "name": camera.name,
@@ -50,7 +53,7 @@ def pack_view_detections(frames: tuple[CameraFrame, ...], unmatched_indices: dic

@beartype
 def extract_clusters(
-    trace: Any,
+    trace: TriangulationTrace,
    camera_names: tuple[str, ...],
 ) -> tuple[ProposalCluster, ...]:
    clusters: list[ProposalCluster] = []
@@ -2,7 +2,7 @@ from collections.abc import Iterable

 from beartype import beartype

-from pose_tracking_exp.models import CameraFrame, FrameBundle, ReplaySequence
+from pose_tracking_exp.schema import CameraFrame, FrameBundle, ReplaySequence


@beartype
@@ -50,4 +50,3 @@ def synchronize_frames(
                )
            )
    return bundles
-
@@ -5,10 +5,10 @@ import numpy as np
 from beartype import beartype
 from scipy.optimize import linear_sum_assignment

-from pose_tracking_exp.camera_math import project_pose
-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME, CORE_JOINT_INDICES
-from pose_tracking_exp.kinematics import seed_state_from_pose3d, update_noise_scale, update_state_from_multiview
-from pose_tracking_exp.models import (
+from pose_tracking_exp.common.camera_math import project_pose
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME, CORE_JOINT_INDICES
+from pose_tracking_exp.common.normalization import core_reprojection_distance
+from pose_tracking_exp.schema import (
    ActiveTrackState,
    FrameBundle,
    PoseDetection,
@@ -20,8 +20,8 @@ from pose_tracking_exp.models import (
    TrackerConfig,
    TrackerDiagnostics,
 )
-from pose_tracking_exp.normalization import core_reprojection_distance
-from pose_tracking_exp.rpt_adapter import build_rpt_config, extract_clusters, pack_view_detections
+from pose_tracking_exp.tracking.kinematics import seed_state_from_pose3d, update_noise_scale, update_state_from_multiview
+from pose_tracking_exp.tracking.rpt_adapter import build_rpt_config, extract_clusters, pack_view_detections

 CORE_JOINT_MASK = np.zeros((20,), dtype=bool)
 CORE_JOINT_MASK[list(CORE_JOINT_INDICES)] = True
@@ -78,20 +78,24 @@ class PoseTracker:
        return replace(self._diagnostics)

    def run(self, bundles: list[FrameBundle]) -> list[TrackedFrameResult]:
+        self._tentative.clear()
+        self._active.clear()
+        self._lost.clear()
+        self._next_track_id = 1
        self._diagnostics = TrackerDiagnostics()
        return [self.step(bundle) for bundle in bundles]

    def step(self, bundle: FrameBundle) -> TrackedFrameResult:
-        self._enforce_single_person_constraints()
+        self._enforce_track_limits()
        matches, unmatched = self._match_existing_tracks(bundle)
        self._update_active_tracks(bundle, matches)
        self._update_lost_tracks(bundle, matches)
-        proposals = self._refresh_single_person_track_from_proposals(bundle, self._build_proposals(bundle, unmatched))
+        proposals = self._refresh_capped_single_track_from_proposals(bundle, self._build_proposals(bundle, unmatched))
        self._update_tentative_tracks(bundle, self._birth_candidate_proposals(proposals))
        self._promote_tentative_tracks(bundle)
        self._reacquire_lost_tracks(bundle, proposals)
        self._delete_expired_tracks()
-        self._enforce_single_person_constraints()
+        self._enforce_track_limits()
        return TrackedFrameResult(
            bundle_index=bundle.bundle_index,
            timestamp_unix_ns=bundle.timestamp_unix_ns,
@@ -101,46 +105,58 @@ class PoseTracker:
            proposals=proposals,
        )

-    def _single_person_mode(self) -> bool:
-        return self._config.mode == "single_person"
+    def _track_limit(self) -> int | None:
+        return self._config.max_active_tracks

-    def _keep_best_active_track(self) -> None:
-        if len(self._active) <= 1:
+    def _single_track_cap_enabled(self) -> bool:
+        return self._config.max_active_tracks == 1
+
+    def _keep_best_active_tracks(self, limit: int) -> None:
+        if len(self._active) <= limit:
            return
-        best_id = max(self._active, key=lambda track_id: _active_track_rank(self._active[track_id]))
+        ranked_ids = sorted(self._active, key=lambda track_id: _active_track_rank(self._active[track_id]), reverse=True)
+        keep_ids = set(ranked_ids[:limit])
        for track_id in list(self._active):
-            if track_id != best_id:
+            if track_id not in keep_ids:
                self._active.pop(track_id, None)

-    def _keep_best_lost_track(self) -> None:
-        if len(self._lost) <= 1:
+    def _keep_best_lost_tracks(self, limit: int) -> None:
+        if len(self._lost) <= limit:
            return
-        best_id = max(self._lost, key=lambda track_id: _lost_track_rank(self._lost[track_id]))
+        ranked_ids = sorted(self._lost, key=lambda track_id: _lost_track_rank(self._lost[track_id]), reverse=True)
+        keep_ids = set(ranked_ids[:limit])
        for track_id in list(self._lost):
-            if track_id != best_id:
+            if track_id not in keep_ids:
                self._lost.pop(track_id, None)

-    def _keep_best_tentative_track(self) -> None:
-        if len(self._tentative) <= 1:
+    def _keep_best_tentative_tracks(self, limit: int) -> None:
+        if len(self._tentative) <= limit:
            return
-        best_id = max(self._tentative, key=lambda track_id: _tentative_track_rank(self._tentative[track_id]))
+        ranked_ids = sorted(
+            self._tentative,
+            key=lambda track_id: _tentative_track_rank(self._tentative[track_id]),
+            reverse=True,
+        )
+        keep_ids = set(ranked_ids[:limit])
        for track_id in list(self._tentative):
-            if track_id != best_id:
+            if track_id not in keep_ids:
                self._tentative.pop(track_id, None)

-    def _enforce_single_person_constraints(self) -> None:
-        if not self._single_person_mode():
+    def _enforce_track_limits(self) -> None:
+        limit = self._track_limit()
+        if limit is None:
+            return
+        self._keep_best_active_tracks(limit)
+        self._keep_best_lost_tracks(limit)
+        self._keep_best_tentative_tracks(limit)
+        if not self._single_track_cap_enabled():
            return
-        self._keep_best_active_track()
        if self._active:
            self._lost.clear()
            self._tentative.clear()
            return
-        self._keep_best_lost_track()
        if self._lost:
            self._tentative.clear()
-            return
-        self._keep_best_tentative_track()

    def _predicted_pose_by_track(self) -> dict[int, np.ndarray]:
        result: dict[int, np.ndarray] = {}
@@ -278,7 +294,7 @@ class PoseTracker:
            self._diagnostics.proposal_build_seconds += perf_counter() - started_at

    def _birth_candidate_proposals(self, proposals: tuple[ProposalCluster, ...]) -> tuple[ProposalCluster, ...]:
-        if not self._single_person_mode():
+        if not self._single_track_cap_enabled():
            return proposals
        if self._active or self._lost:
            return ()
@@ -286,12 +302,12 @@ class PoseTracker:
            return ()
        return (max(proposals, key=_proposal_rank),)

-    def _refresh_single_person_track_from_proposals(
+    def _refresh_capped_single_track_from_proposals(
        self,
        bundle: FrameBundle,
        proposals: tuple[ProposalCluster, ...],
    ) -> tuple[ProposalCluster, ...]:
-        if not self._single_person_mode() or not proposals:
+        if not self._single_track_cap_enabled() or not proposals:
            return proposals

        remaining = list(proposals)
@@ -0,0 +1 @@
+"""Test package for support helpers and test-local utilities."""
@@ -0,0 +1 @@
+"""Test-only support helpers."""
@@ -1,18 +1,21 @@
 from pathlib import Path

+import click
 import cv2
 import numpy as np
 import pyarrow.parquet as pq
 from beartype import beartype
+from loguru import logger

-from pose_tracking_exp.models import CameraCalibration, CameraFrame, FrameBundle, PoseDetection, SceneConfig
-from pose_tracking_exp.normalization import infer_bbox_from_keypoints, normalize_rtmpose_body20
+from pose_tracking_exp.common.normalization import infer_bbox_from_keypoints, normalize_rtmpose_body20
+from pose_tracking_exp.schema import CameraCalibration, CameraFrame, FrameBundle, PoseDetection, SceneConfig, TrackerConfig
+from pose_tracking_exp.tracking import PoseTracker

 _NOMINAL_FRAME_PERIOD_NS = 33_333_333


@beartype
-def load_actualtest_scene(root: Path) -> SceneConfig:
+def load_actual_test_scene(root: Path) -> SceneConfig:
    # ActualTest parquet comes from the ChArUco/OpenCV side, so `rvec` / `tvec`
    # are world->camera extrinsics. The RPT-facing camera pose is derived later
    # from this canonical OpenCV form.
@@ -40,13 +43,14 @@ def load_actualtest_scene(root: Path) -> SceneConfig:


@beartype
-def load_actualtest_segment_bundles(
+def load_actual_test_segment_bundles(
    root: Path,
    segment_name: str,
    *,
    frame_start: int = 690,
    frame_stop: int | None = None,
    max_frames: int | None = None,
+    min_cameras_with_rows: int = 1,
    min_visible_joints: int = 6,
 ) -> list[FrameBundle]:
    segment_root = root / segment_name
@@ -98,24 +102,31 @@ def load_actualtest_segment_bundles(
    if not by_camera:
        return []

-    common_frames = sorted(set.intersection(*(set(frames) for frames in by_camera.values())))
+    candidate_frames = sorted(set().union(*(set(frames) for frames in by_camera.values())))
+    if min_cameras_with_rows > 1:
+        candidate_frames = [
+            frame_index
+            for frame_index in candidate_frames
+            if sum(frame_index in frames for frames in by_camera.values()) >= min_cameras_with_rows
+        ]
    if max_frames is not None:
-        common_frames = common_frames[:max_frames]
+        candidate_frames = candidate_frames[:max_frames]

-    scene = load_actualtest_scene(root)
+    scene = load_actual_test_scene(root)
    camera_by_name = {camera.name: camera for camera in scene.cameras}
    bundles: list[FrameBundle] = []
-    for bundle_index, frame_index in enumerate(common_frames):
+    ordered_camera_names = [camera.name for camera in scene.cameras]
+    for bundle_index, frame_index in enumerate(candidate_frames):
        timestamp_unix_ns = bundle_index * _NOMINAL_FRAME_PERIOD_NS
        views: list[CameraFrame] = []
-        for camera_name in sorted(by_camera):
+        for camera_name in ordered_camera_names:
            camera = camera_by_name[camera_name]
            views.append(
                CameraFrame(
                    camera_name=camera_name,
                    frame_index=frame_index,
                    timestamp_unix_ns=timestamp_unix_ns,
-                    detections=by_camera[camera_name][frame_index],
+                    detections=by_camera.get(camera_name, {}).get(frame_index, ()),
                    source_size=(camera.width, camera.height),
                )
            )
@@ -127,3 +138,49 @@ def load_actualtest_segment_bundles(
            )
        )
    return bundles
+
+
+@click.command()
+@click.argument("root_path", type=click.Path(path_type=Path, exists=True, file_okay=False))
+@click.option("--segment", "segment_name", default="Segment_1", show_default=True)
+@click.option("--frame-start", default=690, type=int, show_default=True)
+@click.option("--frame-stop", type=int)
+@click.option("--max-frames", type=click.IntRange(min=1))
+@click.option("--min-camera-rows", default=1, type=click.IntRange(min=1), show_default=True)
+@click.option("--max-active-tracks", default=1, type=click.IntRange(min=1), show_default=True)
+def main(
+    root_path: Path,
+    segment_name: str,
+    frame_start: int,
+    frame_stop: int | None,
+    max_frames: int | None,
+    min_camera_rows: int,
+    max_active_tracks: int,
+) -> None:
+    logger.remove()
+    logger.add(
+        click.get_text_stream("stderr"),
+        level="INFO",
+        format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
+    )
+    scene = load_actual_test_scene(root_path)
+    bundles = load_actual_test_segment_bundles(
+        root_path,
+        segment_name,
+        frame_start=frame_start,
+        frame_stop=frame_stop,
+        max_frames=max_frames,
+        min_cameras_with_rows=min_camera_rows,
+    )
+    tracker = PoseTracker(scene, TrackerConfig(max_active_tracks=max_active_tracks))
+    results = tracker.run(bundles)
+    logger.info(
+        "actual_test bundles={} active_frames={} proposal_frames={}",
+        len(results),
+        sum(1 for result in results if result.active_tracks),
+        sum(1 for result in results if result.proposals),
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -4,8 +4,8 @@ import numpy as np
 import pyarrow as pa
 import pyarrow.parquet as pq

-from pose_tracking_exp.actualtest import load_actualtest_scene, load_actualtest_segment_bundles
-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from tests.support.actual_test import load_actual_test_scene, load_actual_test_segment_bundles


 def _write_parquet(path: Path, rows: list[dict[str, object]]) -> None:
@@ -25,7 +25,7 @@ def _sample_rtmpose_detection() -> tuple[list[float], list[list[float]], list[fl
    return [8.0, 4.0, 32.0, 64.0], keypoints_xy.tolist(), scores.tolist()


-def test_load_actualtest_parquet_scene_and_segment(tmp_path: Path) -> None:
+def test_load_actual_test_parquet_scene_and_segment(tmp_path: Path) -> None:
    root = tmp_path / "ActualTest_WeiHua"
    _write_parquet(
        root / "camera_params" / "camera_params.parquet",
@@ -62,8 +62,8 @@ def test_load_actualtest_parquet_scene_and_segment(tmp_path: Path) -> None:
            ],
        )

-    scene = load_actualtest_scene(root)
-    bundles = load_actualtest_segment_bundles(root, "Segment_1", frame_start=690, max_frames=1)
+    scene = load_actual_test_scene(root)
+    bundles = load_actual_test_segment_bundles(root, "Segment_1", frame_start=690, max_frames=1)

    assert [camera.name for camera in scene.cameras] == ["5602", "5603"]
    np.testing.assert_allclose(scene.cameras[0].pose_T, [0.0, 0.0, 0.0])
@@ -75,3 +75,53 @@ def test_load_actualtest_parquet_scene_and_segment(tmp_path: Path) -> None:
        bundles[0].views[0].detections[0].keypoints[BODY20_INDEX_BY_NAME["hip_middle"], :2],
        [20.0, 60.0],
    )
+
+
+def test_load_actual_test_keeps_partial_camera_frames(tmp_path: Path) -> None:
+    root = tmp_path / "ActualTest_WeiHua"
+    _write_parquet(
+        root / "camera_params" / "camera_params.parquet",
+        [
+            {
+                "name": "AF_02",
+                "port": 5602,
+                "intrinsic": {
+                    "camera_matrix": [[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]],
+                    "distortion_coefficients": [0.0, 0.0, 0.0, 0.0, 0.0],
+                },
+                "extrinsic": {"rvec": [0.0, 0.0, 0.0], "tvec": [0.0, 0.0, 0.0]},
+                "resolution": {"width": 640, "height": 480},
+            },
+            {
+                "name": "AF_03",
+                "port": 5603,
+                "intrinsic": {
+                    "camera_matrix": [[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]],
+                    "distortion_coefficients": [0.0, 0.0, 0.0, 0.0, 0.0],
+                },
+                "extrinsic": {"rvec": [0.0, 0.0, 0.0], "tvec": [1.0, 0.0, 0.0]},
+                "resolution": {"width": 640, "height": 480},
+            },
+        ],
+    )
+    box, keypoints_xy, scores = _sample_rtmpose_detection()
+    _write_parquet(
+        root / "Segment_1" / "5602_detected.parquet",
+        [
+            {"frame_index": 690, "boxes": [box], "kps": [keypoints_xy], "kps_scores": [scores]},
+            {"frame_index": 691, "boxes": [box], "kps": [keypoints_xy], "kps_scores": [scores]},
+        ],
+    )
+    _write_parquet(
+        root / "Segment_1" / "5603_detected.parquet",
+        [
+            {"frame_index": 690, "boxes": [box], "kps": [keypoints_xy], "kps_scores": [scores]},
+        ],
+    )
+
+    bundles = load_actual_test_segment_bundles(root, "Segment_1", frame_start=690)
+
+    assert [bundle.views[0].frame_index for bundle in bundles] == [690, 691]
+    assert [view.camera_name for view in bundles[1].views] == ["5602", "5603"]
+    assert len(bundles[1].views[0].detections) == 1
+    assert bundles[1].views[1].detections == ()
@@ -8,9 +8,9 @@ import pytest

 pytest.importorskip("rpt")

-from pose_tracking_exp.models import CameraCalibration, SceneConfig
-from pose_tracking_exp.replay import load_scene_file
-from pose_tracking_exp.rpt_adapter import build_rpt_config
+from pose_tracking_exp.schema import CameraCalibration, CameraModel, SceneConfig, parse_camera_model
+from pose_tracking_exp.tracking.replay_io import load_scene_file
+from pose_tracking_exp.tracking.rpt_adapter import build_rpt_config


 class _CameraArgs(NamedTuple):
@@ -19,7 +19,7 @@ class _CameraArgs(NamedTuple):
    height: int
    K: np.ndarray
    DC: np.ndarray
-    model: str
+    model: CameraModel


 def _camera_args() -> _CameraArgs:
@@ -29,7 +29,7 @@ def _camera_args() -> _CameraArgs:
        height=480,
        K=np.asarray([[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]], dtype=np.float64),
        DC=np.zeros(5, dtype=np.float64),
-        model="pinhole",
+        model=parse_camera_model("pinhole"),
    )


@@ -139,7 +139,7 @@ def test_build_rpt_config_uses_pose_convention(monkeypatch: pytest.MonkeyPatch)
        captured["min_group_size"] = min_group_size
        return captured

-    monkeypatch.setattr("pose_tracking_exp.rpt_adapter.rpt.make_triangulation_config", fake_make_triangulation_config)
+    monkeypatch.setattr("pose_tracking_exp.tracking.rpt_adapter.rpt.make_triangulation_config", fake_make_triangulation_config)

    build_rpt_config(scene, min_match_score=0.5, min_group_size=2)

@@ -0,0 +1,223 @@
+from collections.abc import AsyncIterator, Sequence
+from pathlib import Path
+
+import anyio
+import numpy as np
+import pytest
+
+from pose_tracking_exp.detection.config import (
+    DetectionRunnerConfig,
+    load_detection_runner_config,
+    resolve_instances,
+)
+from pose_tracking_exp.detection.runner import (
+    PendingFrame,
+    SourceSlot,
+    run_detection_runner,
+    store_latest_frame,
+    take_pending_batch,
+)
+from pose_tracking_exp.schema.detection import PoseDetections, SourceFrame
+
+
+def test_load_detection_runner_config_from_toml_and_env(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    config_path = tmp_path / "runner.toml"
+    config_path.write_text(
+        "\n".join(
+            [
+                'instances = ["front_left", "front_right"]',
+                'device = "cuda:1"',
+                'nats_host = "nats://localhost:4222"',
+                'yolo_checkpoint = "checkpoint/yolo/yolo11_mix_epoch10.pt"',
+                'pose_checkpoint = "checkpoint/dwpose/best_coco-wholebody_AP_epoch_50.pth"',
+                "bbox_area_threshold = 2500",
+                "max_batch_frames = 6",
+                "max_batch_wait_ms = 3",
+            ]
+        ),
+        encoding="utf-8",
+    )
+
+    monkeypatch.setenv("POSE_TRACKING_EXP_DETECTION_DEVICE", "cpu")
+    config = load_detection_runner_config(config_path)
+
+    assert config.instances == ("front_left", "front_right")
+    assert config.device == "cpu"
+    assert config.nats_host == "nats://localhost:4222"
+    assert config.bbox_area_threshold == 2500
+    assert config.max_batch_frames == 6
+    assert config.max_batch_wait_ms == 3
+
+
+def test_resolve_instances_prefers_cli_values() -> None:
+    assert resolve_instances(("cli_a", "cli_b"), ("cfg_a",)) == ("cli_a", "cli_b")
+
+
+def test_resolve_instances_falls_back_to_config_values() -> None:
+    assert resolve_instances((), ("cfg_a", "cfg_b")) == ("cfg_a", "cfg_b")
+
+
+def test_store_latest_frame_overwrites_pending_frame() -> None:
+    slot = SourceSlot(source_name="front_left")
+    first = SourceFrame(
+        source_name="front_left",
+        image_bgr=np.zeros((1, 1, 3), dtype=np.uint8),
+        frame_index=1,
+        timestamp_unix_ns=100,
+    )
+    second = SourceFrame(
+        source_name="front_left",
+        image_bgr=np.ones((1, 1, 3), dtype=np.uint8),
+        frame_index=2,
+        timestamp_unix_ns=200,
+    )
+
+    store_latest_frame(slot, first)
+    store_latest_frame(slot, second)
+
+    assert slot.received_frames == 2
+    assert slot.dropped_frames == 1
+    assert slot.pending_frame is not None
+    assert slot.pending_frame.frame is second
+
+
+def test_take_pending_batch_collects_at_most_one_frame_per_source() -> None:
+    slots = {
+        "front_left": SourceSlot(
+            source_name="front_left",
+            pending_frame=PendingFrame(
+                source_name="front_left",
+                frame=SourceFrame(
+                    source_name="front_left",
+                    image_bgr=np.zeros((1, 1, 3), dtype=np.uint8),
+                    frame_index=11,
+                    timestamp_unix_ns=110,
+                ),
+            ),
+        ),
+        "front_right": SourceSlot(
+            source_name="front_right",
+            pending_frame=PendingFrame(
+                source_name="front_right",
+                frame=SourceFrame(
+                    source_name="front_right",
+                    image_bgr=np.zeros((1, 1, 3), dtype=np.uint8),
+                    frame_index=22,
+                    timestamp_unix_ns=220,
+                ),
+            ),
+        ),
+        "rear": SourceSlot(
+            source_name="rear",
+            pending_frame=PendingFrame(
+                source_name="rear",
+                frame=SourceFrame(
+                    source_name="rear",
+                    image_bgr=np.zeros((1, 1, 3), dtype=np.uint8),
+                    frame_index=33,
+                    timestamp_unix_ns=330,
+                ),
+            ),
+        ),
+    }
+
+    batch = take_pending_batch(slots, max_batch_frames=2)
+
+    assert [frame.source_name for frame in batch] == ["front_left", "front_right"]
+    assert slots["front_left"].pending_frame is None
+    assert slots["front_right"].pending_frame is None
+    assert slots["rear"].pending_frame is not None
+
+
+class StubSource:
+    def __init__(self, source_name: str, frames: tuple[SourceFrame, ...]) -> None:
+        self.source_name = source_name
+        self._frames = frames
+
+    async def frames(self) -> AsyncIterator[SourceFrame]:
+        for frame in self._frames:
+            yield frame
+
+
+class StubPoseShim:
+    def process_many(self, frames: Sequence[SourceFrame]) -> list[PoseDetections]:
+        detections: list[PoseDetections] = []
+        for frame in frames:
+            detections.append(
+                PoseDetections(
+                    source_name=frame.source_name,
+                    frame_index=frame.frame_index,
+                    source_size=(frame.image_bgr.shape[1], frame.image_bgr.shape[0]),
+                    boxes_xyxy=np.asarray([[0.0, 0.0, 10.0, 10.0]], dtype=np.float32),
+                    box_scores=np.asarray([1.0], dtype=np.float32),
+                    keypoints_xy=np.zeros((1, 133, 2), dtype=np.float32),
+                    keypoint_scores=np.ones((1, 133), dtype=np.float32),
+                    timestamp_unix_ns=frame.timestamp_unix_ns,
+                    keypoint_schema="coco_wholebody133",
+                )
+            )
+        return detections
+
+
+class StubSink:
+    def __init__(self) -> None:
+        self.messages: list[PoseDetections] = []
+        self.closed = False
+
+    async def publish_pose(self, detections: PoseDetections) -> None:
+        self.messages.append(detections)
+
+    async def aclose(self) -> None:
+        self.closed = True
+
+
+def test_run_detection_runner_publishes_payloads() -> None:
+    sink = StubSink()
+    sources = (
+        StubSource(
+            "cam0",
+            (
+                SourceFrame(
+                    source_name="cam0",
+                    image_bgr=np.zeros((2, 3, 3), dtype=np.uint8),
+                    frame_index=1,
+                    timestamp_unix_ns=100,
+                ),
+            ),
+        ),
+        StubSource(
+            "cam1",
+            (
+                SourceFrame(
+                    source_name="cam1",
+                    image_bgr=np.zeros((2, 3, 3), dtype=np.uint8),
+                    frame_index=2,
+                    timestamp_unix_ns=200,
+                ),
+            ),
+        ),
+    )
+    config = DetectionRunnerConfig(
+        instances=("cam0", "cam1"),
+        pose_config_path=Path(__file__),
+        yolo_checkpoint=Path(__file__),
+        pose_checkpoint=Path(__file__),
+        max_batch_frames=2,
+    )
+
+    anyio.run(
+        run_detection_runner,
+        sources,
+        StubPoseShim(),
+        sink,
+        config,
+    )
+
+    assert sink.closed is True
+    assert [(item.source_name, item.frame_index, item.timestamp_unix_ns) for item in sink.messages] == [
+        ("cam0", 1, 100),
+        ("cam1", 2, 200),
+    ]
@@ -0,0 +1,137 @@
+import json
+from pathlib import Path
+
+import anyio
+import cv2
+import numpy as np
+import pyarrow.parquet as pq
+
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.detection.sinks import ParquetPoseSink
+from pose_tracking_exp.detection.sources import VideoFrameSource
+from pose_tracking_exp.schema.detection import PoseDetections
+from pose_tracking_exp.tracking import load_replay_file
+
+
+def _write_synthetic_video(path: Path) -> None:
+    writer = cv2.VideoWriter(
+        str(path),
+        cv2.VideoWriter.fourcc(*"MJPG"),
+        10.0,
+        (8, 6),
+    )
+    if not writer.isOpened():
+        raise RuntimeError("Could not open synthetic video writer.")
+    try:
+        for frame_index in range(3):
+            frame = np.full((6, 8, 3), frame_index * 32, dtype=np.uint8)
+            writer.write(frame)
+    finally:
+        writer.release()
+
+
+def _sample_wholebody_detection(*, source_name: str, frame_index: int) -> PoseDetections:
+    keypoints_xy = np.zeros((1, 133, 2), dtype=np.float32)
+    keypoint_scores = np.zeros((1, 133), dtype=np.float32)
+    keypoints_xy[0, 5] = [10.0, 20.0]
+    keypoints_xy[0, 6] = [30.0, 20.0]
+    keypoints_xy[0, 11] = [12.0, 60.0]
+    keypoints_xy[0, 12] = [28.0, 60.0]
+    keypoints_xy[0, 0] = [20.0, 8.0]
+    keypoint_scores[0, [0, 5, 6, 11, 12]] = 1.0
+    return PoseDetections(
+        source_name=source_name,
+        frame_index=frame_index,
+        source_size=(640, 480),
+        boxes_xyxy=np.asarray([[8.0, 4.0, 32.0, 64.0]], dtype=np.float32),
+        box_scores=np.asarray([0.9], dtype=np.float32),
+        keypoints_xy=keypoints_xy,
+        keypoint_scores=keypoint_scores,
+        timestamp_unix_ns=frame_index * 100_000_000,
+        keypoint_schema="coco_wholebody133",
+    )
+
+
+def test_video_frame_source_reads_frames(tmp_path: Path) -> None:
+    video_path = tmp_path / "cam0.avi"
+    _write_synthetic_video(video_path)
+    source = VideoFrameSource(video_path, source_name="cam0")
+
+    async def collect() -> list[tuple[str, int, int, tuple[int, int, int]]]:
+        frames: list[tuple[str, int, int, tuple[int, int, int]]] = []
+        async for frame in source.frames():
+            frames.append(
+                (
+                    frame.source_name,
+                    frame.frame_index,
+                    frame.timestamp_unix_ns,
+                    frame.image_bgr.shape,
+                )
+            )
+        return frames
+
+    frames = anyio.run(collect)
+
+    assert [item[0] for item in frames] == ["cam0", "cam0", "cam0"]
+    assert [item[1] for item in frames] == [0, 1, 2]
+    assert [item[3] for item in frames] == [(6, 8, 3), (6, 8, 3), (6, 8, 3)]
+    assert frames[0][2] <= frames[1][2] <= frames[2][2]
+
+
+def test_parquet_sink_round_trips_into_tracking_replay(tmp_path: Path) -> None:
+    output_dir = tmp_path / "detections"
+    sink = ParquetPoseSink(output_dir, flush_rows=1)
+
+    async def write_rows() -> None:
+        await sink.publish_pose(_sample_wholebody_detection(source_name="cam0", frame_index=0))
+        await sink.publish_pose(
+            PoseDetections(
+                source_name="cam0",
+                frame_index=1,
+                source_size=(640, 480),
+                boxes_xyxy=np.empty((0, 4), dtype=np.float32),
+                box_scores=np.empty((0,), dtype=np.float32),
+                keypoints_xy=np.empty((0, 133, 2), dtype=np.float32),
+                keypoint_scores=np.empty((0, 133), dtype=np.float32),
+                timestamp_unix_ns=100_000_000,
+                keypoint_schema="coco_wholebody133",
+            )
+        )
+        await sink.aclose()
+
+    anyio.run(write_rows)
+
+    parquet_path = output_dir / "cam0_detected.parquet"
+    assert parquet_path.exists()
+    assert pq.read_table(parquet_path).num_rows == 2
+
+    scene_path = tmp_path / "scene.json"
+    scene_path.write_text(
+        json.dumps(
+            {
+                "room_size": [6.0, 4.0, 3.0],
+                "room_center": [0.0, 0.0, 1.0],
+                "cameras": [
+                    {
+                        "name": "cam0",
+                        "width": 640,
+                        "height": 480,
+                        "K": [[500.0, 0.0, 320.0], [0.0, 500.0, 240.0], [0.0, 0.0, 1.0]],
+                        "DC": [0.0, 0.0, 0.0, 0.0, 0.0],
+                        "R": [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],
+                        "T": [[0.0], [0.0], [0.0]],
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    replay = load_replay_file(scene_path, output_dir)
+    frames = replay.frames_by_camera["cam0"]
+    assert [frame.frame_index for frame in frames] == [0, 1]
+    assert frames[1].detections == ()
+    np.testing.assert_allclose(
+        frames[0].detections[0].keypoints[BODY20_INDEX_BY_NAME["hip_middle"], :2],
+        [20.0, 60.0],
+    )
@@ -1,7 +1,7 @@
 import numpy as np

-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
-from pose_tracking_exp.kinematics import seed_state_from_pose3d
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.tracking.kinematics import seed_state_from_pose3d


 def _sample_pose3d() -> np.ndarray:
@@ -38,7 +38,7 @@ def test_seed_state_from_pose3d_does_not_call_least_squares(monkeypatch) -> None
    def fail_least_squares(*args: object, **kwargs: object) -> object:
        raise AssertionError("seed_state_from_pose3d should not call scipy.optimize.least_squares")

-    monkeypatch.setattr("pose_tracking_exp.kinematics.least_squares", fail_least_squares)
+    monkeypatch.setattr("pose_tracking_exp.tracking.kinematics.least_squares", fail_least_squares)
    state = seed_state_from_pose3d(_sample_pose3d())

    assert state.parameters.shape == (31,)
@@ -4,11 +4,17 @@ from pathlib import Path

 import numpy as np

-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
-from pose_tracking_exp.normalization import normalize_rtmpose_body20
-from pose_tracking_exp.parajumping import PROTOCOL_HEADER, convert_payload_record, decode_pose_payload
-from pose_tracking_exp.replay import load_replay_file, load_scene_file
-from pose_tracking_exp.sync import synchronize_frames
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.common.normalization import normalize_coco_body20, normalize_rtmpose_body20
+from pose_tracking_exp.detection.cvmmap_payload import (
+    COCO_WHOLEBODY_KEYPOINT_COUNT,
+    PROTOCOL_HEADER,
+    CvmmapPosePayloadCodec,
+    convert_payload_record,
+    decode_pose_payload,
+)
+from pose_tracking_exp.schema.detection import PoseDetections
+from pose_tracking_exp.tracking import load_replay_file, load_scene_file, synchronize_frames


 def _encode_payload(
@@ -31,7 +37,7 @@ def _encode_payload(
        + np.asarray(box_scores, dtype=np.uint8).tobytes()
        + int(keypoints_xy.shape[0]).to_bytes(1, "little")
        + np.asarray(keypoints_xy, dtype="<u2").tobytes()
-        + int(keypoint_scores.size).to_bytes(1, "little")
+        + int(keypoint_scores.shape[0]).to_bytes(1, "little")
        + np.asarray(keypoint_scores, dtype=np.uint8).reshape(-1).tobytes()
        + int(timestamp_unix_ns).to_bytes(8, "little")
    )
@@ -54,6 +60,23 @@ def test_normalize_rtmpose_body20_derives_midpoints_and_head():
    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["head"], :2], [20.0, 8.0])


+def test_normalize_coco17_body20_derives_midpoints_and_head():
+    keypoints = np.zeros((17, 2), dtype=np.float64)
+    scores = np.zeros((17,), dtype=np.float64)
+    keypoints[5] = [10.0, 20.0]
+    keypoints[6] = [30.0, 20.0]
+    keypoints[11] = [12.0, 60.0]
+    keypoints[12] = [28.0, 60.0]
+    keypoints[0] = [20.0, 8.0]
+    scores[[0, 5, 6, 11, 12]] = 1.0
+
+    normalized = normalize_coco_body20(keypoints, scores, keypoint_schema="coco17")
+
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["hip_middle"], :2], [20.0, 60.0])
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["shoulder_middle"], :2], [20.0, 20.0])
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["head"], :2], [20.0, 8.0])
+
+
 def test_decode_payload_and_convert_record():
    keypoints_xy = np.zeros((1, 133, 2), dtype=np.uint16)
    keypoint_scores = np.zeros((1, 133), dtype=np.uint8)
@@ -87,6 +110,26 @@ def test_decode_payload_and_convert_record():
    assert converted["frame_index"] == 7


+def test_encode_pose_payload_requires_coco_wholebody133():
+    codec = CvmmapPosePayloadCodec()
+    detections = PoseDetections(
+        source_name="cam0",
+        frame_index=1,
+        source_size=(640, 480),
+        boxes_xyxy=np.zeros((1, 4), dtype=np.float32),
+        box_scores=np.ones((1,), dtype=np.float32),
+        keypoints_xy=np.zeros((1, COCO_WHOLEBODY_KEYPOINT_COUNT, 2), dtype=np.float32),
+        keypoint_scores=np.ones((1, COCO_WHOLEBODY_KEYPOINT_COUNT), dtype=np.float32),
+        timestamp_unix_ns=123,
+        keypoint_schema="coco_wholebody133",
+    )
+
+    payload = codec.encode(detections)
+    decoded = decode_pose_payload(payload)
+    assert decoded.frame_index == 1
+    assert decoded.reference_size == (640, 480)
+
+
 def test_load_replay_and_synchronize(tmp_path: Path):
    scene_path = tmp_path / "scene.json"
    replay_path = tmp_path / "replay.jsonl"
@@ -153,4 +196,3 @@ def test_load_replay_and_synchronize(tmp_path: Path):
    bundles = synchronize_frames(replay, max_skew_ns=20, min_views=2)
    assert len(bundles) == 1
    assert {frame.camera_name for frame in bundles[0].views} == {"cam0", "cam1"}
-
@@ -5,9 +5,9 @@ import pytest

 pytest.importorskip("rpt")

-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
-from pose_tracking_exp.models import CameraCalibration, CameraFrame, FrameBundle, ProposalCluster, SceneConfig, TrackerConfig
-from pose_tracking_exp.tracker import PoseTracker
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.schema import CameraCalibration, CameraFrame, FrameBundle, ProposalCluster, SceneConfig, TrackerConfig
+from pose_tracking_exp.tracking import PoseTracker


 def _make_scene() -> SceneConfig:
@@ -96,7 +96,7 @@ def test_single_person_mode_caps_active_tracks(monkeypatch) -> None:
    tracker = PoseTracker(
        _make_scene(),
        TrackerConfig(
-            mode="single_person",
+            max_active_tracks=1,
            tentative_min_age=1,
            tentative_hits_required=1,
            tentative_promote_score=0.0,
@@ -127,7 +127,7 @@ def test_single_person_mode_reuses_lost_track_id(monkeypatch) -> None:
    tracker = PoseTracker(
        _make_scene(),
        TrackerConfig(
-            mode="single_person",
+            max_active_tracks=1,
            tentative_min_age=1,
            tentative_hits_required=1,
            tentative_promote_score=0.0,
@@ -6,9 +6,9 @@ import pytest

 pytest.importorskip("rpt")

-from pose_tracking_exp.models import CameraFrame, FrameBundle, PoseDetection, TrackerConfig
-from pose_tracking_exp.replay import load_scene_file
-from pose_tracking_exp.tracker import PoseTracker
+from pose_tracking_exp.schema import CameraFrame, FrameBundle, PoseDetection, TrackerConfig
+from pose_tracking_exp.tracking import PoseTracker
+from pose_tracking_exp.tracking.replay_io import load_scene_file

 RPT_ROOT = Path("/home/crosstyan/Code/RapidPoseTriangulation")

@@ -1,3 +1 @@
-[[index]]
-url = "https://pypi.org/simple"
-default = true
+no-build-isolation-package = ["chumpy", "xtcocotools"]
				`@@ -0,0 +1 @@`
				`vendor/wheels/*.whl filter=lfs diff=lfs merge=lfs -text`
				`@@ -0,0 +1 @@`
				`"""Test package for support helpers and test-local utilities."""`