feat!: reorganize detection and tracking pipeline

Refactor the package into common, schema, detection, and tracking namespaces and move dataset-specific ActualTest utilities into tests/support. Add a pluggable detection stack with typed protocols, pydantic-settings config, loguru-based runner logging, cvmmap and headless video sources, NATS and parquet sinks, and a structured coco-wholebody133 payload path. Teach tracking replay loading to consume parquet detection directories directly, preserve empty frames, and keep the video-to-parquet-to-tracking workflow usable for offline E2E runs. Vendor the local mmcv and xtcocotools wheels under Git LFS, update uv sources/lock state, and refresh the mmcv build so mmcv.ops loads successfully with the current torch+cu130 environment.
2026-03-26 16:24:27 +08:00
parent f1a2372b3c
commit 2c0d51ab31
56 changed files with 5179 additions and 889 deletions
@@ -4,11 +4,17 @@ from pathlib import Path

 import numpy as np

-from pose_tracking_exp.joints import BODY20_INDEX_BY_NAME
-from pose_tracking_exp.normalization import normalize_rtmpose_body20
-from pose_tracking_exp.parajumping import PROTOCOL_HEADER, convert_payload_record, decode_pose_payload
-from pose_tracking_exp.replay import load_replay_file, load_scene_file
-from pose_tracking_exp.sync import synchronize_frames
+from pose_tracking_exp.common.joints import BODY20_INDEX_BY_NAME
+from pose_tracking_exp.common.normalization import normalize_coco_body20, normalize_rtmpose_body20
+from pose_tracking_exp.detection.cvmmap_payload import (
+    COCO_WHOLEBODY_KEYPOINT_COUNT,
+    PROTOCOL_HEADER,
+    CvmmapPosePayloadCodec,
+    convert_payload_record,
+    decode_pose_payload,
+)
+from pose_tracking_exp.schema.detection import PoseDetections
+from pose_tracking_exp.tracking import load_replay_file, load_scene_file, synchronize_frames


 def _encode_payload(
@@ -31,7 +37,7 @@ def _encode_payload(
        + np.asarray(box_scores, dtype=np.uint8).tobytes()
        + int(keypoints_xy.shape[0]).to_bytes(1, "little")
        + np.asarray(keypoints_xy, dtype="<u2").tobytes()
-        + int(keypoint_scores.size).to_bytes(1, "little")
+        + int(keypoint_scores.shape[0]).to_bytes(1, "little")
        + np.asarray(keypoint_scores, dtype=np.uint8).reshape(-1).tobytes()
        + int(timestamp_unix_ns).to_bytes(8, "little")
    )
@@ -54,6 +60,23 @@ def test_normalize_rtmpose_body20_derives_midpoints_and_head():
    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["head"], :2], [20.0, 8.0])


+def test_normalize_coco17_body20_derives_midpoints_and_head():
+    keypoints = np.zeros((17, 2), dtype=np.float64)
+    scores = np.zeros((17,), dtype=np.float64)
+    keypoints[5] = [10.0, 20.0]
+    keypoints[6] = [30.0, 20.0]
+    keypoints[11] = [12.0, 60.0]
+    keypoints[12] = [28.0, 60.0]
+    keypoints[0] = [20.0, 8.0]
+    scores[[0, 5, 6, 11, 12]] = 1.0
+
+    normalized = normalize_coco_body20(keypoints, scores, keypoint_schema="coco17")
+
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["hip_middle"], :2], [20.0, 60.0])
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["shoulder_middle"], :2], [20.0, 20.0])
+    np.testing.assert_allclose(normalized[BODY20_INDEX_BY_NAME["head"], :2], [20.0, 8.0])
+
+
 def test_decode_payload_and_convert_record():
    keypoints_xy = np.zeros((1, 133, 2), dtype=np.uint16)
    keypoint_scores = np.zeros((1, 133), dtype=np.uint8)
@@ -87,6 +110,26 @@ def test_decode_payload_and_convert_record():
    assert converted["frame_index"] == 7


+def test_encode_pose_payload_requires_coco_wholebody133():
+    codec = CvmmapPosePayloadCodec()
+    detections = PoseDetections(
+        source_name="cam0",
+        frame_index=1,
+        source_size=(640, 480),
+        boxes_xyxy=np.zeros((1, 4), dtype=np.float32),
+        box_scores=np.ones((1,), dtype=np.float32),
+        keypoints_xy=np.zeros((1, COCO_WHOLEBODY_KEYPOINT_COUNT, 2), dtype=np.float32),
+        keypoint_scores=np.ones((1, COCO_WHOLEBODY_KEYPOINT_COUNT), dtype=np.float32),
+        timestamp_unix_ns=123,
+        keypoint_schema="coco_wholebody133",
+    )
+
+    payload = codec.encode(detections)
+    decoded = decode_pose_payload(payload)
+    assert decoded.frame_index == 1
+    assert decoded.reference_size == (640, 480)
+
+
 def test_load_replay_and_synchronize(tmp_path: Path):
    scene_path = tmp_path / "scene.json"
    replay_path = tmp_path / "replay.jsonl"
@@ -153,4 +196,3 @@ def test_load_replay_and_synchronize(tmp_path: Path):
    bundles = synchronize_frames(replay, max_skew_ns=20, min_views=2)
    assert len(bundles) == 1
    assert {frame.camera_name for frame in bundles[0].views} == {"cam0", "cam1"}
-