feat: Add camera affinity calculations and iterative processing in playground.py

- Introduced `calculate_camera_affinity_matrix` to compute affinity between trackings and detections for individual cameras, enhancing modularity. - Added `process_detections_iteratively` to handle detections camera by camera, improving efficiency and scalability. - Enhanced type hints and documentation for new functions, clarifying parameters and return values. - Refactored existing affinity calculation logic to integrate new functionalities, ensuring better organization and readability.
2025-04-28 16:11:57 +08:00
parent a31f609133
commit ebcd38eb52
1 changed files with 184 additions and 2 deletions
--- a/playground.py
+++ b/playground.py
@ -12,6 +12,8 @@
 #     name: python3
 # ---
 from collections import OrderedDict
 # %%
 from copy import copy as shallow_copy
 from copy import deepcopy
@ -22,10 +24,12 @@ from pathlib import Path
 from typing import (
    Any,
    Generator,
    Mapping,
    Optional,
    Sequence,
    TypeAlias,
    TypedDict,
    TypeVar,
    cast,
    overload,
 )
@ -41,8 +45,8 @@ from IPython.display import display
 from jaxtyping import Array, Float, Num, jaxtyped
 from matplotlib import pyplot as plt
 from numpy.typing import ArrayLike
 from scipy.optimize import linear_sum_assignment
 from scipy.spatial.transform import Rotation as R
 from collections import OrderedDict
 from app.camera import (
    Camera,
@ -787,6 +791,7 @@ def calculate_tracking_detection_affinity(
    return jnp.sum(total_affinity).item()
 # %%
@beartype
 def calculate_affinity_matrix(
    trackings: Sequence[Tracking],
@ -847,7 +852,7 @@ def calculate_affinity_matrix(
    for i, tracking in enumerate(trackings):
        j = 0
-        for c, camera_detections in detection_by_camera.items():
+        for _, camera_detections in detection_by_camera.items():
            for det in camera_detections:
                affinity_value = calculate_tracking_detection_affinity(
                    tracking,
@ -864,6 +869,155 @@ def calculate_affinity_matrix(
    return affinity, detection_by_camera
@beartype
 def calculate_camera_affinity_matrix(
    trackings: Sequence[Tracking],
    camera_detections: Sequence[Detection],
    w_2d: float,
    alpha_2d: float,
    w_3d: float,
    alpha_3d: float,
    lambda_a: float,
 ) -> Float[Array, "T D"]:
    """
    Calculate an affinity matrix between trackings and detections from a single camera.
    This follows the iterative camera-by-camera approach from the paper
    "Cross-View Tracking for Multi-Human 3D Pose Estimation at over 100 FPS".
    Instead of creating one large matrix for all cameras, this creates
    a separate matrix for each camera, which can be processed independently.
    Args:
        trackings: Sequence of tracking objects
        camera_detections: Sequence of detection objects, from the same camera
        w_2d: Weight for 2D affinity
        alpha_2d: Normalization factor for 2D distance
        w_3d: Weight for 3D affinity
        alpha_3d: Normalization factor for 3D distance
        lambda_a: Decay rate for time difference
    Returns:
        Affinity matrix of shape (T, D) where:
        - T = number of trackings (rows)
        - D = number of detections from this specific camera (columns)
    Matrix Layout:
        The affinity matrix for a single camera has shape (T, D), where:
        - T = number of trackings (rows)
        - D = number of detections from this camera (columns)
        The matrix is organized as follows:
        ```
                 | Detections from Camera c |
                 |  d1    d2    d3    ...  |
        ---------+------------------------+
        Track 1  | a11   a12   a13   ...  |
        Track 2  | a21   a22   a23   ...  |
        ...      | ...   ...   ...   ...  |
        Track t  | at1   at2   at3   ...  |
        ```
        Each cell aij represents the affinity between tracking i and detection j,
        computed using both 2D and 3D geometric correspondences.
    """
    def verify_all_detection_from_same_camera(detections: Sequence[Detection]):
        if not detections:
            return True
        camera_id = next(iter(detections)).camera.id
        return all(map(lambda d: d.camera.id == camera_id, detections))
    if not verify_all_detection_from_same_camera(camera_detections):
        raise ValueError("All detections must be from the same camera")
    affinity = jnp.zeros((len(trackings), len(camera_detections)))
    for i, tracking in enumerate(trackings):
        for j, det in enumerate(camera_detections):
            affinity_value = calculate_tracking_detection_affinity(
                tracking,
                det,
                w_2d=w_2d,
                alpha_2d=alpha_2d,
                w_3d=w_3d,
                alpha_3d=alpha_3d,
                lambda_a=lambda_a,
            )
            affinity = affinity.at[i, j].set(affinity_value)
    return affinity
@beartype
 def process_detections_iteratively(
    trackings: Sequence[Tracking],
    detections: Sequence[Detection],
    w_2d: float = 1.0,
    alpha_2d: float = 1.0,
    w_3d: float = 1.0,
    alpha_3d: float = 1.0,
    lambda_a: float = 0.1,
 ) -> list[tuple[int, Detection]]:
    """
    Process detections iteratively camera by camera, matching them to trackings.
    This implements the paper's approach where each camera is processed
    independently, and the affinity matrix is calculated for one camera at a time.
    This approach has several advantages:
    1. Computational cost scales linearly with number of cameras
    2. Can handle non-synchronized camera frames
    3. More efficient for large-scale camera systems
    Args:
        trackings: Sequence of tracking objects
        detections: Sequence of detection objects
        w_2d: Weight for 2D affinity
        alpha_2d: Normalization factor for 2D distance
        w_3d: Weight for 3D affinity
        alpha_3d: Normalization factor for 3D distance
        lambda_a: Decay rate for time difference
    Returns:
        List of (tracking_index, detection) pairs representing matches
    """
    # Group detections by camera
    detection_by_camera = classify_by_camera(detections)
    # Store matches between trackings and detections
    matches = []
    # Process each camera one by one
    for camera_id, camera_detections in detection_by_camera.items():
        # Calculate affinity matrix for this camera only
        camera_affinity = calculate_camera_affinity_matrix(
            trackings,
            camera_detections,
            w_2d=w_2d,
            alpha_2d=alpha_2d,
            w_3d=w_3d,
            alpha_3d=alpha_3d,
            lambda_a=lambda_a,
        )
        # Apply Hungarian algorithm for this camera only
        tracking_indices, detection_indices = linear_sum_assignment(
            camera_affinity, maximize=True
        )
        tracking_indices = cast(Sequence[int], tracking_indices)
        detection_indices = cast(Sequence[int], detection_indices)
        # Add matches to result
        for t_idx, d_idx in zip(tracking_indices, detection_indices):
            # Skip matches with zero or negative affinity
            if camera_affinity[t_idx, d_idx] <= 0:
                continue
            matches.append((t_idx, camera_detections[d_idx]))
    return matches
 # %%
 # let's do cross-view association
 W_2D = 1.0
@ -885,3 +1039,31 @@ affinity, detection_by_camera = calculate_affinity_matrix(
    lambda_a=LAMBDA_A,
 )
 display(affinity)
 # %%
 T = TypeVar("T")
 def flatten_values(
    d: Mapping[Any, Sequence[T]],
 ) -> list[T]:
    """
    Flatten a dictionary of sequences into a single list of values.
    """
    return [v for vs in d.values() for v in vs]
 detections_sorted = flatten_values(detection_by_camera)
 display(detections_sorted)
 display(detection_by_camera)
 # %%
 # Perform Hungarian algorithm for assignment for each camera
 indices_T, indices_D = linear_sum_assignment(affinity, maximize=True)
 indices_T = cast(Sequence[int], indices_T)
 indices_D = cast(Sequence[int], indices_D)
 display(indices_T)
 display(indices_D)
 # %%