From a4cc34f599b7bb6ee71c925d852e92860e47db3c Mon Sep 17 00:00:00 2001
From: crosstyan <crosstyan@outlook.com>
Date: Sun, 27 Apr 2025 16:56:49 +0800
Subject: [PATCH] feat: Enhance playground.py with new 3D tracking and affinity
 calculations

- Added functions for calculating perpendicular distances between predicted 3D tracking points and camera rays, improving 3D tracking accuracy.
- Introduced a new function for calculating 3D affinity scores based on distances and time differences, enhancing the integration of 3D tracking with existing systems.
- Updated existing functions to support new data types and improved documentation for clarity on parameters and return values.
- Refactored affinity calculation logic to utilize JAX for performance optimization in distance computations.
---
 app/camera/__init__.py |  15 +++-
 playground.py          | 156 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 154 insertions(+), 17 deletions(-)

diff --git a/app/camera/__init__.py b/app/camera/__init__.py
index 174b682..9aa25f5 100644
--- a/app/camera/__init__.py
+++ b/app/camera/__init__.py
@@ -103,8 +103,10 @@ def unproject_points_onto_plane(
 
     (i.e. back-project points onto a plane)
 
+    `intersect_image_rays_with_plane`/`compute_ray_plane_intersections`
+
     Args:
-        points_2d: [..., 2] image pixel coordinates
+        points_2d: [..., 2] image pixel coordinates (with camera distortion)
         plane_normal: (3,) normal vector of the plane in world coords
         plane_point: (3,) a known point on the plane in world coords
         K: Camera intrinsic matrix
@@ -118,7 +120,7 @@ def unproject_points_onto_plane(
     Returns:
         [..., 3] world-space intersection points
     """
-    # Step 1: undistort (no-op here)
+    # Step 1: undistort
     pts = undistort_points(
         np.asarray(points_2d), np.asarray(K), np.asarray(dist_coeffs)
     )
@@ -313,6 +315,13 @@ class CameraParams:
             object.__setattr__(self, "_proj", pm)
         return pm
 
+    @property
+    def location(self) -> Num[Array, "3"]:
+        """
+        The 3D location of camera (relative to world coordinate system)
+        """
+        return self.pose_matrix[:3, -1].reshape((3,))
+
 
 @jaxtyped(typechecker=beartype)
 @dataclass(frozen=True)
@@ -390,7 +399,7 @@ class Camera:
         Un-project 2D points to 3D points on a plane at z = constant.
 
         Args:
-            points_2d: 2D points in pixel coordinates
+            points_2d: 2D points in pixel coordinates (with camera distortion)
             z: z-coordinate of the plane (default: 0.0,  i.e. ground/horizon/floor plane)
 
         Returns:
diff --git a/playground.py b/playground.py
index f1d0a96..779746e 100644
--- a/playground.py
+++ b/playground.py
@@ -568,7 +568,7 @@ def calculate_distance_2d(
     left: Num[Array, "J 2"],
     right: Num[Array, "J 2"],
     image_size: tuple[int, int] = (1, 1),
-):
+) -> Float[Array, "J"]:
     """
     Calculate the *normalized* distance between two sets of keypoints.
 
@@ -576,6 +576,9 @@ def calculate_distance_2d(
         left: The left keypoints
         right: The right keypoints
         image_size: The size of the image
+
+    Returns:
+        Array of normalized Euclidean distances between corresponding keypoints
     """
     w, h = image_size
     if w == 1 and h == 1:
@@ -590,25 +593,41 @@ def calculate_distance_2d(
 
 @jaxtyped(typechecker=beartype)
 def calculate_affinity_2d(
-    distance_2d: float, w_2d: float, alpha_2d: float, lambda_a: float, delta_t: float
+    distance_2d: Float[Array, "J"],
+    delta_t: timedelta,
+    w_2d: float,
+    alpha_2d: float,
+    lambda_a: float,
 ) -> float:
     """
-    Calculate the affinity between two detections based on the distance between their keypoints.
+    Calculate the affinity between two detections based on the distances between their keypoints.
+
+    The affinity score is calculated by summing individual keypoint affinities:
+    A_2D = sum(w_2D * (1 - distance_2D / (alpha_2D*delta_t)) * np.exp(-lambda_a * delta_t)) for each keypoint
 
     Args:
-        distance_2d: The normalized distance between the two keypoints (see `calculate_distance_2d`)
-        w_2d: The weight of the distance (parameter)
-        alpha_2d: The alpha value for the distance calculation (parameter)
-        lambda_a: The lambda value for the distance calculation (parameter)
+        distance_2d: The normalized distances between keypoints (array with one value per keypoint)
+        w_2d: The weight for 2D affinity
+        alpha_2d: The normalization factor for distance
+        lambda_a: The decay rate for time difference
         delta_t: The time delta between the two detections, in seconds
+
+    Returns:
+        Sum of affinity scores across all keypoints
     """
-    return w_2d * (1 - distance_2d / (alpha_2d * delta_t)) * np.exp(-lambda_a * delta_t)
+    delta_t_s = delta_t.total_seconds()
+    affinity_per_keypoint = (
+        w_2d
+        * (1 - distance_2d / (alpha_2d * delta_t_s))
+        * jnp.exp(-lambda_a * delta_t_s)
+    )
+    return jnp.sum(affinity_per_keypoint).item()
 
 
 @jaxtyped(typechecker=beartype)
 def perpendicular_distance_point_to_line_two_points(
-    point: Num[Array, "2"], line: tuple[Num[Array, "2"], Num[Array, "2"]]
-):
+    point: Num[Array, "3"], line: tuple[Num[Array, "3"], Num[Array, "3"]]
+) -> Float[Array, ""]:
     """
     Calculate the perpendicular distance between a point and a line.
 
@@ -621,20 +640,106 @@ def perpendicular_distance_point_to_line_two_points(
     return distance
 
 
+@jaxtyped(typechecker=beartype)
+def perpendicular_distance_camera_2d_points_to_tracking_raycasting(
+    detection: Detection,
+    tracking: Tracking,
+    delta_t: timedelta,
+) -> Float[Array, "J"]:
+    """
+    Calculate the perpendicular distances between predicted 3D tracking points
+    and the rays cast from camera center through the 2D image points.
+
+    Args:
+        detection: The detection object containing 2D keypoints and camera parameters
+        tracking: The tracking object containing 3D keypoints
+        delta_t: Time delta between the tracking's last update and current observation
+
+    Returns:
+        Array of perpendicular distances for each keypoint
+    """
+    camera = detection.camera
+    # Convert timedelta to seconds for prediction
+    delta_t_s = delta_t.total_seconds()
+
+    # Predict the 3D pose based on tracking and delta_t
+    predicted_pose = predict_pose_3d(tracking, delta_t_s)
+
+    # Back-project the 2D points to 3D space (assuming z=0 plane)
+    back_projected_points = detection.camera.unproject_points_to_z_plane(
+        detection.keypoints, z=0.0
+    )
+
+    # Get camera center from the camera parameters
+    camera_center = camera.params.location
+
+    # Define function to calculate distance between a predicted point and its corresponding ray
+    def calc_distance(predicted_point, back_projected_point):
+        return perpendicular_distance_point_to_line_two_points(
+            predicted_point, (camera_center, back_projected_point)
+        )
+
+    # Vectorize over all keypoints
+    vmap_calc_distance = jax.vmap(calc_distance)
+
+    # Calculate and return distances for all keypoints
+    return vmap_calc_distance(predicted_pose, back_projected_points)
+
+
+@jaxtyped(typechecker=beartype)
+def calculate_affinity_3d(
+    distances: Float[Array, "J"],
+    delta_t: timedelta,
+    w_3d: float,
+    alpha_3d: float,
+    lambda_a: float,
+) -> float:
+    """
+    Calculate 3D affinity score between a tracking and detection.
+
+    The affinity score is calculated by summing individual keypoint affinities:
+    A_3D = sum(w_3D * (1 - dl / alpha_3D) * np.exp(-lambda_a * delta_t)) for each keypoint
+
+    Args:
+        distances: Array of perpendicular distances for each keypoint
+        delta_t: Time difference between tracking and detection
+        w_3d: Weight for 3D affinity
+        alpha_3d: Normalization factor for distance
+        lambda_a: Decay rate for time difference
+
+    Returns:
+        Sum of affinity scores across all keypoints
+    """
+    delta_t_s = delta_t.total_seconds()
+    affinity_per_keypoint = (
+        w_3d * (1 - distances / alpha_3d) * jnp.exp(-lambda_a * delta_t_s)
+    )
+
+    # Sum affinities across all keypoints
+    return jnp.sum(affinity_per_keypoint).item()
+
+
 def predict_pose_3d(
     tracking: Tracking,
-    delta_t: float,
+    delta_t_s: float,
 ) -> Float[Array, "J 3"]:
     """
     Predict the 3D pose of a tracking based on its velocity.
     """
     if tracking.velocity is None:
         return tracking.keypoints
-    return tracking.keypoints + tracking.velocity * delta_t
+    return tracking.keypoints + tracking.velocity * delta_t_s
 
 
 # %%
 # let's do cross-view association
+W_2D = 1.0
+ALPHA_2D = 1.0
+LAMBDA_A = 0.1
+W_3D = 1.0
+ALPHA_3D = 1.0
+LAMBDA_A = 0.1
+
 trackings = sorted(global_tracking_state.trackings.values(), key=lambda x: x.id)
 unmatched_detections = shallow_copy(next_group)
 # cross-view association matrix with shape (T, D), where T is the number of
@@ -647,12 +752,35 @@ unmatched_detections = shallow_copy(next_group)
 #
 # where T <- [t1..tt]; D <- join(c1..cc), where `cn` is a collection of
 # detections from camera `n`
-affinity = np.zeros((len(trackings), len(unmatched_detections)))
+affinity = jnp.zeros((len(trackings), len(unmatched_detections)))
 detection_by_camera = classify_by_camera(unmatched_detections)
 for i, tracking in enumerate(trackings):
+    j = 0
     for c, detections in detection_by_camera.items():
         camera = next(iter(detections)).camera
         # pixel space, unnormalized
         tracking_2d_projection = camera.project(tracking.keypoints)
         for det in detections:
-            ...
+            delta_t = det.timestamp - tracking.last_active_timestamp
+            distance_2d = calculate_distance_2d(tracking_2d_projection, det.keypoints)
+            affinity_2d = calculate_affinity_2d(
+                distance_2d,
+                delta_t,
+                w_2d=W_2D,
+                alpha_2d=ALPHA_2D,
+                lambda_a=LAMBDA_A,
+            )
+            distances = perpendicular_distance_camera_2d_points_to_tracking_raycasting(
+                det, tracking, delta_t
+            )
+            affinity_3d = calculate_affinity_3d(
+                distances,
+                delta_t,
+                w_3d=W_3D,
+                alpha_3d=ALPHA_3D,
+                lambda_a=LAMBDA_A,
+            )
+            affinity_sum = affinity_2d + affinity_3d
+            affinity = affinity.at[i, j].set(affinity_sum)
+            j += 1
+display(affinity)