From a4cc34f599b7bb6ee71c925d852e92860e47db3c Mon Sep 17 00:00:00 2001 From: crosstyan Date: Sun, 27 Apr 2025 16:56:49 +0800 Subject: [PATCH] feat: Enhance playground.py with new 3D tracking and affinity calculations - Added functions for calculating perpendicular distances between predicted 3D tracking points and camera rays, improving 3D tracking accuracy. - Introduced a new function for calculating 3D affinity scores based on distances and time differences, enhancing the integration of 3D tracking with existing systems. - Updated existing functions to support new data types and improved documentation for clarity on parameters and return values. - Refactored affinity calculation logic to utilize JAX for performance optimization in distance computations. --- app/camera/__init__.py | 15 +++- playground.py | 156 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 154 insertions(+), 17 deletions(-) diff --git a/app/camera/__init__.py b/app/camera/__init__.py index 174b682..9aa25f5 100644 --- a/app/camera/__init__.py +++ b/app/camera/__init__.py @@ -103,8 +103,10 @@ def unproject_points_onto_plane( (i.e. back-project points onto a plane) + `intersect_image_rays_with_plane`/`compute_ray_plane_intersections` + Args: - points_2d: [..., 2] image pixel coordinates + points_2d: [..., 2] image pixel coordinates (with camera distortion) plane_normal: (3,) normal vector of the plane in world coords plane_point: (3,) a known point on the plane in world coords K: Camera intrinsic matrix @@ -118,7 +120,7 @@ def unproject_points_onto_plane( Returns: [..., 3] world-space intersection points """ - # Step 1: undistort (no-op here) + # Step 1: undistort pts = undistort_points( np.asarray(points_2d), np.asarray(K), np.asarray(dist_coeffs) ) @@ -313,6 +315,13 @@ class CameraParams: object.__setattr__(self, "_proj", pm) return pm + @property + def location(self) -> Num[Array, "3"]: + """ + The 3D location of camera (relative to world coordinate system) + """ + return self.pose_matrix[:3, -1].reshape((3,)) + @jaxtyped(typechecker=beartype) @dataclass(frozen=True) @@ -390,7 +399,7 @@ class Camera: Un-project 2D points to 3D points on a plane at z = constant. Args: - points_2d: 2D points in pixel coordinates + points_2d: 2D points in pixel coordinates (with camera distortion) z: z-coordinate of the plane (default: 0.0, i.e. ground/horizon/floor plane) Returns: diff --git a/playground.py b/playground.py index f1d0a96..779746e 100644 --- a/playground.py +++ b/playground.py @@ -568,7 +568,7 @@ def calculate_distance_2d( left: Num[Array, "J 2"], right: Num[Array, "J 2"], image_size: tuple[int, int] = (1, 1), -): +) -> Float[Array, "J"]: """ Calculate the *normalized* distance between two sets of keypoints. @@ -576,6 +576,9 @@ def calculate_distance_2d( left: The left keypoints right: The right keypoints image_size: The size of the image + + Returns: + Array of normalized Euclidean distances between corresponding keypoints """ w, h = image_size if w == 1 and h == 1: @@ -590,25 +593,41 @@ def calculate_distance_2d( @jaxtyped(typechecker=beartype) def calculate_affinity_2d( - distance_2d: float, w_2d: float, alpha_2d: float, lambda_a: float, delta_t: float + distance_2d: Float[Array, "J"], + delta_t: timedelta, + w_2d: float, + alpha_2d: float, + lambda_a: float, ) -> float: """ - Calculate the affinity between two detections based on the distance between their keypoints. + Calculate the affinity between two detections based on the distances between their keypoints. + + The affinity score is calculated by summing individual keypoint affinities: + A_2D = sum(w_2D * (1 - distance_2D / (alpha_2D*delta_t)) * np.exp(-lambda_a * delta_t)) for each keypoint Args: - distance_2d: The normalized distance between the two keypoints (see `calculate_distance_2d`) - w_2d: The weight of the distance (parameter) - alpha_2d: The alpha value for the distance calculation (parameter) - lambda_a: The lambda value for the distance calculation (parameter) + distance_2d: The normalized distances between keypoints (array with one value per keypoint) + w_2d: The weight for 2D affinity + alpha_2d: The normalization factor for distance + lambda_a: The decay rate for time difference delta_t: The time delta between the two detections, in seconds + + Returns: + Sum of affinity scores across all keypoints """ - return w_2d * (1 - distance_2d / (alpha_2d * delta_t)) * np.exp(-lambda_a * delta_t) + delta_t_s = delta_t.total_seconds() + affinity_per_keypoint = ( + w_2d + * (1 - distance_2d / (alpha_2d * delta_t_s)) + * jnp.exp(-lambda_a * delta_t_s) + ) + return jnp.sum(affinity_per_keypoint).item() @jaxtyped(typechecker=beartype) def perpendicular_distance_point_to_line_two_points( - point: Num[Array, "2"], line: tuple[Num[Array, "2"], Num[Array, "2"]] -): + point: Num[Array, "3"], line: tuple[Num[Array, "3"], Num[Array, "3"]] +) -> Float[Array, ""]: """ Calculate the perpendicular distance between a point and a line. @@ -621,20 +640,106 @@ def perpendicular_distance_point_to_line_two_points( return distance +@jaxtyped(typechecker=beartype) +def perpendicular_distance_camera_2d_points_to_tracking_raycasting( + detection: Detection, + tracking: Tracking, + delta_t: timedelta, +) -> Float[Array, "J"]: + """ + Calculate the perpendicular distances between predicted 3D tracking points + and the rays cast from camera center through the 2D image points. + + Args: + detection: The detection object containing 2D keypoints and camera parameters + tracking: The tracking object containing 3D keypoints + delta_t: Time delta between the tracking's last update and current observation + + Returns: + Array of perpendicular distances for each keypoint + """ + camera = detection.camera + # Convert timedelta to seconds for prediction + delta_t_s = delta_t.total_seconds() + + # Predict the 3D pose based on tracking and delta_t + predicted_pose = predict_pose_3d(tracking, delta_t_s) + + # Back-project the 2D points to 3D space (assuming z=0 plane) + back_projected_points = detection.camera.unproject_points_to_z_plane( + detection.keypoints, z=0.0 + ) + + # Get camera center from the camera parameters + camera_center = camera.params.location + + # Define function to calculate distance between a predicted point and its corresponding ray + def calc_distance(predicted_point, back_projected_point): + return perpendicular_distance_point_to_line_two_points( + predicted_point, (camera_center, back_projected_point) + ) + + # Vectorize over all keypoints + vmap_calc_distance = jax.vmap(calc_distance) + + # Calculate and return distances for all keypoints + return vmap_calc_distance(predicted_pose, back_projected_points) + + +@jaxtyped(typechecker=beartype) +def calculate_affinity_3d( + distances: Float[Array, "J"], + delta_t: timedelta, + w_3d: float, + alpha_3d: float, + lambda_a: float, +) -> float: + """ + Calculate 3D affinity score between a tracking and detection. + + The affinity score is calculated by summing individual keypoint affinities: + A_3D = sum(w_3D * (1 - dl / alpha_3D) * np.exp(-lambda_a * delta_t)) for each keypoint + + Args: + distances: Array of perpendicular distances for each keypoint + delta_t: Time difference between tracking and detection + w_3d: Weight for 3D affinity + alpha_3d: Normalization factor for distance + lambda_a: Decay rate for time difference + + Returns: + Sum of affinity scores across all keypoints + """ + delta_t_s = delta_t.total_seconds() + affinity_per_keypoint = ( + w_3d * (1 - distances / alpha_3d) * jnp.exp(-lambda_a * delta_t_s) + ) + + # Sum affinities across all keypoints + return jnp.sum(affinity_per_keypoint).item() + + def predict_pose_3d( tracking: Tracking, - delta_t: float, + delta_t_s: float, ) -> Float[Array, "J 3"]: """ Predict the 3D pose of a tracking based on its velocity. """ if tracking.velocity is None: return tracking.keypoints - return tracking.keypoints + tracking.velocity * delta_t + return tracking.keypoints + tracking.velocity * delta_t_s # %% # let's do cross-view association +W_2D = 1.0 +ALPHA_2D = 1.0 +LAMBDA_A = 0.1 +W_3D = 1.0 +ALPHA_3D = 1.0 +LAMBDA_A = 0.1 + trackings = sorted(global_tracking_state.trackings.values(), key=lambda x: x.id) unmatched_detections = shallow_copy(next_group) # cross-view association matrix with shape (T, D), where T is the number of @@ -647,12 +752,35 @@ unmatched_detections = shallow_copy(next_group) # # where T <- [t1..tt]; D <- join(c1..cc), where `cn` is a collection of # detections from camera `n` -affinity = np.zeros((len(trackings), len(unmatched_detections))) +affinity = jnp.zeros((len(trackings), len(unmatched_detections))) detection_by_camera = classify_by_camera(unmatched_detections) for i, tracking in enumerate(trackings): + j = 0 for c, detections in detection_by_camera.items(): camera = next(iter(detections)).camera # pixel space, unnormalized tracking_2d_projection = camera.project(tracking.keypoints) for det in detections: - ... + delta_t = det.timestamp - tracking.last_active_timestamp + distance_2d = calculate_distance_2d(tracking_2d_projection, det.keypoints) + affinity_2d = calculate_affinity_2d( + distance_2d, + delta_t, + w_2d=W_2D, + alpha_2d=ALPHA_2D, + lambda_a=LAMBDA_A, + ) + distances = perpendicular_distance_camera_2d_points_to_tracking_raycasting( + det, tracking, delta_t + ) + affinity_3d = calculate_affinity_3d( + distances, + delta_t, + w_3d=W_3D, + alpha_3d=ALPHA_3D, + lambda_a=LAMBDA_A, + ) + affinity_sum = affinity_2d + affinity_3d + affinity = affinity.at[i, j].set(affinity_sum) + j += 1 +display(affinity)