Files
CVTH3PE/app/camera/__init__.py
crosstyan 3cc93e5eae feat: Enhance play notebook with new data structures and visualization utilities
- Added new TypedDict classes for camera parameters, including Resolution, Intrinsic, and Extrinsic.
- Updated dataset reading logic to accommodate new camera parameters structure.
- Introduced functions for reading datasets by port and visualizing whole body keypoints.
- Improved the affinity matrix calculation logic in the camera module.
- Updated dependencies in pyproject.toml to include Plotly and SciPy for enhanced functionality.
2025-04-16 18:53:05 +08:00

390 lines
12 KiB
Python

from collections import OrderedDict, defaultdict
from dataclasses import dataclass
from datetime import datetime
from typing import Any, TypeAlias, TypedDict
from beartype import beartype
from jax import Array
from jax import numpy as jnp
from jaxtyping import Num, jaxtyped
from typing_extensions import NotRequired
CameraID: TypeAlias = str # pylint: disable=invalid-name
@jaxtyped(typechecker=beartype)
@dataclass
class CameraParams:
"""
Camera parameters: intrinsic matrix, extrinsic matrix, and distortion coefficients
"""
K: Num[Array, "3 3"]
"""
intrinsic matrix
"""
Rt: Num[Array, "4 4"]
"""
[R|t] extrinsic matrix
R and t are the rotation and translation that describe the change of
coordinates from world to camera coordinate systems (or camera frame)
"""
dist_coeffs: Num[Array, "N"]
"""
An array of distortion coefficients of the form
[k1, k2, [p1, p2, [k3]]], where ki is the ith
radial distortion coefficient and pi is the ith
tangential distortion coeff.
"""
image_size: Num[Array, "2"]
"""
The size of image plane (width, height)
"""
@jaxtyped(typechecker=beartype)
@dataclass
class Camera:
"""
a description of a camera
"""
id: CameraID
"""
Camera ID
"""
params: CameraParams
"""
Camera parameters
"""
@jaxtyped(typechecker=beartype)
@dataclass
class Detection:
"""
One detection from a camera
"""
keypoints: Num[Array, "N 2"]
"""
Keypoints
"""
confidences: Num[Array, "N"]
"""
Confidences
"""
camera: Camera
"""
Camera
"""
timestamp: datetime
"""
Timestamp of the detection
"""
def classify_by_camera(
detections: list[Detection],
) -> OrderedDict[CameraID, list[Detection]]:
"""
Classify detections by camera
"""
# or use setdefault
camera_wise_split: dict[CameraID, list[Detection]] = defaultdict(list)
for entry in detections:
camera_id = entry.camera.id
camera_wise_split[camera_id].append(entry)
return OrderedDict(camera_wise_split)
@jaxtyped(typechecker=beartype)
def to_homogeneous(points: Num[Array, "N 2"] | Num[Array, "N 3"]) -> Num[Array, "N 3"]:
"""
Convert points to homogeneous coordinates
"""
if points.shape[-1] == 2:
return jnp.hstack((points, jnp.ones((points.shape[0], 1))))
elif points.shape[-1] == 3:
return points
else:
raise ValueError(f"Invalid shape for points: {points.shape}")
@jaxtyped(typechecker=beartype)
def point_line_distance(
points: Num[Array, "N 3"] | Num[Array, "N 2"],
line: Num[Array, "N 3"],
eps: float = 1e-9,
):
"""
Calculate the distance from a point to a line
Args:
point: (possibly homogeneous) points :math:`(N, 2 or 3)`.
line: lines coefficients :math:`(a, b, c)` with shape :math:`(N, 3)`, where :math:`ax + by + c = 0`.
eps: Small constant for safe sqrt.
Returns:
the computed distance with shape :math:`(N)`.
See also:
https://en.wikipedia.org/wiki/Distance_from_a_point_to_a_line
"""
numerator = abs(line[:, 0] * points[:, 0] + line[:, 1] * points[:, 1] + line[:, 2])
denominator = jnp.sqrt(line[:, 0] * line[:, 0] + line[:, 1] * line[:, 1])
return numerator / (denominator + eps)
@jaxtyped(typechecker=beartype)
def left_to_right_epipolar_distance(
left: Num[Array, "N 3"],
right: Num[Array, "N 3"],
fundamental_matrix: Num[Array, "3 3"],
):
"""
Return one-sided epipolar distance for correspondences given the fundamental matrix.
Args:
left: points in the left image (homogeneous) :math:`(N, 3)`
right: points in the right image (homogeneous) :math:`(N, 3)`
fundamental_matrix: fundamental matrix :math:`(3, 3)`
Returns:
the computed distance with shape :math:`(N)`.
See also:
https://en.wikipedia.org/wiki/Fundamental_matrix_%28computer_vision%29
$$x^{\\prime T}Fx = 0$$
"""
F_t = fundamental_matrix.transpose()
line1_in_2 = jnp.matmul(left, F_t)
return point_line_distance(right, line1_in_2)
@jaxtyped(typechecker=beartype)
def right_to_left_epipolar_distance(
left: Num[Array, "N 3"],
right: Num[Array, "N 3"],
fundamental_matrix: Num[Array, "3 3"],
):
"""
Return one-sided epipolar distance for correspondences given the fundamental matrix.
Args:
left: points in the left image (homogeneous) :math:`(N, 3)`
right: points in the right image (homogeneous) :math:`(N, 3)`
fundamental_matrix: fundamental matrix :math:`(3, 3)`
Returns:
the computed distance with shape :math:`(N)`.
See also:
https://en.wikipedia.org/wiki/Fundamental_matrix_%28computer_vision%29
$$x^{\\prime T}Fx = 0$$
"""
line2_in_1 = jnp.matmul(right, fundamental_matrix)
return point_line_distance(left, line2_in_1)
def distance_between_epipolar_lines(
x1: Num[Array, "N 2"] | Num[Array, "N 3"],
x2: Num[Array, "N 2"] | Num[Array, "N 3"],
fundamental_matrix: Num[Array, "3 3"],
):
"""
Calculate the total epipolar line distance between x1 and x2.
"""
if x1.shape[0] != x2.shape[0]:
raise ValueError(
f"x1 and x2 must have the same number of points: {x1.shape[0]} != {x2.shape[0]}"
)
if x1.shape[-1] == 2:
points1 = to_homogeneous(x1)
elif x1.shape[-1] == 3:
points1 = x1
else:
raise ValueError(f"Invalid shape for correspondence1: {x1.shape}")
if x2.shape[-1] == 2:
points2 = to_homogeneous(x2)
elif x2.shape[-1] == 3:
points2 = x2
else:
raise ValueError(f"Invalid shape for correspondence2: {x2.shape}")
if fundamental_matrix.shape != (3, 3):
raise ValueError(
f"Invalid shape for fundamental_matrix: {fundamental_matrix.shape}"
)
# points 1 and 2 are unnormalized points
dist_1 = jnp.mean(
right_to_left_epipolar_distance(points1, points2, fundamental_matrix)
)
dist_2 = jnp.mean(
left_to_right_epipolar_distance(points1, points2, fundamental_matrix)
)
distance = dist_1 + dist_2
return distance
@jaxtyped(typechecker=beartype)
def calculate_fundamental_matrix(
camera_left: Camera, camera_right: Camera
) -> Num[Array, "3 3"]:
"""
Calculate the fundamental matrix for the given cameras.
"""
# Intrinsics
K1 = camera_left.params.K
K2 = camera_right.params.K
# Extrinsics (World to Camera transforms)
Rt1 = camera_left.params.Rt
Rt2 = camera_right.params.Rt
# Convert to Camera to World (Inverse)
T2: Array = jnp.linalg.inv(Rt2)
# Relative transform from Left to Right
T_rel = T2 @ Rt1
R = T_rel[:3, :3]
t = T_rel[:3, 3:]
# Skew-symmetric matrix for cross product
def skew(v: Num[Array, "3"]) -> Num[Array, "3 3"]:
return jnp.array(
[
[0, -v[2], v[1]],
[v[2], 0, -v[0]],
[-v[1], v[0], 0],
]
)
t_skew = skew(t.reshape(-1))
# Essential Matrix
E = t_skew @ R
# Fundamental Matrix
F = jnp.linalg.inv(K2).T @ E @ jnp.linalg.inv(K1)
return F
def compute_affinity_epipolar_constraint_with_pairs(
left: Detection, right: Detection, alpha_2d: float
):
"""
Compute the affinity between two groups of detections by epipolar constraint,
where camera parameters are included in the detections.
Note:
Originally, alpha_2d comes from the paper as a scaling factor for epipolar error affinity.
Its role is mainly to normalize error into [0,1] range, but it could lead to negative affinity.
An alternative approach is using normalized epipolar error relative to image size, with soft cutoff,
like exp(-error / threshold), for better interpretability and stability.
"""
fundamental_matrix = calculate_fundamental_matrix(left.camera, right.camera)
d = distance_between_epipolar_lines(
left.keypoints, right.keypoints, fundamental_matrix
)
return 1 - (d / alpha_2d)
def calculate_affinity_matrix_by_epipolar_constraint(
detections: list[Detection] | dict[CameraID, list[Detection]],
alpha_2d: float,
) -> tuple[list[Detection], Num[Array, "N N"]]:
"""
Calculate the affinity matrix by epipolar constraint
This function evaluates the geometric consistency of every pair of detections
across different cameras using the fundamental matrix. It assumes that
detections from the same camera are not comparable and should have zero affinity.
The affinity is computed by:
1. Calculating the fundamental matrix between the two cameras.
2. Measuring the average point-to-epipolar-line distance for all keypoints.
3. Mapping the distance to affinity with the formula: 1 - (distance / alpha_2d).
Args:
detections: Either a flat list of Detection or a dict grouping Detection by CameraID.
alpha_2d: Image resolution-dependent threshold controlling affinity scaling.
Typically relates to expected pixel displacement rate.
Returns:
sorted_detections: Flattened list of detections sorted by camera order.
affinity_matrix: Array of shape (N, N), where N is the number of detections.
Notes:
- Detections from the same camera always have affinity = 0.
- Affinity decays linearly with epipolar error until 0 (or potentially negative).
- Consider switching to exp(-error / scale) style for non-negative affinity.
- alpha_2d should be adjusted based on image resolution or empirical observation.
Affinity Matrix layout:
assuming we have 3 cameras
C0 has 3 detections: D0_C0, D1_C0, D2_C0
C1 has 2 detections: D0_C1, D1_C1
C2 has 2 detections: D0_C2, D1_C2
D0_C0(0), D1_C0(1), D2_C0(2), D0_C1(3), D1_C1(4), D0_C2(5), D1_C2(6)
D0_C0(0) 0 0 0 a_03 a_04 a_05 a_06
D1_C0(1) 0 0 0 a_13 a_14 a_15 a_16
...
D0_C1(3) a_30 a_31 a_32 0 0 a_35 a_36
...
D1_C2(6) a_60 a_61 a_62 a_63 a_64 0 0
"""
if isinstance(detections, dict):
camera_wise_split = detections
else:
camera_wise_split = classify_by_camera(detections)
num_entries = sum(len(entries) for entries in camera_wise_split.values())
affinity_matrix = jnp.ones((num_entries, num_entries), dtype=jnp.float32) * -jnp.inf
affinity_matrix_mask = jnp.zeros((num_entries, num_entries), dtype=jnp.bool_)
acc = 0
total_indices = set(range(num_entries))
camera_id_index_map: dict[CameraID, set[int]] = defaultdict(set)
camera_id_index_map_inverse: dict[CameraID, set[int]] = defaultdict(set)
# sorted by [D0_C0, D1_C0, D2_C0, D0_C1, D1_C1, D0_C2, D1_C2...]
sorted_detections: list[Detection] = []
for camera_id, entries in camera_wise_split.items():
for i, _ in enumerate(entries):
camera_id_index_map[camera_id].add(acc)
sorted_detections.append(entries[i])
acc += 1
camera_id_index_map_inverse[camera_id] = (
total_indices - camera_id_index_map[camera_id]
)
# ignore self-affinity
# ignore same-camera affinity
# assuming commutative
for i, det in enumerate(sorted_detections):
other_indices = camera_id_index_map_inverse[det.camera.id]
for j in other_indices:
if i == j:
continue
if affinity_matrix_mask[i, j] or affinity_matrix_mask[j, i]:
continue
a = compute_affinity_epipolar_constraint_with_pairs(
det, sorted_detections[j], alpha_2d
)
affinity_matrix = affinity_matrix.at[i, j].set(a)
affinity_matrix = affinity_matrix.at[j, i].set(a)
affinity_matrix_mask = affinity_matrix_mask.at[i, j].set(True)
affinity_matrix_mask = affinity_matrix_mask.at[j, i].set(True)
return sorted_detections, affinity_matrix