From a06b3e83998d950465bb73c81197ac331ac436d2 Mon Sep 17 00:00:00 2001 From: crosstyan Date: Tue, 4 Mar 2025 17:34:27 +0800 Subject: [PATCH] feat: Add perpendicular distance calculation utility function Implemented a utility function `calculate_perpendicular_distance` in the `app/utils/__init__.py` module. The function calculates the perpendicular distance between a point and a line using NumPy, with type hints and runtime type checking using jaxtyping and typeguard. --- app/utils/__init__.py | 23 + ..._Estimation_at_Over_100_CVPR_2020_paper.md | 14 +- paper/note.md | 800 ++++++++++++++++++ 3 files changed, 835 insertions(+), 2 deletions(-) create mode 100644 paper/note.md diff --git a/app/utils/__init__.py b/app/utils/__init__.py index e69de29..e016da1 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -0,0 +1,23 @@ +from typing import Any + +import numpy as np +from jaxtyping import Float, Num, jaxtyped +from typeguard import typechecked + +from app._typing import NDArray + + +@jaxtyped(typechecker=typechecked) +def calculate_perpendicular_distance( + point: Num[NDArray, "2"], + line: Num[NDArray, "2 2"], +) -> np.floating[Any]: + """ + Calculate the perpendicular distance between a point and a line. + """ + line_start, line_end = line + distance = np.linalg.norm( + np.cross(line_end - line_start, line_start - point) + ) / np.linalg.norm(line_end - line_start) + + return distance diff --git a/paper/Chen_Cross-View_Tracking_for_Multi-Human_3D_Pose_Estimation_at_Over_100_CVPR_2020_paper.md b/paper/Chen_Cross-View_Tracking_for_Multi-Human_3D_Pose_Estimation_at_Over_100_CVPR_2020_paper.md index 82fbfb3..b07d7a7 100644 --- a/paper/Chen_Cross-View_Tracking_for_Multi-Human_3D_Pose_Estimation_at_Over_100_CVPR_2020_paper.md +++ b/paper/Chen_Cross-View_Tracking_for_Multi-Human_3D_Pose_Estimation_at_Over_100_CVPR_2020_paper.md @@ -4,13 +4,23 @@ Estimating 3D poses of multiple humans in real-time is a classic but still challenging task in computer vision. Its major difficulty lies in the ambiguity in cross-view association of 2D poses and the huge state space when there are multiple people in multiple views. In this paper, we present a novel solution for multi-human 3D pose estimation from multiple calibrated camera views. It takes 2D poses in different camera coordinates as inputs and aims for the accurate 3D poses in the global coordinate. Unlike previous methods that associate 2D poses among all pairs of views from scratch at every frame, we exploit the temporal consistency in videos to match the 2D inputs with 3D poses directly in 3-space. More specifically, we propose to retain the 3D pose for each person and update them iteratively via the cross-view multi-human tracking. This novel formulation improves both accuracy and efficiency, as we demonstrated on widely-used public datasets. To further verify the scalability of our method, we propose a new large-scale multihuman dataset with 12 to 28 camera views. Without bells and whistles, our solution achieves 154 FPS on 12 cameras and 34 FPS on 28 cameras, indicating its ability to handle large-scale real-world applications. The proposed dataset will be released at [here](https://github.com/longcw/crossview_3d_pose_tracking). -tected 2D body joints from multiple views as inputs with the advance of 2D human pose estimation [9, 11, 35], and address the 3D pose estimation in a two-step formulation [2, 13]. Specifically, the 2D joints of the same person are first matched and associated across views, the 3D location of each joint is subsequently determined by a multi-view reconstruction method. In this formulation, the challenge comes from three parts: 1) the detected 2D joints are noisy and inaccurate since the pose estimation is imperfect; 2) the cross-view association is ambiguous when multiple people interacting with each other in crowded scenes; 3) the computational complexity explodes as the number of people and number of cameras increase. ## 1. Introduction Multi-human 3D pose estimation from videos has a wide range of applications, including action recognition, sports analysis, and human-computer interaction. With the rapid development of deep neural network, most of the recent efforts in this area have been devoted to monocular 3D pose estimation [25, 26] . However, despite much progress, the single-camera setting is still far from being resolved due to the large variations of human poses and partial occlusion in the monocular views. A natural solution for these problems is to recover the 3D poses from multiple camera views. -Recent multi-view approaches generally employ the de- +Recent multi-view approaches generally employ the detected 2D body joints from +multiple views as inputs with the advance of 2D human pose estimation [9, 11, +35], and address the 3D pose estimation in a two-step formulation [2, 13]. +Specifically, the 2D joints of the same person are first matched and associated +across views, the 3D location of each joint is subsequently determined by a +multi-view reconstruction method. In this formulation, the challenge comes from +three parts: + +1. the detected 2D joints are noisy and inaccurate since the pose +estimation is imperfect; +2. the cross-view association is ambiguous when multiple people interacting with each other in crowded scenes; +3. the computational complexity explodes as the number of people and number of cameras increase. To tackle the problem of cross-view association, 3D pictorial structure model (3DPS) is widely used in some previous methods [2, 8], where the 3D poses are recovered from 2D joints in a discretized 3-space. In this formulation, the likelihood of a joint belonging to a spatial bin is given by the geometric consistency [16], along with a pre-defined body structure model. A severe problem of 3DPS is the expensive computational cost due to the huge state space with multiple people in multiple views. As an improvement, Dong et al . [13] propose solving the cross-view association problem at the body level in advance before applying 3DPS. They associate 2D poses of the same person from different views as clusters and estimate 3D poses from the clusters via 3DPS. Nevertheless, matching 2D poses between all pairs of views still makes the computational complexity explode as the number of cameras increases. diff --git a/paper/note.md b/paper/note.md new file mode 100644 index 0000000..a0ee0db --- /dev/null +++ b/paper/note.md @@ -0,0 +1,800 @@ +LLM generated notes for me + + + +Let me provide a more thorough explanation of the graph theory concepts involved in this problem and how they relate to the implementation. I'll break it down step by step: + +1. Graph representation of the problem +2. What cycle consistency means in graph theory terms +3. How binary integer programming (BIP) formulates this problem +4. How the code implements these concepts + + + +# Understanding Graph Partitioning with Cycle Consistency + +I'll explain the graph theory concepts more thoroughly to help you understand how the implementation works. + +## 1. The Basic Problem + +The paper is trying to associate 2D human detections across multiple camera views. This is essentially a **clustering problem**: we want to group detections from different cameras that correspond to the same person. + +## 2. Graph Representation + +We can represent this problem as a graph: +- Each **node** represents a 2D detection from a camera +- An **edge** between two nodes indicates that we believe those detections correspond to the same person +- The **weight** of an edge (the affinity) indicates how confident we are that the detections match + +## 3. What is Cycle Consistency? + +Cycle consistency is a crucial property in multi-view association: + +- If detection A matches with detection B +- And detection B matches with detection C +- Then detection A should match with detection C + +This creates a **transitive relationship**: if A→B and B→C, then A→C must be true. + +Let's visualize this with a simple example: +- Camera 1 sees person X and produces detection A +- Camera 2 sees person X and produces detection B +- Camera 3 sees person X and produces detection C + +If our algorithm correctly matches A→B and B→C, then it must also match A→C, because they all represent the same person X. + +## 4. Binary Variables in the Formulation + +In the mathematical formulation: +- `y_ij = 1` means "detection i and detection j match" +- `y_ij = 0` means "detection i and detection j don't match" + +## 5. The Cycle Consistency Constraint + +The paper formulates this constraint as: +``` +y_ij + y_jk ≤ 1 + y_ik +``` + +Let's break down what this means: + +- If y_ij = 1 and y_jk = 1 (i.e., i matches j and j matches k), then the left side equals 2 +- For the inequality to hold, y_ik must be 1 (i.e., i must match k) +- If either y_ij = 0 or y_jk = 0, then the constraint is always satisfied regardless of y_ik + +The code implements this as: +``` +x_ij + x_ik - x_jk <= 1 +``` + +These are equivalent constraints. To see this: +- Rearrange the paper's constraint: y_ij + y_jk - y_ik ≤ 1 +- The code uses x_ij + x_ik - x_jk <= 1 (with different indices, but the same structure) + +## 6. How the Implementation Works + +Let's walk through the key parts of the implementation: + +### a. Creating the BIP Problem + +The `_create_bip` method does the following: +1. Takes an affinity matrix as input (higher values indicate stronger matches) +2. Identifies pairs of nodes (detections) to potentially match +3. Creates constraints for all triplets of nodes to enforce cycle consistency +4. Encodes these constraints in a format suitable for the solver + +The key section is where it creates the cycle consistency constraints: + +```python +# create matrix whose rows are the indices of the three edges in a +# constraint x_ij + x_ik - x_jk <= 1 +constraints_edges_idx = [] +if n_nodes >= 3: + # ... code to identify triplets of nodes ... + + triplets = np.asarray( + tuple(itertools.combinations(range(n_nodes), 3)), dtype=int + ) + constraints_edges_idx = np.zeros_like(triplets) + # ... code to add constraints for each triplet ... +``` + +### b. Solving the BIP Problem + +The `_solve_bip` method in `GLPKSolver`: +1. Converts the objective and constraints to the format required by the solver +2. Calls the integer linear programming solver +3. Returns the solution as a binary vector + +### c. Extracting Clusters + +The `solution_mat_clusters` method: +1. Takes the binary solution matrix (where a 1 indicates a match) +2. Identifies connected components in the graph +3. Returns these as clusters + +```python +def solution_mat_clusters(solution_mat: NDArray): + n: int = solution_mat.shape[0] + labels = np.arange(1, n + 1) + for i in range(n): + for j in range(i + 1, n): + if solution_mat[i, j] > 0: + labels[j] = labels[i] + # ... code to group by labels ... + return list(clusters.values()) +``` + +This algorithm assigns the same label to all nodes that should be in the same cluster, effectively finding the connected components. + +## 7. Concrete Example + +Let's say we have 5 detections (nodes) and the following initial matches: +- Node 1 matches Node 2 (y_12 = 1) +- Node 2 matches Node 3 (y_23 = 1) +- Node 3 matches Node 4 (y_34 = 1) +- Node 5 is separate + +The solver will enforce cycle consistency, so it will also set: +- y_13 = 1 (because y_12 = 1 and y_23 = 1) +- y_14 = 1 (because y_13 = 1 and y_34 = 1) +- y_24 = 1 (because y_23 = 1 and y_34 = 1) + +The final clusters will be: +- Cluster 1: Nodes 1, 2, 3, 4 (all connected by transitivity) +- Cluster 2: Node 5 (no connections) + +This clustering respects the cycle consistency property, ensuring that all nodes in a cluster are connected directly or indirectly. + +--- + + +I'll provide a more thorough explanation of the graph theory and convex optimization concepts used in this problem, starting from the basics. + + +# Understanding Graph Partitioning and Cycle Consistency: From the Basics + +Let me explain the concepts behind the graph partitioning approach used in this paper, starting with fundamental ideas and building up to the implementation details. + +## 1. The Basic Problem: Cross-View Association + +In multi-camera tracking, we have: +- Multiple cameras observing a scene +- 2D detections of people in each camera view +- The need to associate detections of the same person across different camera views + +This is challenging because: +- We don't know in advance which detections correspond to the same person +- Multiple people can look similar +- Occlusions can hide parts of people in some views + +## 2. Graph Theory Basics + +### What is a Graph? +- A graph consists of nodes (or vertices) and edges connecting them +- In our context: + - Nodes represent 2D detections from different camera views + - Edges represent potential associations between detections + - Edge weights (affinities) represent how likely two detections are of the same person + +### What is Graph Partitioning? +Graph partitioning is dividing the nodes of a graph into clusters where: +- Nodes in the same cluster are strongly connected (high affinity) +- Nodes in different clusters are weakly connected (low affinity) + +In our tracking problem, each resulting cluster should contain all the detections of a single person across different camera views. + +## 3. The Association Problem as a Graph + +Let's visualize this with a simple example: +- Camera 1 has detections A, B +- Camera 2 has detections C, D +- Camera 3 has detections E, F + +We create a graph where: +- We only connect detections from different cameras (A-C, A-D, B-C, B-D, etc.) +- Edge weights represent affinity scores (from epipolar geometry) +- Our goal is to partition this graph so each cluster contains at most one detection per camera + +## 4. The Cycle Consistency Constraint + +Here's where cycle consistency becomes important: + +Imagine three detections: i, j, and k from three different cameras. If: +- i and j are of the same person (matched) +- j and k are of the same person (matched) + +Then logically: +- i and k must also be of the same person (matched) + +This is the "cycle consistency" property. Without it, we could get inconsistent associations where: +- i and j are matched +- j and k are matched +- But i and k are not matched + +This would be illogical since these are supposed to be the same person viewed from different cameras. + +## 5. Binary Integer Programming (BIP) + +To solve this problem mathematically: + +### Variables +- For each pair of detections i and j, we define a binary variable y_ij: + - y_ij = 1 if i and j are matched (same person) + - y_ij = 0 if i and j are not matched (different people) + +### Objective Function +- We want to maximize the sum of affinities for matched pairs: + - Maximize: ∑ a_ij × y_ij + - Where a_ij is the affinity score between detections i and j + +### Constraints +- The cycle consistency constraint: + - For any three detections i, j, and k: + - y_ij + y_jk ≤ 1 + y_ik + - This ensures that if i matches j and j matches k, then i must match k + +This is a binary integer programming (BIP) problem because: +- Variables can only be 0 or 1 (binary) +- We have linear constraints +- We have a linear objective function to optimize + +## 6. Understanding the Constraint Mathematics + +Let's break down the constraint: y_ij + y_jk ≤ 1 + y_ik + +Looking at all possible cases: +1. If y_ij = 0 and y_jk = 0: The constraint becomes 0 ≤ 1 + y_ik, which is always true +2. If y_ij = 1 and y_jk = 0: The constraint becomes 1 ≤ 1 + y_ik, which is always true +3. If y_ij = 0 and y_jk = 1: The constraint becomes 1 ≤ 1 + y_ik, which is always true +4. If y_ij = 1 and y_jk = 1: The constraint becomes 2 ≤ 1 + y_ik, which means y_ik must be 1 + +The last case is the key: If i matches j and j matches k, then i must match k (y_ik = 1). This enforces cycle consistency. + +In the code, this constraint is written as: x_ij + x_ik - x_jk ≤ 1 + +This is algebraically equivalent: +- y_ij + y_jk ≤ 1 + y_ik +- y_ij + y_jk - y_ik ≤ 1 +- Which is equivalent to x_ij + x_ik - x_jk ≤ 1 with appropriate variable substitution + +## 7. How the Code Implements This + +The implementation follows these steps: + +1. **Build the affinity matrix**: + - Calculate affinity scores between all pairs of detections + - Higher affinity means more likely to be the same person + +2. **Set up the BIP**: + - Define binary variables for each potential match + - Set the objective to maximize the sum of affinities + - Add cycle consistency constraints for all triplets of detections + +3. **Solve using GLPK**: + - GLPK (GNU Linear Programming Kit) is used to solve the BIP + - It finds the values of the binary variables that maximize the objective while satisfying all constraints + +4. **Extract clusters**: + - Connected components in the resulting graph form clusters + - Each cluster represents one person's detections across multiple views + +## 8. Specific Implementation Details + +In the `_create_bip` method: + +1. It processes the affinity matrix and identifies "frozen" edges (definite matches or non-matches) +2. It generates triplets of nodes to create cycle consistency constraints +3. It sets up the constraint coefficients (1, 1, -1) corresponding to x_ij + x_ik - x_jk ≤ 1 +4. It handles permutations and special cases + +The `GLPKSolver._solve_bip` method: +1. Sets up the GLPK solver with the objective and constraints +2. Solves the binary integer programming problem +3. Returns the optimal values of the binary variables + +The `solution_mat_clusters` method: +1. Converts the solution back to a matrix format +2. Identifies connected components (clusters) in the resulting graph +3. Returns these clusters as the final association result + +This implementation correctly enforces cycle consistency, ensuring that the resulting associations across camera views are logically consistent. + +--- + +# Analysis of BIPSolver for Cycle-Consistent Graph Partitioning + +This implementation correctly solves the cycle-consistency graph partitioning problem used for multi-view detection association in the paper. Let me explain how it works: + +## 1. Core Concept +The code implements Binary Integer Programming (BIP) to find optimal clusters of 2D detections across camera views while enforcing cycle-consistency. Each variable in the BIP represents a potential match between two detections. + +## 2. How It Enforces Cycle-Consistency + +The key insight is in the `_create_bip` method, which implements cycle-consistency through **triplet constraints**: + +```python +triplets = np.asarray(tuple(itertools.combinations(range(n_nodes), 3)), dtype=int) +``` + +For each triplet of nodes (i,j,k), the code creates three constraints: +1. `x_ij + x_ik - x_jk ≤ 1` +2. `x_ij - x_ik + x_jk ≤ 1` (via rotation) +3. `-x_ij + x_ik + x_jk ≤ 1` (via rotation) + +These constraints together ensure that: +- If `x_ij = 1` and `x_ik = 1`, then `x_jk` must be 1 (nodes form a complete subgraph) +- If `x_ij = 1` and `x_jk = 1`, then `x_ik` must be 1 +- If `x_ik = 1` and `x_jk = 1`, then `x_ij` must be 1 + +This is exactly the mathematical formulation of cycle-consistency mentioned in the paper. + +## 3. Key Components + +### a. Affinity Handling +```python +frozen_pos_mask = affinities >= max_affinity # Definitely connected +frozen_neg_mask = affinities <= min_affinity # Definitely not connected +unfrozen_mask = np.logical_not(frozen_pos_mask | frozen_neg_mask) # To be determined +``` +- Edges with very high affinity are "frozen positive" (automatically included) +- Edges with very low affinity are "frozen negative" (automatically excluded) +- Only "unfrozen" edges are variables in the optimization + +### b. Constraint Generation +```python +constraints_edges_idx = np.vstack(( + constraints_edges_idx, + np.roll(constraints_edges_idx, 1, axis=1), + np.roll(constraints_edges_idx, 2, axis=1), +)) +``` +This creates all three variants of the triplet constraints through rotation, ensuring cycle-consistency from all perspectives. + +### c. Redundancy Removal +```python +# Clean redundant constraints +constraints_edges_idx = constraints_edges_idx[constraints_edges_idx[:, 2] != FROZEN_POS_EDGE] +``` +Improves solver efficiency by removing constraints that are already satisfied by frozen edges. + +### d. Cluster Extraction +```python +def solution_mat_clusters(solution_mat: NDArray): + # ... code to extract connected components +``` +This method uses a label propagation approach to extract connected components from the solution matrix, which represent the final clusters. + +## 4. Mathematical Formulation + +The BIP problem is formulated as: + +- **Variables**: Binary variables `x_ij ∈ {0,1}` for each edge (i,j) +- **Objective**: Maximize `∑ a_ij * x_ij` (total affinity of selected edges) +- **Constraints**: `x_ij + x_ik - x_jk ≤ 1` for all triplets (i,j,k) (and rotations) + +## 5. GLPK Solver Integration + +The `GLPKSolver` class uses the CVXOPT/GLPK interface to solve the BIP: +```python +status, solution = ilp(c, G, h, B=set(range(len(c)))) +``` +- `c`: Negated objective coefficients (for maximization) +- `G`, `h`: Constraint matrix and upper bounds +- `B`: Indicates which variables are binary + +## 6. Verification of Correctness + +This implementation correctly matches the paper's description because: + +1. It maximizes the total affinity across all edges, as described in Equation 3 in the supplementary material. +2. It enforces cycle-consistency constraints exactly as formulated in Equation 5. +3. The cluster extraction method correctly identifies connected components, which represent people detected across multiple cameras. + +## 7. Real-world Example + +Suppose we have 3 detections (A, B, C) with affinities: +- A-B: 0.9 (high) +- B-C: 0.8 (high) +- A-C: 0.1 (low but positive) + +Without cycle-consistency, A-C might be considered a non-match. With this implementation: +- Variables: x_AB, x_BC, x_AC +- Constraint: x_AB + x_BC - x_AC ≤ 1 +- When solver selects x_AB = 1 and x_BC = 1, the constraint forces x_AC = 1 + +This ensures A, B, and C form a single cluster even though A-C has low affinity, maintaining global consistency. + +--- + +The implementation is indeed correct and efficiently solves the graph partitioning problem with cycle-consistency constraints, exactly as needed for the cross-view tracking approach in the paper. + +--- + +Let's break down the concepts step by step, starting from fundamental graph theory and building up to the specific optimization used in the paper. + +--- + +### **1. Graph Theory Basics** +#### **Graph Representation** +- **Nodes**: Represent entities (in this case, 2D detections from different cameras). +- **Edges**: Represent relationships between nodes (pairwise affinity scores based on epipolar constraints). +- **Affinity Matrix**: A square matrix where entry _aij_ quantifies the similarity between node _i_ and node _j_. + +#### **Cycle Consistency** +- **Intuition**: If detection _A_ matches _B_ and _B_ matches _C_, then _A_ must match _C_. +- **Mathematically**: For any triplet of nodes _(i, j, k)_: + \[ + y_{ij} + y_{jk} \leq 1 + y_{ik} + \] + - If \(y_{ij} = 1\) and \(y_{jk} = 1\), this forces \(y_{ik} = 1\). + - Ensures transitivity in matches. + +--- + +### **2. Binary Integer Programming (BIP)** +#### **What is BIP?** +- **Variables**: \(y_{ij} \in \{0, 1\}\), where \(y_{ij} = 1\) indicates nodes _i_ and _j_ are matched. +- **Objective Function**: Maximize total affinity of matched pairs: + \[ + \text{Maximize } \sum_{i,j} a_{ij} y_{ij} + \] +- **Constraints**: Enforce cycle consistency and other rules. + +#### **Why BIP?** +- **NP-Hard**: Finding the optimal solution is computationally intensive, but guarantees correctness. +- **Exact Solution**: Unlike heuristic methods, BIP ensures constraints are strictly satisfied. + +--- + +### **3. Triplet Constraints** +#### **What is a Triplet?** +- A set of three nodes _(i, j, k)_. +- **Purpose**: Encode cycle consistency for all possible 3-node combinations. + +#### **Constraint Generation** +For every triplet _(i, j, k)_, three constraints are created: +1. \(y_{ij} + y_{jk} \leq 1 + y_{ik}\) +2. \(y_{jk} + y_{ki} \leq 1 + y_{ji}\) +3. \(y_{ki} + y_{ij} \leq 1 + y_{kj}\) + +This ensures consistency across all permutations of the triplet. + +--- + +### **4. Code Breakdown** +#### **Key Steps in `BIPSolver`** +1. **Preprocessing**: + - Extract upper triangular affinity matrix to avoid duplicate edges. + - Freeze edges with extreme affinities (too high/low to be optimized). + +2. **Constraint Generation**: + - Use `itertools.combinations` to generate all 3-node triplets. + - Map edges to their indices in the flattened affinity matrix. + +3. **BIP Formulation**: + - **Objective**: Maximize affinity sum (converted to minimization via negation). + - **Constraints**: Sparse matrix encoding of triplet inequalities. + +4. **Solver**: + - Uses GLPK's integer linear programming solver (`ilp`). + - Returns binary solution vector indicating matched edges. + +5. **Cluster Extraction**: + - Convert solution vector into a symmetric adjacency matrix. + - Use label propagation to extract connected components (clusters). + +--- + +### **5. Mathematical Formulation** +#### **Variables** +- Let \(y_{ij} = 1\) if node _i_ and node _j_ are matched, else 0. + +#### **Objective** +\[ +\max \sum_{i < j} a_{ij} y_{ij} +\] + +#### **Constraints** +1. **Cycle Consistency**: + \[ + y_{ij} + y_{jk} \leq 1 + y_{ik} \quad \forall \text{ triplets } (i, j, k) + \] +2. **Binary Variables**: + \[ + y_{ij} \in \{0, 1\} + \] + +--- + +### **6. Example Walkthrough** +#### **Scenario** +- 3 detections: \(D_1\), \(D_2\), \(D_3\). +- Affinities: \(a_{12} = 0.9\), \(a_{23} = 0.8\), \(a_{13} = 0.1\). + +#### **BIP Solution** +- **Without Constraints**: Might match \(D_1\)-\(D_2\) and \(D_2\)-\(D_3\), but exclude \(D_1\)-\(D_3\). +- **With Constraints**: Forces \(y_{13} = 1\) if \(y_{12} = 1\) and \(y_{23} = 1\), forming a single cluster. + +--- + +### **7. Practical Considerations** +#### **Computational Complexity** +- **Triplet Count**: For _N_ nodes, there are \(\binom{N}{3}\) triplets → \(O(N^3)\) constraints. +- **Mitigation**: Use sparse matrices and filter redundant constraints (e.g., frozen edges). + +#### **Cluster Extraction** +- **Label Propagation**: Simple but error-prone for complex graphs. +- **Better Approach**: Union-Find (Disjoint Set Union) for efficient connected component detection. + +--- + +### **8. Summary** +- **Problem**: Associate 2D detections across cameras while ensuring cycle consistency. +- **Solution**: Formulate as BIP with triplet constraints. +- **Code Flow**: + 1. Preprocess affinities. + 2. Generate constraints from triplets. + 3. Solve BIP. + 4. Extract clusters from solution. + +This approach guarantees globally consistent associations but requires careful handling of computational complexity for large-scale problems. + +--- + +When handling multiple detections and tracking different individuals, the graph partitioning and BIP framework cluster detections through a combination of **affinity scores** and **cycle-consistency constraints**. Here's a detailed breakdown of how it works: + +--- + +### **1. Role of Affinity Scores** +- **Affinity Matrix**: Represents pairwise similarity between detections (e.g., geometric consistency from epipolar constraints). + - High affinity: Likely the same person. + - Low affinity: Likely different people. +- **Objective Function**: The solver maximizes the total affinity of selected edges: + \[ + \text{Maximize } \sum_{(i,j)} a_{ij} y_{ij}, + \] + where \(y_{ij} = 1\) if detections \(i\) and \(j\) are matched. + +--- + +### **2. Role of Cycle-Consistency Constraints** +- **Triplet Constraints**: For every triplet \((i, j, k)\), enforce: + \[ + y_{ij} + y_{jk} \leq 1 + y_{ik}. + \] +- **Purpose**: Ensures transitive relationships. If \(i \leftrightarrow j\) and \(j \leftrightarrow k\), then \(i \leftrightarrow k\) must hold. + +--- + +### **3. Partitioning into Clusters (Trackings)** +#### **A. Connected Components** +- **Clusters**: Groups of detections where all pairs are connected via high-affinity edges *and* satisfy cycle-consistency. +- **Example**: + - Detections \(D_1, D_2, D_3\) form a cluster (same person). + - Detections \(D_4, D_5\) form another cluster (different person). + +#### **B. Separation via Frozen Edges** +- **Frozen Edges**: + - \(y_{ij} = 0\) if $a_{ij} \leq \text{min affinity}$ (forced exclusion). + - \(y_{ij} = 1\) if $a_{ij} \geq \text{max affinity}$ (forced inclusion). +- **Impact**: + - Low-affinity edges between different people are frozen to 0, preventing cross-cluster connections. + - High-affinity edges within the same person are prioritized. + +--- + +### **4. Example with Multiple Detections** +#### **Input**: +- 5 detections: \(D_1, D_2, D_3\) (Person A), \(D_4, D_5\) (Person B). +- Affinities: + - High within-cluster (e.g., \(a_{12} = 0.9\), \(a_{23} = 0.8\), \(a_{45} = 0.85\)). + - Low cross-cluster (e.g., \(a_{14} = 0.1\), \(a_{25} = 0.2\)). + +#### **BIP Solution**: +1. **Within-Cluster Edges**: + - \(y_{12} = 1\), \(y_{23} = 1\), \(y_{13} = 1\) (due to cycle-consistency). + - \(y_{45} = 1\). +2. **Cross-Cluster Edges**: + - \(y_{14} = 0\), \(y_{25} = 0\) (frozen or excluded by low affinity). + +#### **Clusters**: +- Cluster 1: \(\{D_1, D_2, D_3\}\). +- Cluster 2: \(\{D_4, D_5\}\). + +--- + +### **5. Handling Noisy Affinities** +#### **Case 1: Incorrect High Affinity** +- Suppose \(a_{14} = 0.9\) (noise). +- **Without Constraints**: Solver might merge \(D_1\) and \(D_4\). +- **With Cycle-Consistency**: + - If \(D_1 \leftrightarrow D_4\) and \(D_4 \leftrightarrow D_5\), then \(D_1 \leftrightarrow D_5\) must hold. + - Since \(a_{15}\) is low, the constraint forces \(y_{14} = 0\) or \(y_{45} = 0\), preserving separate clusters. + +#### **Case 2: Incorrect Low Affinity** +- Suppose \(a_{12} = 0.3\) (missed true match). +- **Result**: Cluster for Person A may fragment unless other edges compensate. + +--- + +### **6. Code Implementation Insights** +#### **Key Steps**: +1. **Preprocessing**: + ```python + triu_mask = np.triu(np.ones_like(affinity_matrix, dtype=bool), 1) + unfrozen_mask = (affinities > min_affinity) & (affinities < max_affinity) + ``` + - Focus on upper-triangle to avoid duplicates. + - Freeze edges outside `[min_affinity, max_affinity]`. + +2. **Constraint Generation**: + ```python + triplets = itertools.combinations(range(n_nodes), 3) + ``` + - Generate constraints for all triplets to enforce cycle-consistency. + +3. **Solver**: + ```python + status, solution = ilp(c, G, h, B=set(...)) + ``` + - GLPK solver finds the optimal binary edge assignments. + +4. **Cluster Extraction**: + ```python + labels = np.arange(1, n + 1) + for i in range(n): + for j in range(i+1, n): + if solution_mat[i,j] > 0: + labels[j] = labels[i] + ``` + - Label propagation to find connected components (clusters). + +--- + +### **7. Practical Considerations** +- **Scalability**: For \(N\) detections, triplet constraints grow as \(O(N^3)\), becoming infeasible for large \(N\). Use approximations or sampling. +- **Parameter Tuning**: `min_affinity` and `max_affinity` are critical to avoid over/under-clustering. +- **Robustness**: Cycle-consistency improves reliability but depends on accurate affinity computation. + +--- + +### **8. Summary** +- **Affinity Scores**: Drive the objective function (maximize total similarity). +- **Cycle-Consistency Constraints**: Ensure logical coherence in clusters. +- **Frozen Edges**: Prevent spurious connections between different trackings. +- **Result**: Accurate, logically consistent clusters even with noisy data. + +--- + + +The user is asking for a more thorough explanation of the graph theory and convex optimization concepts behind the cross-view tracking algorithm, particularly focusing on the mathematical implications rather than just the code implementation. Let me provide a comprehensive explanation starting from the basics, using LaTeX formatting for mathematical expressions. + +I'll cover: +1. Basic graph theory concepts +2. The graph partitioning problem +3. Cycle consistency and why it's important +4. Binary Integer Programming formulation +5. How this applies to the cross-view tracking problem + + +# Understanding Graph Partitioning with Cycle Consistency in Cross-View Tracking + +Let me explain the mathematical concepts behind the graph partitioning with cycle consistency as described in the supplementary material, starting from the basics. + +## 1. Basic Graph Theory Concepts + +In graph theory, a graph $G = (V, E)$ consists of: +- A set of vertices (or nodes) $V$ +- A set of edges $E$ that connect pairs of vertices + +In the context of the cross-view tracking problem: +- Each vertex represents a detected human body (a 2D pose) from a camera view +- An edge between two vertices indicates that these two detections might correspond to the same person in 3D space +- The weight of an edge (or affinity) represents how likely these two detections belong to the same person + +## 2. The Graph Partitioning Problem + +Graph partitioning aims to divide the vertices of a graph into clusters, such that: +- Vertices within the same cluster are strongly connected (high affinity) +- Vertices from different clusters are weakly connected (low affinity) + +In the tracking context, each cluster should contain all the 2D detections of the same person from different camera views. + +### Mathematical Formulation + +For each pair of detections $D_i$ and $D_j$, we define a binary variable: +$$y_{ij} = \begin{cases} +1 & \text{if } D_i \text{ and } D_j \text{ belong to the same person} \\ +0 & \text{otherwise} +\end{cases}$$ + +We also have an affinity value $a_{ij}$ for each pair, indicating how likely they belong to the same person. The objective is to maximize the total affinity: +$$\max \sum_{\langle D_i, D_j \rangle} a_{ij} y_{ij}$$ + +## 3. Cycle Consistency Constraint + +The cycle consistency constraint is a key concept in multi-view matching. Let's understand why it's necessary: + +Imagine three detections $D_i$, $D_j$, and $D_k$ from different camera views. If: +- $D_i$ and $D_j$ belong to the same person ($y_{ij} = 1$) +- $D_j$ and $D_k$ belong to the same person ($y_{jk} = 1$) + +Then logically, $D_i$ and $D_k$ must also belong to the same person ($y_{ik} = 1$). + +This is expressed mathematically as: +$$y_{ij} + y_{jk} \leq 1 + y_{ik}$$ + +Let's verify this constraint works: +- If $y_{ij} = 1$ and $y_{jk} = 1$, the left side equals 2, forcing $y_{ik} = 1$ (as required) +- If either $y_{ij} = 0$ or $y_{jk} = 0$, the constraint is satisfied regardless of $y_{ik}$ + +Without this constraint, the optimization might produce inconsistent groupings that violate transitivity. + +## 4. Binary Integer Programming (BIP) Formulation + +The complete formulation becomes a Binary Integer Programming problem: + +$$\max \sum_{\langle D_i, D_j \rangle} a_{ij} y_{ij}$$ + +Subject to: +$$y_{ij} \in \{0, 1\}$$ +$$y_{ij} + y_{jk} \leq 1 + y_{ik} \quad \forall i,j,k$$ + +This is equivalent to the constraint form in the code: +$$y_{ij} + y_{ik} - y_{jk} \leq 1 \quad \forall i,j,k$$ + +(Just rearrange the original constraint to get this form) + +## 5. Computing the Affinity Matrix + +In cross-view tracking, the affinity between detections from different cameras is computed using epipolar geometry: + +### Epipolar Geometry + +When a 3D point is viewed from two cameras, the corresponding 2D points in the two images must satisfy the epipolar constraint: +$$\mathbf{x}_R^T F \mathbf{x}_L = 0$$ + +Where: +- $\mathbf{x}_L$ and $\mathbf{x}_R$ are the homogeneous coordinates of the 2D points +- $F$ is the fundamental matrix between the two cameras + +The affinity between two detections is then calculated as: +$$A_e(\mathbf{x}_L, \mathbf{x}_R) = 1 - \frac{d_l(\mathbf{x}_L, l_L) + d_l(\mathbf{x}_R, l_R)}{2 \cdot \alpha_{2D}}$$ + +Where: +- $l_R = F\mathbf{x}_L$ is the epipolar line in the right view +- $l_L = F^T\mathbf{x}_R$ is the epipolar line in the left view +- $d_l(\mathbf{x}, l)$ is the distance from point $\mathbf{x}$ to line $l$ +- $\alpha_{2D}$ is a threshold parameter + +## 6. Solving the BIP Problem + +Binary Integer Programming is NP-hard, but can be solved using specialized solvers like GLPK (GNU Linear Programming Kit). The algorithm: + +1. Creates variables for all pairs of detections +2. Sets up the objective function using the affinity values +3. Adds cycle consistency constraints for all triplets of detections +4. Solves the BIP problem to obtain the optimal assignment +5. Extracts clusters based on the resulting binary variables + +## 7. From Mathematics to Implementation + +In the implementation: + +1. The `_create_bip` method: + - Processes the affinity matrix + - Creates all necessary cycle consistency constraints + - Prepares the optimization problem + +2. The `_solve_bip` method: + - Uses the GLPK solver to find the optimal solution + +3. The `solution_mat_clusters` method: + - Extracts the clusters from the solution matrix + - Each cluster represents a unique person detected in multiple views + +## Conclusion + +The graph partitioning approach with cycle consistency provides a principled way to associate detections across different camera views. It ensures that the resulting associations are consistent, which is crucial for accurate 3D pose reconstruction. The BIP formulation transforms this complex matching problem into a well-defined optimization problem that can be solved efficiently with specialized solvers. +