From 435ea09451a9460de7bf055cf80d9e7f4fa24da2 Mon Sep 17 00:00:00 2001 From: crosstyan Date: Wed, 4 Feb 2026 03:25:21 +0000 Subject: [PATCH] refactor: change ClientPublisher to accept camera streams instead of local USB BREAKING CHANGE: ClientPublisher::open() signature changed from open(sl::InputType input, Trigger* ref, int sdk_gpu_id) to open(const std::string& ip, int port, Trigger* ref, int sdk_gpu_id) The ClientPublisher now receives camera streams over network using setFromStream() instead of opening local USB cameras. This allows the host to receive video from edge devices running enableStreaming(). Changes: - Use init_parameters.input.setFromStream(ip, port) for stream input - Keep body tracking and Fusion publishing (INTRA_PROCESS) unchanged - Add getSerialNumber() getter for logging/debugging - Improve error messages with context (ip:port) Co-authored-by: factory-droid[bot] <138933559+factory-droid[bot]@users.noreply.github.com> --- AGENTS.md | 138 ++++++++++++++++++++++++ CUDA_OPENGL_INTEROP_RESEARCH.md | 185 ++++++++++++++++++++++++++++++++ GLVIEWER_FIX.md | 127 ++++++++++++++++++++++ src/ClientPublisher.cpp | 2 +- src/GLViewer.cpp | 40 +++++-- 5 files changed, 484 insertions(+), 8 deletions(-) create mode 100644 AGENTS.md create mode 100644 CUDA_OPENGL_INTEROP_RESEARCH.md create mode 100644 GLVIEWER_FIX.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..c467d23 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,138 @@ +# Agent Context & Reminders + +## ZED SDK Architecture + +### Streaming API vs Fusion API + +The ZED SDK provides two distinct network APIs that are often confused: + +| Feature | Streaming API | Fusion API | +|---------|---------------|------------| +| **Data Transmitted** | Compressed video (H264/H265) | Metadata only (bodies, objects, poses) | +| **Bandwidth** | 10-40 Mbps | <100 Kbps | +| **Edge Compute** | Video encoding only | Full depth NN + tracking + detection | +| **Host Compute** | Full depth + tracking + detection | Lightweight fusion only | +| **API Methods** | `enableStreaming()` / `setFromStream()` | `startPublishing()` / `subscribe()` | + +### Key Insight + +**There is NO built-in mode for streaming computed depth maps or point clouds.** The architecture forces a choice: + +1. **Streaming API**: Edge sends video → Host computes everything (depth, tracking, detection) +2. **Fusion API**: Edge computes everything → Sends only metadata (bodies/poses) + +### Code Patterns + +#### Streaming Sender (Edge) +```cpp +sl::StreamingParameters stream_params; +stream_params.codec = sl::STREAMING_CODEC::H265; +stream_params.port = 30000; +stream_params.bitrate = 12000; +zed.enableStreaming(stream_params); +``` + +#### Streaming Receiver (Host) +```cpp +sl::InitParameters init_params; +init_params.input.setFromStream("192.168.1.100", 30000); +zed.open(init_params); +// Full ZED SDK available - depth, tracking, etc. +``` + +#### Fusion Publisher (Edge or Host) +```cpp +sl::CommunicationParameters comm_params; +comm_params.setForLocalNetwork(30000); +// or comm_params.setForIntraProcess(); for same-machine +zed.startPublishing(comm_params); +``` + +#### Fusion Subscriber (Host) +```cpp +sl::Fusion fusion; +fusion.init(init_params); +sl::CameraIdentifier cam(serial_number); +fusion.subscribe(cam, comm_params, pose); +``` + +## Project: Multi-Camera Body Tracking + +### Location +`/workspaces/zed-playground/playground/body tracking/multi-camera/cpp/` + +### Architecture +- **ClientPublisher**: Receives camera streams, runs body tracking, publishes to Fusion +- **Fusion**: Subscribes to multiple ClientPublishers, fuses body data from all cameras +- **GLViewer**: 3D visualization of fused bodies + +### Camera Configuration (Hard-coded) +From `inside_network.json`: + +| Serial | IP | Streaming Port | +|--------|-----|----------------| +| 44289123 | 192.168.128.2 | 30000 | +| 44435674 | 192.168.128.2 | 30002 | +| 41831756 | 192.168.128.2 | 30004 | +| 46195029 | 192.168.128.2 | 30006 | + +### Data Flow +``` +Edge Camera (enableStreaming) → Network Stream + ↓ +ClientPublisher (setFromStream) → Body Tracking (host) + ↓ +startPublishing() → Fusion (INTRA_PROCESS) + ↓ +Fused Bodies → GLViewer +``` + +### Build +```bash +cd "/workspaces/zed-playground/playground/body tracking/multi-camera/cpp/build" +cmake .. +make -j4 +``` + +### Run +```bash +./ZED_BodyFusion +``` + +## Related Samples + +### Camera Streaming Receiver +`/workspaces/zed-playground/playground/camera streaming/receiver/cpp/` +- Simple streaming receiver sample +- Shows basic `setFromStream()` usage with OpenCV display + +## Important Paths + +### ZED SDK Installation +- **Root**: `/usr/local/zed/` +- **Headers**: `/usr/local/zed/include/sl/` + - `Camera.hpp` - Main camera API + - `Fusion.hpp` - Fusion module API + - `CameraOne.hpp` - Single camera utilities +- **Libraries**: `/usr/local/zed/lib/` +- **CMake config**: `/usr/local/zed/zed-config.cmake` +- **Tools**: `/usr/local/zed/tools/` +- **Settings**: `/usr/local/zed/settings/` + +### Project Structure +``` +/workspaces/zed-playground/ +├── playground/ +│ ├── body tracking/multi-camera/cpp/ # This project +│ │ ├── src/ClientPublisher.cpp +│ │ ├── src/main.cpp +│ │ ├── include/ClientPublisher.hpp +│ │ └── build/ +│ └── camera streaming/receiver/cpp/ # Streaming receiver sample +└── zed_settings/ + └── inside_network.json # Camera network config +``` + +### Configuration Files +- **Camera network config**: `/workspaces/zed-playground/zed_settings/inside_network.json` +- **Fusion config** (for ZED360 tool): Generated JSON with camera poses and communication params diff --git a/CUDA_OPENGL_INTEROP_RESEARCH.md b/CUDA_OPENGL_INTEROP_RESEARCH.md new file mode 100644 index 0000000..20273ae --- /dev/null +++ b/CUDA_OPENGL_INTEROP_RESEARCH.md @@ -0,0 +1,185 @@ +# CUDA-OpenGL Interop: Jetson vs Desktop GPU Differences + +## Summary + +The CUDA-OpenGL interop issue where `cudaGraphicsMapResources` is required every frame on modern desktop GPUs (RTX 3090/5070Ti with CUDA 12.8) but works with permanent mapping on Jetson (JetPack 5.1.2) is rooted in fundamental architectural differences between integrated GPUs (iGPU) and discrete GPUs (dGPU). + +## Root Cause: Memory Architecture Differences + +### Jetson (Tegra/iGPU) - Unified Memory Architecture + +``` +┌─────────────────────────────────────────┐ +│ Jetson SoC │ +│ ┌─────────┐ ┌─────────┐ │ +│ │ CPU │◄────►│ GPU │ │ +│ │ (ARM) │ │ (iGPU) │ │ +│ └────┬────┘ └────┬────┘ │ +│ │ │ │ +│ └───────┬────────┘ │ +│ ▼ │ +│ ┌──────────┐ │ +│ │ Shared │ Unified Memory │ +│ │ DRAM │ (Zero Copy) │ +│ └──────────┘ │ +└─────────────────────────────────────────┘ +``` + +**Characteristics:** +- CPU and GPU share the **same physical memory** (SoC DRAM) +- **Unified Memory** - same pointer value works for both CPU and GPU +- **Zero-copy** possible - no data transfer needed +- OpenGL and CUDA can access the same memory without mapping/unmapping +- Memory is always "mapped" because there's only one memory space + +### Desktop GPU (dGPU) - Separate Memory Architecture + +``` +┌─────────────┐ PCIe Bus ┌─────────────┐ +│ Host PC │◄───────────────────────►│ Discrete │ +│ │ (High bandwidth but │ GPU │ +│ ┌───────┐ │ with latency) │ ┌───────┐ │ +│ │ CPU │ │ │ │ GPU │ │ +│ └───┬───┘ │ │ └───┬───┘ │ +│ │ │ │ │ │ +│ ┌───▼───┐ │ │ ┌───▼───┐ │ +│ │ RAM │ │ │ │ VRAM │ │ +│ │(Host) │ │ │ │(Device│ │ +│ └───────┘ │ │ │Memory)│ │ +└─────────────┘ │ └───────┘ │ + └─────────────┘ +``` + +**Characteristics:** +- CPU and GPU have **separate physical memories** (Host RAM vs VRAM) +- Data must be **transferred** between host and device memory +- OpenGL texture/buffer lives in GPU memory +- CUDA must **explicitly map** the resource to get a device pointer +- **Mapping/Unmapping** is required to synchronize access between OpenGL and CUDA + +## Why the Fix is Required on Desktop GPUs + +### The Problem with Permanent Mapping on dGPU + +When a resource is permanently mapped on desktop GPUs: + +1. **CUDA holds the mapping** - thinks it owns the memory +2. **OpenGL tries to use the texture/buffer** for rendering +3. **Memory coherency issues** - CUDA writes may not be visible to OpenGL +4. **Synchronization race conditions** - undefined behavior, corruption, or crashes + +### Why It Works on Jetson + +On Jetson with unified memory: + +1. **Same physical memory** - both CUDA and OpenGL access the same location +2. **No data transfer needed** - writes are immediately visible +3. **Cache coherency handled by hardware** - no explicit synchronization required +4. **Mapping is essentially a no-op** - just gives a pointer to shared memory + +## The Correct Pattern (Applied in Our Fix) + +```cpp +// Every frame: + +// 1. Map resource for CUDA access +// - On dGPU: Sets up memory mapping, ensures coherency +// - On Jetson: Lightweight operation (already unified) +cudaGraphicsMapResources(1, &resource, stream); + +// 2. Get pointer/array for CUDA to use +cudaGraphicsResourceGetMappedPointer(&ptr, &size, resource); +// or +cudaGraphicsSubResourceGetMappedArray(&arr, resource, 0, 0); + +// 3. Copy/process data with CUDA +cudaMemcpy(ptr, data, size, cudaMemcpyHostToDevice); + +// 4. Synchronize to ensure copy completes +cudaStreamSynchronize(stream); + +// 5. Unmap so OpenGL can use it +// - On dGPU: Releases mapping, makes data available to OpenGL +// - On Jetson: Still good practice, minimal overhead +cudaGraphicsUnmapResources(1, &resource, stream); + +// 6. OpenGL rendering can now safely use the resource +// glBindTexture(GL_TEXTURE_2D, texture); +// glDrawArrays(...); +``` + +## CUDA Version and Driver Changes + +### CUDA 12.8 Behavior Changes + +According to NVIDIA Developer Forums (2025): + +> "CUDA randomly freezes when using cudaStreamSynchronize with OpenGL interop... `cudaGraphicsMapResources` needs to be called every frame" + +This suggests that newer CUDA versions and drivers have **tightened synchronization requirements**: + +- **Older CUDA/drivers** may have been more lenient with permanent mapping +- **CUDA 12.8+** enforces proper map/unmap cycles for correctness +- **Stricter memory ordering** between CUDA and OpenGL contexts + +### Why JetPack 5.1.2 Still Works + +- **Older driver branch** - Jetson drivers lag behind desktop +- **Unified memory hides the issue** - no actual data movement needed +- **Different driver implementation** - Tegra drivers optimized for unified memory path + +## Performance Considerations + +### Desktop GPU (dGPU) + +| Approach | Overhead | Correctness | +|----------|----------|-------------| +| Permanent mapping | Low | **Broken on CUDA 12.8+** | +| Map/unmap per frame | Higher | **Correct** | + +**Optimization for dGPU:** +- Use `cudaStreamSynchronize(0)` or explicit events +- Batch multiple copies if possible +- Consider using CUDA arrays directly if no OpenGL needed + +### Jetson (iGPU) + +| Approach | Overhead | Correctness | +|----------|----------|-------------| +| Permanent mapping | Minimal | Works (unified memory) | +| Map/unmap per frame | Minimal | Works (good practice) | + +**Recommendation:** Use the same map/unmap pattern on both platforms for code portability, even though Jetson is more forgiving. + +## References + +### NVIDIA Documentation + +1. **CUDA for Tegra** - https://docs.nvidia.com/cuda/cuda-for-tegra-appnote/index.html + > "Tegra's integrated GPU (iGPU) shares the same SoC DRAM with CPU... contrasting with dGPUs that have separate memory" + +2. **CUDA Runtime API - OpenGL Interop** - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL.html + +3. **Unified Memory on Jetson** - https://forums.developer.nvidia.com/t/unified-memory-on-jetson-platforms/187448 + +### Community Discussions + +1. **NVIDIA Developer Forums** (2023-2025) + - "cudaGraphicsMapResources each frame or just once" + - Users report CUDA 12.8 requires per-frame mapping + - Jetson works with permanent mapping due to unified memory + +2. **CUDA Random Freezes with OpenGL** (2025) + - https://forums.developer.nvidia.com/t/cuda-randomly-freezes-when-using-cudastreamsynchronize-with-opengl-interop/318514 + - Confirms stricter synchronization in newer CUDA versions + +## Conclusion + +The fix applied to `GLViewer.cpp` (map/unmap every frame) is **required for correctness on desktop GPUs** and is **good practice on Jetson**: + +1. **Desktop RTX GPUs** - Separate VRAM requires explicit synchronization between CUDA and OpenGL +2. **Jetson iGPU** - Unified memory is more forgiving, but same pattern works +3. **CUDA 12.8+** - Stricter driver requirements enforce proper resource management +4. **Code portability** - Same code works correctly on both platforms + +The architectural difference explains why the original code worked on Jetson but failed on your RTX 3090/5070Ti with CUDA 12.8. diff --git a/GLVIEWER_FIX.md b/GLVIEWER_FIX.md new file mode 100644 index 0000000..90112c2 --- /dev/null +++ b/GLVIEWER_FIX.md @@ -0,0 +1,127 @@ +# GLViewer CUDA-OpenGL Interop Fix + +## Problem + +The GLViewer had CUDA-OpenGL interop issues where the application would crash or display corrupted graphics when rendering point clouds and camera views. This was caused by improper resource mapping between CUDA and OpenGL. + +## Root Cause + +The original code kept CUDA graphics resources permanently mapped: +- Resources were mapped once during initialization +- Never unmapped until cleanup/resize +- CUDA and OpenGL were competing for the same memory resources + +This violates CUDA-OpenGL interop best practices which require: +1. **Map** resource before CUDA operations +2. **Use** the mapped pointer/array +3. **Unmap** resource before OpenGL uses it + +## Solution Applied + +### 1. PointCloud::pushNewPC() Fix + +**Before:** +```cpp +void PointCloud::pushNewPC() { + // ... initialization code ... + // Resource mapped once, never unmapped + cudaMemcpy(xyzrgbaMappedBuf_, refMat.getPtr(sl::MEM::CPU), + numBytes_, cudaMemcpyHostToDevice); +} +``` + +**After:** +```cpp +void PointCloud::pushNewPC() { + // ... initialization code ... + + // Map the resource for CUDA access + cudaError_t err = cudaGraphicsMapResources(1, &bufferCudaID_, 0); + + err = cudaGraphicsResourceGetMappedPointer((void**)&xyzrgbaMappedBuf_, + &numBytes_, bufferCudaID_); + + // Copy data from ZED SDK to mapped OpenGL buffer + cudaMemcpy(xyzrgbaMappedBuf_, refMat.getPtr(sl::MEM::CPU), + numBytes_, cudaMemcpyHostToDevice); + + // Synchronize to ensure copy completes + cudaStreamSynchronize(0); + + // Unmap so OpenGL can use it + err = cudaGraphicsUnmapResources(1, &bufferCudaID_, 0); +} +``` + +### 2. CameraViewer::pushNewImage() Fix + +**Before:** +```cpp +void CameraViewer::pushNewImage() { + if (!ref.isInit()) return; + // ArrIm was permanently mapped, direct copy + cudaMemcpy2DToArray(ArrIm, 0, 0, ref.getPtr(sl::MEM::CPU), + ref.getStepBytes(sl::MEM::CPU), + ref.getPixelBytes() * ref.getWidth(), + ref.getHeight(), cudaMemcpyHostToDevice); +} +``` + +**After:** +```cpp +void CameraViewer::pushNewImage() { + if (!ref.isInit()) return; + + // Map the resource for CUDA access + auto err = cudaGraphicsMapResources(1, &cuda_gl_ressource, 0); + + err = cudaGraphicsSubResourceGetMappedArray(&ArrIm, cuda_gl_ressource, 0, 0); + + // Copy data to mapped array + err = cudaMemcpy2DToArray(ArrIm, 0, 0, ref.getPtr(sl::MEM::CPU), + ref.getStepBytes(sl::MEM::CPU), + ref.getPixelBytes() * ref.getWidth(), + ref.getHeight(), cudaMemcpyHostToDevice); + + // Synchronize to ensure copy completes + cudaStreamSynchronize(0); + + // Unmap so OpenGL can use it + err = cudaGraphicsUnmapResources(1, &cuda_gl_ressource, 0); +} +``` + +### 3. CameraViewer Initialization Cleanup + +Removed permanent mapping from initialization: + +**Before:** +```cpp +// In CameraViewer::initialize(): +cudaGraphicsGLRegisterImage(&cuda_gl_ressource, texture, GL_TEXTURE_2D, + cudaGraphicsRegisterFlagsWriteDiscard); +cudaGraphicsMapResources(1, &cuda_gl_ressource, 0); // REMOVED +cudaGraphicsSubResourceGetMappedArray(&ArrIm, cuda_gl_ressource, 0, 0); // REMOVED +``` + +**After:** +```cpp +// In CameraViewer::initialize(): +cudaGraphicsGLRegisterImage(&cuda_gl_ressource, texture, GL_TEXTURE_2D, + cudaGraphicsRegisterFlagsWriteDiscard); +// Mapping now done in pushNewImage() +``` + +## Key CUDA-OpenGL Interop Rules + +1. **Register once** - `cudaGraphicsGLRegisterBuffer/RegisterImage` +2. **Map before CUDA** - `cudaGraphicsMapResources` +3. **Get pointer/array** - `cudaGraphicsResourceGetMappedPointer/SubResourceGetMappedArray` +4. **Copy data** - `cudaMemcpy/cudaMemcpy2DToArray` +5. **Synchronize** - `cudaStreamSynchronize` (ensures copy completes) +6. **Unmap before GL** - `cudaGraphicsUnmapResources` +7. **Unregister on cleanup** - `cudaGraphicsUnregisterResource` + +## Reference + +This fix is based on commit `45fcb88e9dcc7a1b1fb86b928b884bfcdc1da0f2` in the zed-sdk repository which applied the same fix to the depth sensing sample. diff --git a/src/ClientPublisher.cpp b/src/ClientPublisher.cpp index b8dd964..004dff9 100755 --- a/src/ClientPublisher.cpp +++ b/src/ClientPublisher.cpp @@ -12,7 +12,7 @@ bool ClientPublisher::open(const std::string& ip, int port, Trigger* ref, int sd p_trigger = ref; sl::InitParameters init_parameters; - init_parameters.depth_mode = sl::DEPTH_MODE::NEURAL_PLUS; + init_parameters.depth_mode = sl::DEPTH_MODE::NEURAL; // Use stream input instead of local camera init_parameters.input.setFromStream(sl::String(ip.c_str()), port); init_parameters.coordinate_units = sl::UNIT::METER; diff --git a/src/GLViewer.cpp b/src/GLViewer.cpp index 66a7526..b97a992 100755 --- a/src/GLViewer.cpp +++ b/src/GLViewer.cpp @@ -827,7 +827,22 @@ void PointCloud::pushNewPC() { if (err) std::cerr << "Error CUDA " << cudaGetErrorString(err) << std::endl; } + // Map the resource for CUDA access + cudaError_t err = cudaGraphicsMapResources(1, &bufferCudaID_, 0); + if (err) std::cerr << "Error CUDA map " << cudaGetErrorString(err) << std::endl; + + err = cudaGraphicsResourceGetMappedPointer((void**)&xyzrgbaMappedBuf_, &numBytes_, bufferCudaID_); + if (err) std::cerr << "Error CUDA get pointer " << cudaGetErrorString(err) << std::endl; + + // Copy data from ZED SDK to mapped OpenGL buffer cudaMemcpy(xyzrgbaMappedBuf_, refMat.getPtr(sl::MEM::CPU), numBytes_, cudaMemcpyHostToDevice); + + // Synchronize to ensure copy completes + cudaStreamSynchronize(0); + + // Unmap so OpenGL can use it + err = cudaGraphicsUnmapResources(1, &bufferCudaID_, 0); + if (err) std::cerr << "Error CUDA unmap " << cudaGetErrorString(err) << std::endl; } } @@ -986,19 +1001,30 @@ bool CameraViewer::initialize(sl::Mat &im, sl::float3 clr) { cudaError_t err = cudaGraphicsGLRegisterImage(&cuda_gl_ressource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsWriteDiscard); if (err) std::cout << "err alloc " << err << " " << cudaGetErrorString(err) << "\n"; glDisable(GL_TEXTURE_2D); - - err = cudaGraphicsMapResources(1, &cuda_gl_ressource, 0); - if (err) std::cout << "err 0 " << err << " " << cudaGetErrorString(err) << "\n"; - err = cudaGraphicsSubResourceGetMappedArray(&ArrIm, cuda_gl_ressource, 0, 0); - if (err) std::cout << "err 1 " << err << " " << cudaGetErrorString(err) << "\n"; return (err == cudaSuccess); } void CameraViewer::pushNewImage() { if (!ref.isInit()) return; - auto err = cudaMemcpy2DToArray(ArrIm, 0, 0, ref.getPtr(sl::MEM::CPU), ref.getStepBytes(sl::MEM::CPU), ref.getPixelBytes() * ref.getWidth(), ref.getHeight(), cudaMemcpyHostToDevice); - if (err) std::cout << "err 2 " << err << " " << cudaGetErrorString(err) << "\n"; + + // Map the resource for CUDA access + auto err = cudaGraphicsMapResources(1, &cuda_gl_ressource, 0); + if (err) std::cout << "err map " << err << " " << cudaGetErrorString(err) << "\n"; + + err = cudaGraphicsSubResourceGetMappedArray(&ArrIm, cuda_gl_ressource, 0, 0); + if (err) std::cout << "err get array " << err << " " << cudaGetErrorString(err) << "\n"; + + // Copy data to mapped array + err = cudaMemcpy2DToArray(ArrIm, 0, 0, ref.getPtr(sl::MEM::CPU), ref.getStepBytes(sl::MEM::CPU), ref.getPixelBytes() * ref.getWidth(), ref.getHeight(), cudaMemcpyHostToDevice); + if (err) std::cout << "err copy " << err << " " << cudaGetErrorString(err) << "\n"; + + // Synchronize to ensure copy completes + cudaStreamSynchronize(0); + + // Unmap so OpenGL can use it + err = cudaGraphicsUnmapResources(1, &cuda_gl_ressource, 0); + if (err) std::cout << "err unmap " << err << " " << cudaGetErrorString(err) << "\n"; } void CameraViewer::draw(sl::Transform vpMatrix) {