feat(zed): improve MCAP export batching and defaults

Default ZED MCAP export to neural_plus depth across the CLI and Python wrappers, and add tail-frame handling plus better corrupted-frame diagnostics in zed_svo_to_mcap.

Add mixed hardware/software worker pools to the batch MCAP wrapper, replace tqdm with progress-table on TTYs, keep text event logging and heartbeats for non-TTY runs, and document the NVENC session-limit rationale for mixed mode in the README.

Also refresh Python dependencies for the batch tooling and move the OpenSSL lookup in CMake so the local workspace build remains compatible with the vendored cnats setup.
This commit is contained in:
2026-03-23 09:07:38 +00:00
parent 2f74a9561d
commit a0b9c95d5b
7 changed files with 909 additions and 69 deletions
+522 -55
View File
@@ -9,11 +9,12 @@ import os
import re
import subprocess
import sys
import time
from dataclasses import dataclass
from pathlib import Path
import click
from tqdm import tqdm
from progress_table import ProgressTable
SCRIPT_PATH = Path(__file__).resolve()
@@ -27,11 +28,9 @@ SEGMENT_FILE_PATTERN = re.compile(r".*_zed([0-9]+)\.svo2?$", re.IGNORECASE)
class BatchConfig:
zed_bin: Path | None
probe_existing: bool
cuda_visible_devices: str | None
overwrite: bool
fail_fast: bool
codec: str
encoder_device: str
mcap_compression: str
depth_mode: str
depth_size: str
@@ -41,6 +40,7 @@ class BatchConfig:
start_frame: int | None
end_frame: int | None
sync_tolerance_ms: float | None
progress_ui: str
@dataclass(slots=True, frozen=True)
@@ -48,6 +48,14 @@ class ConversionJob:
segment_dir: Path
output_path: Path
camera_labels: tuple[str, ...]
display_name: str
@dataclass(slots=True, frozen=True)
class WorkerSlot:
label: str
encoder_device: str
cuda_visible_devices: str | None
@dataclass(slots=True, frozen=True)
@@ -84,7 +92,175 @@ class OutputProbeResult:
reason: str = ""
@dataclass(slots=True)
class ActiveJobState:
submission_index: int
job: ConversionJob
slot: WorkerSlot
started_at_monotonic: float
row_index: int | None = None
_MCAP_READER_MODULE = None
TABLE_REFRESH_SECONDS = 1.0
TEXT_HEARTBEAT_SECONDS = 30.0
def format_elapsed(seconds: float) -> str:
rounded = max(0, int(round(seconds)))
minutes, secs = divmod(rounded, 60)
hours, minutes = divmod(minutes, 60)
if hours > 0:
return f"{hours:d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
class ProgressReporter:
heartbeat_interval_seconds: float
def __init__(self, total_jobs: int) -> None:
self.total_jobs = total_jobs
self.heartbeat_interval_seconds = TEXT_HEARTBEAT_SECONDS
def job_started(self, state: ActiveJobState) -> None:
return
def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
return
def heartbeat(
self,
*,
completed_count: int,
failed_count: int,
active_states: list[ActiveJobState],
) -> None:
return
def close(self) -> None:
return
class TextProgressReporter(ProgressReporter):
def __init__(self, total_jobs: int) -> None:
super().__init__(total_jobs)
self.heartbeat_interval_seconds = TEXT_HEARTBEAT_SECONDS
def job_started(self, state: ActiveJobState) -> None:
cuda_label = state.slot.cuda_visible_devices or "-"
click.echo(
(
f"started: [{state.submission_index}/{self.total_jobs}] "
f"{state.slot.label} encoder={state.slot.encoder_device} cuda={cuda_label} "
f"segment={state.job.display_name}"
),
err=True,
)
def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
elapsed = format_elapsed(time.monotonic() - state.started_at_monotonic)
prefix = "completed" if result.status == "converted" else "failed"
exit_text = "" if result.status == "converted" else f" exit={result.return_code}"
click.echo(
(
f"{prefix}: [{state.submission_index}/{self.total_jobs}] "
f"{state.slot.label} elapsed={elapsed}{exit_text} segment={state.job.display_name}"
),
err=True,
)
if result.status == "failed":
for line in failure_excerpt(result):
click.echo(f" {line}", err=True)
def heartbeat(
self,
*,
completed_count: int,
failed_count: int,
active_states: list[ActiveJobState],
) -> None:
active_count = len(active_states)
remaining_count = self.total_jobs - completed_count - failed_count - active_count
click.echo(
(
f"progress: completed={completed_count} failed={failed_count} "
f"active={active_count} remaining={remaining_count}"
),
err=True,
)
class TableProgressReporter(ProgressReporter):
def __init__(self, total_jobs: int) -> None:
super().__init__(total_jobs)
self.heartbeat_interval_seconds = TABLE_REFRESH_SECONDS
self.table = ProgressTable(
"#",
"segment",
"worker",
"encoder",
"cuda",
"status",
"elapsed_s",
interactive=2,
refresh_rate=10,
default_column_alignment="left",
default_column_width=12,
pbar_show_throughput=False,
pbar_show_progress=False,
pbar_show_percents=False,
pbar_show_eta=False,
print_header_every_n_rows=30,
file=sys.stderr,
)
self.table.add_column("#", width=4, alignment="right")
self.table.add_column("segment", width=44, alignment="left")
self.table.add_column("worker", width=8, alignment="left")
self.table.add_column("encoder", width=10, alignment="left")
self.table.add_column("cuda", width=6, alignment="left")
self.table.add_column("status", width=12, alignment="left")
self.table.add_column("elapsed_s", width=10, alignment="right")
def job_started(self, state: ActiveJobState) -> None:
self.table.add_row(
state.submission_index,
state.job.display_name,
state.slot.label,
state.slot.encoder_device,
state.slot.cuda_visible_devices or "-",
"running",
format_elapsed(0.0),
)
state.row_index = self.table.num_rows() - 1
def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
if state.row_index is None:
return
self.table.update("status", "converted" if result.status == "converted" else f"failed({result.return_code})", row=state.row_index)
self.table.update(
"elapsed_s",
format_elapsed(time.monotonic() - state.started_at_monotonic),
row=state.row_index,
)
def heartbeat(
self,
*,
completed_count: int,
failed_count: int,
active_states: list[ActiveJobState],
) -> None:
for state in active_states:
if state.row_index is None:
continue
self.table.update(
"elapsed_s",
format_elapsed(time.monotonic() - state.started_at_monotonic),
row=state.row_index,
)
def close(self) -> None:
self.table.close()
def locate_binary(override: Path | None) -> Path:
@@ -272,7 +448,41 @@ def output_path_for(segment_dir: Path) -> Path:
return segment_dir / f"{segment_dir.name}.mcap"
def command_for_job(job: ConversionJob, config: BatchConfig) -> list[str]:
def common_segment_parent(segment_dirs: tuple[Path, ...]) -> Path | None:
if len(segment_dirs) <= 1:
return None
try:
return Path(os.path.commonpath([str(path) for path in segment_dirs]))
except ValueError:
return None
def display_name_for_segment(
segment_dir: Path,
*,
source_mode: str,
input_root: Path | None,
common_parent: Path | None,
) -> str:
if source_mode == "discovery" and input_root is not None:
try:
return str(segment_dir.relative_to(input_root))
except ValueError:
pass
if common_parent is not None:
try:
relative = segment_dir.relative_to(common_parent)
if str(relative) != ".":
return str(relative)
except ValueError:
pass
parent_name = segment_dir.parent.name
if parent_name:
return str(Path(parent_name) / segment_dir.name)
return segment_dir.name
def command_for_job(job: ConversionJob, config: BatchConfig, encoder_device: str) -> list[str]:
if config.zed_bin is None:
raise RuntimeError("zed_svo_to_mcap binary is not configured")
@@ -283,7 +493,7 @@ def command_for_job(job: ConversionJob, config: BatchConfig) -> list[str]:
"--codec",
config.codec,
"--encoder-device",
config.encoder_device,
encoder_device,
"--mcap-compression",
config.mcap_compression,
"--depth-mode",
@@ -307,12 +517,33 @@ def command_for_job(job: ConversionJob, config: BatchConfig) -> list[str]:
def env_for_job(config: BatchConfig) -> dict[str, str]:
return env_for_job_with_cuda(None)
def env_for_job_with_cuda(assigned_cuda_visible_devices: str | None) -> dict[str, str]:
env = dict(os.environ)
if config.cuda_visible_devices is not None:
env["CUDA_VISIBLE_DEVICES"] = config.cuda_visible_devices
if assigned_cuda_visible_devices is not None:
env["CUDA_VISIBLE_DEVICES"] = assigned_cuda_visible_devices
return env
def parse_cuda_device_pool(raw_value: str | None) -> tuple[str, ...]:
if raw_value is None:
return ()
devices = tuple(device.strip() for device in raw_value.split(",") if device.strip())
return devices
def choose_progress_reporter(progress_ui: str, total_jobs: int) -> ProgressReporter:
if progress_ui == "table":
return TableProgressReporter(total_jobs)
if progress_ui == "text":
return TextProgressReporter(total_jobs)
if sys.stderr.isatty():
return TableProgressReporter(total_jobs)
return TextProgressReporter(total_jobs)
def load_mcap_reader():
global _MCAP_READER_MODULE
if _MCAP_READER_MODULE is not None:
@@ -368,13 +599,25 @@ def probe_output(output_path: Path, camera_labels: tuple[str, ...]) -> OutputPro
def run_conversion(job: ConversionJob, config: BatchConfig) -> JobResult:
command = command_for_job(job, config)
return run_conversion_on_slot(
job,
config,
WorkerSlot(label="job-1", encoder_device="auto", cuda_visible_devices=None),
)
def run_conversion_on_slot(
job: ConversionJob,
config: BatchConfig,
slot: WorkerSlot,
) -> JobResult:
command = command_for_job(job, config, slot.encoder_device)
completed = subprocess.run(
command,
check=False,
capture_output=True,
text=True,
env=env_for_job(config),
env=env_for_job_with_cuda(slot.cuda_visible_devices),
)
status = "converted" if completed.returncode == 0 else "failed"
return JobResult(
@@ -388,6 +631,23 @@ def run_conversion(job: ConversionJob, config: BatchConfig) -> JobResult:
)
def split_lines_for_excerpt(text: str, max_lines: int = 8) -> list[str]:
lines = [line.rstrip() for line in text.splitlines() if line.strip()]
if len(lines) <= max_lines:
return lines
excerpt = lines[:max_lines]
excerpt.append(f"... ({len(lines) - max_lines} more lines)")
return excerpt
def failure_excerpt(result: JobResult, max_lines: int = 8) -> list[str]:
if result.stderr.strip():
return split_lines_for_excerpt(result.stderr, max_lines=max_lines)
if result.stdout.strip():
return split_lines_for_excerpt(result.stdout, max_lines=max_lines)
return []
def summarize_failures(results: list[JobResult]) -> None:
failed_results = [result for result in results if result.status == "failed"]
if not failed_results:
@@ -437,64 +697,216 @@ def report_dry_run_plan(
click.echo(line, err=True)
def run_batch(jobs: list[ConversionJob], config: BatchConfig, jobs_limit: int) -> tuple[list[JobResult], int]:
def run_batch(
jobs: list[ConversionJob],
config: BatchConfig,
worker_slots: list[WorkerSlot],
) -> tuple[list[JobResult], int]:
results: list[JobResult] = []
aborted_count = 0
if not jobs:
return results, aborted_count
if not worker_slots:
raise click.ClickException("no worker slots configured")
future_to_job: dict[concurrent.futures.Future[JobResult], ConversionJob] = {}
available_slots = list(worker_slots)
max_parallel_jobs = len(worker_slots)
future_to_job: dict[concurrent.futures.Future[JobResult], ActiveJobState] = {}
job_iter = iter(jobs)
stop_submitting = False
completed_count = 0
failed_count = 0
submission_index = 0
reporter = choose_progress_reporter(config.progress_ui, len(jobs))
last_heartbeat_at = time.monotonic()
with concurrent.futures.ThreadPoolExecutor(max_workers=jobs_limit) as executor:
with tqdm(total=len(jobs), unit="segment", dynamic_ncols=True) as progress:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel_jobs) as executor:
def submit_next() -> bool:
if stop_submitting:
return False
try:
job = next(job_iter)
except StopIteration:
return False
future = executor.submit(run_conversion, job, config)
future_to_job[future] = job
return True
def submit_next() -> bool:
nonlocal submission_index
if stop_submitting or not available_slots:
return False
slot = available_slots.pop(0)
try:
job = next(job_iter)
except StopIteration:
available_slots.insert(0, slot)
return False
for _ in range(min(jobs_limit, len(jobs))):
submit_next()
submission_index += 1
state = ActiveJobState(
submission_index=submission_index,
job=job,
slot=slot,
started_at_monotonic=time.monotonic(),
)
reporter.job_started(state)
future = executor.submit(run_conversion_on_slot, job, config, slot)
future_to_job[future] = state
return True
while future_to_job:
done, _ = concurrent.futures.wait(
future_to_job,
return_when=concurrent.futures.FIRST_COMPLETED,
for _ in range(min(max_parallel_jobs, len(jobs))):
submit_next()
while future_to_job:
done, _ = concurrent.futures.wait(
future_to_job,
timeout=reporter.heartbeat_interval_seconds,
return_when=concurrent.futures.FIRST_COMPLETED,
)
if not done:
reporter.heartbeat(
completed_count=completed_count,
failed_count=failed_count,
active_states=list(future_to_job.values()),
)
for future in done:
job = future_to_job.pop(future)
result = future.result()
results.append(result)
progress.update(1)
last_heartbeat_at = time.monotonic()
continue
if result.status == "failed":
tqdm.write(
f"failed: {job.segment_dir} (exit {result.return_code})",
file=sys.stderr,
)
if config.fail_fast:
stop_submitting = True
for future in done:
state = future_to_job.pop(future)
available_slots.append(state.slot)
result = future.result()
results.append(result)
reporter.job_finished(state, result)
if not stop_submitting:
submit_next()
if result.status == "failed":
failed_count += 1
if config.fail_fast:
stop_submitting = True
else:
completed_count += 1
if stop_submitting:
remaining = sum(1 for _ in job_iter)
aborted_count = remaining
progress.total = progress.n + len(future_to_job)
progress.refresh()
if not stop_submitting:
submit_next()
now = time.monotonic()
if now - last_heartbeat_at >= reporter.heartbeat_interval_seconds:
reporter.heartbeat(
completed_count=completed_count,
failed_count=failed_count,
active_states=list(future_to_job.values()),
)
last_heartbeat_at = now
if stop_submitting:
aborted_count = sum(1 for _ in job_iter)
reporter.close()
return results, aborted_count
def build_uniform_worker_slots(
jobs: int,
encoder_device: str,
cuda_visible_devices: str | None,
) -> list[WorkerSlot]:
if jobs < 1:
raise click.ClickException("--jobs must be at least 1")
if cuda_visible_devices is None:
return [
WorkerSlot(
label=f"job-{index + 1}",
encoder_device=encoder_device,
cuda_visible_devices=None,
)
for index in range(jobs)
]
device_pool = parse_cuda_device_pool(cuda_visible_devices)
if len(device_pool) < jobs:
raise click.ClickException(
f"--cuda-visible-devices must provide at least {jobs} entries when --jobs={jobs}"
)
return [
WorkerSlot(
label=f"job-{index + 1}",
encoder_device=encoder_device,
cuda_visible_devices=device_pool[index],
)
for index in range(jobs)
]
def parse_required_device_pool(raw_value: str | None, expected_count: int, flag_name: str) -> tuple[str, ...]:
if expected_count == 0:
if raw_value is None:
return ()
raise click.ClickException(f"{flag_name} cannot be used when the matching job count is 0")
device_pool = parse_cuda_device_pool(raw_value)
if len(device_pool) != expected_count:
raise click.ClickException(
f"{flag_name} must provide exactly {expected_count} entries when the matching job count is {expected_count}"
)
return device_pool
def build_worker_slots(
*,
jobs: int,
encoder_device: str,
cuda_visible_devices: str | None,
hardware_jobs: int,
hardware_cuda_visible_devices: str | None,
software_jobs: int,
software_cuda_visible_devices: str | None,
) -> list[WorkerSlot]:
mixed_mode_requested = any(
(
hardware_jobs > 0,
software_jobs > 0,
hardware_cuda_visible_devices is not None,
software_cuda_visible_devices is not None,
)
)
if not mixed_mode_requested:
return build_uniform_worker_slots(jobs, encoder_device, cuda_visible_devices)
if jobs != 1:
raise click.ClickException("--jobs cannot be combined with mixed worker pool flags")
if cuda_visible_devices is not None:
raise click.ClickException("--cuda-visible-devices cannot be combined with mixed worker pool flags")
if encoder_device != "auto":
raise click.ClickException("--encoder-device cannot be combined with mixed worker pool flags")
total_jobs = hardware_jobs + software_jobs
if total_jobs < 1:
raise click.ClickException("mixed worker pool flags require at least one hardware or software job")
hardware_device_pool = parse_required_device_pool(
hardware_cuda_visible_devices,
hardware_jobs,
"--hardware-cuda-visible-devices",
)
software_device_pool = parse_required_device_pool(
software_cuda_visible_devices,
software_jobs,
"--software-cuda-visible-devices",
)
worker_slots: list[WorkerSlot] = []
worker_slots.extend(
WorkerSlot(
label=f"hw-{index + 1}",
encoder_device="nvidia",
cuda_visible_devices=device,
)
for index, device in enumerate(hardware_device_pool)
)
worker_slots.extend(
WorkerSlot(
label=f"sw-{index + 1}",
encoder_device="software",
cuda_visible_devices=device,
)
for index, device in enumerate(software_device_pool)
)
return worker_slots
@click.command()
@click.argument(
"input_dir",
@@ -520,6 +932,28 @@ def run_batch(jobs: list[ConversionJob], config: BatchConfig, jobs_limit: int) -
)
@click.option("--recursive/--no-recursive", default=True, show_default=True, help="Recurse when discovering segment directories from INPUT_DIR.")
@click.option("--jobs", default=1, show_default=True, type=click.IntRange(min=1), help="Parallel conversion jobs.")
@click.option(
"--hardware-jobs",
default=0,
show_default=True,
type=click.IntRange(min=0),
help="Mixed mode: number of hardware-encoded workers.",
)
@click.option(
"--hardware-cuda-visible-devices",
help="Mixed mode: comma-separated CUDA_VISIBLE_DEVICES assignments for hardware workers, one entry per worker.",
)
@click.option(
"--software-jobs",
default=0,
show_default=True,
type=click.IntRange(min=0),
help="Mixed mode: number of software-encoded workers.",
)
@click.option(
"--software-cuda-visible-devices",
help="Mixed mode: comma-separated CUDA_VISIBLE_DEVICES assignments for software workers, one entry per worker.",
)
@click.option(
"--zed-bin",
type=click.Path(path_type=Path, dir_okay=False),
@@ -527,7 +961,7 @@ def run_batch(jobs: list[ConversionJob], config: BatchConfig, jobs_limit: int) -
)
@click.option(
"--cuda-visible-devices",
help="Optional CUDA_VISIBLE_DEVICES value exported for each conversion subprocess.",
help="Optional CUDA_VISIBLE_DEVICES value. A comma-separated list is distributed across concurrent jobs one GPU per subprocess.",
)
@click.option("--overwrite/--skip-existing", default=False, show_default=True, help="Overwrite existing MCAP files.")
@click.option(
@@ -568,7 +1002,7 @@ def run_batch(jobs: list[ConversionJob], config: BatchConfig, jobs_limit: int) -
@click.option(
"--depth-mode",
type=click.Choice(("neural_light", "neural", "neural_plus")),
default="neural",
default="neural_plus",
show_default=True,
)
@click.option(
@@ -606,6 +1040,13 @@ def run_batch(jobs: list[ConversionJob], config: BatchConfig, jobs_limit: int) -
default=None,
help="Override the maximum timestamp delta used for bundled multi-camera sync.",
)
@click.option(
"--progress-ui",
type=click.Choice(("auto", "table", "text")),
default="auto",
show_default=True,
help="Progress output mode. Auto uses a table on TTY and text logging otherwise.",
)
def main(
input_dir: Path | None,
segment_dirs: tuple[Path, ...],
@@ -613,6 +1054,10 @@ def main(
csv_root: Path | None,
recursive: bool,
jobs: int,
hardware_jobs: int,
hardware_cuda_visible_devices: str | None,
software_jobs: int,
software_cuda_visible_devices: str | None,
zed_bin: Path | None,
cuda_visible_devices: str | None,
overwrite: bool,
@@ -631,6 +1076,7 @@ def main(
start_frame: int | None,
end_frame: int | None,
sync_tolerance_ms: float | None,
progress_ui: str,
) -> None:
"""Batch-convert multi-camera ZED segments into bundled MCAP files."""
if report_existing and dry_run:
@@ -638,14 +1084,21 @@ def main(
binary_path = None if report_existing else locate_binary(zed_bin)
sources = resolve_sources(input_dir, segment_dirs, segments_csv, csv_root, recursive)
worker_slots = build_worker_slots(
jobs=jobs,
encoder_device=encoder_device,
cuda_visible_devices=cuda_visible_devices,
hardware_jobs=hardware_jobs,
hardware_cuda_visible_devices=hardware_cuda_visible_devices,
software_jobs=software_jobs,
software_cuda_visible_devices=software_cuda_visible_devices,
)
config = BatchConfig(
zed_bin=binary_path,
probe_existing=probe_existing or report_existing,
cuda_visible_devices=cuda_visible_devices,
overwrite=overwrite,
fail_fast=fail_fast,
codec=codec,
encoder_device=encoder_device,
mcap_compression=mcap_compression,
depth_mode=depth_mode,
depth_size=depth_size,
@@ -655,7 +1108,10 @@ def main(
start_frame=start_frame,
end_frame=end_frame,
sync_tolerance_ms=sync_tolerance_ms,
progress_ui=progress_ui,
)
input_root = input_dir.expanduser().resolve() if input_dir is not None else None
display_parent = common_segment_parent(sources.segment_dirs)
skipped_results: list[JobResult] = []
failed_results: list[JobResult] = []
@@ -673,8 +1129,19 @@ def main(
segment_dir=segment_dir,
output_path=output_path,
camera_labels=scan.camera_labels,
display_name=display_name_for_segment(
segment_dir,
source_mode=sources.mode,
input_root=input_root,
common_parent=display_parent,
),
)
default_encoder_device = worker_slots[0].encoder_device if worker_slots else encoder_device
command = (
tuple(command_for_job(job, config, default_encoder_device))
if config.zed_bin is not None
else ()
)
command = tuple(command_for_job(job, config)) if config.zed_bin is not None else ()
if not scan.is_valid:
failed_results.append(
JobResult(
@@ -762,7 +1229,7 @@ def main(
click.echo(
(
f"source={sources.mode} matched={len(sources.segment_dirs)} pending={len(pending_jobs)} "
f"skipped={len(skipped_results)} invalid={len(failed_results)} jobs={jobs} "
f"skipped={len(skipped_results)} invalid={len(failed_results)} jobs={len(worker_slots)} "
f"dry_run={'yes' if dry_run else 'no'}"
),
err=True,
@@ -787,7 +1254,7 @@ def main(
results = list(skipped_results)
results.extend(failed_results)
conversion_results, aborted_count = run_batch(pending_jobs, config, jobs)
conversion_results, aborted_count = run_batch(pending_jobs, config, worker_slots)
results.extend(conversion_results)
converted_count = sum(1 for result in results if result.status == "converted")