cvmmap-streamer/scripts/zed_batch_svo_to_mcap.py

#!/usr/bin/env python3

from __future__ import annotations

import concurrent.futures
import importlib
import os
import re
import subprocess
import sys
import time
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

import click
from progress_table import ProgressTable

try:
    from scripts import zed_batch_segment_sources as segment_sources
except ModuleNotFoundError:
    import zed_batch_segment_sources as segment_sources


SCRIPT_PATH = Path(__file__).resolve()
REPO_ROOT = SCRIPT_PATH.parents[1]
WORKSPACE_ROOT = REPO_ROOT.parent
MCAP_PYTHON_ROOT = WORKSPACE_ROOT / "mcap" / "python" / "mcap"
SEGMENT_FILE_PATTERN = re.compile(r".*_zed([0-9]+)\.svo2?$", re.IGNORECASE)


@dataclass(slots=True, frozen=True)
class BatchConfig:
    zed_bin: Path | None
    probe_existing: bool
    overwrite: bool
    fail_fast: bool
    codec: str
    mcap_compression: str
    depth_mode: str
    depth_size: str
    bundle_policy: str
    copy_range: str
    bundle_topic: str | None
    with_pose: bool
    pose_config: Path | None
    world_frame_id: str | None
    start_frame: int | None
    end_frame: int | None
    sync_tolerance_ms: float | None
    progress_ui: str


@dataclass(slots=True, frozen=True)
class ConversionJob:
    segment_dir: Path
    output_path: Path
    camera_labels: tuple[str, ...]
    display_name: str


@dataclass(slots=True, frozen=True)
class WorkerSlot:
    label: str
    encoder_device: str
    cuda_visible_devices: str | None


@dataclass(slots=True, frozen=True)
class JobResult:
    status: str
    segment_dir: Path
    output_path: Path
    command: tuple[str, ...]
    return_code: int = 0
    stdout: str = ""
    stderr: str = ""


@dataclass(slots=True, frozen=True)
class SegmentScan:
    segment_dir: Path
    matched_files: int
    camera_labels: tuple[str, ...]
    is_valid: bool
    reason: str | None = None


@dataclass(slots=True, frozen=True)
class OutputProbeResult:
    output_path: Path
    status: str
    reason: str = ""


@dataclass(slots=True)
class ActiveJobState:
    submission_index: int
    job: ConversionJob
    slot: WorkerSlot
    started_at_monotonic: float
    row_index: int | None = None


_MCAP_READER_MODULE = None
_BUNDLE_MANIFEST_CLASS_CACHE: dict[bytes, tuple[object, int | None]] = {}
TABLE_REFRESH_SECONDS = 1.0
TEXT_HEARTBEAT_SECONDS = 30.0


def format_elapsed(seconds: float) -> str:
    rounded = max(0, int(round(seconds)))
    minutes, secs = divmod(rounded, 60)
    hours, minutes = divmod(minutes, 60)
    if hours > 0:
        return f"{hours:d}:{minutes:02d}:{secs:02d}"
    return f"{minutes:02d}:{secs:02d}"


class ProgressReporter:
    heartbeat_interval_seconds: float

    def __init__(self, total_jobs: int) -> None:
        self.total_jobs = total_jobs
        self.heartbeat_interval_seconds = TEXT_HEARTBEAT_SECONDS

    def job_started(self, state: ActiveJobState) -> None:
        return

    def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
        return

    def heartbeat(
        self,
        *,
        completed_count: int,
        failed_count: int,
        active_states: list[ActiveJobState],
    ) -> None:
        return

    def close(self) -> None:
        return


class TextProgressReporter(ProgressReporter):
    def __init__(self, total_jobs: int) -> None:
        super().__init__(total_jobs)
        self.heartbeat_interval_seconds = TEXT_HEARTBEAT_SECONDS

    def job_started(self, state: ActiveJobState) -> None:
        cuda_label = state.slot.cuda_visible_devices or "-"
        click.echo(
            (
                f"started: [{state.submission_index}/{self.total_jobs}] "
                f"{state.slot.label} encoder={state.slot.encoder_device} cuda={cuda_label} "
                f"segment={state.job.display_name}"
            ),
            err=True,
        )

    def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
        elapsed = format_elapsed(time.monotonic() - state.started_at_monotonic)
        prefix = "completed" if result.status == "converted" else "failed"
        exit_text = "" if result.status == "converted" else f" exit={result.return_code}"
        click.echo(
            (
                f"{prefix}: [{state.submission_index}/{self.total_jobs}] "
                f"{state.slot.label} elapsed={elapsed}{exit_text} segment={state.job.display_name}"
            ),
            err=True,
        )
        if result.status == "failed":
            for line in failure_excerpt(result):
                click.echo(f"  {line}", err=True)

    def heartbeat(
        self,
        *,
        completed_count: int,
        failed_count: int,
        active_states: list[ActiveJobState],
    ) -> None:
        active_count = len(active_states)
        remaining_count = self.total_jobs - completed_count - failed_count - active_count
        click.echo(
            (
                f"progress: completed={completed_count} failed={failed_count} "
                f"active={active_count} remaining={remaining_count}"
            ),
            err=True,
        )


class TableProgressReporter(ProgressReporter):
    def __init__(self, total_jobs: int) -> None:
        super().__init__(total_jobs)
        self.heartbeat_interval_seconds = TABLE_REFRESH_SECONDS
        self.table = ProgressTable(
            "#",
            "segment",
            "worker",
            "encoder",
            "cuda",
            "status",
            "elapsed_s",
            interactive=2,
            refresh_rate=10,
            default_column_alignment="left",
            default_column_width=12,
            pbar_show_throughput=False,
            pbar_show_progress=False,
            pbar_show_percents=False,
            pbar_show_eta=False,
            print_header_every_n_rows=30,
            file=sys.stderr,
        )
        self.table.add_column("#", width=4, alignment="right")
        self.table.add_column("segment", width=44, alignment="left")
        self.table.add_column("worker", width=8, alignment="left")
        self.table.add_column("encoder", width=10, alignment="left")
        self.table.add_column("cuda", width=6, alignment="left")
        self.table.add_column("status", width=12, alignment="left")
        self.table.add_column("elapsed_s", width=10, alignment="right")

    def job_started(self, state: ActiveJobState) -> None:
        self.table.add_row(
            state.submission_index,
            state.job.display_name,
            state.slot.label,
            state.slot.encoder_device,
            state.slot.cuda_visible_devices or "-",
            "running",
            format_elapsed(0.0),
        )
        state.row_index = self.table.num_rows() - 1

    def job_finished(self, state: ActiveJobState, result: JobResult) -> None:
        if state.row_index is None:
            return
        self.table.update("status", "converted" if result.status == "converted" else f"failed({result.return_code})", row=state.row_index)
        self.table.update(
            "elapsed_s",
            format_elapsed(time.monotonic() - state.started_at_monotonic),
            row=state.row_index,
        )

    def heartbeat(
        self,
        *,
        completed_count: int,
        failed_count: int,
        active_states: list[ActiveJobState],
    ) -> None:
        for state in active_states:
            if state.row_index is None:
                continue
            self.table.update(
                "elapsed_s",
                format_elapsed(time.monotonic() - state.started_at_monotonic),
                row=state.row_index,
            )

    def close(self) -> None:
        self.table.close()


def locate_binary(override: Path | None) -> Path:
    if override is not None:
        candidate = override.expanduser().resolve()
        if not candidate.is_file():
            raise click.ClickException(f"binary not found: {candidate}")
        return candidate

    candidates = (
        REPO_ROOT / "build" / "bin" / "zed_svo_to_mcap",
        REPO_ROOT / "build" / "zed_svo_to_mcap",
    )
    for candidate in candidates:
        if candidate.is_file():
            return candidate
    raise click.ClickException(f"could not find zed_svo_to_mcap under {REPO_ROOT / 'build'}")


def sorted_camera_labels(labels: set[str]) -> tuple[str, ...]:
    return tuple(sorted(labels, key=lambda label: int(label[3:])))


def scan_segment_dir(segment_dir: Path) -> SegmentScan:
    if not segment_dir.is_dir():
        return SegmentScan(
            segment_dir=segment_dir,
            matched_files=0,
            camera_labels=(),
            is_valid=False,
            reason=f"segment directory does not exist: {segment_dir}",
        )

    matched_by_camera: dict[str, list[Path]] = {}
    for child in segment_dir.iterdir():
        if not child.is_file():
            continue
        match = SEGMENT_FILE_PATTERN.fullmatch(child.name)
        if match is None:
            continue
        label = f"zed{int(match.group(1))}"
        matched_by_camera.setdefault(label, []).append(child)

    matched_files = sum(len(paths) for paths in matched_by_camera.values())
    camera_labels = sorted_camera_labels(set(matched_by_camera))
    duplicate_cameras = [label for label, paths in sorted(matched_by_camera.items()) if len(paths) > 1]

    if duplicate_cameras:
        duplicate_text = ", ".join(duplicate_cameras)
        return SegmentScan(
            segment_dir=segment_dir,
            matched_files=matched_files,
            camera_labels=camera_labels,
            is_valid=False,
            reason=f"duplicate camera inputs under {segment_dir}: {duplicate_text}",
        )
    if len(camera_labels) < 2:
        return SegmentScan(
            segment_dir=segment_dir,
            matched_files=matched_files,
            camera_labels=camera_labels,
            is_valid=False,
            reason=f"expected at least 2 camera inputs under {segment_dir}, found {len(camera_labels)}",
        )

    return SegmentScan(
        segment_dir=segment_dir,
        matched_files=matched_files,
        camera_labels=camera_labels,
        is_valid=True,
    )


def output_path_for(segment_dir: Path) -> Path:
    return segment_dir / f"{segment_dir.name}.mcap"


def common_segment_parent(segment_dirs: tuple[Path, ...]) -> Path | None:
    if len(segment_dirs) <= 1:
        return None
    try:
        return Path(os.path.commonpath([str(path) for path in segment_dirs]))
    except ValueError:
        return None


def display_name_for_segment(
    segment_dir: Path,
    *,
    source_mode: str,
    input_root: Path | None,
    common_parent: Path | None,
) -> str:
    if source_mode == "dataset-root" and input_root is not None:
        try:
            return str(segment_dir.relative_to(input_root))
        except ValueError:
            pass
    if common_parent is not None:
        try:
            relative = segment_dir.relative_to(common_parent)
            if str(relative) != ".":
                return str(relative)
        except ValueError:
            pass
    parent_name = segment_dir.parent.name
    if parent_name:
        return str(Path(parent_name) / segment_dir.name)
    return segment_dir.name


def command_for_job(job: ConversionJob, config: BatchConfig, encoder_device: str) -> list[str]:
    if config.zed_bin is None:
        raise RuntimeError("zed_svo_to_mcap binary is not configured")

    command = [
        str(config.zed_bin),
        "--segment-dir",
        str(job.segment_dir),
        "--codec",
        config.codec,
        "--encoder-device",
        encoder_device,
        "--mcap-compression",
        config.mcap_compression,
        "--depth-mode",
        config.depth_mode,
        "--depth-size",
        config.depth_size,
        "--bundle-policy",
        config.bundle_policy,
        "--copy-range",
        config.copy_range,
    ]
    if config.bundle_topic and config.bundle_policy != "copy":
        command.extend(["--bundle-topic", config.bundle_topic])
    if config.with_pose:
        command.append("--with-pose")
    if config.pose_config is not None:
        command.extend(["--pose-config", str(config.pose_config)])
    if config.world_frame_id is not None:
        command.extend(["--world-frame-id", config.world_frame_id])
    if config.start_frame is not None:
        command.extend(["--start-frame", str(config.start_frame)])
    if config.end_frame is not None:
        command.extend(["--end-frame", str(config.end_frame)])
    if config.sync_tolerance_ms is not None:
        command.extend(["--sync-tolerance-ms", str(config.sync_tolerance_ms)])
    return command


def env_for_job(config: BatchConfig) -> dict[str, str]:
    return env_for_job_with_cuda(None)


def env_for_job_with_cuda(assigned_cuda_visible_devices: str | None) -> dict[str, str]:
    env = dict(os.environ)
    if assigned_cuda_visible_devices is not None:
        env["CUDA_VISIBLE_DEVICES"] = assigned_cuda_visible_devices
    return env


def parse_cuda_device_pool(raw_value: str | None) -> tuple[str, ...]:
    if raw_value is None:
        return ()
    devices = tuple(device.strip() for device in raw_value.split(",") if device.strip())
    return devices


def choose_progress_reporter(progress_ui: str, total_jobs: int) -> ProgressReporter:
    if progress_ui == "table":
        return TableProgressReporter(total_jobs)
    if progress_ui == "text":
        return TextProgressReporter(total_jobs)
    if sys.stderr.isatty():
        return TableProgressReporter(total_jobs)
    return TextProgressReporter(total_jobs)


def load_mcap_reader():
    global _MCAP_READER_MODULE
    if _MCAP_READER_MODULE is not None:
        return _MCAP_READER_MODULE

    if str(MCAP_PYTHON_ROOT) not in sys.path:
        sys.path.insert(0, str(MCAP_PYTHON_ROOT))
    try:
        _MCAP_READER_MODULE = importlib.import_module("mcap.reader")
    except ModuleNotFoundError as error:
        raise click.ClickException(
            f"could not import mcap.reader from {MCAP_PYTHON_ROOT}"
        ) from error
    return _MCAP_READER_MODULE


def required_topics_for(camera_labels: tuple[str, ...]) -> set[str]:
    topics: set[str] = set()
    for label in camera_labels:
        topics.add(f"/{label}/video")
        topics.add(f"/{label}/depth")
        topics.add(f"/{label}/calibration")
    return topics


def load_bundle_manifest_type(schema_data: bytes) -> tuple[object, int | None]:
    cached = _BUNDLE_MANIFEST_CLASS_CACHE.get(schema_data)
    if cached is not None:
        return cached

    from google.protobuf import descriptor_pb2, descriptor_pool, message_factory, timestamp_pb2

    descriptor_set = descriptor_pb2.FileDescriptorSet()
    descriptor_set.ParseFromString(schema_data)
    pool = descriptor_pool.DescriptorPool()
    has_embedded_timestamp = any(
        file_descriptor.name == "google/protobuf/timestamp.proto"
        for file_descriptor in descriptor_set.file
    )
    if has_embedded_timestamp:
        for file_descriptor in descriptor_set.file:
            if file_descriptor.name == "google/protobuf/timestamp.proto":
                pool.Add(file_descriptor)
                break
    else:
        pool.AddSerializedFile(timestamp_pb2.DESCRIPTOR.serialized_pb)
    for file_descriptor in descriptor_set.file:
        if file_descriptor.name == "google/protobuf/timestamp.proto":
            continue
        pool.Add(file_descriptor)
    message_descriptor = pool.FindMessageTypeByName("cvmmap_streamer.BundleManifest")
    message_class = message_factory.GetMessageClass(message_descriptor)
    present_value = None
    if "BundleMemberStatus" in message_descriptor.enum_types_by_name:
        status_enum = message_descriptor.enum_types_by_name["BundleMemberStatus"]
        present_value = status_enum.values_by_name["BUNDLE_MEMBER_STATUS_PRESENT"].number
    _BUNDLE_MANIFEST_CLASS_CACHE[schema_data] = (message_class, present_value)
    return message_class, present_value


def probe_output(
    output_path: Path,
    camera_labels: tuple[str, ...],
    *,
    layout: str,
    bundle_topic: str | None,
) -> OutputProbeResult:
    if not output_path.is_file():
        return OutputProbeResult(output_path=output_path, status="missing")

    reader_module = load_mcap_reader()
    expected_topics = required_topics_for(camera_labels)
    require_bundle = layout == "bundled" and len(camera_labels) > 1 and bool(bundle_topic)
    if require_bundle:
        expected_topics.add(bundle_topic or "/bundle")
    found_topics: set[str] = set()
    video_counts: Counter[str] = Counter()
    depth_counts: Counter[str] = Counter()
    bundle_present_counts: Counter[str] = Counter()
    expected_camera_labels = set(camera_labels)

    try:
        with output_path.open("rb") as stream:
            reader = reader_module.make_reader(stream)
            for schema, channel, message in reader.iter_messages():
                if layout == "copy" and channel.topic == "/bundle":
                    return OutputProbeResult(
                        output_path=output_path,
                        status="invalid",
                        reason="copy-layout MCAP must not contain /bundle",
                    )
                if channel.topic in expected_topics:
                    found_topics.add(channel.topic)
                if channel.topic.endswith("/video"):
                    video_counts[channel.topic.removeprefix("/").removesuffix("/video")] += 1
                    continue
                if channel.topic.endswith("/depth"):
                    depth_counts[channel.topic.removeprefix("/").removesuffix("/depth")] += 1
                    continue
                if require_bundle and channel.topic == bundle_topic:
                    if schema is None or schema.name != "cvmmap_streamer.BundleManifest":
                        return OutputProbeResult(
                            output_path=output_path,
                            status="invalid",
                            reason=f"bundle topic '{bundle_topic}' is missing the BundleManifest schema",
                        )
                    try:
                        bundle_class, present_value = load_bundle_manifest_type(schema.data)
                        bundle = bundle_class()
                        bundle.ParseFromString(message.data)
                    except Exception as error:  # noqa: BLE001
                        return OutputProbeResult(
                            output_path=output_path,
                            status="invalid",
                            reason=f"failed to parse bundle manifest: {error}",
                        )

                    bundle_labels: set[str] = set()
                    for member in bundle.members:
                        label = str(member.camera_label)
                        if label not in expected_camera_labels:
                            return OutputProbeResult(
                                output_path=output_path,
                                status="invalid",
                                reason=f"bundle manifest referenced unknown camera label '{label}'",
                            )
                        if label in bundle_labels:
                            return OutputProbeResult(
                                output_path=output_path,
                                status="invalid",
                                reason=f"bundle manifest duplicated camera label '{label}'",
                            )
                        bundle_labels.add(label)
                        is_present = member.HasField("timestamp")
                        if present_value is not None:
                            is_present = member.status == present_value
                            if is_present and not member.HasField("timestamp"):
                                return OutputProbeResult(
                                    output_path=output_path,
                                    status="invalid",
                                    reason=f"bundle member '{label}' is present but missing a timestamp",
                                )
                        if is_present:
                            bundle_present_counts[label] += 1
                    if bundle_labels != expected_camera_labels:
                        missing_labels = sorted(expected_camera_labels - bundle_labels)
                        extra_labels = sorted(bundle_labels - expected_camera_labels)
                        details = []
                        if missing_labels:
                            details.append("missing=" + ",".join(missing_labels))
                        if extra_labels:
                            details.append("extra=" + ",".join(extra_labels))
                        return OutputProbeResult(
                            output_path=output_path,
                            status="invalid",
                            reason="bundle manifest camera coverage mismatch: " + " ".join(details),
                        )
    except Exception as error:  # noqa: BLE001
        return OutputProbeResult(output_path=output_path, status="invalid", reason=str(error))

    missing_topics = sorted(expected_topics - found_topics)
    if missing_topics:
        return OutputProbeResult(
            output_path=output_path,
            status="invalid",
            reason="missing expected topics: " + ", ".join(missing_topics),
        )
    if require_bundle:
        for label in camera_labels:
            present_count = bundle_present_counts[label]
            if video_counts[label] != present_count:
                return OutputProbeResult(
                    output_path=output_path,
                    status="invalid",
                    reason=(
                        f"video count mismatch for {label}: "
                        f"bundle_present={present_count} video_messages={video_counts[label]}"
                    ),
                )
            if depth_counts[label] != present_count:
                return OutputProbeResult(
                    output_path=output_path,
                    status="invalid",
                    reason=(
                        f"depth count mismatch for {label}: "
                        f"bundle_present={present_count} depth_messages={depth_counts[label]}"
                    ),
                )
    else:
        for label in camera_labels:
            if video_counts[label] != depth_counts[label]:
                return OutputProbeResult(
                    output_path=output_path,
                    status="invalid",
                    reason=(
                        f"video/depth count mismatch for {label}: "
                        f"video_messages={video_counts[label]} depth_messages={depth_counts[label]}"
                    ),
                )
    return OutputProbeResult(output_path=output_path, status="valid")


def run_conversion(job: ConversionJob, config: BatchConfig) -> JobResult:
    return run_conversion_on_slot(
        job,
        config,
        WorkerSlot(label="job-1", encoder_device="auto", cuda_visible_devices=None),
    )


def run_conversion_on_slot(
    job: ConversionJob,
    config: BatchConfig,
    slot: WorkerSlot,
) -> JobResult:
    command = command_for_job(job, config, slot.encoder_device)
    completed = subprocess.run(
        command,
        check=False,
        capture_output=True,
        text=True,
        env=env_for_job_with_cuda(slot.cuda_visible_devices),
    )
    status = "converted" if completed.returncode == 0 else "failed"
    return JobResult(
        status=status,
        segment_dir=job.segment_dir,
        output_path=job.output_path,
        command=tuple(command),
        return_code=completed.returncode,
        stdout=completed.stdout,
        stderr=completed.stderr,
    )


def split_lines_for_excerpt(text: str, max_lines: int = 8) -> list[str]:
    lines = [line.rstrip() for line in text.splitlines() if line.strip()]
    if len(lines) <= max_lines:
        return lines
    head_count = max(1, max_lines // 2)
    tail_count = max_lines - head_count
    excerpt = lines[:head_count]
    omitted = len(lines) - head_count - tail_count
    if omitted > 0:
        excerpt.append(f"... ({omitted} omitted line(s))")
    excerpt.extend(lines[-tail_count:])
    return excerpt


def failure_excerpt(result: JobResult, max_lines: int = 8) -> list[str]:
    if result.stderr.strip():
        return split_lines_for_excerpt(result.stderr, max_lines=max_lines)
    if result.stdout.strip():
        return split_lines_for_excerpt(result.stdout, max_lines=max_lines)
    return []


def summarize_failures(results: list[JobResult]) -> None:
    failed_results = [result for result in results if result.status == "failed"]
    if not failed_results:
        return

    click.echo("\nFailed conversions:", err=True)
    for result in failed_results:
        click.echo(f"- {result.segment_dir} (exit {result.return_code})", err=True)
        if result.stderr.strip():
            click.echo(result.stderr.rstrip(), err=True)
        elif result.stdout.strip():
            click.echo(result.stdout.rstrip(), err=True)


def report_invalid_existing_outputs(
    invalid_existing: list[tuple[ConversionJob, OutputProbeResult]],
) -> None:
    if not invalid_existing:
        return

    click.echo("\nInvalid existing outputs:", err=True)
    for job, probe in invalid_existing:
        click.echo(f"- {job.segment_dir}", err=True)
        click.echo(f"  output: {probe.output_path}", err=True)
        reason_lines = probe.reason.splitlines() or [probe.reason]
        click.echo(f"  reason: {reason_lines[0]}", err=True)
        for line in reason_lines[1:]:
            click.echo(f"          {line}", err=True)


def report_dry_run_plan(
    pending_jobs: list[ConversionJob],
    pending_reasons: dict[Path, str],
    pending_details: dict[Path, str],
) -> None:
    if not pending_jobs:
        click.echo("dry-run: no conversions would be launched", err=True)
        return

    click.echo("\nDry-run plan:", err=True)
    for job in pending_jobs:
        reason = pending_reasons[job.segment_dir]
        detail = pending_details.get(job.segment_dir)
        line = f"- {job.segment_dir} [{reason}]"
        if detail:
            line = f"{line}: {detail.replace(chr(10), ' | ')}"
        click.echo(line, err=True)


def run_batch(
    jobs: list[ConversionJob],
    config: BatchConfig,
    worker_slots: list[WorkerSlot],
) -> tuple[list[JobResult], int]:
    results: list[JobResult] = []
    aborted_count = 0
    if not jobs:
        return results, aborted_count
    if not worker_slots:
        raise click.ClickException("no worker slots configured")

    available_slots = list(worker_slots)
    max_parallel_jobs = len(worker_slots)
    future_to_job: dict[concurrent.futures.Future[JobResult], ActiveJobState] = {}
    job_iter = iter(jobs)
    stop_submitting = False
    completed_count = 0
    failed_count = 0
    submission_index = 0
    reporter = choose_progress_reporter(config.progress_ui, len(jobs))
    last_heartbeat_at = time.monotonic()

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_parallel_jobs) as executor:

        def submit_next() -> bool:
            nonlocal submission_index
            if stop_submitting or not available_slots:
                return False
            slot = available_slots.pop(0)
            try:
                job = next(job_iter)
            except StopIteration:
                available_slots.insert(0, slot)
                return False

            submission_index += 1
            state = ActiveJobState(
                submission_index=submission_index,
                job=job,
                slot=slot,
                started_at_monotonic=time.monotonic(),
            )
            reporter.job_started(state)
            future = executor.submit(run_conversion_on_slot, job, config, slot)
            future_to_job[future] = state
            return True

        for _ in range(min(max_parallel_jobs, len(jobs))):
            submit_next()

        while future_to_job:
            done, _ = concurrent.futures.wait(
                future_to_job,
                timeout=reporter.heartbeat_interval_seconds,
                return_when=concurrent.futures.FIRST_COMPLETED,
            )
            if not done:
                reporter.heartbeat(
                    completed_count=completed_count,
                    failed_count=failed_count,
                    active_states=list(future_to_job.values()),
                )
                last_heartbeat_at = time.monotonic()
                continue

            for future in done:
                state = future_to_job.pop(future)
                available_slots.append(state.slot)
                result = future.result()
                results.append(result)
                reporter.job_finished(state, result)

                if result.status == "failed":
                    failed_count += 1
                    if config.fail_fast:
                        stop_submitting = True
                else:
                    completed_count += 1

                if not stop_submitting:
                    submit_next()

            now = time.monotonic()
            if now - last_heartbeat_at >= reporter.heartbeat_interval_seconds:
                reporter.heartbeat(
                    completed_count=completed_count,
                    failed_count=failed_count,
                    active_states=list(future_to_job.values()),
                )
                last_heartbeat_at = now

        if stop_submitting:
            aborted_count = sum(1 for _ in job_iter)

    reporter.close()

    return results, aborted_count


def build_uniform_worker_slots(
    jobs: int,
    encoder_device: str,
    cuda_visible_devices: str | None,
) -> list[WorkerSlot]:
    if jobs < 1:
        raise click.ClickException("--jobs must be at least 1")

    if cuda_visible_devices is None:
        return [
            WorkerSlot(
                label=f"job-{index + 1}",
                encoder_device=encoder_device,
                cuda_visible_devices=None,
            )
            for index in range(jobs)
        ]

    device_pool = parse_cuda_device_pool(cuda_visible_devices)
    if len(device_pool) < jobs:
        raise click.ClickException(
            f"--cuda-visible-devices must provide at least {jobs} entries when --jobs={jobs}"
        )
    return [
        WorkerSlot(
            label=f"job-{index + 1}",
            encoder_device=encoder_device,
            cuda_visible_devices=device_pool[index],
        )
        for index in range(jobs)
    ]


def parse_required_device_pool(raw_value: str | None, expected_count: int, flag_name: str) -> tuple[str, ...]:
    if expected_count == 0:
        if raw_value is None:
            return ()
        raise click.ClickException(f"{flag_name} cannot be used when the matching job count is 0")

    device_pool = parse_cuda_device_pool(raw_value)
    if len(device_pool) != expected_count:
        raise click.ClickException(
            f"{flag_name} must provide exactly {expected_count} entries when the matching job count is {expected_count}"
        )
    return device_pool


def build_worker_slots(
    *,
    jobs: int,
    encoder_device: str,
    cuda_visible_devices: str | None,
    hardware_jobs: int,
    hardware_cuda_visible_devices: str | None,
    software_jobs: int,
    software_cuda_visible_devices: str | None,
) -> list[WorkerSlot]:
    mixed_mode_requested = any(
        (
            hardware_jobs > 0,
            software_jobs > 0,
            hardware_cuda_visible_devices is not None,
            software_cuda_visible_devices is not None,
        )
    )
    if not mixed_mode_requested:
        return build_uniform_worker_slots(jobs, encoder_device, cuda_visible_devices)

    if jobs != 1:
        raise click.ClickException("--jobs cannot be combined with mixed worker pool flags")
    if cuda_visible_devices is not None:
        raise click.ClickException("--cuda-visible-devices cannot be combined with mixed worker pool flags")
    if encoder_device != "auto":
        raise click.ClickException("--encoder-device cannot be combined with mixed worker pool flags")

    total_jobs = hardware_jobs + software_jobs
    if total_jobs < 1:
        raise click.ClickException("mixed worker pool flags require at least one hardware or software job")

    hardware_device_pool = parse_required_device_pool(
        hardware_cuda_visible_devices,
        hardware_jobs,
        "--hardware-cuda-visible-devices",
    )
    software_device_pool = parse_required_device_pool(
        software_cuda_visible_devices,
        software_jobs,
        "--software-cuda-visible-devices",
    )

    worker_slots: list[WorkerSlot] = []
    worker_slots.extend(
        WorkerSlot(
            label=f"hw-{index + 1}",
            encoder_device="nvidia",
            cuda_visible_devices=device,
        )
        for index, device in enumerate(hardware_device_pool)
    )
    worker_slots.extend(
        WorkerSlot(
            label=f"sw-{index + 1}",
            encoder_device="software",
            cuda_visible_devices=device,
        )
        for index, device in enumerate(software_device_pool)
    )
    return worker_slots


@click.command(context_settings={"allow_extra_args": True})
@click.option(
    "--dataset-root",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
    help="Dataset root containing segment directories. Mutually exclusive with --segment and --segments-csv.",
)
@click.option(
    "--segment",
    "segment_dirs",
    multiple=True,
    type=click.Path(exists=True, path_type=Path, file_okay=False, dir_okay=True),
    help=(
        "Explicit segment directory. Repeatable. The directory must directly contain "
        "*_zedN.svo or *_zedN.svo2 files. Mutually exclusive with --dataset-root and --segments-csv."
    ),
)
@click.option(
    "--segment-dir",
    "legacy_segment_dirs",
    multiple=True,
    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
    hidden=True,
)
@click.option(
    "--segments-csv",
    type=click.Path(path_type=Path, dir_okay=False),
    help="CSV file containing a segment_dir column. Mutually exclusive with --dataset-root and --segment.",
)
@click.option(
    "--csv-root",
    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
    help="Base directory for relative segment_dir entries in --segments-csv. Defaults to the CSV parent directory.",
)
@click.option(
    "--recursive/--no-recursive",
    default=True,
    show_default=True,
    help="Recurse when discovering segment directories from --dataset-root.",
)
@click.option("--jobs", default=1, show_default=True, type=click.IntRange(min=1), help="Parallel conversion jobs.")
@click.option(
    "--hardware-jobs",
    default=0,
    show_default=True,
    type=click.IntRange(min=0),
    help="Mixed mode: number of hardware-encoded workers.",
)
@click.option(
    "--hardware-cuda-visible-devices",
    help="Mixed mode: comma-separated CUDA_VISIBLE_DEVICES assignments for hardware workers, one entry per worker.",
)
@click.option(
    "--software-jobs",
    default=0,
    show_default=True,
    type=click.IntRange(min=0),
    help="Mixed mode: number of software-encoded workers.",
)
@click.option(
    "--software-cuda-visible-devices",
    help="Mixed mode: comma-separated CUDA_VISIBLE_DEVICES assignments for software workers, one entry per worker.",
)
@click.option(
    "--zed-bin",
    type=click.Path(path_type=Path, dir_okay=False),
    help="Explicit path to the zed_svo_to_mcap binary.",
)
@click.option(
    "--cuda-visible-devices",
    help="Optional CUDA_VISIBLE_DEVICES value. A comma-separated list is distributed across concurrent jobs one GPU per subprocess.",
)
@click.option("--overwrite/--skip-existing", default=False, show_default=True, help="Overwrite existing MCAP files.")
@click.option(
    "--probe-existing/--trust-existing",
    default=False,
    show_default=True,
    help="Validate existing MCAP files before skipping them. Invalid outputs are treated as missing.",
)
@click.option(
    "--report-existing",
    is_flag=True,
    help="Probe existing MCAP files, report invalid ones, and do not launch conversions.",
)
@click.option(
    "--dry-run",
    is_flag=True,
    help="Show which segments would be converted after applying skip/probe logic, without launching conversions.",
)
@click.option(
    "--fail-fast/--continue-on-error",
    default=False,
    show_default=True,
    help="Stop submitting new work after the first failed conversion.",
)
@click.option("--codec", type=click.Choice(("h264", "h265")), default="h265", show_default=True)
@click.option(
    "--encoder-device",
    type=click.Choice(("auto", "nvidia", "software")),
    default="auto",
    show_default=True,
)
@click.option(
    "--mcap-compression",
    type=click.Choice(("none", "lz4", "zstd")),
    default="zstd",
    show_default=True,
)
@click.option(
    "--depth-mode",
    type=click.Choice(("neural_light", "neural", "neural_plus")),
    default="neural_plus",
    show_default=True,
)
@click.option(
    "--depth-size",
    type=str,
    default="optimal",
    show_default=True,
)
@click.option(
    "--bundle-policy",
    type=click.Choice(("nearest", "strict", "copy")),
    default="nearest",
    show_default=True,
    help="Bundling policy for multi-camera MCAP export.",
)
@click.option(
    "--copy-range",
    type=click.Choice(("common", "full")),
    default="common",
    show_default=True,
    help="Timestamp range used when --bundle-policy copy is selected.",
)
@click.option(
    "--bundle-topic",
    default="/bundle",
    show_default=True,
    help="Topic used for bundled multi-camera manifest messages.",
)
@click.option("--with-pose", is_flag=True, help="Enable per-camera positional tracking export when available.")
@click.option(
    "--pose-config",
    type=click.Path(path_type=Path, dir_okay=False),
    help="TOML config passed to zed_svo_to_mcap for pose tracking settings.",
)
@click.option(
    "--world-frame-id",
    default=None,
    help="Optional pose reference frame id passed through to zed_svo_to_mcap.",
)
@click.option(
    "--start-frame",
    type=click.IntRange(min=0),
    default=None,
    help="First bundle index to export (inclusive) in bundled multi-camera mode.",
)
@click.option(
    "--end-frame",
    type=click.IntRange(min=0),
    default=None,
    help="Last bundle index to export (inclusive) in bundled multi-camera mode.",
)
@click.option(
    "--sync-tolerance-ms",
    type=click.FloatRange(min=0.0, min_open=True),
    default=None,
    help="Override the maximum timestamp delta used by strict bundled sync.",
)
@click.option(
    "--progress-ui",
    type=click.Choice(("auto", "table", "text")),
    default="auto",
    show_default=True,
    help="Progress output mode. Auto uses a table on TTY and text logging otherwise.",
)
@click.pass_context
def main(
    ctx: click.Context,
    dataset_root: Path | None,
    segment_dirs: tuple[Path, ...],
    legacy_segment_dirs: tuple[Path, ...],
    segments_csv: Path | None,
    csv_root: Path | None,
    recursive: bool,
    jobs: int,
    hardware_jobs: int,
    hardware_cuda_visible_devices: str | None,
    software_jobs: int,
    software_cuda_visible_devices: str | None,
    zed_bin: Path | None,
    cuda_visible_devices: str | None,
    overwrite: bool,
    probe_existing: bool,
    report_existing: bool,
    dry_run: bool,
    fail_fast: bool,
    codec: str,
    encoder_device: str,
    mcap_compression: str,
    depth_mode: str,
    depth_size: str,
    bundle_policy: str,
    copy_range: str,
    bundle_topic: str,
    with_pose: bool,
    pose_config: Path | None,
    world_frame_id: str | None,
    start_frame: int | None,
    end_frame: int | None,
    sync_tolerance_ms: float | None,
    progress_ui: str,
) -> None:
    """Batch-convert multi-camera ZED segments into grouped MCAP files."""
    segment_sources.raise_for_legacy_extra_args(ctx.args)
    segment_sources.raise_for_legacy_source_args(None, legacy_segment_dirs)
    segment_sources.raise_if_recursive_flag_is_incompatible(ctx, dataset_root)

    if report_existing and dry_run:
        raise click.ClickException("--report-existing and --dry-run are mutually exclusive")
    if bundle_policy == "copy":
        if start_frame is not None or end_frame is not None:
            raise click.ClickException("--start-frame/--end-frame cannot be used with --bundle-policy copy")
        if sync_tolerance_ms is not None:
            raise click.ClickException("--sync-tolerance-ms cannot be used with --bundle-policy copy")
        if bundle_topic != "/bundle":
            raise click.ClickException("--bundle-topic cannot be customized with --bundle-policy copy")

    sources = segment_sources.resolve_sources(
        dataset_root,
        segment_dirs,
        segments_csv,
        csv_root,
        recursive,
        scan_segment_dir=scan_segment_dir,
        no_matches_message=lambda root: f"no multi-camera segments found under {root}",
    )
    binary_path = None if report_existing else locate_binary(zed_bin)
    worker_slots = build_worker_slots(
        jobs=jobs,
        encoder_device=encoder_device,
        cuda_visible_devices=cuda_visible_devices,
        hardware_jobs=hardware_jobs,
        hardware_cuda_visible_devices=hardware_cuda_visible_devices,
        software_jobs=software_jobs,
        software_cuda_visible_devices=software_cuda_visible_devices,
    )
    config = BatchConfig(
        zed_bin=binary_path,
        probe_existing=probe_existing or report_existing,
        overwrite=overwrite,
        fail_fast=fail_fast,
        codec=codec,
        mcap_compression=mcap_compression,
        depth_mode=depth_mode,
        depth_size=depth_size,
        bundle_policy=bundle_policy,
        copy_range=copy_range,
        bundle_topic=None if bundle_policy == "copy" else bundle_topic,
        with_pose=with_pose,
        pose_config=pose_config.expanduser().resolve() if pose_config is not None else None,
        world_frame_id=world_frame_id,
        start_frame=start_frame,
        end_frame=end_frame,
        sync_tolerance_ms=sync_tolerance_ms,
        progress_ui=progress_ui,
    )
    input_root = dataset_root.expanduser().resolve() if dataset_root is not None else None
    display_parent = common_segment_parent(sources.segment_dirs)

    skipped_results: list[JobResult] = []
    failed_results: list[JobResult] = []
    pending_jobs: list[ConversionJob] = []
    pending_reasons: dict[Path, str] = {}
    pending_details: dict[Path, str] = {}
    valid_existing: list[OutputProbeResult] = []
    invalid_existing: list[tuple[ConversionJob, OutputProbeResult]] = []
    missing_outputs: list[ConversionJob] = []

    for segment_dir in sources.segment_dirs:
        scan = scan_segment_dir(segment_dir)
        output_path = output_path_for(segment_dir)
        job = ConversionJob(
            segment_dir=segment_dir,
            output_path=output_path,
            camera_labels=scan.camera_labels,
            display_name=display_name_for_segment(
                segment_dir,
                source_mode=sources.mode,
                input_root=input_root,
                common_parent=display_parent,
            ),
        )
        default_encoder_device = worker_slots[0].encoder_device if worker_slots else encoder_device
        command = (
            tuple(command_for_job(job, config, default_encoder_device))
            if config.zed_bin is not None
            else ()
        )
        if not scan.is_valid:
            failed_results.append(
                JobResult(
                    status="failed",
                    segment_dir=segment_dir,
                    output_path=output_path,
                    command=command,
                    return_code=2,
                    stderr=scan.reason or "",
                )
            )
            continue

        if report_existing:
            probe_result = probe_output(
                output_path,
                job.camera_labels,
                layout="copy" if config.bundle_policy == "copy" else "bundled",
                bundle_topic=config.bundle_topic,
            )
            if probe_result.status == "valid":
                valid_existing.append(probe_result)
            elif probe_result.status == "invalid":
                invalid_existing.append((job, probe_result))
            else:
                missing_outputs.append(job)
            continue

        if overwrite:
            pending_jobs.append(job)
            pending_reasons[segment_dir] = "overwrite"
            continue

        if config.probe_existing:
            probe_result = probe_output(
                output_path,
                job.camera_labels,
                layout="copy" if config.bundle_policy == "copy" else "bundled",
                bundle_topic=config.bundle_topic,
            )
            if probe_result.status == "valid":
                valid_existing.append(probe_result)
                skipped_results.append(
                    JobResult(
                        status="skipped",
                        segment_dir=segment_dir,
                        output_path=output_path,
                        command=command,
                    )
                )
                continue
            if probe_result.status == "invalid":
                invalid_existing.append((job, probe_result))
                pending_jobs.append(job)
                pending_reasons[segment_dir] = "invalid-existing-output"
                pending_details[segment_dir] = probe_result.reason
                continue
            missing_outputs.append(job)
            pending_jobs.append(job)
            pending_reasons[segment_dir] = "missing-output"
            continue

        if output_path.exists():
            skipped_results.append(
                JobResult(
                    status="skipped",
                    segment_dir=segment_dir,
                    output_path=output_path,
                    command=command,
                )
            )
            continue

        missing_outputs.append(job)
        pending_jobs.append(job)
        pending_reasons[segment_dir] = "missing-output"

    if report_existing:
        click.echo(
            (
                f"source={sources.mode} matched={len(sources.segment_dirs)} valid={len(valid_existing)} "
                f"invalid={len(invalid_existing)} missing={len(missing_outputs)} "
                f"invalid-segments={len(failed_results)}"
            ),
            err=True,
        )
        if sources.ignored_partial_dirs:
            click.echo(f"ignored_incomplete={len(sources.ignored_partial_dirs)}", err=True)
        report_invalid_existing_outputs(invalid_existing)
        summarize_failures(failed_results)
        if failed_results or invalid_existing:
            raise SystemExit(1)
        return

    click.echo(
        (
            f"source={sources.mode} matched={len(sources.segment_dirs)} pending={len(pending_jobs)} "
            f"skipped={len(skipped_results)} invalid={len(failed_results)} jobs={len(worker_slots)} "
            f"dry_run={'yes' if dry_run else 'no'}"
        ),
        err=True,
    )
    if sources.ignored_partial_dirs:
        click.echo(f"ignored_incomplete={len(sources.ignored_partial_dirs)}", err=True)
    if config.probe_existing:
        click.echo(
            (
                f"probed-existing: valid={len(valid_existing)} invalid={len(invalid_existing)} "
                f"missing={len(missing_outputs)}"
            ),
            err=True,
        )

    if dry_run:
        report_dry_run_plan(pending_jobs, pending_reasons, pending_details)
        summarize_failures(failed_results)
        if failed_results:
            raise SystemExit(1)
        return

    results = list(skipped_results)
    results.extend(failed_results)
    conversion_results, aborted_count = run_batch(pending_jobs, config, worker_slots)
    results.extend(conversion_results)

    converted_count = sum(1 for result in results if result.status == "converted")
    skipped_count = sum(1 for result in results if result.status == "skipped")
    failed_count = sum(1 for result in results if result.status == "failed")

    click.echo(
        (
            f"summary: matched={len(sources.segment_dirs)} converted={converted_count} "
            f"skipped={skipped_count} failed={failed_count} aborted={aborted_count}"
        ),
        err=True,
    )
    summarize_failures(results)

    if failed_count > 0 or aborted_count > 0:
        raise SystemExit(1)


if __name__ == "__main__":
    main()