Redesign batch segment source selection

2026-04-08 08:07:05 +00:00
parent c320bf01af
commit 0a3da46f19
7 changed files with 642 additions and 268 deletions
@@ -3,7 +3,6 @@
 from __future__ import annotations

 import concurrent.futures
-import csv
 import importlib
 import os
 import re
@@ -17,6 +16,11 @@ from pathlib import Path
 import click
 from progress_table import ProgressTable

+try:
+    from scripts import zed_batch_segment_sources as segment_sources
+except ModuleNotFoundError:
+    import zed_batch_segment_sources as segment_sources
+

 SCRIPT_PATH = Path(__file__).resolve()
 REPO_ROOT = SCRIPT_PATH.parents[1]
@@ -82,13 +86,6 @@ class SegmentScan:
    reason: str | None = None


-@dataclass(slots=True, frozen=True)
-class SourceResolution:
-    mode: str
-    segment_dirs: tuple[Path, ...]
-    ignored_partial_dirs: tuple[SegmentScan, ...]
-
-
@dataclass(slots=True, frozen=True)
 class OutputProbeResult:
    output_path: Path
@@ -339,116 +336,6 @@ def scan_segment_dir(segment_dir: Path) -> SegmentScan:
    )


-def dedupe_paths(paths: list[Path]) -> list[Path]:
-    ordered: list[Path] = []
-    seen: set[Path] = set()
-    for path in paths:
-        resolved = path.expanduser().resolve()
-        if resolved in seen:
-            continue
-        seen.add(resolved)
-        ordered.append(resolved)
-    return ordered
-
-
-def discover_segment_dirs(root: Path, recursive: bool) -> SourceResolution:
-    if not root.is_dir():
-        raise click.ClickException(f"input directory does not exist: {root}")
-
-    candidate_dirs = {root.resolve()}
-    iterator = root.rglob("*") if recursive else root.iterdir()
-    for path in iterator:
-        if path.is_dir():
-            candidate_dirs.add(path.resolve())
-
-    valid_dirs: list[Path] = []
-    ignored_partial_dirs: list[SegmentScan] = []
-    for segment_dir in sorted(candidate_dirs):
-        scan = scan_segment_dir(segment_dir)
-        if scan.is_valid:
-            valid_dirs.append(segment_dir)
-        elif scan.matched_files > 0:
-            ignored_partial_dirs.append(scan)
-
-    if not valid_dirs:
-        raise click.ClickException(f"no multi-camera segments found under {root}")
-
-    return SourceResolution(
-        mode="discovery",
-        segment_dirs=tuple(valid_dirs),
-        ignored_partial_dirs=tuple(ignored_partial_dirs),
-    )
-
-
-def parse_segments_csv(csv_path: Path, csv_root: Path | None) -> tuple[Path, ...]:
-    csv_path = csv_path.expanduser().resolve()
-    if not csv_path.is_file():
-        raise click.ClickException(f"CSV not found: {csv_path}")
-
-    if csv_root is not None:
-        base_dir = csv_root.expanduser().resolve()
-        if not base_dir.is_dir():
-            raise click.ClickException(f"CSV root is not a directory: {base_dir}")
-    else:
-        base_dir = csv_path.parent
-
-    segment_dirs: list[Path] = []
-    seen: set[Path] = set()
-    with csv_path.open(newline="") as stream:
-        reader = csv.DictReader(stream)
-        if reader.fieldnames is None or "segment_dir" not in reader.fieldnames:
-            raise click.ClickException(f"{csv_path} must contain a 'segment_dir' header")
-
-        for row_number, row in enumerate(reader, start=2):
-            raw_segment_dir = (row.get("segment_dir") or "").strip()
-            if not raw_segment_dir:
-                raise click.ClickException(f"{csv_path}:{row_number} has an empty segment_dir value")
-            segment_dir = Path(raw_segment_dir)
-            resolved = segment_dir if segment_dir.is_absolute() else base_dir / segment_dir
-            resolved = resolved.expanduser().resolve()
-            if resolved in seen:
-                continue
-            seen.add(resolved)
-            segment_dirs.append(resolved)
-
-    if not segment_dirs:
-        raise click.ClickException(f"{csv_path} did not contain any segment_dir rows")
-    return tuple(segment_dirs)
-
-
-def resolve_sources(
-    input_dir: Path | None,
-    segment_dirs: tuple[Path, ...],
-    segments_csv: Path | None,
-    csv_root: Path | None,
-    recursive: bool,
-) -> SourceResolution:
-    source_count = sum(
-        (
-            1 if input_dir is not None else 0,
-            1 if segment_dirs else 0,
-            1 if segments_csv is not None else 0,
-        )
-    )
-    if source_count != 1:
-        raise click.ClickException(
-            "provide exactly one source mode: INPUT_DIR, --segment-dir, or --segments-csv"
-        )
-
-    if input_dir is not None:
-        return discover_segment_dirs(input_dir.expanduser().resolve(), recursive)
-
-    if segment_dirs:
-        ordered_dirs = dedupe_paths(list(segment_dirs))
-        return SourceResolution(mode="segment-dir", segment_dirs=tuple(ordered_dirs), ignored_partial_dirs=())
-
-    return SourceResolution(
-        mode="segments-csv",
-        segment_dirs=parse_segments_csv(segments_csv, csv_root),
-        ignored_partial_dirs=(),
-    )
-
-
 def output_path_for(segment_dir: Path) -> Path:
    return segment_dir / f"{segment_dir.name}.mcap"

@@ -469,7 +356,7 @@ def display_name_for_segment(
    input_root: Path | None,
    common_parent: Path | None,
 ) -> str:
-    if source_mode == "discovery" and input_root is not None:
+    if source_mode == "dataset-root" and input_root is not None:
        try:
            return str(segment_dir.relative_to(input_root))
        except ValueError:
@@ -1071,30 +958,45 @@ def build_worker_slots(
    return worker_slots


-@click.command()
-@click.argument(
-    "input_dir",
-    required=False,
+@click.command(context_settings={"allow_extra_args": True})
+@click.option(
+    "--dataset-root",
    type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path),
+    help="Dataset root containing segment directories. Mutually exclusive with --segment and --segments-csv.",
+)
+@click.option(
+    "--segment",
+    "segment_dirs",
+    multiple=True,
+    type=click.Path(exists=True, path_type=Path, file_okay=False, dir_okay=True),
+    help=(
+        "Explicit segment directory. Repeatable. The directory must directly contain "
+        "*_zedN.svo or *_zedN.svo2 files. Mutually exclusive with --dataset-root and --segments-csv."
+    ),
 )
@click.option(
    "--segment-dir",
-    "segment_dirs",
+    "legacy_segment_dirs",
    multiple=True,
    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
-    help="Explicit segment directory. Repeatable. Mutually exclusive with INPUT_DIR and --segments-csv.",
+    hidden=True,
 )
@click.option(
    "--segments-csv",
    type=click.Path(path_type=Path, dir_okay=False),
-    help="CSV file containing a segment_dir column. Mutually exclusive with INPUT_DIR and --segment-dir.",
+    help="CSV file containing a segment_dir column. Mutually exclusive with --dataset-root and --segment.",
 )
@click.option(
    "--csv-root",
    type=click.Path(path_type=Path, file_okay=False, dir_okay=True),
    help="Base directory for relative segment_dir entries in --segments-csv. Defaults to the CSV parent directory.",
 )
-@click.option("--recursive/--no-recursive", default=True, show_default=True, help="Recurse when discovering segment directories from INPUT_DIR.")
+@click.option(
+    "--recursive/--no-recursive",
+    default=True,
+    show_default=True,
+    help="Recurse when discovering segment directories from --dataset-root.",
+)
@click.option("--jobs", default=1, show_default=True, type=click.IntRange(min=1), help="Parallel conversion jobs.")
@click.option(
    "--hardware-jobs",
@@ -1231,9 +1133,12 @@ def build_worker_slots(
    show_default=True,
    help="Progress output mode. Auto uses a table on TTY and text logging otherwise.",
 )
+@click.pass_context
 def main(
-    input_dir: Path | None,
+    ctx: click.Context,
+    dataset_root: Path | None,
    segment_dirs: tuple[Path, ...],
+    legacy_segment_dirs: tuple[Path, ...],
    segments_csv: Path | None,
    csv_root: Path | None,
    recursive: bool,
@@ -1266,6 +1171,10 @@ def main(
    progress_ui: str,
 ) -> None:
    """Batch-convert multi-camera ZED segments into grouped MCAP files."""
+    segment_sources.raise_for_legacy_extra_args(ctx.args)
+    segment_sources.raise_for_legacy_source_args(None, legacy_segment_dirs)
+    segment_sources.raise_if_recursive_flag_is_incompatible(ctx, dataset_root)
+
    if report_existing and dry_run:
        raise click.ClickException("--report-existing and --dry-run are mutually exclusive")
    if bundle_policy == "copy":
@@ -1276,8 +1185,16 @@ def main(
        if bundle_topic != "/bundle":
            raise click.ClickException("--bundle-topic cannot be customized with --bundle-policy copy")

+    sources = segment_sources.resolve_sources(
+        dataset_root,
+        segment_dirs,
+        segments_csv,
+        csv_root,
+        recursive,
+        scan_segment_dir=scan_segment_dir,
+        no_matches_message=lambda root: f"no multi-camera segments found under {root}",
+    )
    binary_path = None if report_existing else locate_binary(zed_bin)
-    sources = resolve_sources(input_dir, segment_dirs, segments_csv, csv_root, recursive)
    worker_slots = build_worker_slots(
        jobs=jobs,
        encoder_device=encoder_device,
@@ -1307,7 +1224,7 @@ def main(
        sync_tolerance_ms=sync_tolerance_ms,
        progress_ui=progress_ui,
    )
-    input_root = input_dir.expanduser().resolve() if input_dir is not None else None
+    input_root = dataset_root.expanduser().resolve() if dataset_root is not None else None
    display_parent = common_segment_parent(sources.segment_dirs)

    skipped_results: list[JobResult] = []