diff --git a/docs/0.get_started.md b/docs/0.get_started.md index fff74bc..d677bcf 100644 --- a/docs/0.get_started.md +++ b/docs/0.get_started.md @@ -49,6 +49,8 @@ CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch --nproc_per_node=2 o You can run commands in [train.sh](train.sh) for training different models. +For long-running local jobs, prefer the supervised `systemd-run --user` workflow documented in [systemd-run-training.md](systemd-run-training.md). It uses `torchrun`, UUID-based GPU selection, real log files, and survives shell/session teardown more reliably than `nohup ... &`. + ## Test Evaluate the trained model by ``` diff --git a/docs/sconet-drf-status-and-training.md b/docs/sconet-drf-status-and-training.md index 1c77a69..00f8ecb 100644 --- a/docs/sconet-drf-status-and-training.md +++ b/docs/sconet-drf-status-and-training.md @@ -3,6 +3,7 @@ This note records the current Scoliosis1K implementation status in this repo and the main conclusions from the recent reproduction/debugging work. For a stricter paper-vs-local reproducibility breakdown, see [scoliosis_reproducibility_audit.md](scoliosis_reproducibility_audit.md). +For the recommended long-running local launch workflow, see [systemd-run-training.md](systemd-run-training.md). ## Current status @@ -79,6 +80,9 @@ The current working conclusion is: - the main remaining suspect is the skeleton-map representation and preprocessing path - for practical model development, `1:1:2` is currently the better working split than `1:1:8` - for practical model development, the current best skeleton recipe is still `body-only + plain CE + SGD` on `1:1:2` +- the first practical DRF bridge on that same winning `1:1:2` recipe did not improve on the plain skeleton baseline: + - best retained DRF checkpoint (`2000`) on the full test set: `80.21 Acc / 58.92 Prec / 59.23 Rec / 57.84 F1` + - current best plain skeleton checkpoint (`7000`) on the full test set: `83.16 Acc / 68.24 Prec / 80.02 Rec / 68.47 F1` For readability in this repo's docs, `ScoNet-MT-ske` refers to the skeleton-map variant that the DRF paper writes as `ScoNet-MT^{ske}`. diff --git a/docs/systemd-run-training.md b/docs/systemd-run-training.md new file mode 100644 index 0000000..faeb3bd --- /dev/null +++ b/docs/systemd-run-training.md @@ -0,0 +1,146 @@ +# Stable Long-Running Training with `systemd-run --user` + +This note documents the recommended way to launch long OpenGait jobs on a local workstation. + +## Why use `systemd-run --user` + +For long training runs, `systemd-run --user` is more reliable than shell background tricks like: + +- `nohup ... &` +- `disown` +- one-shot detached shell wrappers + +Why: + +- the training process is supervised by the user systemd instance instead of a transient shell process +- stdout/stderr can be sent to a real log file and the systemd journal +- you can query status with `systemctl --user` +- you can stop the job cleanly with `systemctl --user stop ...` +- the job is no longer tied to the lifetime of a tool process tree + +In this repo, detached shell launches were observed to die unexpectedly even when the training code itself was healthy. `systemd-run --user` avoids that failure mode. + +## Prerequisites + +Check that user services are available: + +```bash +systemd-run --user --version +``` + +If you want jobs to survive logout, enable linger: + +```bash +loginctl enable-linger "$USER" +``` + +This is optional if you only need the job to survive shell/session teardown while you stay logged in. + +## Recommended launcher + +Use the helper script: + +- [scripts/systemd_run_opengait.py](/home/crosstyan/Code/OpenGait/scripts/systemd_run_opengait.py) + +It: + +- uses `torchrun` (`python -m torch.distributed.run`), not deprecated `torch.distributed.launch` +- accepts GPU UUIDs instead of ordinal indices +- launches a transient user service with `systemd-run --user` +- writes stdout/stderr to a real log file +- provides `status`, `logs`, and `stop` helpers + +## Launch examples + +Single-GPU train: + +```bash +uv run python scripts/systemd_run_opengait.py launch \ + --cfgs configs/sconet/sconet_scoliosis1k_skeleton_112_sigma15_joint8_bodyonly_plaince_bridge_1gpu_10k.yaml \ + --phase train \ + --gpu-uuids GPU-9cc7b26e-90d4-0c49-4d4c-060e528ffba6 +``` + +Single-GPU eval: + +```bash +uv run python scripts/systemd_run_opengait.py launch \ + --cfgs configs/sconet/sconet_scoliosis1k_local_eval_1gpu.yaml \ + --phase test \ + --gpu-uuids GPU-9cc7b26e-90d4-0c49-4d4c-060e528ffba6 +``` + +Two-GPU train: + +```bash +uv run python scripts/systemd_run_opengait.py launch \ + --cfgs configs/baseline/baseline.yaml \ + --phase train \ + --gpu-uuids GPU-9cc7b26e-90d4-0c49-4d4c-060e528ffba6,GPU-1155e14e-6097-5942-7feb-20453868b202 +``` + +Dry run: + +```bash +uv run python scripts/systemd_run_opengait.py launch \ + --cfgs configs/baseline/baseline.yaml \ + --phase train \ + --gpu-uuids GPU-9cc7b26e-90d4-0c49-4d4c-060e528ffba6 \ + --dry-run +``` + +## Monitoring and control + +Show service status: + +```bash +uv run python scripts/systemd_run_opengait.py status opengait-baseline-train +``` + +Show recent journal lines: + +```bash +uv run python scripts/systemd_run_opengait.py logs opengait-baseline-train -n 200 +``` + +Follow the journal directly: + +```bash +journalctl --user -u opengait-baseline-train -f +``` + +Stop the run: + +```bash +uv run python scripts/systemd_run_opengait.py stop opengait-baseline-train +``` + +## Logging behavior + +The launcher configures both: + +- a file log under `/tmp` by default +- the systemd journal for the transient unit + +This makes it easier to recover logs even if the original shell or tool session disappears. + +## GPU selection + +Prefer GPU UUIDs, not ordinal indices. + +Reason: + +- local `CUDA_VISIBLE_DEVICES` ordinal mapping can be unstable or surprising +- UUIDs make the intended device explicit + +Get UUIDs with: + +```bash +nvidia-smi -L +``` + +## Notes + +- The helper uses `torchrun` through `python -m torch.distributed.run`. +- `--nproc_per_node` is inferred from the number of UUIDs passed to `--gpu-uuids`. +- OpenGait evaluator constraints still apply: test batch/world-size settings must match the visible GPU count. diff --git a/scripts/systemd_run_opengait.py b/scripts/systemd_run_opengait.py new file mode 100644 index 0000000..846e216 --- /dev/null +++ b/scripts/systemd_run_opengait.py @@ -0,0 +1,194 @@ +from __future__ import annotations + +import re +import subprocess +from collections.abc import Sequence +from pathlib import Path + +import click + + +REPO_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_LOG_DIR = Path("/tmp") + + +def _sanitize_unit_name(raw: str) -> str: + sanitized = re.sub(r"[^A-Za-z0-9_.@-]+", "-", raw).strip("-") + if not sanitized: + raise click.ClickException("Unit name cannot be empty after sanitization.") + return sanitized + + +def _split_gpu_uuids(value: str) -> list[str]: + uuids = [part.strip() for part in value.split(",") if part.strip()] + if not uuids: + raise click.ClickException("At least one GPU UUID is required.") + return uuids + + +def _run_command( + args: Sequence[str], + *, + cwd: Path | None = None, + check: bool = True, +) -> subprocess.CompletedProcess[str]: + return subprocess.run( + list(args), + cwd=str(cwd) if cwd is not None else None, + text=True, + capture_output=True, + check=check, + ) + + +def _default_unit_name(cfgs: Path, phase: str) -> str: + stem = cfgs.stem + return _sanitize_unit_name(f"opengait-{stem}-{phase}") + + +@click.group() +def cli() -> None: + """Launch and manage OpenGait runs under systemd user services.""" + + +@cli.command("launch") +@click.option("--cfgs", type=click.Path(path_type=Path, exists=True, dir_okay=False), required=True) +@click.option("--phase", type=click.Choice(["train", "test"]), required=True) +@click.option( + "--gpu-uuids", + required=True, + help="Comma-separated GPU UUID list for CUDA_VISIBLE_DEVICES.", +) +@click.option("--unit", type=str, default=None, help="systemd unit name. Defaults to a name derived from cfgs + phase.") +@click.option( + "--log-path", + type=click.Path(path_type=Path, dir_okay=False), + default=None, + help="Optional file to append stdout/stderr to. Defaults to /tmp/.log", +) +@click.option( + "--workdir", + type=click.Path(path_type=Path, file_okay=False), + default=REPO_ROOT, + show_default=True, +) +@click.option("--description", type=str, default=None, help="Optional systemd unit description.") +@click.option("--dry-run", is_flag=True, help="Print the resolved systemd-run command without launching it.") +def launch( + cfgs: Path, + phase: str, + gpu_uuids: str, + unit: str | None, + log_path: Path | None, + workdir: Path, + description: str | None, + dry_run: bool, +) -> None: + """Launch an OpenGait run via systemd-run --user using torchrun.""" + resolved_cfgs = cfgs if cfgs.is_absolute() else (workdir / cfgs).resolve() + if not resolved_cfgs.exists(): + raise click.ClickException(f"Config not found: {resolved_cfgs}") + + unit_name = _sanitize_unit_name(unit) if unit is not None else _default_unit_name(resolved_cfgs, phase) + resolved_log_path = (log_path if log_path is not None else DEFAULT_LOG_DIR / f"{unit_name}.log").resolve() + resolved_log_path.parent.mkdir(parents=True, exist_ok=True) + + gpu_uuid_list = _split_gpu_uuids(gpu_uuids) + nproc = len(gpu_uuid_list) + + command = [ + "systemd-run", + "--user", + "--unit", + unit_name, + "--collect", + "--same-dir", + "--property", + "KillMode=mixed", + "--property", + f"StandardOutput=append:{resolved_log_path}", + "--property", + f"StandardError=append:{resolved_log_path}", + "--setenv", + f"CUDA_VISIBLE_DEVICES={','.join(gpu_uuid_list)}", + ] + if description: + command.extend(["--description", description]) + + command.extend( + [ + "uv", + "run", + "python", + "-m", + "torch.distributed.run", + "--nproc_per_node", + str(nproc), + "opengait/main.py", + "--cfgs", + str(resolved_cfgs), + "--phase", + phase, + ] + ) + + if dry_run: + click.echo(" ".join(command)) + return + + result = _run_command(command, cwd=workdir, check=False) + if result.returncode != 0: + raise click.ClickException( + f"systemd-run launch failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + + click.echo(f"unit={unit_name}") + click.echo(f"log={resolved_log_path}") + click.echo("journal: journalctl --user -u " + unit_name + " -f") + if result.stdout.strip(): + click.echo(result.stdout.strip()) + + +@cli.command("status") +@click.argument("unit") +def status(unit: str) -> None: + """Show systemd user-unit status.""" + result = _run_command(["systemctl", "--user", "status", unit], check=False) + click.echo(result.stdout, nl=False) + if result.stderr: + click.echo(result.stderr, err=True, nl=False) + if result.returncode != 0: + raise SystemExit(result.returncode) + + +@cli.command("logs") +@click.argument("unit") +@click.option("-n", "--lines", type=int, default=200, show_default=True) +def logs(unit: str, lines: int) -> None: + """Show recent journal lines for a unit.""" + result = _run_command( + ["journalctl", "--user", "-u", unit, "-n", str(lines), "--no-pager"], + check=False, + ) + click.echo(result.stdout, nl=False) + if result.stderr: + click.echo(result.stderr, err=True, nl=False) + if result.returncode != 0: + raise SystemExit(result.returncode) + + +@cli.command("stop") +@click.argument("unit") +def stop(unit: str) -> None: + """Stop a systemd user unit.""" + result = _run_command(["systemctl", "--user", "stop", unit], check=False) + if result.stdout: + click.echo(result.stdout, nl=False) + if result.stderr: + click.echo(result.stderr, err=True, nl=False) + if result.returncode != 0: + raise SystemExit(result.returncode) + + +if __name__ == "__main__": + cli()