OpenGait/scripts/monitor_drf_jobs.py

from __future__ import annotations

import argparse
import os
import sys
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Final


ERROR_PATTERNS: Final[tuple[str, ...]] = (
    "traceback",
    "runtimeerror",
    "error:",
    "exception",
    "failed",
    "segmentation fault",
    "killed",
)


@dataclass(frozen=True)
class JobSpec:
    name: str
    pid: int
    log_path: Path


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Monitor long-running DRF preprocess/train jobs.")
    parser.add_argument("--preprocess-pid", type=int, required=True)
    parser.add_argument("--preprocess-log", type=Path, required=True)
    parser.add_argument("--launcher-pid", type=int, required=True)
    parser.add_argument("--launcher-log", type=Path, required=True)
    parser.add_argument("--sentinel-path", type=Path, required=True)
    parser.add_argument("--status-log", type=Path, required=True)
    parser.add_argument("--poll-seconds", type=float, default=30.0)
    return parser.parse_args()


def pid_alive(pid: int) -> bool:
    return Path(f"/proc/{pid}").exists()


def read_tail(path: Path, limit: int = 8192) -> str:
    if not path.exists():
        return ""
    with path.open("rb") as handle:
        handle.seek(0, os.SEEK_END)
        size = handle.tell()
        handle.seek(max(size - limit, 0), os.SEEK_SET)
        data = handle.read()
    return data.decode("utf-8", errors="replace")


def detect_error(log_text: str) -> str | None:
    lowered = log_text.lower()
    for pattern in ERROR_PATTERNS:
        if pattern in lowered:
            return pattern
    return None


def append_status(path: Path, line: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as handle:
        handle.write(f"{datetime.now().isoformat(timespec='seconds')} {line}\n")


def monitor_job(job: JobSpec, status_log: Path) -> str | None:
    tail = read_tail(job.log_path)
    error = detect_error(tail)
    if error is not None:
        append_status(status_log, f"[alert] {job.name}: detected `{error}` in {job.log_path}")
        return f"{job.name} log shows `{error}`"
    return None


def main() -> int:
    args = parse_args()
    preprocess = JobSpec("preprocess", args.preprocess_pid, args.preprocess_log)
    launcher = JobSpec("launcher", args.launcher_pid, args.launcher_log)

    append_status(
        args.status_log,
        (
            "[start] monitoring "
            f"preprocess_pid={preprocess.pid} launcher_pid={launcher.pid} "
            f"sentinel={args.sentinel_path}"
        ),
    )

    preprocess_seen_alive = False
    launcher_seen_alive = False

    while True:
        preprocess_alive = pid_alive(preprocess.pid)
        launcher_alive = pid_alive(launcher.pid)
        preprocess_seen_alive = preprocess_seen_alive or preprocess_alive
        launcher_seen_alive = launcher_seen_alive or launcher_alive

        preprocess_error = monitor_job(preprocess, args.status_log)
        if preprocess_error is not None:
            print(preprocess_error, file=sys.stderr)
            return 1

        launcher_error = monitor_job(launcher, args.status_log)
        if launcher_error is not None:
            print(launcher_error, file=sys.stderr)
            return 1

        sentinel_ready = args.sentinel_path.exists()
        append_status(
            args.status_log,
            (
                "[ok] "
                f"preprocess_alive={preprocess_alive} "
                f"launcher_alive={launcher_alive} "
                f"sentinel_ready={sentinel_ready}"
            ),
        )

        if preprocess_seen_alive and not preprocess_alive and not sentinel_ready:
            append_status(args.status_log, "[alert] preprocess exited before sentinel was written")
            print("preprocess exited before sentinel was written", file=sys.stderr)
            return 1

        launcher_tail = read_tail(launcher.log_path)
        train_started = "[start]" in launcher_tail

        if launcher_seen_alive and not launcher_alive:
            if not train_started and not sentinel_ready:
                append_status(args.status_log, "[alert] launcher exited before training started")
                print("launcher exited before training started", file=sys.stderr)
                return 1
            append_status(args.status_log, "[done] launcher exited; monitoring complete")
            return 0

        time.sleep(args.poll_seconds)


if __name__ == "__main__":
    raise SystemExit(main())