Refine DRF preprocessing and body-prior pipeline
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
|
||||
ERROR_PATTERNS: Final[tuple[str, ...]] = (
|
||||
"traceback",
|
||||
"runtimeerror",
|
||||
"error:",
|
||||
"exception",
|
||||
"failed",
|
||||
"segmentation fault",
|
||||
"killed",
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class JobSpec:
|
||||
name: str
|
||||
pid: int
|
||||
log_path: Path
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Monitor long-running DRF preprocess/train jobs.")
|
||||
parser.add_argument("--preprocess-pid", type=int, required=True)
|
||||
parser.add_argument("--preprocess-log", type=Path, required=True)
|
||||
parser.add_argument("--launcher-pid", type=int, required=True)
|
||||
parser.add_argument("--launcher-log", type=Path, required=True)
|
||||
parser.add_argument("--sentinel-path", type=Path, required=True)
|
||||
parser.add_argument("--status-log", type=Path, required=True)
|
||||
parser.add_argument("--poll-seconds", type=float, default=30.0)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def pid_alive(pid: int) -> bool:
|
||||
return Path(f"/proc/{pid}").exists()
|
||||
|
||||
|
||||
def read_tail(path: Path, limit: int = 8192) -> str:
|
||||
if not path.exists():
|
||||
return ""
|
||||
with path.open("rb") as handle:
|
||||
handle.seek(0, os.SEEK_END)
|
||||
size = handle.tell()
|
||||
handle.seek(max(size - limit, 0), os.SEEK_SET)
|
||||
data = handle.read()
|
||||
return data.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def detect_error(log_text: str) -> str | None:
|
||||
lowered = log_text.lower()
|
||||
for pattern in ERROR_PATTERNS:
|
||||
if pattern in lowered:
|
||||
return pattern
|
||||
return None
|
||||
|
||||
|
||||
def append_status(path: Path, line: str) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("a", encoding="utf-8") as handle:
|
||||
handle.write(f"{datetime.now().isoformat(timespec='seconds')} {line}\n")
|
||||
|
||||
|
||||
def monitor_job(job: JobSpec, status_log: Path) -> str | None:
|
||||
tail = read_tail(job.log_path)
|
||||
error = detect_error(tail)
|
||||
if error is not None:
|
||||
append_status(status_log, f"[alert] {job.name}: detected `{error}` in {job.log_path}")
|
||||
return f"{job.name} log shows `{error}`"
|
||||
return None
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
preprocess = JobSpec("preprocess", args.preprocess_pid, args.preprocess_log)
|
||||
launcher = JobSpec("launcher", args.launcher_pid, args.launcher_log)
|
||||
|
||||
append_status(
|
||||
args.status_log,
|
||||
(
|
||||
"[start] monitoring "
|
||||
f"preprocess_pid={preprocess.pid} launcher_pid={launcher.pid} "
|
||||
f"sentinel={args.sentinel_path}"
|
||||
),
|
||||
)
|
||||
|
||||
preprocess_seen_alive = False
|
||||
launcher_seen_alive = False
|
||||
|
||||
while True:
|
||||
preprocess_alive = pid_alive(preprocess.pid)
|
||||
launcher_alive = pid_alive(launcher.pid)
|
||||
preprocess_seen_alive = preprocess_seen_alive or preprocess_alive
|
||||
launcher_seen_alive = launcher_seen_alive or launcher_alive
|
||||
|
||||
preprocess_error = monitor_job(preprocess, args.status_log)
|
||||
if preprocess_error is not None:
|
||||
print(preprocess_error, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
launcher_error = monitor_job(launcher, args.status_log)
|
||||
if launcher_error is not None:
|
||||
print(launcher_error, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
sentinel_ready = args.sentinel_path.exists()
|
||||
append_status(
|
||||
args.status_log,
|
||||
(
|
||||
"[ok] "
|
||||
f"preprocess_alive={preprocess_alive} "
|
||||
f"launcher_alive={launcher_alive} "
|
||||
f"sentinel_ready={sentinel_ready}"
|
||||
),
|
||||
)
|
||||
|
||||
if preprocess_seen_alive and not preprocess_alive and not sentinel_ready:
|
||||
append_status(args.status_log, "[alert] preprocess exited before sentinel was written")
|
||||
print("preprocess exited before sentinel was written", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
launcher_tail = read_tail(launcher.log_path)
|
||||
train_started = "[start]" in launcher_tail
|
||||
|
||||
if launcher_seen_alive and not launcher_alive:
|
||||
if not train_started and not sentinel_ready:
|
||||
append_status(args.status_log, "[alert] launcher exited before training started")
|
||||
print("launcher exited before training started", file=sys.stderr)
|
||||
return 1
|
||||
append_status(args.status_log, "[done] launcher exited; monitoring complete")
|
||||
return 0
|
||||
|
||||
time.sleep(args.poll_seconds)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user