From cc551a6922a72899e1017deccb70a0cc4b6034ea Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Fri, 27 Mar 2026 11:00:11 -0400 Subject: [PATCH] Recover watcher results when run log is missing --- atvm/docs/automation/run-learnings.md | 10 ++++ atvm/watcher-service/atvm_run_watcher.py | 66 +++++++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index ee3cf5e..1e781eb 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -230,3 +230,13 @@ This file stores run-specific examples only when a run produced a new learning r - When the watcher is approved, start the watcher before `run-sorry-cypress.py`. - Keep the order as: template generation, verification, watcher start, runner start. - Do not launch the runner first when the watcher is part of the approved command set. + +## Run Learning: 2026-03-27 (Watcher must recover when the consolidated run log is missing) +- Observed failure mode: + - A non-categorized watcher run can finish without posting Mattermost even when the ATVM test itself passed. + - In this case the watcher service expected `/tmp/.log`, but that consolidated run log was never written. + - The run still produced the final `check-xml-files.ts` XML and fresh per-host reporter artifacts under `cmcReporter/logs//`. +- Action for future runs: + - Do not rely only on `/tmp/.log` for non-categorized watcher result recovery. + - When final `check-xml-files.ts` validation is present but host XML is absent, recover host completion from the latest matching per-host reporter artifact within the run window. + - Keep non-categorized watcher notes accurate; do not describe that failure as a categorized sub-run issue. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index e15bed5..f192b88 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -430,6 +430,53 @@ def collect_latest_host_result( return latest +def collect_latest_host_reporter_artifact( + reporter_root: Path, + expected_hosts: List[str], + kernels: Dict[str, str], + run_started_at: datetime, + run_ended_at: Optional[datetime] = None, +) -> Optional[Tuple[str, HostResult]]: + logs_dir = reporter_root / "logs" + if not logs_dir.exists(): + return None + + latest: Optional[Tuple[str, HostResult]] = None + for host_dir in sorted(logs_dir.iterdir()): + if not host_dir.is_dir(): + continue + host = host_dir.name + if not host.startswith("atvm"): + continue + if expected_hosts and host not in expected_hosts: + continue + + for artifact_path in sorted(host_dir.iterdir()): + if artifact_path.suffix not in {".txt", ".json"}: + continue + artifact_mtime = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc) + if artifact_mtime < run_started_at: + continue + if run_ended_at and artifact_mtime >= run_ended_at: + continue + + result = HostResult( + host=host, + kernel=kernels.get(host, "unknown"), + status="PASS", + detail="completed", + timestamp=artifact_mtime, + ) + candidate = (host, result) + if latest is None: + latest = candidate + continue + latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc) + if artifact_mtime >= latest_ts: + latest = candidate + return latest + + def find_check_xml_end( reporter_root: Path, started_at: datetime, @@ -853,14 +900,29 @@ def evaluate_subrun( ) return "RUNNING", host_results, start_ts, end_ts, subrun.currents_url, notes + if check_end and not host_results: + latest_host = collect_latest_host_reporter_artifact( + reporter_root=reporter_root, + expected_hosts=subrun.expected_hosts, + kernels=inventory, + run_started_at=subrun.started_at, + run_ended_at=check_end + timedelta(seconds=5), + ) + if latest_host: + host, result = latest_host + host_results = {host: result} + if host_results: - notes.append("Categorized sub-run completed after the parent runner exited.") + notes.append("Run completed after the parent runner exited.") if check_end: notes.append("Final `check-xml-files.ts` validation passed.") + latest_artifact_note = "Host result details were derived from the latest matching host reporter artifact written before final validation." + if latest_artifact_note not in notes and all(result.tests == 0 for result in host_results.values()): + notes.append(latest_artifact_note) state = "FAILED" if any(result.failures for result in host_results.values()) else "COMPLETED" return state, host_results, start_ts, end_ts, subrun.currents_url, notes - notes.append("Parent run exited before this categorized sub-run produced host results.") + notes.append("Run process exited before host results were detected.") return "TERMINATED", host_results, start_ts, end_ts, subrun.currents_url, notes