From cc551a6922a72899e1017deccb70a0cc4b6034ea Mon Sep 17 00:00:00 2001
From: "anthony.wen" <awen01@gmail.com>
Date: Fri, 27 Mar 2026 11:00:11 -0400
Subject: [PATCH] Recover watcher results when run log is missing

---
 atvm/docs/automation/run-learnings.md    | 10 ++++
 atvm/watcher-service/atvm_run_watcher.py | 66 +++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 2 deletions(-)
diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md
index ee3cf5e..1e781eb 100644
--- a/atvm/docs/automation/run-learnings.md
+++ b/atvm/docs/automation/run-learnings.md
@@ -230,3 +230,13 @@ This file stores run-specific examples only when a run produced a new learning r
   - When the watcher is approved, start the watcher before `run-sorry-cypress.py`.
   - Keep the order as: template generation, verification, watcher start, runner start.
   - Do not launch the runner first when the watcher is part of the approved command set.
+
+## Run Learning: 2026-03-27 (Watcher must recover when the consolidated run log is missing)
+- Observed failure mode:
+  - A non-categorized watcher run can finish without posting Mattermost even when the ATVM test itself passed.
+  - In this case the watcher service expected `/tmp/<build-name>.log`, but that consolidated run log was never written.
+  - The run still produced the final `check-xml-files.ts` XML and fresh per-host reporter artifacts under `cmcReporter/logs/<host>/`.
+- Action for future runs:
+  - Do not rely only on `/tmp/<build-name>.log` for non-categorized watcher result recovery.
+  - When final `check-xml-files.ts` validation is present but host XML is absent, recover host completion from the latest matching per-host reporter artifact within the run window.
+  - Keep non-categorized watcher notes accurate; do not describe that failure as a categorized sub-run issue.
diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py
index e15bed5..f192b88 100644
--- a/atvm/watcher-service/atvm_run_watcher.py
+++ b/atvm/watcher-service/atvm_run_watcher.py
@@ -430,6 +430,53 @@ def collect_latest_host_result(
     return latest
 
 
+def collect_latest_host_reporter_artifact(
+    reporter_root: Path,
+    expected_hosts: List[str],
+    kernels: Dict[str, str],
+    run_started_at: datetime,
+    run_ended_at: Optional[datetime] = None,
+) -> Optional[Tuple[str, HostResult]]:
+    logs_dir = reporter_root / "logs"
+    if not logs_dir.exists():
+        return None
+
+    latest: Optional[Tuple[str, HostResult]] = None
+    for host_dir in sorted(logs_dir.iterdir()):
+        if not host_dir.is_dir():
+            continue
+        host = host_dir.name
+        if not host.startswith("atvm"):
+            continue
+        if expected_hosts and host not in expected_hosts:
+            continue
+
+        for artifact_path in sorted(host_dir.iterdir()):
+            if artifact_path.suffix not in {".txt", ".json"}:
+                continue
+            artifact_mtime = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc)
+            if artifact_mtime < run_started_at:
+                continue
+            if run_ended_at and artifact_mtime >= run_ended_at:
+                continue
+
+            result = HostResult(
+                host=host,
+                kernel=kernels.get(host, "unknown"),
+                status="PASS",
+                detail="completed",
+                timestamp=artifact_mtime,
+            )
+            candidate = (host, result)
+            if latest is None:
+                latest = candidate
+                continue
+            latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc)
+            if artifact_mtime >= latest_ts:
+                latest = candidate
+    return latest
+
+
 def find_check_xml_end(
     reporter_root: Path,
     started_at: datetime,
@@ -853,14 +900,29 @@ def evaluate_subrun(
             )
         return "RUNNING", host_results, start_ts, end_ts, subrun.currents_url, notes
 
+    if check_end and not host_results:
+        latest_host = collect_latest_host_reporter_artifact(
+            reporter_root=reporter_root,
+            expected_hosts=subrun.expected_hosts,
+            kernels=inventory,
+            run_started_at=subrun.started_at,
+            run_ended_at=check_end + timedelta(seconds=5),
+        )
+        if latest_host:
+            host, result = latest_host
+            host_results = {host: result}
+
     if host_results:
-        notes.append("Categorized sub-run completed after the parent runner exited.")
+        notes.append("Run completed after the parent runner exited.")
         if check_end:
             notes.append("Final `check-xml-files.ts` validation passed.")
+            latest_artifact_note = "Host result details were derived from the latest matching host reporter artifact written before final validation."
+            if latest_artifact_note not in notes and all(result.tests == 0 for result in host_results.values()):
+                notes.append(latest_artifact_note)
         state = "FAILED" if any(result.failures for result in host_results.values()) else "COMPLETED"
         return state, host_results, start_ts, end_ts, subrun.currents_url, notes
 
-    notes.append("Parent run exited before this categorized sub-run produced host results.")
+    notes.append("Run process exited before host results were detected.")
     return "TERMINATED", host_results, start_ts, end_ts, subrun.currents_url, notes