Recover watcher results when run log is missing
This commit is contained in:
@@ -230,3 +230,13 @@ This file stores run-specific examples only when a run produced a new learning r
|
||||
- When the watcher is approved, start the watcher before `run-sorry-cypress.py`.
|
||||
- Keep the order as: template generation, verification, watcher start, runner start.
|
||||
- Do not launch the runner first when the watcher is part of the approved command set.
|
||||
|
||||
## Run Learning: 2026-03-27 (Watcher must recover when the consolidated run log is missing)
|
||||
- Observed failure mode:
|
||||
- A non-categorized watcher run can finish without posting Mattermost even when the ATVM test itself passed.
|
||||
- In this case the watcher service expected `/tmp/<build-name>.log`, but that consolidated run log was never written.
|
||||
- The run still produced the final `check-xml-files.ts` XML and fresh per-host reporter artifacts under `cmcReporter/logs/<host>/`.
|
||||
- Action for future runs:
|
||||
- Do not rely only on `/tmp/<build-name>.log` for non-categorized watcher result recovery.
|
||||
- When final `check-xml-files.ts` validation is present but host XML is absent, recover host completion from the latest matching per-host reporter artifact within the run window.
|
||||
- Keep non-categorized watcher notes accurate; do not describe that failure as a categorized sub-run issue.
|
||||
|
||||
@@ -430,6 +430,53 @@ def collect_latest_host_result(
|
||||
return latest
|
||||
|
||||
|
||||
def collect_latest_host_reporter_artifact(
|
||||
reporter_root: Path,
|
||||
expected_hosts: List[str],
|
||||
kernels: Dict[str, str],
|
||||
run_started_at: datetime,
|
||||
run_ended_at: Optional[datetime] = None,
|
||||
) -> Optional[Tuple[str, HostResult]]:
|
||||
logs_dir = reporter_root / "logs"
|
||||
if not logs_dir.exists():
|
||||
return None
|
||||
|
||||
latest: Optional[Tuple[str, HostResult]] = None
|
||||
for host_dir in sorted(logs_dir.iterdir()):
|
||||
if not host_dir.is_dir():
|
||||
continue
|
||||
host = host_dir.name
|
||||
if not host.startswith("atvm"):
|
||||
continue
|
||||
if expected_hosts and host not in expected_hosts:
|
||||
continue
|
||||
|
||||
for artifact_path in sorted(host_dir.iterdir()):
|
||||
if artifact_path.suffix not in {".txt", ".json"}:
|
||||
continue
|
||||
artifact_mtime = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc)
|
||||
if artifact_mtime < run_started_at:
|
||||
continue
|
||||
if run_ended_at and artifact_mtime >= run_ended_at:
|
||||
continue
|
||||
|
||||
result = HostResult(
|
||||
host=host,
|
||||
kernel=kernels.get(host, "unknown"),
|
||||
status="PASS",
|
||||
detail="completed",
|
||||
timestamp=artifact_mtime,
|
||||
)
|
||||
candidate = (host, result)
|
||||
if latest is None:
|
||||
latest = candidate
|
||||
continue
|
||||
latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc)
|
||||
if artifact_mtime >= latest_ts:
|
||||
latest = candidate
|
||||
return latest
|
||||
|
||||
|
||||
def find_check_xml_end(
|
||||
reporter_root: Path,
|
||||
started_at: datetime,
|
||||
@@ -853,14 +900,29 @@ def evaluate_subrun(
|
||||
)
|
||||
return "RUNNING", host_results, start_ts, end_ts, subrun.currents_url, notes
|
||||
|
||||
if check_end and not host_results:
|
||||
latest_host = collect_latest_host_reporter_artifact(
|
||||
reporter_root=reporter_root,
|
||||
expected_hosts=subrun.expected_hosts,
|
||||
kernels=inventory,
|
||||
run_started_at=subrun.started_at,
|
||||
run_ended_at=check_end + timedelta(seconds=5),
|
||||
)
|
||||
if latest_host:
|
||||
host, result = latest_host
|
||||
host_results = {host: result}
|
||||
|
||||
if host_results:
|
||||
notes.append("Categorized sub-run completed after the parent runner exited.")
|
||||
notes.append("Run completed after the parent runner exited.")
|
||||
if check_end:
|
||||
notes.append("Final `check-xml-files.ts` validation passed.")
|
||||
latest_artifact_note = "Host result details were derived from the latest matching host reporter artifact written before final validation."
|
||||
if latest_artifact_note not in notes and all(result.tests == 0 for result in host_results.values()):
|
||||
notes.append(latest_artifact_note)
|
||||
state = "FAILED" if any(result.failures for result in host_results.values()) else "COMPLETED"
|
||||
return state, host_results, start_ts, end_ts, subrun.currents_url, notes
|
||||
|
||||
notes.append("Parent run exited before this categorized sub-run produced host results.")
|
||||
notes.append("Run process exited before host results were detected.")
|
||||
return "TERMINATED", host_results, start_ts, end_ts, subrun.currents_url, notes
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user