From 1405a2e8793bdad5944cb5ca3c081a213d6ac3b6 Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Mon, 30 Mar 2026 14:02:32 -0400 Subject: [PATCH] Fix categorized ATVM watcher host result recovery --- atvm/docs/automation/guide.md | 8 +- atvm/docs/automation/run-learnings.md | 17 ++++ atvm/watcher-service/README.md | 9 +- atvm/watcher-service/atvm_run_watcher.py | 100 ++++++++++++++++++++--- 4 files changed, 117 insertions(+), 17 deletions(-) diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index afb80e5..9bcd9ac 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -73,6 +73,9 @@ Run ATVM CMC automation tests on the designated automation VM without unintended - per-run HTML reports - When a machine fails, use the matching `logs/` entry first to capture the detailed failure context for that host. - When reconstructing historical status, prefer `cmcReporter` artifacts over less-specific runner output because they preserve per-host results after the live run has ended. +- Do not treat the existence of a per-host reporter artifact by itself as proof that the host passed. +- For categorized grouped recovery, prefer the matching per-host reporter JSON or mochawesome result and carry through the real `failures`, `pending`, and failure message instead of assuming `PASS completed`. +- If grouped XML only contains `check-xml-files.ts`, cross-check the grouped result against the per-host reporter artifacts before posting or repeating status for that grouped sub-run. Typical sequence: 1. Build the exact `cmc-templates.py` and `run-sorry-cypress.py` commands for the request. @@ -81,8 +84,9 @@ Typical sequence: 4. Run `cmc-templates.py` with the approved options. 5. Wait for `cmc-templates.py` to fully finish and confirm success. 6. Verify the generated `.ts` files and the config `specPattern` include every requested VM before starting the runner. -7. If the watcher is approved, start the watcher before launching `run-sorry-cypress.py`. -8. Run `run-sorry-cypress.py` with the matching approved config and build name. +7. If the watcher is approved, make sure the controller's deployed watcher code is the intended version before relying on its posts. +8. If the watcher is approved, start the watcher before launching `run-sorry-cypress.py`. +9. Run `run-sorry-cypress.py` with the matching approved config and build name. ## Config File / Gold Disk Mapping - `cypress.atvm-config-gold.ts` -> Gold Disk 1 diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index 79f7817..97388c7 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -356,3 +356,20 @@ This file stores run-specific examples only when a run produced a new learning r - Keep the maintained `--exclude_partial_match` list for broad selectors such as `--containsVm` or `--randomize`. - When the operator uses `--specify_vms`, do not auto-add the blacklist unless they explicitly request it. - Even when the operator uses `--specify_vms`, first check whether any requested VM is on the maintained blacklist and stop instead of launching it if one is included. + +## Run Learning: 2026-03-30 (Controller watcher deployment must match the repo watcher before trusting live posts) +- Observed failure mode: + - The repo watcher had the corrected `cmc-reboot` flow, but the controller install at `/opt/atvm-watcher-service/atvm_run_watcher.py` still had the old generic 5-step fallback. + - A live categorized reboot subrun therefore posted the stale 5-step `TEST FLOW:` even though the repo copy had already been fixed. +- Action for future runs: + - Before trusting watcher-generated live posts for new watcher behavior, verify that the controller install matches the intended repo watcher version. + - If the controller install is stale and the operator approves it, deploy the updated watcher code to `/opt/atvm-watcher-service` and restart only the watcher instance for the active build. + +## Run Learning: 2026-03-30 (Categorized grouped recovery must parse real per-host reporter status, not assume pass) +- Observed failure mode: + - A categorized Red Hat reboot subrun posted both hosts as passed even though `atvm71-redhat9.1` actually failed during `1. Verifying set up`. + - The grouped XML only contained `check-xml-files.ts`, and the watcher incorrectly treated the presence of a per-host reporter artifact as `PASS completed`. +- Action for future runs: + - When grouped XML lacks explicit host testcase results, recover grouped host status from the per-host reporter JSON or equivalent detailed artifact. + - Carry through the real `failures`, `pending`, and failure message from that host artifact instead of assuming `PASS completed`. + - If a correction post is needed because stale or reconstructed state was wrong, mark it explicitly as a correction that supersedes the earlier result. diff --git a/atvm/watcher-service/README.md b/atvm/watcher-service/README.md index 41d81e1..2a5a873 100644 --- a/atvm/watcher-service/README.md +++ b/atvm/watcher-service/README.md @@ -62,9 +62,9 @@ Typical workflow: - mark the final state - exit 5. For categorized runs: - - detect each grouped sub-run in sequence from the parent run log - - wait for that grouped sub-run to finish - - send one Mattermost post for that grouped sub-run if it reached `COMPLETED` or `FAILED` + - detect each grouped sub-run in sequence from the parent run log + - wait for that grouped sub-run to finish + - send one Mattermost post for that grouped sub-run if it reached `COMPLETED` or `FAILED` - keep the watcher alive while the parent categorized runner or related child Cypress process is still active - do not treat one completed grouped sub-run as proof that the whole parent request is finished - continue to the next grouped sub-run @@ -129,5 +129,8 @@ This writes a cancellation marker, updates `state.json` to `CANCELLED`, and stop - Categorized execution is treated as sequential grouped ATVM sub-runs, not as one parent run with internal phases. - In categorized mode, the watcher writes per-subrun state under `subruns/` and posts each completed grouped run separately. - In categorized mode, if the child build id label does not match the host/spec actually being executed, the watcher reports the grouped run using the inferred host-based group name instead of trusting the raw child build id label. +- In categorized mode, grouped XML can finish with only `check-xml-files.ts`; when that happens, the watcher must recover per-host results from the matching host reporter artifacts. +- Do not infer `PASS completed` from host artifact presence alone. Parse the per-host reporter result and preserve real `FAIL` and `RUN/pending` state when reconstructing grouped results. +- When the repo copy of the watcher changes, the controller install under `/opt/atvm-watcher-service` must be updated before expecting the new reporting behavior from live runs. - Best-practice controller install path: `/opt/atvm-watcher-service`. - This package is local-only right now. Nothing here is installed on the controller yet. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 25929bf..6da4bed 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -607,6 +607,78 @@ def collect_latest_host_reporter_artifact( return latest +def parse_host_reporter_json(artifact_path: Path, host: str, kernels: Dict[str, str]) -> Optional[HostResult]: + try: + payload = json.loads(artifact_path.read_text(encoding="utf-8")) + except (json.JSONDecodeError, OSError): + return None + if not isinstance(payload, dict): + return None + + stats = payload.get("stats") + metadata = payload.get("metadata") + if not isinstance(stats, dict): + return None + + tests = int(stats.get("tests", 0) or 0) + failures = int(stats.get("failures", 0) or 0) + pending = int(stats.get("pending", 0) or 0) + duration_ms = stats.get("duration", 0) or 0 + timestamp = None + if isinstance(metadata, dict): + timestamp = parse_reporter_metadata_timestamp(metadata.get("timestamp")) + if timestamp is None: + timestamp = reporter_artifact_run_timestamp(artifact_path) + + detail_parts = [f"{tests} tests", f"{failures} failures"] + if pending: + detail_parts.append(f"{pending} pending") + + if failures: + status = "FAIL" + elif pending: + status = "RUN" + else: + status = "PASS" + + return HostResult( + host=host, + kernel=kernels.get(host, "unknown"), + status=status, + detail=", ".join(detail_parts), + tests=tests, + failures=failures, + duration_seconds=float(duration_ms) / 1000 if duration_ms else None, + timestamp=timestamp, + ) + + +def parse_host_reporter_artifact(artifact_path: Path, host: str, kernels: Dict[str, str]) -> Optional[HostResult]: + if artifact_path.suffix == ".json": + return parse_host_reporter_json(artifact_path, host, kernels) + + artifact_ts = reporter_artifact_run_timestamp(artifact_path) + if artifact_ts is None: + artifact_ts = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc) + + try: + text = artifact_path.read_text(encoding="utf-8", errors="replace") + except OSError: + text = "" + + failures = 1 if re.search(r"\bTimed out!\b|\bAssertionError\b|\bFAIL\b|\berror\b", text, re.I) else 0 + status = "FAIL" if failures else "PASS" + detail = "1 failures" if failures else "completed" + return HostResult( + host=host, + kernel=kernels.get(host, "unknown"), + status=status, + detail=detail, + failures=failures, + timestamp=artifact_ts, + ) + + def collect_group_host_reporter_artifacts( reporter_root: Path, group_label: Optional[str], @@ -632,7 +704,7 @@ def collect_group_host_reporter_artifacts( continue latest_artifact_mtime: Optional[datetime] = None - latest_run_ts: Optional[datetime] = None + latest_result: Optional[HostResult] = None for artifact_path in sorted(host_dir.iterdir()): if artifact_path.suffix not in {".txt", ".json"}: continue @@ -641,20 +713,24 @@ def collect_group_host_reporter_artifacts( continue if run_ended_at and artifact_mtime >= run_ended_at: continue - if latest_artifact_mtime is None or artifact_mtime >= latest_artifact_mtime: - latest_artifact_mtime = artifact_mtime - latest_run_ts = reporter_artifact_run_timestamp(artifact_path) or artifact_mtime - if latest_artifact_mtime is None: + parsed_result = parse_host_reporter_artifact(artifact_path, host, kernels) + if parsed_result is None: + continue + + # Prefer the newest artifact, and prefer JSON over TXT when timestamps tie. + if latest_artifact_mtime is None or artifact_mtime > latest_artifact_mtime or ( + artifact_mtime == latest_artifact_mtime and artifact_path.suffix == ".json" + ): + latest_artifact_mtime = artifact_mtime + latest_result = parsed_result + + if latest_artifact_mtime is None or latest_result is None: continue - results[host] = HostResult( - host=host, - kernel=kernels.get(host, "unknown"), - status="PASS", - detail="completed", - timestamp=latest_run_ts or latest_artifact_mtime, - ) + if latest_result.timestamp is None: + latest_result.timestamp = latest_artifact_mtime + results[host] = latest_result return results