From 18dcbc89f9ae2469619f1691d65648b8c45d74de Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Mon, 30 Mar 2026 21:38:59 -0400 Subject: [PATCH] Improve ATVM failed-host detail recovery --- atvm/AGENTS.md | 3 + atvm/docs/automation/guide.md | 8 + atvm/docs/automation/run-learnings.md | 27 ++++ atvm/watcher-service/atvm_run_watcher.py | 191 ++++++++++++++++++++--- 4 files changed, 209 insertions(+), 20 deletions(-) diff --git a/atvm/AGENTS.md b/atvm/AGENTS.md index 2e26005..f74f05f 100644 --- a/atvm/AGENTS.md +++ b/atvm/AGENTS.md @@ -78,6 +78,9 @@ This file defines how to operate and maintain the ATVM workspace in `/home/aw/co - When the watcher is requested, start the watcher before `run-sorry-cypress.py`. - Do not start the runner before the watcher, because the watcher helper clears stale `/tmp/.log` and can delete the fresh live runner log if the runner starts first. - For host-level test detail and failed-test investigation, use `/root/cdc-e2e-cyp-12.17.4/cypress/cmcReporter`, especially `logs/`, `xml/`, and `mochawesome/`. +- Apply failed-host detail recovery consistently for every ATVM template run, not just `cmc-reboot`. +- For any failed ATVM host, recover failure detail in this order when available: consolidated run log, `mochawesome`, structured reporter artifacts (`json`/`xml`), then text reporter artifacts. +- Keep the `HOSTS` detail column compact with the failing step plus a short error summary, and put the longer trimmed failure excerpt in `NOTES:`. - When reporting `TEST FLOW:` for an ATVM run, prefer the numbered steps extracted from the generated spec for that exact run. - If the generated spec exists, do not rely on a static template flow list for `TEST FLOW:`. - Only fall back to template-level or static flow definitions when the generated spec cannot be located or parsed. diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index d29af13..556e89e 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -76,6 +76,12 @@ Run ATVM CMC automation tests on the designated automation VM without unintended - `mochawesome/` - per-run HTML reports - When a machine fails, use the matching `logs/` entry first to capture the detailed failure context for that host. +- Apply the failed-host detail recovery path to every ATVM template type, not just reboot. +- For any failed host, recover detail in this order when available: + - consolidated run log + - matching `mochawesome` HTML + - structured reporter artifacts such as per-host JSON or XML + - text reporter artifacts - When reconstructing historical status, prefer `cmcReporter` artifacts over less-specific runner output because they preserve per-host results after the live run has ended. - Do not treat the existence of a per-host reporter artifact by itself as proof that the host passed. - For categorized grouped recovery, prefer the matching per-host reporter JSON or mochawesome result and carry through the real `failures`, `pending`, and failure message instead of assuming `PASS completed`. @@ -269,6 +275,8 @@ Status-report expectations: - Do not include generic watcher bookkeeping messages in `NOTES:` such as artifact-detection confirmations. - Do not include internal watcher fallback notes in `NOTES:` such as `check-xml-files.ts` validation confirmations or reporter-artifact recovery details. - The `HOSTS:` table includes `Host`, `Kernel`, `Status`, and `Detail` columns in that order. +- For any failed host, keep the `Detail` column compact by showing the failing step plus a short error summary, not the full raw stack trace. +- If richer failure text is available, put the longer trimmed excerpt in `NOTES:` so the result stays readable in Mattermost and local status output. - In `COVERAGE:`, describe the important `cmc-templates.py` command inputs such as template, categorize mode, datastore/config family, config filename, migration style, any real plugin/integration path, and other operator-relevant run options, but do not list target hosts there or include verbose prose scope descriptions. - Only include coverage fields that the template command actually used. Do not show empty or irrelevant fields such as an integration/plugin path for templates that did not use one. - If `categorize mode: enabled` is already shown in `COVERAGE:`, do not also repeat `--categorize` under `run options`. diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index beedc8e..ce0634a 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -451,3 +451,30 @@ This file stores run-specific examples only when a run produced a new learning r - Resolve `TEST FLOW:` from the generated `.ts` spec for the actual run whenever that spec exists. - Extract the numbered `it(...)` steps from the generated spec referenced by the run's `specPattern`. - Only use template-level or static fallback flow definitions when the generated spec cannot be found or parsed. + +## Run Learning: 2026-03-30 (Event-log reporter JSON must not be ignored in non-categorized fallback) +- Observed failure mode: + - A failed non-categorized run still posted/saved host detail as only `1 failures` even though the per-host reporter artifacts preserved the failing step. + - The per-host `.json` artifact used an event-log format with `metadata` plus `tests`, but no top-level `stats` block. + - The watcher ignored that JSON format, fell back to the `.txt`, and lost structured test counts/detail. +- Action for future runs: + - Support the event-log JSON format directly when parsing per-host reporter artifacts. + - In non-categorized fallback, prefer the structured `.json` artifact over the matching `.txt` when they belong to the same run timestamp. + - Recover at least the failing testcase name and a nonzero test count from those artifacts even when the consolidated run log is missing. + +## Run Learning: 2026-03-30 (Use `mochawesome` as the rich fallback for host failure detail) +- Observed failure mode: + - The full UI-visible Cypress error text for a failed ATVM host run existed in `cypress/cmcReporter/mochawesome/*.html`, but the lower-fidelity host-level `.json` and `.txt` reporter artifacts only preserved the failing step boundary. + - That made the host detail fall back to a thin summary even though a richer error payload was available on the controller. +- Action for future runs: + - When the consolidated run log is missing, use `mochawesome` as the rich fallback source for per-host failure text before settling for lower-fidelity reporter artifacts. + - Keep the `HOSTS` table compact by showing the failing step plus a short error summary. + - Put the longer trimmed failure excerpt in `NOTES:` instead of dumping the full raw stack trace into the host-detail column. + +## Run Learning: 2026-03-30 (Apply rich failed-host detail recovery to every ATVM template) +- Observed operator requirement: + - The same failed-host recovery and formatting rules should apply across all ATVM template runs, not only reboot scenarios. + - If any ATVM test template fails, the result should still recover the best available failure detail and present it consistently. +- Action for future runs: + - Use the same failure-detail recovery order for every ATVM template: consolidated run log, `mochawesome`, structured reporter artifacts, then text reporter artifacts. + - Keep failed-host `Detail` compact and put the longer trimmed excerpt in `NOTES:` for every template type. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 99c16e2..3b920e8 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -3,6 +3,7 @@ from __future__ import annotations import argparse import ast +import html import json import os import re @@ -439,6 +440,12 @@ def append_failure_detail(detail: str, failure_detail: Optional[str]) -> str: return f"{detail} - {failure_detail}" +def concise_testcase_name(raw: str) -> str: + if "->" in raw: + return raw.split("->", 1)[1].strip() + return raw.strip() + + def extract_failure_detail_from_xml_suite(suite: ET.Element) -> Optional[str]: for testcase in suite.findall("testcase"): failure = testcase.find("failure") @@ -470,6 +477,39 @@ def extract_failure_detail_from_text_blob(text: str) -> Optional[str]: return None +def extract_failure_from_reporter_events(testcase_name: str, testcase_events: object) -> Optional[str]: + if not isinstance(testcase_events, list): + return None + for event in testcase_events: + if not isinstance(event, list) or len(event) < 3: + continue + event_type = str(event[0]).lower() + message_value = str(event[1]) if len(event) > 1 else "" + status_value = str(event[2]).lower() + if event_type in {"cy:command", "cy:task"} and status_value in {"failed", "fail", "error"}: + return compact_failure_detail(f"{concise_testcase_name(testcase_name)} - {message_value}") + return None + + +def extract_failure_from_reporter_text(text: str) -> Optional[str]: + sections = re.split(r"^=== (.+?) ===\s*$", text, flags=re.M) + if len(sections) < 3: + return None + for index in range(1, len(sections), 2): + testcase_name = sections[index] + section_body = sections[index + 1] if index + 1 < len(sections) else "" + for line in section_body.splitlines(): + parts = line.split("\t") + if len(parts) < 3: + continue + event_type = parts[0].strip().lower() + status_value = parts[1].strip().lower() + message_value = "\t".join(parts[2:]).strip() + if event_type in {"cy:command", "cy:task"} and status_value in {"failed", "fail", "error"}: + return compact_failure_detail(f"{concise_testcase_name(testcase_name)} - {message_value}") + return None + + def extract_failure_detail_from_log_text(log_text: str, host: str) -> Optional[str]: pattern = ( rf"\d+\)\s+Testing .*?{re.escape(host)}.*?\n" @@ -486,6 +526,66 @@ def extract_failure_detail_from_log_text(log_text: str, host: str) -> Optional[s return compact_failure_detail(testcase) +def decode_json_string_fragment(raw: str) -> str: + try: + return json.loads(f'"{raw}"') + except json.JSONDecodeError: + return raw + + +def extract_first_json_string(block: str, key: str) -> Optional[str]: + match = re.search(rf'"{re.escape(key)}":"((?:\\.|[^"])*)"', block, re.S) + if not match: + return None + return decode_json_string_fragment(match.group(1)) + + +def extract_failure_from_mochawesome( + reporter_root: Path, + build_name: str, + host: str, +) -> Optional[Tuple[str, str, str]]: + mochawesome_dir = reporter_root / "mochawesome" + if not mochawesome_dir.exists(): + return None + + candidates = sorted( + mochawesome_dir.glob(f"*{build_name}*.html"), + key=lambda path: (path.stat().st_mtime, 0 if path.name.endswith("_001.html") else 1), + reverse=True, + ) + host_pattern = re.escape(host) + + for html_path in candidates: + try: + text = html.unescape(html_path.read_text(encoding="utf-8", errors="replace")) + except OSError: + continue + for match in re.finditer(r'"fullTitle":"((?:\\.|[^"])*)"', text, re.S): + full_title = decode_json_string_fragment(match.group(1)) + if host not in full_title: + continue + block_start = max(0, match.start() - 1200) + block_end = min(len(text), match.end() + 8000) + block = text[block_start:block_end] + if not re.search(r'"state":"failed"', block): + continue + testcase = extract_first_json_string(block, "title") or "failed testcase" + message = extract_first_json_string(block, "message") or "" + estack = extract_first_json_string(block, "estack") or "" + if testcase or message or estack: + return testcase, message, estack + return None + + +def summarize_host_detail_with_mochawesome(detail: str, testcase: str, message: str) -> str: + prefix_match = re.match(r"^(\d+ tests, \d+ failures(?:, \d+ pending)?)", detail) + prefix = prefix_match.group(1) if prefix_match else detail + message_summary = compact_failure_detail(message or testcase, limit=260) + testcase_summary = compact_failure_detail(testcase, limit=140) + return f"{prefix} - {testcase_summary} - {message_summary}" + + def extract_host_results_from_run_finished_segment(segment_text: str, inventory: Dict[str, str]) -> Dict[str, HostResult]: host_results: Dict[str, HostResult] = {} normalized = re.sub(r"\n\s*│\s*s\s*│", "s", segment_text) @@ -637,7 +737,7 @@ def collect_latest_host_reporter_artifact( if not logs_dir.exists(): return None - latest: Optional[Tuple[str, HostResult]] = None + latest: Optional[Tuple[str, HostResult, datetime, str]] = None for host_dir in sorted(logs_dir.iterdir()): if not host_dir.is_dir(): continue @@ -661,14 +761,23 @@ def collect_latest_host_reporter_artifact( continue artifact_ts = result.timestamp or reporter_artifact_run_timestamp(artifact_path) or artifact_mtime result.timestamp = artifact_ts - candidate = (host, result) + candidate = (host, result, artifact_ts, artifact_path.suffix) if latest is None: latest = candidate continue - latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc) - if artifact_mtime >= latest_ts: + + latest_result_ts = latest[2] + latest_suffix = latest[3] + # Prefer the newest logical run artifact, and prefer JSON over TXT + # when both artifacts represent the same run timestamp. + if artifact_ts > latest_result_ts: latest = candidate - return latest + continue + if artifact_ts == latest_result_ts and artifact_path.suffix == ".json" and latest_suffix != ".json": + latest = candidate + if latest is None: + return None + return latest[0], latest[1] def parse_host_reporter_json(artifact_path: Path, host: str, kernels: Dict[str, str]) -> Optional[HostResult]: @@ -681,8 +790,42 @@ def parse_host_reporter_json(artifact_path: Path, host: str, kernels: Dict[str, stats = payload.get("stats") metadata = payload.get("metadata") + tests_payload = payload.get("tests") if not isinstance(stats, dict): - return None + if not isinstance(tests_payload, dict): + return None + tests = len([name for name, events in tests_payload.items() if isinstance(name, str) and isinstance(events, list)]) + failures = 0 + pending = 0 + duration_ms = 0 + failure_detail = None + for testcase_name, testcase_events in tests_payload.items(): + current_failure = extract_failure_from_reporter_events(testcase_name, testcase_events) + if current_failure: + failures += 1 + if failure_detail is None: + failure_detail = current_failure + timestamp = None + if isinstance(metadata, dict): + timestamp = parse_reporter_metadata_timestamp(metadata.get("timestamp")) + if timestamp is None: + timestamp = reporter_artifact_run_timestamp(artifact_path) + + detail_parts = [f"{tests} tests", f"{failures} failures"] + detail = ", ".join(detail_parts) + if failures: + detail = append_failure_detail(detail, failure_detail) + status = "FAIL" if failures else "PASS" + return HostResult( + host=host, + kernel=kernels.get(host, "unknown"), + status=status, + detail=detail, + tests=tests, + failures=failures, + duration_seconds=None, + timestamp=timestamp, + ) tests = int(stats.get("tests", 0) or 0) failures = int(stats.get("failures", 0) or 0) @@ -699,20 +842,10 @@ def parse_host_reporter_json(artifact_path: Path, host: str, kernels: Dict[str, detail_parts.append(f"{pending} pending") failure_detail = None - tests_payload = payload.get("tests") if failures and isinstance(tests_payload, dict): for testcase_name, testcase_events in tests_payload.items(): - if not isinstance(testcase_name, str) or not isinstance(testcase_events, list): - continue - for event in testcase_events: - if not isinstance(event, list) or len(event) < 3: - continue - status_value = str(event[2]).lower() - message_value = str(event[1]) if len(event) > 1 else "" - if status_value in {"failed", "fail", "error"} or re.search(r"\b(Error|AssertionError|Timed out!)\b", message_value): - failure_detail = compact_failure_detail(f"{testcase_name} - {message_value}") - break - if failure_detail: + failure_detail = extract_failure_from_reporter_events(testcase_name, testcase_events) + if failure_detail is not None: break if failures: @@ -751,11 +884,13 @@ def parse_host_reporter_artifact(artifact_path: Path, host: str, kernels: Dict[s except OSError: text = "" - failure_detail = extract_failure_detail_from_text_blob(text) + sectioned_failure_detail = extract_failure_from_reporter_text(text) + failure_detail = extract_failure_detail_from_text_blob(text) or sectioned_failure_detail structured_failure = re.search(r"^(?:cy:command|cy:task)\terror\t", text, re.I | re.M) failures = 1 if failure_detail or structured_failure else 0 + tests = len(re.findall(r"^=== .+? ===\s*$", text, re.M)) status = "FAIL" if failures else "PASS" - detail = "1 failures" if failures else "completed" + detail = f"{tests} tests, 1 failures" if failures and tests else ("1 failures" if failures else "completed") if failures: detail = append_failure_detail(detail, failure_detail) return HostResult( @@ -763,6 +898,7 @@ def parse_host_reporter_artifact(artifact_path: Path, host: str, kernels: Dict[s kernel=kernels.get(host, "unknown"), status=status, detail=detail, + tests=tests, failures=failures, timestamp=artifact_ts, ) @@ -1044,6 +1180,20 @@ def build_status_markdown( longest = max((h for h in ordered_hosts if h.duration_seconds is not None), key=lambda h: h.duration_seconds, default=None) average = (sum(durations) / len(durations)) if durations else None + additional_failure_notes: List[str] = [] + for host in ordered_hosts: + if host.status != "FAIL": + continue + mochawesome_failure = extract_failure_from_mochawesome(reporter_root, build_name, host.host) + if not mochawesome_failure: + continue + testcase, message, estack = mochawesome_failure + host.detail = summarize_host_detail_with_mochawesome(host.detail, testcase, message) + failure_excerpt_source = estack or message + additional_failure_notes.append( + f"{host.host} failure excerpt: `{compact_failure_detail(failure_excerpt_source, limit=420)}`" + ) + host_lines = ["| Host | Kernel | Status | Detail |", "| --- | --- | --- | --- |"] for host in ordered_hosts: icon = { @@ -1070,6 +1220,7 @@ def build_status_markdown( notes = notes + [ "Both iscsi and fc disks were used for the reboot test. As a result, iscsi disks may not have attached before the mtdi started. So if the test failed, that is most likely the issue." ] + notes = notes + additional_failure_notes notes_block = "\n".join(f"- {note}" for note in notes) if notes else "- none" resolved_flow = extract_test_flow_from_generated_spec(reporter_root, log_text) or get_test_flow(metadata.get("template"))