diff --git a/atvm/AGENTS.md b/atvm/AGENTS.md index f74f05f..a13f66d 100644 --- a/atvm/AGENTS.md +++ b/atvm/AGENTS.md @@ -80,7 +80,8 @@ This file defines how to operate and maintain the ATVM workspace in `/home/aw/co - For host-level test detail and failed-test investigation, use `/root/cdc-e2e-cyp-12.17.4/cypress/cmcReporter`, especially `logs/`, `xml/`, and `mochawesome/`. - Apply failed-host detail recovery consistently for every ATVM template run, not just `cmc-reboot`. - For any failed ATVM host, recover failure detail in this order when available: consolidated run log, `mochawesome`, structured reporter artifacts (`json`/`xml`), then text reporter artifacts. -- Keep the `HOSTS` detail column compact with the failing step plus a short error summary, and put the longer trimmed failure excerpt in `NOTES:`. +- Keep the `HOSTS` detail column compact with the failing step plus a short error summary only. +- Put richer per-host error excerpts in a dedicated `FAILURE NOTES:` section, and reserve `NOTES:` for non-failure context such as the template command, Currents URL, and operator-facing caveats. - When reporting `TEST FLOW:` for an ATVM run, prefer the numbered steps extracted from the generated spec for that exact run. - If the generated spec exists, do not rely on a static template flow list for `TEST FLOW:`. - Only fall back to template-level or static flow definitions when the generated spec cannot be located or parsed. diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index 556e89e..d957f21 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -253,7 +253,8 @@ When the operator asks for the status of an ATVM automation run, report in this 4. `TIMING:` section with start, end, total, quickest, longest, and average. 5. `COVERAGE:` section describing what the run was intended to cover, excluding the target-host list. 6. `TEST FLOW:` section describing the template-specific numbered run flow for the test. -7. `NOTES:` section for broader context and anomalies. +7. `FAILURE NOTES:` section for detailed per-host error excerpts when failures exist. +8. `NOTES:` section for broader non-failure context and anomalies. 7. Remaining machines still to run. 8. Summary counts for finished, passed, failed, and skipped machines. 9. Timing details: @@ -268,15 +269,15 @@ When the operator asks for the status of an ATVM automation run, report in this Status-report expectations: - Use the same display layout for every ATVM automation status response regardless of test type (`e2e`, `systemOS`, `reboot`, `migrateops`, and others). - Use `/home/aw/code/cds/atvm/docs/automation/status-template.md` as the default template for both local status output and Mattermost status posts. -- The default ATVM status template uses flat bullet-list sections for `COVERAGE:` and `TEST FLOW:`, Markdown tables for `SUMMARY:`, `HOSTS:`, and `TIMING:`, and uses `NOTES:` for flat operator-facing notes. -- Order the status sections as `SUMMARY:`, `HOSTS:`, `TIMING:`, `COVERAGE:`, `TEST FLOW:`, then `NOTES:`. -- Keep `NOTES:` focused on operator-facing value such as the Currents run URL, real anomalies, failure context, or material fallback behavior. +- The default ATVM status template uses flat bullet-list sections for `COVERAGE:`, `TEST FLOW:`, `FAILURE NOTES:`, and `NOTES:`, and Markdown tables for `SUMMARY:`, `HOSTS:`, and `TIMING:`. +- Order the status sections as `SUMMARY:`, `HOSTS:`, `TIMING:`, `COVERAGE:`, `TEST FLOW:`, `FAILURE NOTES:`, then `NOTES:`. +- Keep `NOTES:` focused on non-failure operator-facing value such as the Currents run URL, real anomalies unrelated to the direct failure text, or material fallback behavior. - Include the exact `cmc-templates.py` command used to trigger the ATVM automation run in `NOTES:`, without the outer `sshpass`/`ssh` wrapper and without trimming it. - Do not include generic watcher bookkeeping messages in `NOTES:` such as artifact-detection confirmations. - Do not include internal watcher fallback notes in `NOTES:` such as `check-xml-files.ts` validation confirmations or reporter-artifact recovery details. - The `HOSTS:` table includes `Host`, `Kernel`, `Status`, and `Detail` columns in that order. - For any failed host, keep the `Detail` column compact by showing the failing step plus a short error summary, not the full raw stack trace. -- If richer failure text is available, put the longer trimmed excerpt in `NOTES:` so the result stays readable in Mattermost and local status output. +- If richer failure text is available, put the longer trimmed excerpt in `FAILURE NOTES:` so the result stays readable in Mattermost and local status output. - In `COVERAGE:`, describe the important `cmc-templates.py` command inputs such as template, categorize mode, datastore/config family, config filename, migration style, any real plugin/integration path, and other operator-relevant run options, but do not list target hosts there or include verbose prose scope descriptions. - Only include coverage fields that the template command actually used. Do not show empty or irrelevant fields such as an integration/plugin path for templates that did not use one. - If `categorize mode: enabled` is already shown in `COVERAGE:`, do not also repeat `--categorize` under `run options`. @@ -289,7 +290,7 @@ Status-report expectations: - If fallback is required, resolve it from the run template name before using any generic default flow. - `cmc-e2e` currently uses the 22-step migration flow documented in `/home/aw/code/cds/atvm/docs/automation/status-template.md`. - `cmc-systemOS` currently uses the 21-step boot-disk migration flow documented in `/home/aw/code/cds/atvm/docs/automation/status-template.md`. -- Keep `NOTES:` behavior consistent across template types; do not add template-specific internal-source notes such as parent-log-summary recovery details. +- Keep `FAILURE NOTES:` and `NOTES:` behavior consistent across template types; do not add template-specific internal-source notes such as parent-log-summary recovery details. - For the `Kernel` column, cross-reference the host name against `/home/aw/code/cds/atvm/inventory/vm-inventory.md`. - If the hostname is not present in `vm-inventory.md`, report the kernel value as `unknown`. - Treat references to the "ATVM automation run" or "automation run" as referring to this ATVM folder workflow and the automation VM at `192.168.3.190`, not to Cirrus project operations such as the `atvm - cypress` project. diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index ce0634a..48131c4 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -477,4 +477,13 @@ This file stores run-specific examples only when a run produced a new learning r - If any ATVM test template fails, the result should still recover the best available failure detail and present it consistently. - Action for future runs: - Use the same failure-detail recovery order for every ATVM template: consolidated run log, `mochawesome`, structured reporter artifacts, then text reporter artifacts. - - Keep failed-host `Detail` compact and put the longer trimmed excerpt in `NOTES:` for every template type. + - Keep failed-host `Detail` compact and put the longer trimmed excerpt in `FAILURE NOTES:` for every template type. + +## Run Learning: 2026-03-30 (Separate failure detail from general notes in ATVM status output) +- Observed operator requirement: + - The `HOSTS` detail column should stay short and scannable. + - Detailed per-host error text should not crowd the host table or mix with general `NOTES:`. +- Action for future runs: + - Keep `HOSTS` detail to the failing step plus a short error summary only. + - Put richer per-host error excerpts in `FAILURE NOTES:`. + - Reserve `NOTES:` for non-failure context such as template command, Currents URL, and operator-facing caveats. diff --git a/atvm/docs/automation/status-template.md b/atvm/docs/automation/status-template.md index c351918..1267b7f 100644 --- a/atvm/docs/automation/status-template.md +++ b/atvm/docs/automation/status-template.md @@ -51,13 +51,16 @@ Use this as the default ATVM automation run-status template for: **TEST FLOW:** - +**FAILURE NOTES:** +- + **NOTES:** - - ``` ## Rules -- Keep `SUMMARY:`, `HOSTS:`, `TIMING:`, `COVERAGE:`, `TEST FLOW:`, and `NOTES:` in that order. +- Keep `SUMMARY:`, `HOSTS:`, `TIMING:`, `COVERAGE:`, `TEST FLOW:`, `FAILURE NOTES:`, and `NOTES:` in that order. - Use the title format: - `## ATVM Run Status` - `### ` @@ -72,10 +75,12 @@ Use this as the default ATVM automation run-status template for: - `⏳ RUN` - `⏭️ SKIP` - Keep `Detail` concise. -- Put broader context under `NOTES:`, not in the host table. +- Put richer per-host failure excerpts under `FAILURE NOTES:`, not in the host table. +- Put broader non-failure context under `NOTES:`. - When available, put the persistent Currents run URL in `NOTES:` so operators can open the exact recorded run directly. - Include the exact `cmc-templates.py` command used to trigger the run in `NOTES:`, without the outer `sshpass`/`ssh` wrapper. -- Keep `NOTES:` limited to meaningful operator-facing items such as the Currents link, real anomalies, failure context, or important fallback behavior. +- Keep `FAILURE NOTES:` limited to detailed per-host error excerpts. +- Keep `NOTES:` limited to meaningful non-failure operator-facing items such as the Currents link, real anomalies, or important fallback behavior. - Do not include generic watcher bookkeeping lines in `NOTES:` such as "run artifacts were detected" or "final reporting artifacts were detected." - Do not include internal fallback notes in `NOTES:` such as "`check-xml-files.ts` validation passed" or "host details were derived from reporter artifacts." - `COVERAGE:` should describe what the run was intended to cover without listing target hosts. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 3b920e8..72a9782 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -581,7 +581,22 @@ def extract_failure_from_mochawesome( def summarize_host_detail_with_mochawesome(detail: str, testcase: str, message: str) -> str: prefix_match = re.match(r"^(\d+ tests, \d+ failures(?:, \d+ pending)?)", detail) prefix = prefix_match.group(1) if prefix_match else detail - message_summary = compact_failure_detail(message or testcase, limit=260) + normalized_message = " ".join((message or "").split()) + message_summary = "" + for pattern in ( + r"(md5sum:\s.*?)(?:$)", + r"(sshpass does not contain OK(?:\.\s*Output:)?)(?:$)", + r"(AssertionError:\s.*?)(?:$)", + r"(Timed out!? .*?)(?:$)", + r"(Error:\s.*?)(?:$)", + ): + match = re.search(pattern, normalized_message, re.I) + if match: + message_summary = match.group(1) + break + if not message_summary: + message_summary = normalized_message or testcase + message_summary = compact_failure_detail(message_summary, limit=120) testcase_summary = compact_failure_detail(testcase, limit=140) return f"{prefix} - {testcase_summary} - {message_summary}" @@ -1180,7 +1195,7 @@ def build_status_markdown( longest = max((h for h in ordered_hosts if h.duration_seconds is not None), key=lambda h: h.duration_seconds, default=None) average = (sum(durations) / len(durations)) if durations else None - additional_failure_notes: List[str] = [] + failure_notes: List[str] = [] for host in ordered_hosts: if host.status != "FAIL": continue @@ -1190,7 +1205,7 @@ def build_status_markdown( testcase, message, estack = mochawesome_failure host.detail = summarize_host_detail_with_mochawesome(host.detail, testcase, message) failure_excerpt_source = estack or message - additional_failure_notes.append( + failure_notes.append( f"{host.host} failure excerpt: `{compact_failure_detail(failure_excerpt_source, limit=420)}`" ) @@ -1220,8 +1235,7 @@ def build_status_markdown( notes = notes + [ "Both iscsi and fc disks were used for the reboot test. As a result, iscsi disks may not have attached before the mtdi started. So if the test failed, that is most likely the issue." ] - notes = notes + additional_failure_notes - + failure_notes_block = "\n".join(f"- {note}" for note in failure_notes) if failure_notes else "- none" notes_block = "\n".join(f"- {note}" for note in notes) if notes else "- none" resolved_flow = extract_test_flow_from_generated_spec(reporter_root, log_text) or get_test_flow(metadata.get("template")) test_flow_lines = [f"- {step}" for step in resolved_flow] @@ -1261,6 +1275,9 @@ def build_status_markdown( "**TEST FLOW:**", *test_flow_lines, "", + "**FAILURE NOTES:**", + failure_notes_block, + "", "**NOTES:**", notes_block, ]