diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index b9d5a90..5cfcc90 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -658,3 +658,10 @@ This file stores run-specific examples only when a run produced a new learning r - When parsing parent `Cloud Run Finished` tables, treat standalone wrapped `s` rows as duration-cell continuations and remove those rows instead of appending `s` to the end of the host line. - Rely on the existing duration parser to accept wrapped values without the trailing `s`. - Replay the exact launch log through the current watcher code after this fix before trusting a corrected host count. + +## Run Learning: 2026-05-07 (Synthesize failed host row when hang-kill occurs before reporter artifacts) +- Observed failure mode: + - Some hang-killed runs exit before host-level reporter artifacts are emitted, which can leave Mattermost statuses with `FAILED` summary but no host rows. +- Action for future runs: + - When a run is marked `FAILED` from hang-kill markers or non-zero runner exit and no host results are available, synthesize one failed host row from current host/spec inference. + - Use a clear failure detail such as `hang timeout killed runner` so operator-facing status always includes a concrete host failure line. diff --git a/atvm/inventory/vm-inventory.md b/atvm/inventory/vm-inventory.md index 2181376..e7c4518 100644 --- a/atvm/inventory/vm-inventory.md +++ b/atvm/inventory/vm-inventory.md @@ -162,6 +162,7 @@ For current membership on `AutomatedTest-VMBootImg-Gold` and `AutomatedTest-VMBo | Ubuntu 20.04 | atvm4-ubuntu20.04 | 5.4.0-144-generic | | | Ubuntu 22.04 | atvm5-ubuntu22.04 | 5.15.0-25-generic | | | Ubuntu 24.04 | atvm121-ubuntu24.04 | 6.8.0-31-generic | | +| Ubuntu 26.04 | atvm166-ubuntu26.04 | 7.0.0-15-generic | | | Windows Server 2008R2 | atvm108-w2k8r2 | 6.1.7601 Service Pack 1 Build 7601 | | | Windows Server 2012 | atvm109-w2k12R2 | 6.3.9600 Build 9600 | | | Windows Server 2016 | atvm110-w2k16 | 10.0.14393 Build 14393 | | diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 8ab4b69..4d88137 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -1212,6 +1212,30 @@ def infer_host_from_subrun_build( return remaining_hosts[0] if remaining_hosts else None +def synthesize_failed_host_result( + build_name: str, + log_text: str, + inventory: Dict[str, str], + reason: str, +) -> Dict[str, HostResult]: + expected_hosts = extract_expected_hosts(log_text) + host = find_current_running_host(log_text, []) + if not host and expected_hosts: + host = expected_hosts[0] + if not host: + host = build_name + return { + host: HostResult( + host=host, + kernel=inventory.get(host, "unknown"), + status="FAIL", + detail=reason, + tests=1, + failures=1, + ) + } + + def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]: try: extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]")) @@ -2131,6 +2155,15 @@ def determine_state( return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes if hang_kill_detected: + if not parent_host_results: + parent_host_results = synthesize_failed_host_result( + build_name=build_name, + log_text=log_text, + inventory=inventory, + reason="hang timeout killed runner", + ) + if subrun_states: + subrun_states[-1]["host_results"] = dict(parent_host_results) notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.") notes.append("Treating this as explicit failure.") return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes @@ -2146,6 +2179,15 @@ def determine_state( if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds: if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0): + if not parent_host_results: + parent_host_results = synthesize_failed_host_result( + build_name=build_name, + log_text=log_text, + inventory=inventory, + reason="runner exited non-zero before host results were emitted", + ) + if subrun_states: + subrun_states[-1]["host_results"] = dict(parent_host_results) detail = f"Runner service ended with non-zero exit ({runner_exit_code})" if runner_active_state: detail += f"; state={runner_active_state}"