fix(atvm-watcher): synthesize failed host result on hang-kill/nonzero exit; update run learning and vm inventory

This commit is contained in:
2026-05-12 14:42:11 -04:00
parent 222fb1aaa2
commit 8c4985d33a
3 changed files with 50 additions and 0 deletions

View File

@@ -658,3 +658,10 @@ This file stores run-specific examples only when a run produced a new learning r
- When parsing parent `Cloud Run Finished` tables, treat standalone wrapped `s` rows as duration-cell continuations and remove those rows instead of appending `s` to the end of the host line. - When parsing parent `Cloud Run Finished` tables, treat standalone wrapped `s` rows as duration-cell continuations and remove those rows instead of appending `s` to the end of the host line.
- Rely on the existing duration parser to accept wrapped values without the trailing `s`. - Rely on the existing duration parser to accept wrapped values without the trailing `s`.
- Replay the exact launch log through the current watcher code after this fix before trusting a corrected host count. - Replay the exact launch log through the current watcher code after this fix before trusting a corrected host count.
## Run Learning: 2026-05-07 (Synthesize failed host row when hang-kill occurs before reporter artifacts)
- Observed failure mode:
- Some hang-killed runs exit before host-level reporter artifacts are emitted, which can leave Mattermost statuses with `FAILED` summary but no host rows.
- Action for future runs:
- When a run is marked `FAILED` from hang-kill markers or non-zero runner exit and no host results are available, synthesize one failed host row from current host/spec inference.
- Use a clear failure detail such as `hang timeout killed runner` so operator-facing status always includes a concrete host failure line.

View File

@@ -162,6 +162,7 @@ For current membership on `AutomatedTest-VMBootImg-Gold` and `AutomatedTest-VMBo
| Ubuntu 20.04 | atvm4-ubuntu20.04 | 5.4.0-144-generic | | | Ubuntu 20.04 | atvm4-ubuntu20.04 | 5.4.0-144-generic | |
| Ubuntu 22.04 | atvm5-ubuntu22.04 | 5.15.0-25-generic | | | Ubuntu 22.04 | atvm5-ubuntu22.04 | 5.15.0-25-generic | |
| Ubuntu 24.04 | atvm121-ubuntu24.04 | 6.8.0-31-generic | | | Ubuntu 24.04 | atvm121-ubuntu24.04 | 6.8.0-31-generic | |
| Ubuntu 26.04 | atvm166-ubuntu26.04 | 7.0.0-15-generic | |
| Windows Server 2008R2 | atvm108-w2k8r2 | 6.1.7601 Service Pack 1 Build 7601 | | | Windows Server 2008R2 | atvm108-w2k8r2 | 6.1.7601 Service Pack 1 Build 7601 | |
| Windows Server 2012 | atvm109-w2k12R2 | 6.3.9600 Build 9600 | | | Windows Server 2012 | atvm109-w2k12R2 | 6.3.9600 Build 9600 | |
| Windows Server 2016 | atvm110-w2k16 | 10.0.14393 Build 14393 | | | Windows Server 2016 | atvm110-w2k16 | 10.0.14393 Build 14393 | |

View File

@@ -1212,6 +1212,30 @@ def infer_host_from_subrun_build(
return remaining_hosts[0] if remaining_hosts else None return remaining_hosts[0] if remaining_hosts else None
def synthesize_failed_host_result(
build_name: str,
log_text: str,
inventory: Dict[str, str],
reason: str,
) -> Dict[str, HostResult]:
expected_hosts = extract_expected_hosts(log_text)
host = find_current_running_host(log_text, [])
if not host and expected_hosts:
host = expected_hosts[0]
if not host:
host = build_name
return {
host: HostResult(
host=host,
kernel=inventory.get(host, "unknown"),
status="FAIL",
detail=reason,
tests=1,
failures=1,
)
}
def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]: def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]:
try: try:
extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]")) extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]"))
@@ -2131,6 +2155,15 @@ def determine_state(
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if hang_kill_detected: if hang_kill_detected:
if not parent_host_results:
parent_host_results = synthesize_failed_host_result(
build_name=build_name,
log_text=log_text,
inventory=inventory,
reason="hang timeout killed runner",
)
if subrun_states:
subrun_states[-1]["host_results"] = dict(parent_host_results)
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.") notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
notes.append("Treating this as explicit failure.") notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
@@ -2146,6 +2179,15 @@ def determine_state(
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds: if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0): if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
if not parent_host_results:
parent_host_results = synthesize_failed_host_result(
build_name=build_name,
log_text=log_text,
inventory=inventory,
reason="runner exited non-zero before host results were emitted",
)
if subrun_states:
subrun_states[-1]["host_results"] = dict(parent_host_results)
detail = f"Runner service ended with non-zero exit ({runner_exit_code})" detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
if runner_active_state: if runner_active_state:
detail += f"; state={runner_active_state}" detail += f"; state={runner_active_state}"