fix(atvm-watcher): synthesize failed host result on hang-kill/nonzero exit; update run learning and vm inventory
This commit is contained in:
@@ -658,3 +658,10 @@ This file stores run-specific examples only when a run produced a new learning r
|
||||
- When parsing parent `Cloud Run Finished` tables, treat standalone wrapped `s` rows as duration-cell continuations and remove those rows instead of appending `s` to the end of the host line.
|
||||
- Rely on the existing duration parser to accept wrapped values without the trailing `s`.
|
||||
- Replay the exact launch log through the current watcher code after this fix before trusting a corrected host count.
|
||||
|
||||
## Run Learning: 2026-05-07 (Synthesize failed host row when hang-kill occurs before reporter artifacts)
|
||||
- Observed failure mode:
|
||||
- Some hang-killed runs exit before host-level reporter artifacts are emitted, which can leave Mattermost statuses with `FAILED` summary but no host rows.
|
||||
- Action for future runs:
|
||||
- When a run is marked `FAILED` from hang-kill markers or non-zero runner exit and no host results are available, synthesize one failed host row from current host/spec inference.
|
||||
- Use a clear failure detail such as `hang timeout killed runner` so operator-facing status always includes a concrete host failure line.
|
||||
|
||||
@@ -162,6 +162,7 @@ For current membership on `AutomatedTest-VMBootImg-Gold` and `AutomatedTest-VMBo
|
||||
| Ubuntu 20.04 | atvm4-ubuntu20.04 | 5.4.0-144-generic | |
|
||||
| Ubuntu 22.04 | atvm5-ubuntu22.04 | 5.15.0-25-generic | |
|
||||
| Ubuntu 24.04 | atvm121-ubuntu24.04 | 6.8.0-31-generic | |
|
||||
| Ubuntu 26.04 | atvm166-ubuntu26.04 | 7.0.0-15-generic | |
|
||||
| Windows Server 2008R2 | atvm108-w2k8r2 | 6.1.7601 Service Pack 1 Build 7601 | |
|
||||
| Windows Server 2012 | atvm109-w2k12R2 | 6.3.9600 Build 9600 | |
|
||||
| Windows Server 2016 | atvm110-w2k16 | 10.0.14393 Build 14393 | |
|
||||
|
||||
@@ -1212,6 +1212,30 @@ def infer_host_from_subrun_build(
|
||||
return remaining_hosts[0] if remaining_hosts else None
|
||||
|
||||
|
||||
def synthesize_failed_host_result(
|
||||
build_name: str,
|
||||
log_text: str,
|
||||
inventory: Dict[str, str],
|
||||
reason: str,
|
||||
) -> Dict[str, HostResult]:
|
||||
expected_hosts = extract_expected_hosts(log_text)
|
||||
host = find_current_running_host(log_text, [])
|
||||
if not host and expected_hosts:
|
||||
host = expected_hosts[0]
|
||||
if not host:
|
||||
host = build_name
|
||||
return {
|
||||
host: HostResult(
|
||||
host=host,
|
||||
kernel=inventory.get(host, "unknown"),
|
||||
status="FAIL",
|
||||
detail=reason,
|
||||
tests=1,
|
||||
failures=1,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]:
|
||||
try:
|
||||
extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]"))
|
||||
@@ -2131,6 +2155,15 @@ def determine_state(
|
||||
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
if hang_kill_detected:
|
||||
if not parent_host_results:
|
||||
parent_host_results = synthesize_failed_host_result(
|
||||
build_name=build_name,
|
||||
log_text=log_text,
|
||||
inventory=inventory,
|
||||
reason="hang timeout killed runner",
|
||||
)
|
||||
if subrun_states:
|
||||
subrun_states[-1]["host_results"] = dict(parent_host_results)
|
||||
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
|
||||
notes.append("Treating this as explicit failure.")
|
||||
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
@@ -2146,6 +2179,15 @@ def determine_state(
|
||||
|
||||
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
|
||||
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
|
||||
if not parent_host_results:
|
||||
parent_host_results = synthesize_failed_host_result(
|
||||
build_name=build_name,
|
||||
log_text=log_text,
|
||||
inventory=inventory,
|
||||
reason="runner exited non-zero before host results were emitted",
|
||||
)
|
||||
if subrun_states:
|
||||
subrun_states[-1]["host_results"] = dict(parent_host_results)
|
||||
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
|
||||
if runner_active_state:
|
||||
detail += f"; state={runner_active_state}"
|
||||
|
||||
Reference in New Issue
Block a user