fix(atvm-watcher): synthesize failed host result on hang-kill/nonzero exit; update run learning and vm inventory

This commit is contained in:
2026-05-12 14:42:11 -04:00
parent 222fb1aaa2
commit 8c4985d33a
3 changed files with 50 additions and 0 deletions

View File

@@ -1212,6 +1212,30 @@ def infer_host_from_subrun_build(
return remaining_hosts[0] if remaining_hosts else None
def synthesize_failed_host_result(
build_name: str,
log_text: str,
inventory: Dict[str, str],
reason: str,
) -> Dict[str, HostResult]:
expected_hosts = extract_expected_hosts(log_text)
host = find_current_running_host(log_text, [])
if not host and expected_hosts:
host = expected_hosts[0]
if not host:
host = build_name
return {
host: HostResult(
host=host,
kernel=inventory.get(host, "unknown"),
status="FAIL",
detail=reason,
tests=1,
failures=1,
)
}
def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]:
try:
extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]"))
@@ -2131,6 +2155,15 @@ def determine_state(
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if hang_kill_detected:
if not parent_host_results:
parent_host_results = synthesize_failed_host_result(
build_name=build_name,
log_text=log_text,
inventory=inventory,
reason="hang timeout killed runner",
)
if subrun_states:
subrun_states[-1]["host_results"] = dict(parent_host_results)
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
@@ -2146,6 +2179,15 @@ def determine_state(
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
if not parent_host_results:
parent_host_results = synthesize_failed_host_result(
build_name=build_name,
log_text=log_text,
inventory=inventory,
reason="runner exited non-zero before host results were emitted",
)
if subrun_states:
subrun_states[-1]["host_results"] = dict(parent_host_results)
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
if runner_active_state:
detail += f"; state={runner_active_state}"