fix(atvm-watcher): synthesize failed host result on hang-kill/nonzero exit; update run learning and vm inventory
This commit is contained in:
@@ -1212,6 +1212,30 @@ def infer_host_from_subrun_build(
|
||||
return remaining_hosts[0] if remaining_hosts else None
|
||||
|
||||
|
||||
def synthesize_failed_host_result(
|
||||
build_name: str,
|
||||
log_text: str,
|
||||
inventory: Dict[str, str],
|
||||
reason: str,
|
||||
) -> Dict[str, HostResult]:
|
||||
expected_hosts = extract_expected_hosts(log_text)
|
||||
host = find_current_running_host(log_text, [])
|
||||
if not host and expected_hosts:
|
||||
host = expected_hosts[0]
|
||||
if not host:
|
||||
host = build_name
|
||||
return {
|
||||
host: HostResult(
|
||||
host=host,
|
||||
kernel=inventory.get(host, "unknown"),
|
||||
status="FAIL",
|
||||
detail=reason,
|
||||
tests=1,
|
||||
failures=1,
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
def infer_metadata(build_name: str, log_text: str) -> Dict[str, object]:
|
||||
try:
|
||||
extra_options = json.loads(os.environ.get("ATVM_WATCHER_EXTRA_OPTIONS", "[]"))
|
||||
@@ -2131,6 +2155,15 @@ def determine_state(
|
||||
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
|
||||
if hang_kill_detected:
|
||||
if not parent_host_results:
|
||||
parent_host_results = synthesize_failed_host_result(
|
||||
build_name=build_name,
|
||||
log_text=log_text,
|
||||
inventory=inventory,
|
||||
reason="hang timeout killed runner",
|
||||
)
|
||||
if subrun_states:
|
||||
subrun_states[-1]["host_results"] = dict(parent_host_results)
|
||||
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
|
||||
notes.append("Treating this as explicit failure.")
|
||||
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
|
||||
@@ -2146,6 +2179,15 @@ def determine_state(
|
||||
|
||||
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
|
||||
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
|
||||
if not parent_host_results:
|
||||
parent_host_results = synthesize_failed_host_result(
|
||||
build_name=build_name,
|
||||
log_text=log_text,
|
||||
inventory=inventory,
|
||||
reason="runner exited non-zero before host results were emitted",
|
||||
)
|
||||
if subrun_states:
|
||||
subrun_states[-1]["host_results"] = dict(parent_host_results)
|
||||
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
|
||||
if runner_active_state:
|
||||
detail += f"; state={runner_active_state}"
|
||||
|
||||
Reference in New Issue
Block a user