atvm: fail runs explicitly on hang-kill and runner non-zero exit

This commit is contained in:
2026-05-07 13:34:37 -04:00
parent e3497111dd
commit 65330ee9f8
3 changed files with 65 additions and 0 deletions

View File

@@ -196,6 +196,41 @@ def run_ps() -> str:
return proc.stdout
def read_runner_service_status(build_name: str) -> Tuple[Optional[str], Optional[int], Optional[str]]:
unit = f"atvm-runner@{build_name}.service"
try:
proc = subprocess.run(
["systemctl", "show", unit, "--property=Result,ExecMainStatus,ActiveState", "--value"],
capture_output=True,
text=True,
check=True,
timeout=5,
)
except Exception:
return None, None, None
lines = [line.strip() for line in proc.stdout.splitlines()]
if len(lines) < 3:
return None, None, None
result = lines[0] or None
exec_main_status: Optional[int] = None
if lines[1]:
try:
exec_main_status = int(lines[1])
except ValueError:
exec_main_status = None
active_state = lines[2] or None
return result, exec_main_status, active_state
def detect_hang_kill(log_text: str) -> bool:
markers = (
"Sending SIGKILL to cy2 command process group due to no change",
"Max hang retries reached.",
)
return any(marker in log_text for marker in markers)
def normalize_logged_command(raw: str, command_name: str) -> Optional[str]:
patterns = {
"cmc-templates.py": r"((?:python3?\s+)?(?:\./)?cmc-templates\.py\b.*)",
@@ -2081,6 +2116,8 @@ def determine_state(
start_ts = min(parent_start_candidates) if parent_start_candidates else started_at
end_ts = max(parent_end_candidates) if parent_end_candidates else find_check_xml_end(reporter_root, started_at)
currents_url = extract_currents_url(log_text) or latest_currents_url(build_dir)
hang_kill_detected = detect_hang_kill(log_text)
runner_result, runner_exit_code, runner_active_state = read_runner_service_status(build_name)
if cancelled:
notes.append("Cancellation marker detected.")
@@ -2093,6 +2130,11 @@ def determine_state(
return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if hang_kill_detected:
notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.")
notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds:
notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.")
return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
@@ -2103,6 +2145,13 @@ def determine_state(
return state, subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds:
if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0):
detail = f"Runner service ended with non-zero exit ({runner_exit_code})"
if runner_active_state:
detail += f"; state={runner_active_state}"
notes.append(detail + ".")
notes.append("Treating this as explicit failure.")
return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes
notes.append("Run process exited without a clean completion signal.")
return "TERMINATED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes