From 65330ee9f8a4eeb74d73544bd08fb2f7c09f4e65 Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Thu, 7 May 2026 13:34:37 -0400 Subject: [PATCH] atvm: fail runs explicitly on hang-kill and runner non-zero exit --- atvm/docs/automation/guide.md | 2 + atvm/docs/automation/run-learnings.md | 14 +++++++ atvm/watcher-service/atvm_run_watcher.py | 49 ++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index c4742e7..c49a45d 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -63,6 +63,8 @@ Run ATVM CMC automation tests on the designated automation VM without unintended - If monitoring is requested, allow long runtime windows (15-30+ minutes) and continue until completion unless operator instructs otherwise. - Report command errors immediately. - `sshpass` may be used where password-based SSH automation is required. +- Treat runner hang-kill events (`Sending SIGKILL ... due to no change` / `Max hang retries reached`) as explicit `FAILED` outcomes, not `RUNNING` or ambiguous termination. +- For manual `run-sorry-cypress.py` execution, treat `ATVM_HANG_FAIL ...` log markers and `/tmp/atvm-runner-state-.json` terminal state files as the source of truth for hang-failure terminal status. ## Core Scripts - Template prep: `/root/cdc-e2e-cyp-12.17.4/cmc-templates.py` diff --git a/atvm/docs/automation/run-learnings.md b/atvm/docs/automation/run-learnings.md index dae998a..b9d5a90 100644 --- a/atvm/docs/automation/run-learnings.md +++ b/atvm/docs/automation/run-learnings.md @@ -33,6 +33,20 @@ This file stores run-specific examples only when a run produced a new learning r - For Windows-involved ATVM automation runs, add `--hang_retries 0` to `run-sorry-cypress.py` by default unless the operator explicitly requests a different value. - Keep this as an operator-default behavior even though the underlying runner option is generic and not Windows-only in code. +## Run Learning: 2026-05-07 (Treat hang-kill as explicit failure) +- Observed failure mode: + - A run can stall long enough for `run-sorry-cypress.py` to force-kill Cypress (`Sending SIGKILL ... due to no change`) and still be reported as an ambiguous terminated state. +- Action for future runs: + - When run logs contain hang-kill markers (`Sending SIGKILL ... due to no change` and `Max hang retries reached.`), classify the run as `FAILED`. + - When the runner service exits non-zero, classify the run as `FAILED` instead of generic terminated. + +## Run Learning: 2026-05-07 (Manual runner emits explicit hang-fail markers and terminal state) +- Observed failure mode: + - Manual `run-sorry-cypress.py` execution can appear "still running" after hang-kill handling because failure state was not emitted in a machine-readable terminal marker. +- Action for future runs: + - `run-sorry-cypress.py` now emits `ATVM_HANG_FAIL ...` on hang-kill paths and writes terminal state JSON under `/tmp/atvm-runner-state-.json`. + - Max hang-retry exhaustion now writes terminal failure state before exiting non-zero, including categorized and non-categorized flows. + ## Run Learning: 2026-05-02 (Do not reuse the previous controller status check for a new ATVM request) - Observed failure mode: - A later ATVM run request was blocked because the assistant reused the immediately previous controller status result instead of performing a fresh live running-state check at request time. diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index d6cc135..8ab4b69 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -196,6 +196,41 @@ def run_ps() -> str: return proc.stdout +def read_runner_service_status(build_name: str) -> Tuple[Optional[str], Optional[int], Optional[str]]: + unit = f"atvm-runner@{build_name}.service" + try: + proc = subprocess.run( + ["systemctl", "show", unit, "--property=Result,ExecMainStatus,ActiveState", "--value"], + capture_output=True, + text=True, + check=True, + timeout=5, + ) + except Exception: + return None, None, None + + lines = [line.strip() for line in proc.stdout.splitlines()] + if len(lines) < 3: + return None, None, None + result = lines[0] or None + exec_main_status: Optional[int] = None + if lines[1]: + try: + exec_main_status = int(lines[1]) + except ValueError: + exec_main_status = None + active_state = lines[2] or None + return result, exec_main_status, active_state + + +def detect_hang_kill(log_text: str) -> bool: + markers = ( + "Sending SIGKILL to cy2 command process group due to no change", + "Max hang retries reached.", + ) + return any(marker in log_text for marker in markers) + + def normalize_logged_command(raw: str, command_name: str) -> Optional[str]: patterns = { "cmc-templates.py": r"((?:python3?\s+)?(?:\./)?cmc-templates\.py\b.*)", @@ -2081,6 +2116,8 @@ def determine_state( start_ts = min(parent_start_candidates) if parent_start_candidates else started_at end_ts = max(parent_end_candidates) if parent_end_candidates else find_check_xml_end(reporter_root, started_at) currents_url = extract_currents_url(log_text) or latest_currents_url(build_dir) + hang_kill_detected = detect_hang_kill(log_text) + runner_result, runner_exit_code, runner_active_state = read_runner_service_status(build_name) if cancelled: notes.append("Cancellation marker detected.") @@ -2093,6 +2130,11 @@ def determine_state( return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes + if hang_kill_detected: + notes.append("Runner hang monitor terminated the run (`SIGKILL`) after inactivity.") + notes.append("Treating this as explicit failure.") + return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes + if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds: notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.") return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes @@ -2103,6 +2145,13 @@ def determine_state( return state, subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes if process_gone_since and (now_utc() - process_gone_since).total_seconds() >= process_exit_grace_seconds: + if runner_result == "exit-code" or (runner_exit_code is not None and runner_exit_code != 0): + detail = f"Runner service ended with non-zero exit ({runner_exit_code})" + if runner_active_state: + detail += f"; state={runner_active_state}" + notes.append(detail + ".") + notes.append("Treating this as explicit failure.") + return "FAILED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes notes.append("Run process exited without a clean completion signal.") return "TERMINATED", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes