From 44e6e0e6539259e77757d71e767ff1b0552d9c3b Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Thu, 26 Mar 2026 12:39:23 -0400 Subject: [PATCH] Keep categorized ATVM watcher alive until parent run finishes - update the watcher to treat categorized parent-run activity as the authoritative signal for whether the overall request is still running - prevent the watcher from exiting early just because one categorized grouped sub-run completed and wrote artifacts - document that categorized watcher instances must remain alive between grouped runs until the parent request has actually gone inactive past the grace window - update the ATVM guide, watcher design, and install docs to reflect the stricter categorized parent-run completion rule --- atvm/docs/automation/guide.md | 2 ++ .../docs/automation/mattermost-watcher-design.md | 2 ++ atvm/watcher-service/INSTALL.md | 2 ++ atvm/watcher-service/README.md | 2 ++ atvm/watcher-service/atvm_run_watcher.py | 16 +++++++++++++++- 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/atvm/docs/automation/guide.md b/atvm/docs/automation/guide.md index f484f05..3f2848d 100644 --- a/atvm/docs/automation/guide.md +++ b/atvm/docs/automation/guide.md @@ -45,6 +45,8 @@ Run ATVM CMC automation tests on the designated automation VM without unintended - Treat `approve with watcher` as approval to run and also start the per-run watcher service for that build. - When `--categorize` is used with watcher enabled, treat the watcher as a sequential grouped-run watcher: - it must post one final Mattermost status per completed categorized group/sub-run + - it must stay active between grouped sub-runs while the parent categorized request is still running + - it must not stop after the first grouped run simply because one grouped run completed - it must not wait and replace those with one single parent-only post - After execution, report immediate success/failure only. - Do not actively monitor completion unless explicitly requested. diff --git a/atvm/docs/automation/mattermost-watcher-design.md b/atvm/docs/automation/mattermost-watcher-design.md index ae1e8b4..cceb4f4 100644 --- a/atvm/docs/automation/mattermost-watcher-design.md +++ b/atvm/docs/automation/mattermost-watcher-design.md @@ -42,6 +42,8 @@ A categorized run must be treated differently: - the watcher must wait for that grouped sub-run to complete - then send that grouped sub-run's final Mattermost status - then continue watching for the next grouped sub-run +- the watcher must remain alive while the parent categorized request or related child Cypress process is still active +- one completed grouped sub-run must not be treated as proof that the parent categorized request is finished - the watcher must not wait until the very end to send one single parent-only post Evidence sources: diff --git a/atvm/watcher-service/INSTALL.md b/atvm/watcher-service/INSTALL.md index c46af09..aba3419 100644 --- a/atvm/watcher-service/INSTALL.md +++ b/atvm/watcher-service/INSTALL.md @@ -120,6 +120,7 @@ Recommended permissions: - if the run uses `--categorize`, also pass `--categorize` to the watcher start helper - confirm final Mattermost delivery for a completed run - confirm categorized execution sends one post per completed grouped sub-run + - confirm the watcher stays alive between categorized grouped runs while the parent request is still active - confirm reused parent build names do not inherit stale `cancelled.marker`, `posted.marker`, or `subruns/` state from older runs ## Recommended Validation Commands @@ -191,6 +192,7 @@ The cancel helper should: - This is not a daemon. - One watcher instance is started per ATVM run. - Categorized execution is treated as one watcher instance tracking sequential grouped ATVM sub-runs. +- In categorized execution, the watcher must remain alive until the parent request has actually gone inactive past the grace window, even if one grouped sub-run already completed. - The watcher exits after the run reaches a terminal state. - The watcher writes state under `/var/lib/atvm-run-watcher/`. - The watcher prevents duplicate Mattermost posts by writing posted markers. diff --git a/atvm/watcher-service/README.md b/atvm/watcher-service/README.md index ca8dd5b..90a4a25 100644 --- a/atvm/watcher-service/README.md +++ b/atvm/watcher-service/README.md @@ -65,6 +65,8 @@ Typical workflow: - detect each grouped sub-run in sequence from the parent run log - wait for that grouped sub-run to finish - send one Mattermost post for that grouped sub-run if it reached `COMPLETED` or `FAILED` + - keep the watcher alive while the parent categorized runner or related child Cypress process is still active + - do not treat one completed grouped sub-run as proof that the whole parent request is finished - continue to the next grouped sub-run - exit after the parent request reaches a terminal state diff --git a/atvm/watcher-service/atvm_run_watcher.py b/atvm/watcher-service/atvm_run_watcher.py index 75ca8a7..79f2332 100644 --- a/atvm/watcher-service/atvm_run_watcher.py +++ b/atvm/watcher-service/atvm_run_watcher.py @@ -108,6 +108,16 @@ def process_active(build_name: str) -> bool: return False +def related_process_active(build_name: str) -> bool: + output = run_ps() + for line in output.splitlines(): + if build_name not in line: + continue + if any(token in line for token in ("run-sorry-cypress.py", "cypress-cloud", "node ")): + return True + return False + + def extract_active_subrun_build(build_name: str) -> Optional[str]: output = run_ps() matches: List[str] = [] @@ -713,7 +723,7 @@ def determine_state( ) -> Tuple[str, List[Dict[str, object]], Dict[str, HostResult], Optional[datetime], Optional[datetime], Optional[str], List[str]]: cancelled_marker = build_dir / "cancelled.marker" log_text = read_text(run_log) - active = process_active(build_name) + active = related_process_active(build_name) if metadata.get("categorized") else process_active(build_name) cancelled = cancelled_marker.exists() notes: List[str] = [] subrun_states: List[Dict[str, object]] = [] @@ -776,6 +786,10 @@ def determine_state( return "HUNG", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes + if metadata.get("categorized") and process_gone_since and (now_utc() - process_gone_since).total_seconds() < process_exit_grace_seconds: + notes.append("Categorized parent runner has not been gone long enough to treat the request as finished.") + return "RUNNING", subrun_states, parent_host_results, start_ts, end_ts, currents_url, notes + terminal_subruns = [subrun for subrun in subrun_states if subrun["state"] in {"COMPLETED", "FAILED"}] if terminal_subruns: state = "FAILED" if any(result.failures for result in parent_host_results.values()) else "COMPLETED"