Infer categorized watcher group names from actual host execution

- update the watcher to stop trusting misleading categorized child build labels when they do not match the host/spec actually being executed
- infer the reported categorized group name from the actual host being run, so mismatched labels like ubuntu-batch for a Red Hat host are corrected in status reporting
- document the categorized watcher workaround in the ATVM guide, watcher design, and watcher README without changing the underlying ATVM runner scripts
This commit is contained in:
2026-03-26 14:20:22 -04:00
parent 7d49896ac2
commit f5eb21cccd
4 changed files with 49 additions and 5 deletions

View File

@@ -47,6 +47,7 @@ Run ATVM CMC automation tests on the designated automation VM without unintended
- it must post one final Mattermost status per completed categorized group/sub-run
- it must stay active between grouped sub-runs while the parent categorized request is still running
- it must not stop after the first grouped run simply because one grouped run completed
- if the child build id label does not match the actual host/spec being executed, report the grouped run using the inferred host-based group instead of the raw child build id label
- it must not wait and replace those with one single parent-only post
- After execution, report immediate success/failure only.
- Do not actively monitor completion unless explicitly requested.

View File

@@ -44,6 +44,7 @@ A categorized run must be treated differently:
- then continue watching for the next grouped sub-run
- the watcher must remain alive while the parent categorized request or related child Cypress process is still active
- one completed grouped sub-run must not be treated as proof that the parent categorized request is finished
- if the child build id label does not match the actual host/spec being executed, the watcher must infer the real group from host execution and use that inferred group for reporting
- the watcher must not wait until the very end to send one single parent-only post
Evidence sources:

View File

@@ -128,5 +128,6 @@ This writes a cancellation marker, updates `state.json` to `CANCELLED`, and stop
- Kernel values are resolved from `atvm/inventory/vm-inventory.md`.
- Categorized execution is treated as sequential grouped ATVM sub-runs, not as one parent run with internal phases.
- In categorized mode, the watcher writes per-subrun state under `subruns/` and posts each completed grouped run separately.
- In categorized mode, if the child build id label does not match the host/spec actually being executed, the watcher reports the grouped run using the inferred host-based group name instead of trusting the raw child build id label.
- Best-practice controller install path: `/opt/atvm-watcher-service`.
- This package is local-only right now. Nothing here is installed on the controller yet.

View File

@@ -553,6 +553,38 @@ def infer_group_label(hosts: List[str], index: int) -> str:
return "-".join(labels) if labels else f"group{index}"
def infer_group_from_host(host: str) -> str:
short = host.split("-", 1)[-1].lower()
if short.startswith("w2k"):
return "windows"
if short.startswith("amazonlinux"):
return "amazonlinux"
if short.startswith("centos"):
return "centos"
if short.startswith("ubuntu"):
return "ubuntu"
if short.startswith("rocky"):
return "rocky"
if short.startswith("redhat"):
return "redhat"
if short.startswith("oracle"):
return "oracle"
if short.startswith("fedora"):
return "fedora"
if short.startswith("debian"):
return "debian"
if short.startswith("suse"):
return "suse"
return short or "group"
def corrected_categorized_display_name(raw_display_name: str, hosts: List[str]) -> str:
if not hosts:
return raw_display_name
group = infer_group_from_host(hosts[0])
return re.sub(r"-(amazonlinux|centos|ubuntu|rocky|redhat|oracle|fedora|debian|suse|windows|w2k)-batch", f"-{group}-batch", raw_display_name)
def extract_segment_build_name(segment_text: str, parent_build_name: str) -> Optional[str]:
patterns = [
rf"({re.escape(parent_build_name)}-[A-Za-z0-9_.-]*batch\d+_\d+)",
@@ -702,8 +734,8 @@ def discover_categorized_subruns(
xml_mtime = datetime.fromtimestamp(xml_path.stat().st_mtime, tz=timezone.utc)
if xml_mtime < started_at:
continue
display_name = xml_path.stem[len("test-result-"):]
discovered_builds.append(display_name)
raw_display_name = xml_path.stem[len("test-result-"):]
discovered_builds.append(raw_display_name)
parsed = parse_host_xml(xml_path)
host_results: Dict[str, HostResult] = {}
if parsed:
@@ -721,13 +753,16 @@ def discover_categorized_subruns(
state = "RUNNING"
if cancelled:
state = "CANCELLED"
elif check_ts or display_name != current_subrun_build or not parent_active:
elif check_ts or raw_display_name != current_subrun_build or not parent_active:
state = "FAILED" if any(result.failures for result in host_results.values()) else "COMPLETED"
display_name = corrected_categorized_display_name(raw_display_name, list(host_results))
notes = [f"Categorized sub-run discovered from reporter file `{xml_path.name}`."]
if check_ts:
notes.append("Final `check-xml-files.ts` validation passed.")
if summary and host_results:
notes.append("Host result details were derived from the parent categorized run log summary.")
if display_name != raw_display_name:
notes.append(f"Child build id was reported as `{raw_display_name}`, but the actual grouped run was inferred from host execution as `{display_name}`.")
if cancelled:
notes.append("Cancellation marker detected.")
end_ts = check_ts or next((result.timestamp for result in host_results.values() if result.timestamp), xml_mtime)
@@ -766,16 +801,22 @@ def discover_categorized_subruns(
status="RUN",
detail="in progress",
)
display_name = corrected_categorized_display_name(current_subrun_build, [current_host] if current_host else [])
notes = ["Active categorized sub-run inferred from live `--ci-build-id` process state."]
if display_name != current_subrun_build:
notes.append(f"Child build id is `{current_subrun_build}`, but the actual grouped run was inferred from host execution as `{display_name}`.")
if cancelled:
notes.append("Cancellation marker detected.")
subrun_states.append(
{
"key": sanitize_key(current_subrun_build),
"display_name": current_subrun_build,
"display_name": display_name,
"state": "CANCELLED" if cancelled else "RUNNING",
"host_results": host_results,
"start_ts": started_at,
"end_ts": None,
"currents_url": None,
"notes": ["Active categorized sub-run inferred from live `--ci-build-id` process state."] + (["Cancellation marker detected."] if cancelled else []),
"notes": notes,
}
)