Fix categorized watcher subrun host recovery
This commit is contained in:
@@ -275,6 +275,14 @@ This file stores run-specific examples only when a run produced a new learning r
|
||||
- Reserve `NOTES:` for meaningful operator-facing content such as the Currents run URL, real anomalies, failure context, and important fallback behavior.
|
||||
- Do not include generic artifact-detection confirmations in the posted `NOTES:` section.
|
||||
|
||||
## Run Learning: 2026-03-27 (Categorized grouped XML may need host recovery from the subrun's per-host artifact)
|
||||
- Observed failure mode:
|
||||
- A categorized subrun can finish and write its grouped `test-result-<build>.xml`, but that XML may only contain `check-xml-files.ts`.
|
||||
- In that case the watcher may know the grouped batch completed and even know its Currents URL, but still miss the host result unless it recovers the host from the matching per-host reporter artifact.
|
||||
- Action for future runs:
|
||||
- For categorized runs, when grouped XML only shows `check-xml-files.ts`, infer the subrun host from the categorized build id and recover the result from the latest matching per-host reporter artifact within the grouped completion window.
|
||||
- Do not keep a completed grouped subrun in `RUNNING` just because the grouped XML lacked a host testcase entry.
|
||||
|
||||
## Run Learning: 2026-03-27 (Default ATVM approval should include the watcher)
|
||||
- Observed requirement:
|
||||
- The operator wants `approve` to mean run with watcher by default.
|
||||
|
||||
@@ -514,6 +514,56 @@ def collect_latest_host_reporter_artifact(
|
||||
return latest
|
||||
|
||||
|
||||
def collect_latest_group_host_reporter_artifact(
|
||||
reporter_root: Path,
|
||||
group_label: Optional[str],
|
||||
kernels: Dict[str, str],
|
||||
run_started_at: datetime,
|
||||
run_ended_at: Optional[datetime] = None,
|
||||
) -> Optional[Tuple[str, HostResult]]:
|
||||
if not group_label:
|
||||
return None
|
||||
|
||||
logs_dir = reporter_root / "logs"
|
||||
if not logs_dir.exists():
|
||||
return None
|
||||
|
||||
latest: Optional[Tuple[str, HostResult]] = None
|
||||
for host_dir in sorted(logs_dir.iterdir()):
|
||||
if not host_dir.is_dir():
|
||||
continue
|
||||
host = host_dir.name
|
||||
if not host.startswith("atvm"):
|
||||
continue
|
||||
if infer_group_from_host(host) != group_label:
|
||||
continue
|
||||
|
||||
for artifact_path in sorted(host_dir.iterdir()):
|
||||
if artifact_path.suffix not in {".txt", ".json"}:
|
||||
continue
|
||||
artifact_mtime = datetime.fromtimestamp(artifact_path.stat().st_mtime, tz=timezone.utc)
|
||||
if artifact_mtime < run_started_at:
|
||||
continue
|
||||
if run_ended_at and artifact_mtime >= run_ended_at:
|
||||
continue
|
||||
|
||||
result = HostResult(
|
||||
host=host,
|
||||
kernel=kernels.get(host, "unknown"),
|
||||
status="PASS",
|
||||
detail="completed",
|
||||
timestamp=artifact_mtime,
|
||||
)
|
||||
candidate = (host, result)
|
||||
if latest is None:
|
||||
latest = candidate
|
||||
continue
|
||||
latest_ts = latest[1].timestamp or datetime.fromtimestamp(0, tz=timezone.utc)
|
||||
if artifact_mtime >= latest_ts:
|
||||
latest = candidate
|
||||
return latest
|
||||
|
||||
|
||||
def find_check_xml_end(
|
||||
reporter_root: Path,
|
||||
started_at: datetime,
|
||||
@@ -1003,17 +1053,29 @@ def discover_categorized_subruns(
|
||||
completed_hosts.append(host)
|
||||
check_ts = extract_check_xml_timestamp_from_file(xml_path)
|
||||
summary = completed_summaries[current_summary_index] if current_summary_index < len(completed_summaries) else None
|
||||
inferred_host = infer_host_from_subrun_build(raw_display_name, expected_hosts, completed_hosts)
|
||||
if summary and (not host_results or all(result.host == "check-xml-files" for result in host_results.values())):
|
||||
host_results = summary["host_results"]
|
||||
completed_hosts.extend([host for host in host_results if host not in completed_hosts])
|
||||
if not host_results and check_ts:
|
||||
latest_host = collect_latest_host_result(
|
||||
scoped_expected_hosts = [inferred_host] if inferred_host else expected_hosts
|
||||
latest_host = collect_latest_host_reporter_artifact(
|
||||
reporter_root=reporter_root,
|
||||
expected_hosts=expected_hosts,
|
||||
expected_hosts=scoped_expected_hosts,
|
||||
kernels=inventory,
|
||||
run_started_at=started_at,
|
||||
run_ended_at=check_ts + timedelta(seconds=5),
|
||||
)
|
||||
if not latest_host:
|
||||
display_group_match = re.search(r"-(amazonlinux|centos|ubuntu|rocky|redhat|oracle|fedora|debian|suse|windows)-batch", raw_display_name)
|
||||
display_group = display_group_match.group(1) if display_group_match else None
|
||||
latest_host = collect_latest_group_host_reporter_artifact(
|
||||
reporter_root=reporter_root,
|
||||
group_label=display_group,
|
||||
kernels=inventory,
|
||||
run_started_at=started_at,
|
||||
run_ended_at=check_ts + timedelta(seconds=5),
|
||||
)
|
||||
if latest_host:
|
||||
host, result = latest_host
|
||||
host_results = {host: result}
|
||||
@@ -1042,6 +1104,8 @@ def discover_categorized_subruns(
|
||||
notes.append("Host result details were derived from the parent categorized run log summary.")
|
||||
elif host_results and check_ts:
|
||||
notes.append("Host result details were derived from the latest matching host reporter artifact written before grouped finalization.")
|
||||
if inferred_host:
|
||||
notes.append(f"Grouped sub-run host scope was inferred as `{inferred_host}` from the categorized build id.")
|
||||
elif check_ts and not host_results and parent_active:
|
||||
notes.append("Grouped reporter XML arrived before the parent run log exposed the final host summary; waiting to post until host details are available.")
|
||||
if display_name != raw_display_name:
|
||||
|
||||
Reference in New Issue
Block a user