| import argparse |
| import json |
| import math |
| import os |
| import time |
| import traceback |
| import zipfile |
| from collections import Counter |
|
|
| import requests |
|
|
|
|
| def get_jobs(workflow_run_id, token=None): |
| """Extract jobs in a GitHub Actions workflow run""" |
|
|
| headers = None |
| if token is not None: |
| headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} |
|
|
| url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100" |
| result = requests.get(url, headers=headers).json() |
| jobs = [] |
|
|
| try: |
| jobs.extend(result["jobs"]) |
| pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100) |
|
|
| for i in range(pages_to_iterate_over): |
| result = requests.get(url + f"&page={i + 2}", headers=headers).json() |
| jobs.extend(result["jobs"]) |
|
|
| return jobs |
| except Exception: |
| print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}") |
|
|
| return [] |
|
|
|
|
| def get_job_links(workflow_run_id, token=None): |
| """Extract job names and their job links in a GitHub Actions workflow run""" |
|
|
| headers = None |
| if token is not None: |
| headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} |
|
|
| url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{workflow_run_id}/jobs?per_page=100" |
| result = requests.get(url, headers=headers).json() |
| job_links = {} |
|
|
| try: |
| job_links.update({job["name"]: job["html_url"] for job in result["jobs"]}) |
| pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100) |
|
|
| for i in range(pages_to_iterate_over): |
| result = requests.get(url + f"&page={i + 2}", headers=headers).json() |
| job_links.update({job["name"]: job["html_url"] for job in result["jobs"]}) |
|
|
| return job_links |
| except Exception: |
| print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}") |
|
|
| return {} |
|
|
|
|
| def get_artifacts_links(worflow_run_id, token=None): |
| """Get all artifact links from a workflow run""" |
|
|
| headers = None |
| if token is not None: |
| headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} |
|
|
| url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{worflow_run_id}/artifacts?per_page=100" |
| result = requests.get(url, headers=headers).json() |
| artifacts = {} |
|
|
| try: |
| artifacts.update({artifact["name"]: artifact["archive_download_url"] for artifact in result["artifacts"]}) |
| pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100) |
|
|
| for i in range(pages_to_iterate_over): |
| result = requests.get(url + f"&page={i + 2}", headers=headers).json() |
| artifacts.update({artifact["name"]: artifact["archive_download_url"] for artifact in result["artifacts"]}) |
|
|
| return artifacts |
| except Exception: |
| print(f"Unknown error, could not fetch links:\n{traceback.format_exc()}") |
|
|
| return {} |
|
|
|
|
| def download_artifact(artifact_name, artifact_url, output_dir, token): |
| """Download a GitHub Action artifact from a URL. |
| |
| The URL is of the form `https://api.github.com/repos/huggingface/transformers/actions/artifacts/{ARTIFACT_ID}/zip`, |
| but it can't be used to download directly. We need to get a redirect URL first. |
| See https://docs.github.com/en/rest/actions/artifacts#download-an-artifact |
| """ |
| headers = None |
| if token is not None: |
| headers = {"Accept": "application/vnd.github+json", "Authorization": f"Bearer {token}"} |
|
|
| result = requests.get(artifact_url, headers=headers, allow_redirects=False) |
| download_url = result.headers["Location"] |
| response = requests.get(download_url, allow_redirects=True) |
| file_path = os.path.join(output_dir, f"{artifact_name}.zip") |
| with open(file_path, "wb") as fp: |
| fp.write(response.content) |
|
|
|
|
| def get_errors_from_single_artifact(artifact_zip_path, job_links=None): |
| """Extract errors from a downloaded artifact (in .zip format)""" |
| errors = [] |
| failed_tests = [] |
| job_name = None |
|
|
| with zipfile.ZipFile(artifact_zip_path) as z: |
| for filename in z.namelist(): |
| if not os.path.isdir(filename): |
| |
| if filename in ["failures_line.txt", "summary_short.txt", "job_name.txt"]: |
| with z.open(filename) as f: |
| for line in f: |
| line = line.decode("UTF-8").strip() |
| if filename == "failures_line.txt": |
| try: |
| |
| error_line = line[: line.index(": ")] |
| error = line[line.index(": ") + len(": ") :] |
| errors.append([error_line, error]) |
| except Exception: |
| |
| pass |
| elif filename == "summary_short.txt" and line.startswith("FAILED "): |
| |
| test = line[len("FAILED ") :] |
| failed_tests.append(test) |
| elif filename == "job_name.txt": |
| job_name = line |
|
|
| if len(errors) != len(failed_tests): |
| raise ValueError( |
| f"`errors` and `failed_tests` should have the same number of elements. Got {len(errors)} for `errors` " |
| f"and {len(failed_tests)} for `failed_tests` instead. The test reports in {artifact_zip_path} have some" |
| " problem." |
| ) |
|
|
| job_link = None |
| if job_name and job_links: |
| job_link = job_links.get(job_name, None) |
|
|
| |
| result = [x + [y] + [job_link] for x, y in zip(errors, failed_tests)] |
|
|
| return result |
|
|
|
|
| def get_all_errors(artifact_dir, job_links=None): |
| """Extract errors from all artifact files""" |
|
|
| errors = [] |
|
|
| paths = [os.path.join(artifact_dir, p) for p in os.listdir(artifact_dir) if p.endswith(".zip")] |
| for p in paths: |
| errors.extend(get_errors_from_single_artifact(p, job_links=job_links)) |
|
|
| return errors |
|
|
|
|
| def reduce_by_error(logs, error_filter=None): |
| """count each error""" |
|
|
| counter = Counter() |
| counter.update([x[1] for x in logs]) |
| counts = counter.most_common() |
| r = {} |
| for error, count in counts: |
| if error_filter is None or error not in error_filter: |
| r[error] = {"count": count, "failed_tests": [(x[2], x[0]) for x in logs if x[1] == error]} |
|
|
| r = dict(sorted(r.items(), key=lambda item: item[1]["count"], reverse=True)) |
| return r |
|
|
|
|
| def get_model(test): |
| """Get the model name from a test method""" |
| test = test.split("::")[0] |
| if test.startswith("tests/models/"): |
| test = test.split("/")[2] |
| else: |
| test = None |
|
|
| return test |
|
|
|
|
| def reduce_by_model(logs, error_filter=None): |
| """count each error per model""" |
|
|
| logs = [(x[0], x[1], get_model(x[2])) for x in logs] |
| logs = [x for x in logs if x[2] is not None] |
| tests = {x[2] for x in logs} |
|
|
| r = {} |
| for test in tests: |
| counter = Counter() |
| |
| counter.update([x[1] for x in logs if x[2] == test]) |
| counts = counter.most_common() |
| error_counts = {error: count for error, count in counts if (error_filter is None or error not in error_filter)} |
| n_errors = sum(error_counts.values()) |
| if n_errors > 0: |
| r[test] = {"count": n_errors, "errors": error_counts} |
|
|
| r = dict(sorted(r.items(), key=lambda item: item[1]["count"], reverse=True)) |
| return r |
|
|
|
|
| def make_github_table(reduced_by_error): |
| header = "| no. | error | status |" |
| sep = "|-:|:-|:-|" |
| lines = [header, sep] |
| for error in reduced_by_error: |
| count = reduced_by_error[error]["count"] |
| line = f"| {count} | {error[:100]} | |" |
| lines.append(line) |
|
|
| return "\n".join(lines) |
|
|
|
|
| def make_github_table_per_model(reduced_by_model): |
| header = "| model | no. of errors | major error | count |" |
| sep = "|-:|-:|-:|-:|" |
| lines = [header, sep] |
| for model in reduced_by_model: |
| count = reduced_by_model[model]["count"] |
| error, _count = list(reduced_by_model[model]["errors"].items())[0] |
| line = f"| {model} | {count} | {error[:60]} | {_count} |" |
| lines.append(line) |
|
|
| return "\n".join(lines) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument("--workflow_run_id", type=str, required=True, help="A GitHub Actions workflow run id.") |
| parser.add_argument( |
| "--output_dir", |
| type=str, |
| required=True, |
| help="Where to store the downloaded artifacts and other result files.", |
| ) |
| parser.add_argument("--token", default=None, type=str, help="A token that has actions:read permission.") |
| args = parser.parse_args() |
|
|
| os.makedirs(args.output_dir, exist_ok=True) |
|
|
| _job_links = get_job_links(args.workflow_run_id, token=args.token) |
| job_links = {} |
| |
| |
| if _job_links: |
| for k, v in _job_links.items(): |
| |
| if " / " in k: |
| index = k.find(" / ") |
| k = k[index + len(" / ") :] |
| job_links[k] = v |
| with open(os.path.join(args.output_dir, "job_links.json"), "w", encoding="UTF-8") as fp: |
| json.dump(job_links, fp, ensure_ascii=False, indent=4) |
|
|
| artifacts = get_artifacts_links(args.workflow_run_id, token=args.token) |
| with open(os.path.join(args.output_dir, "artifacts.json"), "w", encoding="UTF-8") as fp: |
| json.dump(artifacts, fp, ensure_ascii=False, indent=4) |
|
|
| for idx, (name, url) in enumerate(artifacts.items()): |
| download_artifact(name, url, args.output_dir, args.token) |
| |
| time.sleep(1) |
|
|
| errors = get_all_errors(args.output_dir, job_links=job_links) |
|
|
| |
| counter = Counter() |
| counter.update([e[1] for e in errors]) |
|
|
| |
| most_common = counter.most_common(30) |
| for item in most_common: |
| print(item) |
|
|
| with open(os.path.join(args.output_dir, "errors.json"), "w", encoding="UTF-8") as fp: |
| json.dump(errors, fp, ensure_ascii=False, indent=4) |
|
|
| reduced_by_error = reduce_by_error(errors) |
| reduced_by_model = reduce_by_model(errors) |
|
|
| s1 = make_github_table(reduced_by_error) |
| s2 = make_github_table_per_model(reduced_by_model) |
|
|
| with open(os.path.join(args.output_dir, "reduced_by_error.txt"), "w", encoding="UTF-8") as fp: |
| fp.write(s1) |
| with open(os.path.join(args.output_dir, "reduced_by_model.txt"), "w", encoding="UTF-8") as fp: |
| fp.write(s2) |
|
|