Add files using upload-large-folder tool

0947ff8 verified 8 months ago

12.2 kB

	# Copyright 2024 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Run benchmark using the `optimum-benchmark` library with some customization in `transformers`.

	Assume we are under `transformers` root directory: (make sure the commits are valid commits)
	```bash
	python benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=9b9c7f03da625b13643e99205c691fe046461724 --metrics=decode.latency.mean,per_token.latency.mean,per_token.throughput.value backend.model=google/gemma-2b benchmark.input_shapes.sequence_length=5,7 benchmark.input_shapes.batch_size=1,2 --multirun
	```
	"""

	import argparse
	import glob
	import json
	import os.path
	import re
	import tempfile
	from contextlib import contextmanager
	from pathlib import Path

	from git import Repo

	from huggingface_hub import HfApi

	from optimum_benchmark import Benchmark
	from optimum_benchmark_wrapper import main


	PATH_TO_REPO = Path(__file__).parent.parent.resolve()


	@contextmanager
	def checkout_commit(repo: Repo, commit_id: str):
	"""
	Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit.
	Args:
	repo (`git.Repo`): A git repository (for instance the Transformers repo).
	commit_id (`str`): The commit reference to checkout inside the context manager.
	"""
	current_head = repo.head.commit if repo.head.is_detached else repo.head.ref

	try:
	repo.git.checkout(commit_id)
	yield

	finally:
	repo.git.checkout(current_head)


	def summarize(run_dir, metrics, expand_metrics=False):
	"""Produce a summary for each optimum-benchmark launched job's output directory found in `run_dir`.

	Each summary's format is as follows (for `expand_metrics=False`):
	```
	{
	"model": "google/gemma-2b",
	"commit": "3cd6ed22e4d49219f300f5055e71e3929aba20d7",
	"config": "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5",
	"metrics": {
	"decode.latency.mean": 1.624666809082031,
	"per_token.latency.mean": 0.012843788806628804,
	"per_token.throughput.value": 77.85864553330948
	}
	}
	```
	"""
	reports = glob.glob(os.path.join(run_dir, "**/benchmark_report.json"), recursive=True)
	report_dirs = [str(Path(report).parent) for report in reports]

	summaries = []
	for report_dir in report_dirs:
	commit = re.search(r"/commit=([^/]+)", report_dir).groups()[0]

	if not os.path.isfile(os.path.join(report_dir, "benchmark.json")):
	continue
	benchmark = Benchmark.from_json(os.path.join(report_dir, "benchmark.json"))
	report = benchmark.report

	model = benchmark.config.backend["model"]

	# Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
	# (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
	benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
	benchmark_name = str(Path(benchmark_name).parts[-1])
	if benchmark_name.startswith("commit="):
	benchmark_name = benchmark.config.name

	metrics_values = {}
	# post-processing of report: show a few selected/important metric
	for metric in metrics:
	keys = metric.split(".")
	value = report.to_dict()
	current = metrics_values
	for key in keys:
	# Avoid KeyError when a user's specified metric has typo.
	# TODO: Give warnings.
	if key not in value:
	continue
	value = value[key]

	if expand_metrics:
	if isinstance(value, dict):
	if key not in current:
	current[key] = {}
	current = current[key]
	else:
	current[key] = value

	if not expand_metrics:
	metrics_values[metric] = value

	# show some config information
	print(f"model: {model}")
	print(f"commit: {commit}")
	print(f"config: {benchmark_name}")
	if len(metrics_values) > 0:
	print("metrics:")
	if expand_metrics:
	print(metrics_values)
	else:
	for metric, value in metrics_values.items():
	print(f" - {metric}: {value}")
	print("-" * 80)

	summary = {
	"model": model,
	"commit": commit,
	"config": benchmark_name,
	"metrics": metrics_values,
	}
	summaries.append(summary)

	with open(os.path.join(report_dir, "summary.json"), "w") as fp:
	json.dump(summary, fp, indent=4)

	return summaries


	def combine_summaries(summaries):
	"""Combine a list of summary obtained from the function `summarize`.

	The combined summary's format is as follows:
	```
	"google/gemma-2b": {
	"benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5": {
	"3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
	"metrics": {"decode.latency.mean": 1.624666809082031}
	},
	"c97ee28b117c0abe8e08891f402065e4df6d72aa": {
	"metrics": {"decode.latency.mean": 1.6278163452148438}
	}
	},
	"benchmark.input_shapes.batch_size=2,benchmark.input_shapes.sequence_length=5": {
	"3cd6ed22e4d49219f300f5055e71e3929aba20d7": {
	"metrics": {"decode.latency.mean": 1.6947791748046876}
	},
	"c97ee28b117c0abe8e08891f402065e4df6d72aa": {
	"metrics": {
	"decode.latency.mean": 1.6980519409179688}
	}
	}
	}
	```
	"""
	combined = {}
	for summary in summaries:
	model = summary["model"]
	config = summary["config"]
	commit = summary["commit"]

	if model not in combined:
	combined[model] = {}

	if config not in combined[model]:
	combined[model][config] = {}

	if commit not in combined[model][config]:
	combined[model][config][commit] = {"metrics": summary["metrics"]}

	with open(os.path.join(exp_run_dir, "summary.json"), "w") as fp:
	json.dump(combined, fp, indent=4)

	print(json.dumps(combined, indent=4))

	return combined


	if __name__ == "__main__":

	def list_str(values):
	return values.split(",")

	parser = argparse.ArgumentParser()

	parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.")
	parser.add_argument("--config-name", type=str, required=True, help="The config name.")

	# arguments specific to this wrapper for our own customization
	parser.add_argument("--ensure_empty", type=bool, default=True, help="If to create a temporary directory.")
	parser.add_argument(
	"--commit",
	type=list_str,
	default="",
	help="Comma-separated list of branch names and/or commit sha values on which the benchmark will run. If `diff` is specified, it will run on both the current head and the `main` branch.",
	)
	parser.add_argument("--metrics", type=str, help="The metrics to be included in the summary.")

	parser.add_argument("--repo_id", type=str, default=None, help="The repository to which the file will be uploaded.")
	parser.add_argument("--path_in_repo", type=str, default=None, help="Relative filepath in the repo.")
	parser.add_argument("--token", type=str, default=None, help="A valid user access token (string).")

	args, optimum_benchmark_args = parser.parse_known_args()

	repo = Repo(PATH_TO_REPO)

	metrics = [
	"prefill.latency.mean",
	"prefill.throughput.value",
	"decode.latency.mean",
	"decode.throughput.value",
	"per_token.latency.mean",
	"per_token.throughput.value",
	]
	if args.metrics is not None:
	metrics = args.metrics.split(",")

	# Get `backend.model` in a hacky way: We want to control the experiment flow manually.
	models = [""]
	for idx, arg in enumerate(optimum_benchmark_args):
	if arg.startswith("backend.model="):
	models = arg[len("backend.model=") :]
	models = models.split(",")
	break
	optimum_benchmark_args = [arg for arg in optimum_benchmark_args if not arg.startswith("backend.model=")]

	# Get the commit(s)
	current_head = str(repo.head.commit) if repo.head.is_detached else str(repo.head.ref)
	commits = [x for x in args.commit if x != ""]
	if len(commits) == 0:
	commits = [current_head]
	elif len(commits) == 1 and commits[0] == "diff":
	# compare to `main`
	commits = ["main", current_head]

	# Get the specified run directory
	run_dir_arg_idx, run_dir = -1, None
	sweep_dir_arg_idx, sweep_dir = -1, None
	for idx, arg in enumerate(optimum_benchmark_args):
	if arg.startswith("hydra.run.dir="):
	run_dir = arg[len("hydra.run.dir=") :]
	run_dir_arg_idx = idx
	elif arg.startswith("hydra.sweep.dir="):
	sweep_dir = arg[len("hydra.sweep.dir=") :]
	sweep_dir_arg_idx = idx
	exp_run_dir, arg_dix, arg_name = (
	(sweep_dir, sweep_dir_arg_idx, "hydra.sweep.dir")
	if "--multirun" in optimum_benchmark_args
	else (run_dir, run_dir_arg_idx, "hydra.run.dir")
	)

	# TODO: not hardcoded
	if exp_run_dir is None and args.ensure_empty:
	exp_run_dir = "_benchmark"

	if args.ensure_empty:
	os.makedirs(exp_run_dir, exist_ok=True)
	exp_run_dir = tempfile.mkdtemp(dir=exp_run_dir)

	run_summaries = []
	for commit in commits:
	with checkout_commit(repo, commit):
	commit = str(repo.head.commit)

	commit_run_dir = exp_run_dir
	if exp_run_dir is not None:
	commit_run_dir = os.path.join(exp_run_dir, rf"commit\={commit}")

	print(f"Run benchmark on commit: {commit}")

	for model in models:
	model_arg = [f"backend.model={model}"] if model != "" else []
	dir_args = []
	if commit_run_dir is not None:
	if arg_dix > -1:
	optimum_benchmark_args[arg_dix] = f"{arg_name}={commit_run_dir}"
	else:
	dir_args = [
	f"hydra.sweep.dir={commit_run_dir}",
	f"hydra.run.dir={commit_run_dir}/" + "${hydra.job.override_dirname}",
	]
	main(args.config_dir, args.config_name, model_arg + dir_args + optimum_benchmark_args)

	if commit_run_dir is not None:
	# Need to remove the `\` character
	summaries = summarize(commit_run_dir.replace("\\", ""), metrics)
	run_summaries.extend(summaries)

	# aggregate the information across the commits
	if exp_run_dir is not None:
	with open(os.path.join(exp_run_dir, "summaries.json"), "w") as fp:
	json.dump(run_summaries, fp, indent=4)

	combined_summary = combine_summaries(run_summaries)

	if args.repo_id is not None and args.path_in_repo is not None:
	# Upload to Hub
	api = HfApi()
	api.upload_folder(
	folder_path=exp_run_dir,
	path_in_repo=args.path_in_repo,
	repo_id=args.repo_id,
	repo_type="dataset",
	token=args.token,
	)