| |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
| from urllib import request |
| import os |
| import shlex |
| import shutil |
| import subprocess |
| import sys |
| from typing import Any, Sequence |
| import logging |
| import json |
| import argparse |
|
|
| curdir = Path(os.path.dirname(__file__)) |
|
|
| logger = logging.getLogger("bench") |
|
|
| MODEL_DIR = curdir / "bench-TriLMs-models" |
| LLAMA_CPP_PATH = curdir / "." |
| MODEL_SIZES = ("1.5", "2.4", "3.9") |
| ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16") |
| GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16") |
|
|
|
|
| def gather_models(sizes: Sequence[str] = MODEL_SIZES): |
| logger.info("Gathering models") |
| if not MODEL_DIR.exists(): |
| MODEL_DIR.mkdir(parents=True, exist_ok=True) |
| for size in sizes: |
| filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
| file = MODEL_DIR / filename |
| if not file.exists(): |
| url = ( |
| f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}" |
| ) |
| logger.info(f"Fetching {filename} from {url}") |
| request.urlretrieve(url, file) |
|
|
|
|
| def build_llama_cpp(options: Sequence[str]): |
| logger.info("Building llama.cpp") |
| builddir = LLAMA_CPP_PATH / "build" |
| if builddir.exists(): |
| |
| cmake_cache = builddir / "CMakeCache.txt" |
| cmake_files = builddir / "CMakeFiles" |
| logger.info("Removing %s and %s", cmake_cache, cmake_files) |
| os.system(shlex.join(("rm", "-rf", str(cmake_cache), str(cmake_files)))) |
| builddir.mkdir(exist_ok=True) |
| old_cwd = os.path.curdir |
| os.chdir(builddir) |
| os.system(shlex.join(("cmake", "..", *options))) |
| os.system(f"make -j{os.cpu_count()} llama-bench llama-quantize test-backend-ops") |
| os.chdir(old_cwd) |
|
|
|
|
| def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES): |
| logger.info("Make all model types we'll test") |
| for size in sizes: |
| source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
| for ty in types: |
| target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
| if not target.exists() or target.is_file() and target.stat().st_size == 0: |
| command = shlex.join( |
| ( |
| str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"), |
| "--allow-requantize", |
| str(source), |
| str(target), |
| ty, |
| ) |
| ) |
| logger.info("Running: %s", command) |
| ret = os.system(command) |
| if ret != 0 or target.is_file() and target.stat().st_size == 0: |
| logger.error("Failed to quantize to %s", target) |
| |
|
|
|
|
| def llama_bench( |
| repetitions: int = 5, |
| types: Sequence[str] = ALL_TYPES, |
| sizes: Sequence[str] = MODEL_SIZES, |
| ) -> list[dict[str, Any]]: |
| logger.info("Test each model one by one for different numbers of threads") |
|
|
| threads = [2**i for i in range(5) if 2**i <= os.cpu_count()] |
| logger.info(f"Numbers of threads to be tested: {threads}") |
|
|
| out = [] |
|
|
| for size in sizes: |
| for ty in types: |
| for th in threads: |
| model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
| args = [ |
| "-v", |
| "-m", |
| str(model_path), |
| "-t", |
| str(th), |
| "-r", |
| str(repetitions), |
| "-p", |
| "512", |
| "-n", |
| "128", |
| "-o", |
| "json", |
| ] |
| command = [str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args |
| logger.info("Running: %s", " ".join(command)) |
| result = subprocess.run(command, capture_output=True) |
| logger.debug(result.stderr.decode(errors="ignore")) |
| if result.returncode != 0 or len(result.stdout) == 0: |
| logger.error("Failed to run %s", " ".join(command)) |
| break |
|
|
| new_output = json.loads(result.stdout) |
| logger.info(json.dumps(new_output, indent=4)) |
| out.extend(new_output) |
| return out |
|
|
|
|
| def test_backend_perf() -> str: |
| logger.info("Test MUL_MAT performance") |
| result = subprocess.run( |
| [ |
| str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"), |
| "perf", |
| "-o", |
| "MUL_MAT", |
| ], |
| capture_output=True, |
| ) |
| logger.debug(result.stdout.decode()) |
| return result.stdout.decode(encoding="utf-8") |
|
|
|
|
| def parse_args(args: Sequence[str]): |
| parser = argparse.ArgumentParser( |
| prog=args[0], description="Benchmark ternary models" |
| ) |
| parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU") |
| parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU") |
| parser.add_argument( |
| "--llama-cpp-path", |
| type=Path, |
| default=LLAMA_CPP_PATH, |
| help="Path to a llama.cpp checkout", |
| ) |
| parser.add_argument( |
| "--model-dir", |
| type=Path, |
| default=MODEL_DIR, |
| help="Where the tested models will be stored", |
| ) |
| parser.add_argument( |
| "--repetitions", |
| type=int, |
| default=5, |
| required=False, |
| help="How many repetitions are run for each test", |
| ) |
| parser.add_argument( |
| "--out", |
| type=Path, |
| default=Path(os.path.curdir) / "result.json", |
| help="Path of the benchmark results to be written", |
| ) |
| parser.add_argument( |
| "--force", action="store_true", help="Overwrite the result file without asking" |
| ) |
| return parser.parse_args(args[1:]) |
|
|
|
|
| if __name__ == "__main__": |
| args = parse_args(sys.argv) |
|
|
| logging.basicConfig(level=logging.DEBUG) |
|
|
| LLAMA_CPP_PATH = args.llama_cpp_path |
| MODEL_DIR = args.model_dir |
|
|
| output_file = Path(args.out).absolute() |
|
|
| if output_file.exists() and not args.force: |
| ask = input("Result file exists. Do you want to overwrite it? [y/N]") |
| if not ask.strip().lower().startswith("y"): |
| logger.info("Not running, leaving output file intact") |
| exit() |
|
|
| results = [] |
| mulmat_perf = [] |
| repetitions: int = args.repetitions |
|
|
| if args.cpu: |
| gather_models() |
| build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"]) |
| quantize() |
| mulmat_perf.append(test_backend_perf()) |
| results.extend(llama_bench(repetitions=repetitions)) |
|
|
| if args.gpu: |
| gather_models() |
| build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"]) |
| quantize() |
| mulmat_perf.append(test_backend_perf()) |
| results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES)) |
|
|
| final_result: dict[str, Any] = { |
| "mulmat_perf": mulmat_perf, |
| "results": results, |
| } |
|
|
| if shutil.which("lscpu") is not None: |
| logger.info("Getting CPU info") |
| final_result["cpuinfo"] = subprocess.run( |
| ["lscpu"], capture_output=True |
| ).stdout.decode(encoding="utf-8") |
|
|
| if args.gpu and shutil.which("nvidia-smi") is not None: |
| logger.info("Getting NVIDIA GPU info") |
| final_result["gpuinfo"] = subprocess.run( |
| ["nvidia-smi", "-q"], capture_output=True |
| ).stdout.decode(encoding="utf-8") |
|
|
| logger.info("Writing output to: %s", output_file) |
| logger.debug("Final results: %s", json.dumps(final_result, indent=4)) |
| with open(output_file, "w") as f: |
| json.dump(final_result, f, indent=4) |
| f.flush() |
|
|