Spaces:
Running
Running
| // Copyright 2025 The ODML Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| ABSL_FLAG(std::optional<std::string>, vision_backend, std::nullopt, | |
| "Backend to use for the vision model (cpu or gpu). If not specified, " | |
| "the vision backend will be chosen based on the main backend."); | |
| ABSL_FLAG(std::optional<std::string>, audio_backend, std::nullopt, | |
| "Backend to use for the audio model (cpu or gpu). If not specified, " | |
| "the audio backend will be chosen based on the main backend."); | |
| ABSL_FLAG(std::string, sampler_backend, "", | |
| "Sampler backend to use for LLM execution (cpu, gpu, etc.). If " | |
| "empty, the sampler backend will be chosen for the best according to " | |
| "the main executor, for example, gpu for gpu main executor."); | |
| ABSL_FLAG(std::string, expected_output, "", | |
| "If not empty, the output will be checked against this string. If " | |
| "the output does not contain the string, the program will exit with " | |
| "an error."); | |
| ABSL_FLAG(std::optional<std::string>, log_sink_file, std::nullopt, | |
| "If specified, the logs will be written to this file."); | |
| ABSL_FLAG(int, max_num_tokens, 0, | |
| "Maximum number of tokens or context length to use for LLM execution " | |
| "of a graph with dynamic context length. If 0, the maximum context " | |
| "length will be determined by some heuristic. On benchmark mode, it " | |
| "will be set to one equal to or greater than " | |
| "benchmark_prefill_tokens + benchmark_decode_tokens."); | |
| ABSL_FLAG(int, max_output_tokens, -1, | |
| "Maximum number of output tokens for generation."); | |
| ABSL_FLAG(int, max_num_images, 1, | |
| "Maximum number of images to use for LLM execution."); | |
| ABSL_FLAG(std::vector<std::string>, prefill_batch_sizes, {}, | |
| "A list of maximum numbers of prefill tokens processed at once. If " | |
| "empty, it will be the list of one entry with the length of input " | |
| "prompt tokens or benchmark_prefill_tokens when benchmark mode is " | |
| "enabled."); | |
| ABSL_FLAG(int, num_output_candidates, 1, | |
| "The number of candidates generated for the given prompt, or the " | |
| "batch size of the decode signature."); | |
| ABSL_FLAG(bool, benchmark, false, "Benchmark the LLM execution."); | |
| ABSL_FLAG(int, benchmark_prefill_tokens, 0, | |
| "If benchmark is true and the value is larger than 0, the benchmark " | |
| "will use this number to set the number of prefill tokens " | |
| "(regardless of the input prompt)."); | |
| ABSL_FLAG(int, benchmark_decode_tokens, 0, | |
| "If benchmark is true and the value is larger than 0, the benchmark " | |
| "will use this number to set the number of decode steps (regardless " | |
| "of the input prompt)."); | |
| ABSL_FLAG(bool, async, true, "Run the LLM execution asynchronously."); | |
| ABSL_FLAG(bool, report_peak_memory_footprint, false, | |
| "Report peak memory footprint."); | |
| ABSL_FLAG(bool, force_f32, false, | |
| "Force float 32 precision for the activation data type."); | |
| ABSL_FLAG(bool, multi_turns, false, | |
| "If true, the command line will ask for multi-turns input."); | |
| ABSL_FLAG(int, num_cpu_threads, 0, | |
| "If greater than 0, the number of CPU threads to use for the LLM " | |
| "execution with CPU backend."); | |
| ABSL_FLAG(bool, gpu_external_tensor_mode, false, | |
| "If false (by default), the GPU backend will use no external tensor " | |
| "mode which runs slightly faster during decode. It should be set " | |
| "true when GPU backend doesn't support no external tensor mode, " | |
| "e.g. Vulkan or OpenGL."); | |
| ABSL_FLAG(bool, configure_magic_numbers, true, | |
| "If true and the model contains magic numbers, present magic number " | |
| "configs when the model is initialized."); | |
| ABSL_FLAG(bool, verify_magic_numbers, false, | |
| "If true and the model contains magic numbers and test signatures, " | |
| "verify magic number configs when the real dimensions that replaced " | |
| "magic numbers match with ones of test signatures."); | |
| ABSL_FLAG(bool, clear_kv_cache_before_prefill, true, | |
| "If true, clear kv cache before the first prefill step. This may " | |
| "help to disclose any issues related to kv cache."); | |
| ABSL_FLAG(int, num_logits_to_print_after_decode, 0, | |
| "The number of values at the beginning of logits, in the middle of " | |
| "logits, and at the end of logits to print after each decode step. " | |
| "If 0, disables printing logits."); | |
| ABSL_FLAG(std::string, score_target_text, "", "Target text to score."); | |
| ABSL_FLAG(bool, gpu_madvise_original_shared_tensors, true, | |
| "If true, the GPU backend will madvise the original shared tensors " | |
| "after use."); | |
| ABSL_FLAG(bool, disable_cache, false, "Disable weight cache."); | |
| ABSL_FLAG(std::string, preferred_device_substr, "", | |
| "Preferred WebGPU device name substring, case-insensitive. " | |
| "If not empty, the adapter which the device name contains the " | |
| "substring will be chosen. " | |
| "If empty, the device will be determined by other factors."); | |
| ABSL_FLAG(int, num_threads_to_upload, -1, | |
| "Number of threads for WebGPU weight upload. By default (-1), it's " | |
| "determined by the runtime."); | |
| ABSL_FLAG(int, num_threads_to_compile, -1, | |
| "Number of threads for WebGPU kernel compilation. By default (-1), " | |
| "it's determined by the runtime."); | |
| ABSL_FLAG(bool, convert_weights_on_gpu, true, | |
| "If true, the executor will convert weights on GPU. It is not " | |
| "supported by the all backends so this flag is ignored when using " | |
| "non-OpenCL and non-WebGPU backends."); | |
| ABSL_FLAG(bool, wait_for_weights_conversion_complete_in_benchmark, true, | |
| "If false, the executor does not wait for weights conversion on GPU " | |
| "to complete during benchmark. It's meaningful only when benchmark " | |
| "and convert_weights_on_gpu are true."); | |
| ABSL_FLAG(bool, optimize_shader_compilation, true, | |
| "If true, optimize Vulkan shader compilation."); | |
| ABSL_FLAG(bool, share_constant_tensors, true, | |
| "If true, the executor will enable constant tensor sharing."); | |
| ABSL_FLAG(int, num_iterations, 1, | |
| "Number of iterations to run the model. By default, it's 1."); | |
| ABSL_FLAG(std::string, litert_dispatch_lib_dir, "", | |
| "Directory of the LiteRT dispatch library. If not set, the runtime " | |
| "will look for the library in the path defined as the environment " | |
| "variables."); | |
| ABSL_FLAG(bool, sampler_handles_input, true, | |
| "If true and the sampler supports, the sampler manipulates decode " | |
| "input tensors including tokens, positions, and mask."); | |
| ABSL_FLAG(std::string, conv_type, "auto", | |
| "Convolution data type. It can be auto, float, or int8. float will " | |
| "be either float32 or float16 depending on the activation data type. " | |
| "See --force_f32. int8 would have better latency with lower " | |
| "accuracy. auto will choose the best type based on the model."); | |
| ABSL_FLAG(bool, cache_compiled_shaders_only, false, | |
| "If true, only the compiled shaders will be cached. If false, gpu " | |
| "graph info including work group sizes (and all compiled shaders " | |
| "depending on backend) will be cached."); | |
| ABSL_FLAG(std::string, constraint_regex, "", | |
| "Regular expression to constrain the output generation."); | |
| ABSL_FLAG(bool, use_submodel, false, | |
| "Whether the submodel should be used if available."); | |
| ABSL_FLAG(bool, enable_speculative_decoding, false, | |
| "Whether to use speculative decoding."); | |