Spaces:
Running
Running
File size: 8,363 Bytes
5f923cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | // Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "runtime/engine/shared_flags.h"
#include <optional>
#include <string>
#include <vector>
#include "absl/flags/flag.h" // from @com_google_absl
ABSL_FLAG(std::optional<std::string>, vision_backend, std::nullopt,
"Backend to use for the vision model (cpu or gpu). If not specified, "
"the vision backend will be chosen based on the main backend.");
ABSL_FLAG(std::optional<std::string>, audio_backend, std::nullopt,
"Backend to use for the audio model (cpu or gpu). If not specified, "
"the audio backend will be chosen based on the main backend.");
ABSL_FLAG(std::string, sampler_backend, "",
"Sampler backend to use for LLM execution (cpu, gpu, etc.). If "
"empty, the sampler backend will be chosen for the best according to "
"the main executor, for example, gpu for gpu main executor.");
ABSL_FLAG(std::string, expected_output, "",
"If not empty, the output will be checked against this string. If "
"the output does not contain the string, the program will exit with "
"an error.");
ABSL_FLAG(std::optional<std::string>, log_sink_file, std::nullopt,
"If specified, the logs will be written to this file.");
ABSL_FLAG(int, max_num_tokens, 0,
"Maximum number of tokens or context length to use for LLM execution "
"of a graph with dynamic context length. If 0, the maximum context "
"length will be determined by some heuristic. On benchmark mode, it "
"will be set to one equal to or greater than "
"benchmark_prefill_tokens + benchmark_decode_tokens.");
ABSL_FLAG(int, max_output_tokens, -1,
"Maximum number of output tokens for generation.");
ABSL_FLAG(int, max_num_images, 1,
"Maximum number of images to use for LLM execution.");
ABSL_FLAG(std::vector<std::string>, prefill_batch_sizes, {},
"A list of maximum numbers of prefill tokens processed at once. If "
"empty, it will be the list of one entry with the length of input "
"prompt tokens or benchmark_prefill_tokens when benchmark mode is "
"enabled.");
ABSL_FLAG(int, num_output_candidates, 1,
"The number of candidates generated for the given prompt, or the "
"batch size of the decode signature.");
ABSL_FLAG(bool, benchmark, false, "Benchmark the LLM execution.");
ABSL_FLAG(int, benchmark_prefill_tokens, 0,
"If benchmark is true and the value is larger than 0, the benchmark "
"will use this number to set the number of prefill tokens "
"(regardless of the input prompt).");
ABSL_FLAG(int, benchmark_decode_tokens, 0,
"If benchmark is true and the value is larger than 0, the benchmark "
"will use this number to set the number of decode steps (regardless "
"of the input prompt).");
ABSL_FLAG(bool, async, true, "Run the LLM execution asynchronously.");
ABSL_FLAG(bool, report_peak_memory_footprint, false,
"Report peak memory footprint.");
ABSL_FLAG(bool, force_f32, false,
"Force float 32 precision for the activation data type.");
ABSL_FLAG(bool, multi_turns, false,
"If true, the command line will ask for multi-turns input.");
ABSL_FLAG(int, num_cpu_threads, 0,
"If greater than 0, the number of CPU threads to use for the LLM "
"execution with CPU backend.");
ABSL_FLAG(bool, gpu_external_tensor_mode, false,
"If false (by default), the GPU backend will use no external tensor "
"mode which runs slightly faster during decode. It should be set "
"true when GPU backend doesn't support no external tensor mode, "
"e.g. Vulkan or OpenGL.");
ABSL_FLAG(bool, configure_magic_numbers, true,
"If true and the model contains magic numbers, present magic number "
"configs when the model is initialized.");
ABSL_FLAG(bool, verify_magic_numbers, false,
"If true and the model contains magic numbers and test signatures, "
"verify magic number configs when the real dimensions that replaced "
"magic numbers match with ones of test signatures.");
ABSL_FLAG(bool, clear_kv_cache_before_prefill, true,
"If true, clear kv cache before the first prefill step. This may "
"help to disclose any issues related to kv cache.");
ABSL_FLAG(int, num_logits_to_print_after_decode, 0,
"The number of values at the beginning of logits, in the middle of "
"logits, and at the end of logits to print after each decode step. "
"If 0, disables printing logits.");
ABSL_FLAG(std::string, score_target_text, "", "Target text to score.");
ABSL_FLAG(bool, gpu_madvise_original_shared_tensors, true,
"If true, the GPU backend will madvise the original shared tensors "
"after use.");
ABSL_FLAG(bool, disable_cache, false, "Disable weight cache.");
ABSL_FLAG(std::string, preferred_device_substr, "",
"Preferred WebGPU device name substring, case-insensitive. "
"If not empty, the adapter which the device name contains the "
"substring will be chosen. "
"If empty, the device will be determined by other factors.");
ABSL_FLAG(int, num_threads_to_upload, -1,
"Number of threads for WebGPU weight upload. By default (-1), it's "
"determined by the runtime.");
ABSL_FLAG(int, num_threads_to_compile, -1,
"Number of threads for WebGPU kernel compilation. By default (-1), "
"it's determined by the runtime.");
ABSL_FLAG(bool, convert_weights_on_gpu, true,
"If true, the executor will convert weights on GPU. It is not "
"supported by the all backends so this flag is ignored when using "
"non-OpenCL and non-WebGPU backends.");
ABSL_FLAG(bool, wait_for_weights_conversion_complete_in_benchmark, true,
"If false, the executor does not wait for weights conversion on GPU "
"to complete during benchmark. It's meaningful only when benchmark "
"and convert_weights_on_gpu are true.");
ABSL_FLAG(bool, optimize_shader_compilation, true,
"If true, optimize Vulkan shader compilation.");
ABSL_FLAG(bool, share_constant_tensors, true,
"If true, the executor will enable constant tensor sharing.");
ABSL_FLAG(int, num_iterations, 1,
"Number of iterations to run the model. By default, it's 1.");
ABSL_FLAG(std::string, litert_dispatch_lib_dir, "",
"Directory of the LiteRT dispatch library. If not set, the runtime "
"will look for the library in the path defined as the environment "
"variables.");
ABSL_FLAG(bool, sampler_handles_input, true,
"If true and the sampler supports, the sampler manipulates decode "
"input tensors including tokens, positions, and mask.");
ABSL_FLAG(std::string, conv_type, "auto",
"Convolution data type. It can be auto, float, or int8. float will "
"be either float32 or float16 depending on the activation data type. "
"See --force_f32. int8 would have better latency with lower "
"accuracy. auto will choose the best type based on the model.");
ABSL_FLAG(bool, cache_compiled_shaders_only, false,
"If true, only the compiled shaders will be cached. If false, gpu "
"graph info including work group sizes (and all compiled shaders "
"depending on backend) will be cached.");
ABSL_FLAG(std::string, constraint_regex, "",
"Regular expression to constrain the output generation.");
ABSL_FLAG(bool, use_submodel, false,
"Whether the submodel should be used if available.");
ABSL_FLAG(bool, enable_speculative_decoding, false,
"Whether to use speculative decoding.");
|