// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "runtime/engine/shared_flags.h" #include #include #include #include "absl/flags/flag.h" // from @com_google_absl ABSL_FLAG(std::optional, vision_backend, std::nullopt, "Backend to use for the vision model (cpu or gpu). If not specified, " "the vision backend will be chosen based on the main backend."); ABSL_FLAG(std::optional, audio_backend, std::nullopt, "Backend to use for the audio model (cpu or gpu). If not specified, " "the audio backend will be chosen based on the main backend."); ABSL_FLAG(std::string, sampler_backend, "", "Sampler backend to use for LLM execution (cpu, gpu, etc.). If " "empty, the sampler backend will be chosen for the best according to " "the main executor, for example, gpu for gpu main executor."); ABSL_FLAG(std::string, expected_output, "", "If not empty, the output will be checked against this string. If " "the output does not contain the string, the program will exit with " "an error."); ABSL_FLAG(std::optional, log_sink_file, std::nullopt, "If specified, the logs will be written to this file."); ABSL_FLAG(int, max_num_tokens, 0, "Maximum number of tokens or context length to use for LLM execution " "of a graph with dynamic context length. If 0, the maximum context " "length will be determined by some heuristic. On benchmark mode, it " "will be set to one equal to or greater than " "benchmark_prefill_tokens + benchmark_decode_tokens."); ABSL_FLAG(int, max_output_tokens, -1, "Maximum number of output tokens for generation."); ABSL_FLAG(int, max_num_images, 1, "Maximum number of images to use for LLM execution."); ABSL_FLAG(std::vector, prefill_batch_sizes, {}, "A list of maximum numbers of prefill tokens processed at once. If " "empty, it will be the list of one entry with the length of input " "prompt tokens or benchmark_prefill_tokens when benchmark mode is " "enabled."); ABSL_FLAG(int, num_output_candidates, 1, "The number of candidates generated for the given prompt, or the " "batch size of the decode signature."); ABSL_FLAG(bool, benchmark, false, "Benchmark the LLM execution."); ABSL_FLAG(int, benchmark_prefill_tokens, 0, "If benchmark is true and the value is larger than 0, the benchmark " "will use this number to set the number of prefill tokens " "(regardless of the input prompt)."); ABSL_FLAG(int, benchmark_decode_tokens, 0, "If benchmark is true and the value is larger than 0, the benchmark " "will use this number to set the number of decode steps (regardless " "of the input prompt)."); ABSL_FLAG(bool, async, true, "Run the LLM execution asynchronously."); ABSL_FLAG(bool, report_peak_memory_footprint, false, "Report peak memory footprint."); ABSL_FLAG(bool, force_f32, false, "Force float 32 precision for the activation data type."); ABSL_FLAG(bool, multi_turns, false, "If true, the command line will ask for multi-turns input."); ABSL_FLAG(int, num_cpu_threads, 0, "If greater than 0, the number of CPU threads to use for the LLM " "execution with CPU backend."); ABSL_FLAG(bool, gpu_external_tensor_mode, false, "If false (by default), the GPU backend will use no external tensor " "mode which runs slightly faster during decode. It should be set " "true when GPU backend doesn't support no external tensor mode, " "e.g. Vulkan or OpenGL."); ABSL_FLAG(bool, configure_magic_numbers, true, "If true and the model contains magic numbers, present magic number " "configs when the model is initialized."); ABSL_FLAG(bool, verify_magic_numbers, false, "If true and the model contains magic numbers and test signatures, " "verify magic number configs when the real dimensions that replaced " "magic numbers match with ones of test signatures."); ABSL_FLAG(bool, clear_kv_cache_before_prefill, true, "If true, clear kv cache before the first prefill step. This may " "help to disclose any issues related to kv cache."); ABSL_FLAG(int, num_logits_to_print_after_decode, 0, "The number of values at the beginning of logits, in the middle of " "logits, and at the end of logits to print after each decode step. " "If 0, disables printing logits."); ABSL_FLAG(std::string, score_target_text, "", "Target text to score."); ABSL_FLAG(bool, gpu_madvise_original_shared_tensors, true, "If true, the GPU backend will madvise the original shared tensors " "after use."); ABSL_FLAG(bool, disable_cache, false, "Disable weight cache."); ABSL_FLAG(std::string, preferred_device_substr, "", "Preferred WebGPU device name substring, case-insensitive. " "If not empty, the adapter which the device name contains the " "substring will be chosen. " "If empty, the device will be determined by other factors."); ABSL_FLAG(int, num_threads_to_upload, -1, "Number of threads for WebGPU weight upload. By default (-1), it's " "determined by the runtime."); ABSL_FLAG(int, num_threads_to_compile, -1, "Number of threads for WebGPU kernel compilation. By default (-1), " "it's determined by the runtime."); ABSL_FLAG(bool, convert_weights_on_gpu, true, "If true, the executor will convert weights on GPU. It is not " "supported by the all backends so this flag is ignored when using " "non-OpenCL and non-WebGPU backends."); ABSL_FLAG(bool, wait_for_weights_conversion_complete_in_benchmark, true, "If false, the executor does not wait for weights conversion on GPU " "to complete during benchmark. It's meaningful only when benchmark " "and convert_weights_on_gpu are true."); ABSL_FLAG(bool, optimize_shader_compilation, true, "If true, optimize Vulkan shader compilation."); ABSL_FLAG(bool, share_constant_tensors, true, "If true, the executor will enable constant tensor sharing."); ABSL_FLAG(int, num_iterations, 1, "Number of iterations to run the model. By default, it's 1."); ABSL_FLAG(std::string, litert_dispatch_lib_dir, "", "Directory of the LiteRT dispatch library. If not set, the runtime " "will look for the library in the path defined as the environment " "variables."); ABSL_FLAG(bool, sampler_handles_input, true, "If true and the sampler supports, the sampler manipulates decode " "input tensors including tokens, positions, and mask."); ABSL_FLAG(std::string, conv_type, "auto", "Convolution data type. It can be auto, float, or int8. float will " "be either float32 or float16 depending on the activation data type. " "See --force_f32. int8 would have better latency with lower " "accuracy. auto will choose the best type based on the model."); ABSL_FLAG(bool, cache_compiled_shaders_only, false, "If true, only the compiled shaders will be cached. If false, gpu " "graph info including work group sizes (and all compiled shaders " "depending on backend) will be cached."); ABSL_FLAG(std::string, constraint_regex, "", "Regular expression to constrain the output generation."); ABSL_FLAG(bool, use_submodel, false, "Whether the submodel should be used if available."); ABSL_FLAG(bool, enable_speculative_decoding, false, "Whether to use speculative decoding.");