LiteRT-LM / runtime /engine /shared_flags.cc
SeaWolf-AI's picture
Upload full LiteRT-LM codebase
5f923cd verified
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "runtime/engine/shared_flags.h"
#include <optional>
#include <string>
#include <vector>
#include "absl/flags/flag.h" // from @com_google_absl
ABSL_FLAG(std::optional<std::string>, vision_backend, std::nullopt,
"Backend to use for the vision model (cpu or gpu). If not specified, "
"the vision backend will be chosen based on the main backend.");
ABSL_FLAG(std::optional<std::string>, audio_backend, std::nullopt,
"Backend to use for the audio model (cpu or gpu). If not specified, "
"the audio backend will be chosen based on the main backend.");
ABSL_FLAG(std::string, sampler_backend, "",
"Sampler backend to use for LLM execution (cpu, gpu, etc.). If "
"empty, the sampler backend will be chosen for the best according to "
"the main executor, for example, gpu for gpu main executor.");
ABSL_FLAG(std::string, expected_output, "",
"If not empty, the output will be checked against this string. If "
"the output does not contain the string, the program will exit with "
"an error.");
ABSL_FLAG(std::optional<std::string>, log_sink_file, std::nullopt,
"If specified, the logs will be written to this file.");
ABSL_FLAG(int, max_num_tokens, 0,
"Maximum number of tokens or context length to use for LLM execution "
"of a graph with dynamic context length. If 0, the maximum context "
"length will be determined by some heuristic. On benchmark mode, it "
"will be set to one equal to or greater than "
"benchmark_prefill_tokens + benchmark_decode_tokens.");
ABSL_FLAG(int, max_output_tokens, -1,
"Maximum number of output tokens for generation.");
ABSL_FLAG(int, max_num_images, 1,
"Maximum number of images to use for LLM execution.");
ABSL_FLAG(std::vector<std::string>, prefill_batch_sizes, {},
"A list of maximum numbers of prefill tokens processed at once. If "
"empty, it will be the list of one entry with the length of input "
"prompt tokens or benchmark_prefill_tokens when benchmark mode is "
"enabled.");
ABSL_FLAG(int, num_output_candidates, 1,
"The number of candidates generated for the given prompt, or the "
"batch size of the decode signature.");
ABSL_FLAG(bool, benchmark, false, "Benchmark the LLM execution.");
ABSL_FLAG(int, benchmark_prefill_tokens, 0,
"If benchmark is true and the value is larger than 0, the benchmark "
"will use this number to set the number of prefill tokens "
"(regardless of the input prompt).");
ABSL_FLAG(int, benchmark_decode_tokens, 0,
"If benchmark is true and the value is larger than 0, the benchmark "
"will use this number to set the number of decode steps (regardless "
"of the input prompt).");
ABSL_FLAG(bool, async, true, "Run the LLM execution asynchronously.");
ABSL_FLAG(bool, report_peak_memory_footprint, false,
"Report peak memory footprint.");
ABSL_FLAG(bool, force_f32, false,
"Force float 32 precision for the activation data type.");
ABSL_FLAG(bool, multi_turns, false,
"If true, the command line will ask for multi-turns input.");
ABSL_FLAG(int, num_cpu_threads, 0,
"If greater than 0, the number of CPU threads to use for the LLM "
"execution with CPU backend.");
ABSL_FLAG(bool, gpu_external_tensor_mode, false,
"If false (by default), the GPU backend will use no external tensor "
"mode which runs slightly faster during decode. It should be set "
"true when GPU backend doesn't support no external tensor mode, "
"e.g. Vulkan or OpenGL.");
ABSL_FLAG(bool, configure_magic_numbers, true,
"If true and the model contains magic numbers, present magic number "
"configs when the model is initialized.");
ABSL_FLAG(bool, verify_magic_numbers, false,
"If true and the model contains magic numbers and test signatures, "
"verify magic number configs when the real dimensions that replaced "
"magic numbers match with ones of test signatures.");
ABSL_FLAG(bool, clear_kv_cache_before_prefill, true,
"If true, clear kv cache before the first prefill step. This may "
"help to disclose any issues related to kv cache.");
ABSL_FLAG(int, num_logits_to_print_after_decode, 0,
"The number of values at the beginning of logits, in the middle of "
"logits, and at the end of logits to print after each decode step. "
"If 0, disables printing logits.");
ABSL_FLAG(std::string, score_target_text, "", "Target text to score.");
ABSL_FLAG(bool, gpu_madvise_original_shared_tensors, true,
"If true, the GPU backend will madvise the original shared tensors "
"after use.");
ABSL_FLAG(bool, disable_cache, false, "Disable weight cache.");
ABSL_FLAG(std::string, preferred_device_substr, "",
"Preferred WebGPU device name substring, case-insensitive. "
"If not empty, the adapter which the device name contains the "
"substring will be chosen. "
"If empty, the device will be determined by other factors.");
ABSL_FLAG(int, num_threads_to_upload, -1,
"Number of threads for WebGPU weight upload. By default (-1), it's "
"determined by the runtime.");
ABSL_FLAG(int, num_threads_to_compile, -1,
"Number of threads for WebGPU kernel compilation. By default (-1), "
"it's determined by the runtime.");
ABSL_FLAG(bool, convert_weights_on_gpu, true,
"If true, the executor will convert weights on GPU. It is not "
"supported by the all backends so this flag is ignored when using "
"non-OpenCL and non-WebGPU backends.");
ABSL_FLAG(bool, wait_for_weights_conversion_complete_in_benchmark, true,
"If false, the executor does not wait for weights conversion on GPU "
"to complete during benchmark. It's meaningful only when benchmark "
"and convert_weights_on_gpu are true.");
ABSL_FLAG(bool, optimize_shader_compilation, true,
"If true, optimize Vulkan shader compilation.");
ABSL_FLAG(bool, share_constant_tensors, true,
"If true, the executor will enable constant tensor sharing.");
ABSL_FLAG(int, num_iterations, 1,
"Number of iterations to run the model. By default, it's 1.");
ABSL_FLAG(std::string, litert_dispatch_lib_dir, "",
"Directory of the LiteRT dispatch library. If not set, the runtime "
"will look for the library in the path defined as the environment "
"variables.");
ABSL_FLAG(bool, sampler_handles_input, true,
"If true and the sampler supports, the sampler manipulates decode "
"input tensors including tokens, positions, and mask.");
ABSL_FLAG(std::string, conv_type, "auto",
"Convolution data type. It can be auto, float, or int8. float will "
"be either float32 or float16 depending on the activation data type. "
"See --force_f32. int8 would have better latency with lower "
"accuracy. auto will choose the best type based on the model.");
ABSL_FLAG(bool, cache_compiled_shaders_only, false,
"If true, only the compiled shaders will be cached. If false, gpu "
"graph info including work group sizes (and all compiled shaders "
"depending on backend) will be cached.");
ABSL_FLAG(std::string, constraint_regex, "",
"Regular expression to constrain the output generation.");
ABSL_FLAG(bool, use_submodel, false,
"Whether the submodel should be used if available.");
ABSL_FLAG(bool, enable_speculative_decoding, false,
"Whether to use speculative decoding.");