Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /engine /shared_flags.cc

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 3 days ago

raw

history blame contribute delete

8.36 kB

	// Copyright 2025 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "runtime/engine/shared_flags.h"

	#include <optional>
	#include <string>
	#include <vector>

	#include "absl/flags/flag.h" // from @com_google_absl

	ABSL_FLAG(std::optional<std::string>, vision_backend, std::nullopt,
	"Backend to use for the vision model (cpu or gpu). If not specified, "
	"the vision backend will be chosen based on the main backend.");
	ABSL_FLAG(std::optional<std::string>, audio_backend, std::nullopt,
	"Backend to use for the audio model (cpu or gpu). If not specified, "
	"the audio backend will be chosen based on the main backend.");
	ABSL_FLAG(std::string, sampler_backend, "",
	"Sampler backend to use for LLM execution (cpu, gpu, etc.). If "
	"empty, the sampler backend will be chosen for the best according to "
	"the main executor, for example, gpu for gpu main executor.");
	ABSL_FLAG(std::string, expected_output, "",
	"If not empty, the output will be checked against this string. If "
	"the output does not contain the string, the program will exit with "
	"an error.");
	ABSL_FLAG(std::optional<std::string>, log_sink_file, std::nullopt,
	"If specified, the logs will be written to this file.");
	ABSL_FLAG(int, max_num_tokens, 0,
	"Maximum number of tokens or context length to use for LLM execution "
	"of a graph with dynamic context length. If 0, the maximum context "
	"length will be determined by some heuristic. On benchmark mode, it "
	"will be set to one equal to or greater than "
	"benchmark_prefill_tokens + benchmark_decode_tokens.");
	ABSL_FLAG(int, max_output_tokens, -1,
	"Maximum number of output tokens for generation.");
	ABSL_FLAG(int, max_num_images, 1,
	"Maximum number of images to use for LLM execution.");
	ABSL_FLAG(std::vector<std::string>, prefill_batch_sizes, {},
	"A list of maximum numbers of prefill tokens processed at once. If "
	"empty, it will be the list of one entry with the length of input "
	"prompt tokens or benchmark_prefill_tokens when benchmark mode is "
	"enabled.");
	ABSL_FLAG(int, num_output_candidates, 1,
	"The number of candidates generated for the given prompt, or the "
	"batch size of the decode signature.");
	ABSL_FLAG(bool, benchmark, false, "Benchmark the LLM execution.");
	ABSL_FLAG(int, benchmark_prefill_tokens, 0,
	"If benchmark is true and the value is larger than 0, the benchmark "
	"will use this number to set the number of prefill tokens "
	"(regardless of the input prompt).");
	ABSL_FLAG(int, benchmark_decode_tokens, 0,
	"If benchmark is true and the value is larger than 0, the benchmark "
	"will use this number to set the number of decode steps (regardless "
	"of the input prompt).");
	ABSL_FLAG(bool, async, true, "Run the LLM execution asynchronously.");
	ABSL_FLAG(bool, report_peak_memory_footprint, false,
	"Report peak memory footprint.");
	ABSL_FLAG(bool, force_f32, false,
	"Force float 32 precision for the activation data type.");
	ABSL_FLAG(bool, multi_turns, false,
	"If true, the command line will ask for multi-turns input.");
	ABSL_FLAG(int, num_cpu_threads, 0,
	"If greater than 0, the number of CPU threads to use for the LLM "
	"execution with CPU backend.");
	ABSL_FLAG(bool, gpu_external_tensor_mode, false,
	"If false (by default), the GPU backend will use no external tensor "
	"mode which runs slightly faster during decode. It should be set "
	"true when GPU backend doesn't support no external tensor mode, "
	"e.g. Vulkan or OpenGL.");
	ABSL_FLAG(bool, configure_magic_numbers, true,
	"If true and the model contains magic numbers, present magic number "
	"configs when the model is initialized.");
	ABSL_FLAG(bool, verify_magic_numbers, false,
	"If true and the model contains magic numbers and test signatures, "
	"verify magic number configs when the real dimensions that replaced "
	"magic numbers match with ones of test signatures.");
	ABSL_FLAG(bool, clear_kv_cache_before_prefill, true,
	"If true, clear kv cache before the first prefill step. This may "
	"help to disclose any issues related to kv cache.");
	ABSL_FLAG(int, num_logits_to_print_after_decode, 0,
	"The number of values at the beginning of logits, in the middle of "
	"logits, and at the end of logits to print after each decode step. "
	"If 0, disables printing logits.");
	ABSL_FLAG(std::string, score_target_text, "", "Target text to score.");
	ABSL_FLAG(bool, gpu_madvise_original_shared_tensors, true,
	"If true, the GPU backend will madvise the original shared tensors "
	"after use.");
	ABSL_FLAG(bool, disable_cache, false, "Disable weight cache.");
	ABSL_FLAG(std::string, preferred_device_substr, "",
	"Preferred WebGPU device name substring, case-insensitive. "
	"If not empty, the adapter which the device name contains the "
	"substring will be chosen. "
	"If empty, the device will be determined by other factors.");
	ABSL_FLAG(int, num_threads_to_upload, -1,
	"Number of threads for WebGPU weight upload. By default (-1), it's "
	"determined by the runtime.");
	ABSL_FLAG(int, num_threads_to_compile, -1,
	"Number of threads for WebGPU kernel compilation. By default (-1), "
	"it's determined by the runtime.");
	ABSL_FLAG(bool, convert_weights_on_gpu, true,
	"If true, the executor will convert weights on GPU. It is not "
	"supported by the all backends so this flag is ignored when using "
	"non-OpenCL and non-WebGPU backends.");
	ABSL_FLAG(bool, wait_for_weights_conversion_complete_in_benchmark, true,
	"If false, the executor does not wait for weights conversion on GPU "
	"to complete during benchmark. It's meaningful only when benchmark "
	"and convert_weights_on_gpu are true.");
	ABSL_FLAG(bool, optimize_shader_compilation, true,
	"If true, optimize Vulkan shader compilation.");
	ABSL_FLAG(bool, share_constant_tensors, true,
	"If true, the executor will enable constant tensor sharing.");
	ABSL_FLAG(int, num_iterations, 1,
	"Number of iterations to run the model. By default, it's 1.");
	ABSL_FLAG(std::string, litert_dispatch_lib_dir, "",
	"Directory of the LiteRT dispatch library. If not set, the runtime "
	"will look for the library in the path defined as the environment "
	"variables.");
	ABSL_FLAG(bool, sampler_handles_input, true,
	"If true and the sampler supports, the sampler manipulates decode "
	"input tensors including tokens, positions, and mask.");
	ABSL_FLAG(std::string, conv_type, "auto",
	"Convolution data type. It can be auto, float, or int8. float will "
	"be either float32 or float16 depending on the activation data type. "
	"See --force_f32. int8 would have better latency with lower "
	"accuracy. auto will choose the best type based on the model.");
	ABSL_FLAG(bool, cache_compiled_shaders_only, false,
	"If true, only the compiled shaders will be cached. If false, gpu "
	"graph info including work group sizes (and all compiled shaders "
	"depending on backend) will be cached.");
	ABSL_FLAG(std::string, constraint_regex, "",
	"Regular expression to constrain the output generation.");
	ABSL_FLAG(bool, use_submodel, false,
	"Whether the submodel should be used if available.");
	ABSL_FLAG(bool, enable_speculative_decoding, false,
	"Whether to use speculative decoding.");