// Copyright 2024 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_ #define THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_ #include #include #include #include #include #include #include #include #include #include #include "absl/log/absl_log.h" // from @com_google_absl #include "absl/log/log.h" // from @com_google_absl #include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/str_cat.h" // from @com_google_absl #include "runtime/executor/executor_settings_base.h" namespace litert::lm { struct GpuArtisanConfig { // Number of output candidates. uint32_t num_output_candidates = 1; // Whether to wait for weight uploads before prefilling. bool wait_for_weight_uploads = false; // Number of decode steps per sync. Used by GPU only. uint32_t num_decode_steps_per_sync = 1; // Sequence batch size for encoding. Used by GPU only. Number of input // tokens to process at a time for batch processing. Setting this value to 1 // means both the encoding and decoding share the same graph of sequence // length of 1. Setting this value to 0 means the batch size will be // optimized programmatically. uint32_t sequence_batch_size = 0; // The supported lora ranks for the base model. Used by GPU only. By default // it will be empty, meaning not supporting any lora ranks. std::vector supported_lora_ranks = {}; // Maximum top k, which is the max Top-K value supported for all // sessions created with the engine, used by GPU only. If a session with // Top-K value larger than this is being asked to be created, it will be // rejected(throw error). The max top k will be 1, which means only greedy // decoding is supported for any sessions created with this engine. uint32_t max_top_k = 1; // Enables decode logits. // AiCore uses decode logits, so this is enabled for AiCore. // LLM Engine defaults to disabling decode logits. bool enable_decode_logits = false; // Enables external embeddings. // AiCore uses external embeddings, so this is enabled for AiCore. // LLM Engine defaults to disabling external embeddings. bool enable_external_embeddings = false; // Whether the submodel should be used if available. bool use_submodel = false; }; std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config); struct GpuConfig { // Maximum top k, which is the max Top-K value supported for all // sessions created with the engine, used by GPU only. If a session with // Top-K value larger than this is being asked to be created, it will be // rejected(throw error). The default max top k will be 1, which // means only greedy decoding is supported for any sessions created with // this engine. uint32_t max_top_k = 1; // Whether to use external tensor mode. bool external_tensor_mode = false; }; std::ostream& operator<<(std::ostream& os, const GpuConfig& config); struct CpuConfig { // The increment size of the kv-cache. This is used by dynamically exported // models. Each time during decode, the kv-cache size is increased by this // size. uint32_t kv_increment_size = 16; // The maximum number of tokens to process in a single prefill chunk. This // setting is only applicable to dynamically exported models. Using smaller // chunk sizes can reduce peak memory usage and allow for more timely // cancellation of long input sequences. A value of -1 indicates that no // chunking is applied, and the entire prefill is processed at once. int prefill_chunk_size = -1; // Number of threads. The default value is 4. uint32_t number_of_threads = 4; }; std::ostream& operator<<(std::ostream& os, const CpuConfig& config); // Optional advanced settings for the LLM executor. struct AdvancedSettings { // Ordered set of the maximum number of prefill tokens processed at once when // the graph has dynamic prefill lengths. std::set prefill_batch_sizes; // The number of output candidates, or the decode batch size. int num_output_candidates = 1; // Whether to configure magic numbers when the model contains magic numbers. // Magic number for the context length will be replaced with max_num_tokens_ // in LlmExecutorSettings. // Magic numbers of the prefill lengths will be replaced with the // prefill_batch_sizes above with best match which means, a subgraph of // prefill will be chosen to have the smallest magic number greater than or // equal to the given prefill batch size. // The numbers that replaced magic numbers must be less than magic numbers. // Otherwise, default values less than magic numbers will be used that are // chosen by some heuristics. bool configure_magic_numbers = true; // Whether to verify magic numbers when the model contains magic numbers and // test signatures. // If true, the subgraphs replacing magic numbers with real dimensions must be // the same as or supersets of the subgraphs in test signatures of the same // dimensions. bool verify_magic_numbers = false; // Whether to clear kv cache before the first prefill step which may help to // disclose any issues related to kv cache. // When mask is in floating point and KV cache is not cleared, some // uninitialized values in KV cache, .e.g. NaN, may disrupt calculations // improperly. // Disable it if it's safe to keep the KV cache uninitialized, e.g. quantized, // so, they can't be NaN. bool clear_kv_cache_before_prefill = true; // For debugging purpose, the number of values at the beginning of logits, in // the middle of logits, and at the end of logits to print after each decode // step. If 0, disables printing logits. uint32_t num_logits_to_print_after_decode = 0; // If true, the GPU backend will madvise the original shared tensors after // use. bool gpu_madvise_original_shared_tensors = true; // If true, the executor is running a benchmark. bool is_benchmark = false; // Preferred WebGPU device name substring, case-insensitive. // If not empty, the adapter which the device name contains the substring will // be chosen. // If empty, the device will be determined by other factors. std::string preferred_device_substr; // Number of threads for WebGPU weight upload. -1 means it's determined by // the runtime. int num_threads_to_upload = -1; // Number of threads for WebGPU kernel shader compilation. -1 means it's // determined by the runtime. int num_threads_to_compile = -1; // If true, the executor will convert weights on GPU. // It is not supported by the all backends so this flag is ignored when using // non-OpenCL and non-WebGPU backends. bool convert_weights_on_gpu = true; // If false, the executor does not wait for weights conversion on GPU to // complete during benchmark. It's meaningful only when both is_benchmark and // convert_weights_on_gpu are true. bool wait_for_weights_conversion_complete_in_benchmark = true; // If true (by default), the executor enables Vulkan kernel shader // optimization. // Some GPU backends like Vulkan don't get much performance benefit from the // shader optimization but just increase initialization time with longer // shader compilation time. bool optimize_shader_compilation = true; // If true, the executor only cache the compiled shaders. If false, gpu graph // info including work group sizes (and compiled shaders depending on backend, // e.g. OpenCL includes compiled shaders, but WebGPU doesn't) will be cached. bool cache_compiled_shaders_only = false; // If true (by default), the executor enables constant tensor sharing. // Some GPU backends like Vulkan may degrade the performance when constant // tensor sharing is enabled. bool share_constant_tensors = true; // If true and the sampler supports, the sampler manipulates decode input // tensors including tokens, positions, and mask. bool sampler_handles_input = true; // If true, the executor allows src quantized fc conv ops on the GPU. // This feature is only supported by some GPUs. It can greatly improve // performance at the risk of reducing quality. std::optional allow_src_quantized_fc_conv_ops; // If true, the executor hints waiting for completion. This is to wait for all // the enqueued commands to be completed after each invoke. // This feature is only applied to the OpenCL backend and the goal is to fix // a known quality issue on AMD and Mali GPUs. // This flag is by default nullopt, which means the decision is made by the // runtime. // And for runtime, by default, it is false. But if we are running a Generic // model (most OSS models) on AMD or Mali GPU, we would set this flag to true. std::optional hint_waiting_for_completion; // If true, the GPU context priority will be set to low. // This flag is by default nullopt, which means the decision is made by the // runtime. // And for runtime, by default, it is false. If we are running a Generic model // (most OSS models), we would set this flag to true to ensure smooth UI. std::optional gpu_context_low_priority; // If true, the executor enables speculative decoding. bool enable_speculative_decoding = false; // If true, the executor disables delegate clustering. Can be useful for cases // where the default model delegate partitioning is not optimal. bool disable_delegate_clustering = false; bool operator==(const AdvancedSettings& other) const { return prefill_batch_sizes == other.prefill_batch_sizes && num_output_candidates == other.num_output_candidates && configure_magic_numbers == other.configure_magic_numbers && verify_magic_numbers == other.verify_magic_numbers && clear_kv_cache_before_prefill == other.clear_kv_cache_before_prefill && num_logits_to_print_after_decode == other.num_logits_to_print_after_decode && gpu_madvise_original_shared_tensors == other.gpu_madvise_original_shared_tensors && is_benchmark == other.is_benchmark && preferred_device_substr == other.preferred_device_substr && num_threads_to_upload == other.num_threads_to_upload && num_threads_to_compile == other.num_threads_to_compile && convert_weights_on_gpu == other.convert_weights_on_gpu && wait_for_weights_conversion_complete_in_benchmark == other.wait_for_weights_conversion_complete_in_benchmark && optimize_shader_compilation == other.optimize_shader_compilation && cache_compiled_shaders_only == other.cache_compiled_shaders_only && share_constant_tensors == other.share_constant_tensors && sampler_handles_input == other.sampler_handles_input && allow_src_quantized_fc_conv_ops == other.allow_src_quantized_fc_conv_ops && hint_waiting_for_completion == other.hint_waiting_for_completion && gpu_context_low_priority == other.gpu_context_low_priority && enable_speculative_decoding == other.enable_speculative_decoding && disable_delegate_clustering == other.disable_delegate_clustering; } }; std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings); // Settings for the LLM executor. // // This class holds the settings for the LLM executor, including the // model assets, cache directory, maximum number of tokens, backend, // activation data type, and backend-specific settings. // // The user should construct the class using ModelAssets and then set the // remaining settings using the setter APIs. class LlmExecutorSettings : public ExecutorSettingsBase { public: // Creates a LlmExecutorSettings with default values using the provided // ModelAssets. static absl::StatusOr CreateDefault( ModelAssets model_assets, Backend backend = Backend::CPU, std::optional sampler_backend = std::nullopt); uint32_t GetMaxNumTokens() const { return max_num_tokens_; } void SetMaxNumTokens(uint32_t max_num_tokens) { max_num_tokens_ = max_num_tokens; } uint32_t GetMaxNumImages() const { return max_num_images_; } void SetMaxNumImages(uint32_t max_num_images) { max_num_images_ = max_num_images; } uint32_t GetLoraRank() const { return lora_rank_; } void SetLoraRank(uint32_t lora_rank) { lora_rank_ = lora_rank; } template absl::StatusOr GetBackendConfig() const { if (std::holds_alternative(backend_config_)) { return std::get(backend_config_); } return absl::InvalidArgumentError("Backend config is not valid."); } template absl::StatusOr MutableBackendConfig() { if (std::holds_alternative(backend_config_)) { return std::get(backend_config_); } return absl::InvalidArgumentError("Backend config is not valid."); } void SetBackendConfig( const std::variant& config) { backend_config_ = config; } Backend GetSamplerBackend() const { return sampler_backend_; } void SetSamplerBackend(Backend sampler_backend) { sampler_backend_ = sampler_backend; } const std::optional& GetAdvancedSettings() const { return advanced_settings_; } void SetAdvancedSettings(const AdvancedSettings& advanced_settings) { advanced_settings_ = advanced_settings; } absl::Status SetSupportedLoraRanks(const std::vector& lora_ranks) { if (std::holds_alternative(backend_config_)) { std::get(backend_config_).supported_lora_ranks = lora_ranks; return absl::OkStatus(); } else if (!lora_ranks.empty()) { // If lora_ranks is not empty, but the backend is not GpuArtisanConfig, // we log a warning and ignore the lora ranks. LOG(ERROR) << "supported_lora_ranks is only supported for " "GpuArtisanConfig. The provided lora ranks will be " "ignored."; } return absl::OkStatus(); } private: explicit LlmExecutorSettings(ModelAssets model_assets) : ExecutorSettingsBase(std::move(model_assets)) {} // Maximum number of the sum of input and output tokens. It is equivalent to // the size of the kv-cache. uint32_t max_num_tokens_; // Maximum number of images the model can handle. uint32_t max_num_images_; // LoRA rank. 0 means LoRA is disabled. uint32_t lora_rank_ = 0; // Backend specific config. std::variant backend_config_; // Backend to use for sampling. Backend sampler_backend_ = Backend::UNSPECIFIED; // Optional advanced settings. std::optional advanced_settings_; // Declare the output stream operator as a friend such that it can be used // to print the LlmExecutorSettings private member. friend std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config); }; std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config); // Struct to host the runtime settings for the executor. // Settings will not be changed by the executor while executing task. // TODO: b/404279705 - Set default values in LLM Executor RuntimeConfig struct RuntimeConfig { // The number of output heads. // Multiple output heads might be supported in the future. For now, it is // always 1. std::optional output_heads; // The number of tokens per decode function call. std::optional tokens_per_decode; }; } // namespace litert::lm #endif // THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_