Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /executor /llm_executor_settings.h

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 3 days ago

raw

history blame contribute delete

16.5 kB

	// Copyright 2024 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_
	#define THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_

	#include <cstdint>
	#include <iostream>
	#include <memory>
	#include <optional>
	#include <ostream>
	#include <set>
	#include <string>
	#include <utility>
	#include <variant>
	#include <vector>

	#include "absl/log/absl_log.h" // from @com_google_absl
	#include "absl/log/log.h" // from @com_google_absl
	#include "absl/status/status.h" // from @com_google_absl
	#include "absl/status/statusor.h" // from @com_google_absl
	#include "absl/strings/str_cat.h" // from @com_google_absl
	#include "runtime/executor/executor_settings_base.h"

	namespace litert::lm {

	struct GpuArtisanConfig {
	// Number of output candidates.
	uint32_t num_output_candidates = 1;

	// Whether to wait for weight uploads before prefilling.
	bool wait_for_weight_uploads = false;

	// Number of decode steps per sync. Used by GPU only.
	uint32_t num_decode_steps_per_sync = 1;

	// Sequence batch size for encoding. Used by GPU only. Number of input
	// tokens to process at a time for batch processing. Setting this value to 1
	// means both the encoding and decoding share the same graph of sequence
	// length of 1. Setting this value to 0 means the batch size will be
	// optimized programmatically.
	uint32_t sequence_batch_size = 0;

	// The supported lora ranks for the base model. Used by GPU only. By default
	// it will be empty, meaning not supporting any lora ranks.
	std::vector<uint32_t> supported_lora_ranks = {};

	// Maximum top k, which is the max Top-K value supported for all
	// sessions created with the engine, used by GPU only. If a session with
	// Top-K value larger than this is being asked to be created, it will be
	// rejected(throw error). The max top k will be 1, which means only greedy
	// decoding is supported for any sessions created with this engine.
	uint32_t max_top_k = 1;

	// Enables decode logits.
	// AiCore uses decode logits, so this is enabled for AiCore.
	// LLM Engine defaults to disabling decode logits.
	bool enable_decode_logits = false;

	// Enables external embeddings.
	// AiCore uses external embeddings, so this is enabled for AiCore.
	// LLM Engine defaults to disabling external embeddings.
	bool enable_external_embeddings = false;

	// Whether the submodel should be used if available.
	bool use_submodel = false;
	};

	std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config);

	struct GpuConfig {
	// Maximum top k, which is the max Top-K value supported for all
	// sessions created with the engine, used by GPU only. If a session with
	// Top-K value larger than this is being asked to be created, it will be
	// rejected(throw error). The default max top k will be 1, which
	// means only greedy decoding is supported for any sessions created with
	// this engine.
	uint32_t max_top_k = 1;

	// Whether to use external tensor mode.
	bool external_tensor_mode = false;
	};
	std::ostream& operator<<(std::ostream& os, const GpuConfig& config);

	struct CpuConfig {
	// The increment size of the kv-cache. This is used by dynamically exported
	// models. Each time during decode, the kv-cache size is increased by this
	// size.
	uint32_t kv_increment_size = 16;

	// The maximum number of tokens to process in a single prefill chunk. This
	// setting is only applicable to dynamically exported models. Using smaller
	// chunk sizes can reduce peak memory usage and allow for more timely
	// cancellation of long input sequences. A value of -1 indicates that no
	// chunking is applied, and the entire prefill is processed at once.
	int prefill_chunk_size = -1;

	// Number of threads. The default value is 4.
	uint32_t number_of_threads = 4;
	};
	std::ostream& operator<<(std::ostream& os, const CpuConfig& config);

	// Optional advanced settings for the LLM executor.
	struct AdvancedSettings {
	// Ordered set of the maximum number of prefill tokens processed at once when
	// the graph has dynamic prefill lengths.
	std::set<int> prefill_batch_sizes;

	// The number of output candidates, or the decode batch size.
	int num_output_candidates = 1;

	// Whether to configure magic numbers when the model contains magic numbers.
	// Magic number for the context length will be replaced with max_num_tokens_
	// in LlmExecutorSettings.
	// Magic numbers of the prefill lengths will be replaced with the
	// prefill_batch_sizes above with best match which means, a subgraph of
	// prefill will be chosen to have the smallest magic number greater than or
	// equal to the given prefill batch size.
	// The numbers that replaced magic numbers must be less than magic numbers.
	// Otherwise, default values less than magic numbers will be used that are
	// chosen by some heuristics.
	bool configure_magic_numbers = true;

	// Whether to verify magic numbers when the model contains magic numbers and
	// test signatures.
	// If true, the subgraphs replacing magic numbers with real dimensions must be
	// the same as or supersets of the subgraphs in test signatures of the same
	// dimensions.
	bool verify_magic_numbers = false;

	// Whether to clear kv cache before the first prefill step which may help to
	// disclose any issues related to kv cache.
	// When mask is in floating point and KV cache is not cleared, some
	// uninitialized values in KV cache, .e.g. NaN, may disrupt calculations
	// improperly.
	// Disable it if it's safe to keep the KV cache uninitialized, e.g. quantized,
	// so, they can't be NaN.
	bool clear_kv_cache_before_prefill = true;

	// For debugging purpose, the number of values at the beginning of logits, in
	// the middle of logits, and at the end of logits to print after each decode
	// step. If 0, disables printing logits.
	uint32_t num_logits_to_print_after_decode = 0;

	// If true, the GPU backend will madvise the original shared tensors after
	// use.
	bool gpu_madvise_original_shared_tensors = true;

	// If true, the executor is running a benchmark.
	bool is_benchmark = false;

	// Preferred WebGPU device name substring, case-insensitive.
	// If not empty, the adapter which the device name contains the substring will
	// be chosen.
	// If empty, the device will be determined by other factors.
	std::string preferred_device_substr;

	// Number of threads for WebGPU weight upload. -1 means it's determined by
	// the runtime.
	int num_threads_to_upload = -1;
	// Number of threads for WebGPU kernel shader compilation. -1 means it's
	// determined by the runtime.
	int num_threads_to_compile = -1;

	// If true, the executor will convert weights on GPU.
	// It is not supported by the all backends so this flag is ignored when using
	// non-OpenCL and non-WebGPU backends.
	bool convert_weights_on_gpu = true;

	// If false, the executor does not wait for weights conversion on GPU to
	// complete during benchmark. It's meaningful only when both is_benchmark and
	// convert_weights_on_gpu are true.
	bool wait_for_weights_conversion_complete_in_benchmark = true;

	// If true (by default), the executor enables Vulkan kernel shader
	// optimization.
	// Some GPU backends like Vulkan don't get much performance benefit from the
	// shader optimization but just increase initialization time with longer
	// shader compilation time.
	bool optimize_shader_compilation = true;

	// If true, the executor only cache the compiled shaders. If false, gpu graph
	// info including work group sizes (and compiled shaders depending on backend,
	// e.g. OpenCL includes compiled shaders, but WebGPU doesn't) will be cached.
	bool cache_compiled_shaders_only = false;

	// If true (by default), the executor enables constant tensor sharing.
	// Some GPU backends like Vulkan may degrade the performance when constant
	// tensor sharing is enabled.
	bool share_constant_tensors = true;

	// If true and the sampler supports, the sampler manipulates decode input
	// tensors including tokens, positions, and mask.
	bool sampler_handles_input = true;

	// If true, the executor allows src quantized fc conv ops on the GPU.
	// This feature is only supported by some GPUs. It can greatly improve
	// performance at the risk of reducing quality.
	std::optional<bool> allow_src_quantized_fc_conv_ops;

	// If true, the executor hints waiting for completion. This is to wait for all
	// the enqueued commands to be completed after each invoke.
	// This feature is only applied to the OpenCL backend and the goal is to fix
	// a known quality issue on AMD and Mali GPUs.
	// This flag is by default nullopt, which means the decision is made by the
	// runtime.
	// And for runtime, by default, it is false. But if we are running a Generic
	// model (most OSS models) on AMD or Mali GPU, we would set this flag to true.
	std::optional<bool> hint_waiting_for_completion;

	// If true, the GPU context priority will be set to low.
	// This flag is by default nullopt, which means the decision is made by the
	// runtime.
	// And for runtime, by default, it is false. If we are running a Generic model
	// (most OSS models), we would set this flag to true to ensure smooth UI.
	std::optional<bool> gpu_context_low_priority;

	// If true, the executor enables speculative decoding.
	bool enable_speculative_decoding = false;

	// If true, the executor disables delegate clustering. Can be useful for cases
	// where the default model delegate partitioning is not optimal.
	bool disable_delegate_clustering = false;

	bool operator==(const AdvancedSettings& other) const {
	return prefill_batch_sizes == other.prefill_batch_sizes &&
	num_output_candidates == other.num_output_candidates &&
	configure_magic_numbers == other.configure_magic_numbers &&
	verify_magic_numbers == other.verify_magic_numbers &&
	clear_kv_cache_before_prefill ==
	other.clear_kv_cache_before_prefill &&
	num_logits_to_print_after_decode ==
	other.num_logits_to_print_after_decode &&
	gpu_madvise_original_shared_tensors ==
	other.gpu_madvise_original_shared_tensors &&
	is_benchmark == other.is_benchmark &&
	preferred_device_substr == other.preferred_device_substr &&
	num_threads_to_upload == other.num_threads_to_upload &&
	num_threads_to_compile == other.num_threads_to_compile &&
	convert_weights_on_gpu == other.convert_weights_on_gpu &&
	wait_for_weights_conversion_complete_in_benchmark ==
	other.wait_for_weights_conversion_complete_in_benchmark &&
	optimize_shader_compilation == other.optimize_shader_compilation &&
	cache_compiled_shaders_only == other.cache_compiled_shaders_only &&
	share_constant_tensors == other.share_constant_tensors &&
	sampler_handles_input == other.sampler_handles_input &&
	allow_src_quantized_fc_conv_ops ==
	other.allow_src_quantized_fc_conv_ops &&
	hint_waiting_for_completion == other.hint_waiting_for_completion &&
	gpu_context_low_priority == other.gpu_context_low_priority &&
	enable_speculative_decoding == other.enable_speculative_decoding &&
	disable_delegate_clustering == other.disable_delegate_clustering;
	}
	};
	std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings);

	// Settings for the LLM executor.
	//
	// This class holds the settings for the LLM executor, including the
	// model assets, cache directory, maximum number of tokens, backend,
	// activation data type, and backend-specific settings.
	//
	// The user should construct the class using ModelAssets and then set the
	// remaining settings using the setter APIs.
	class LlmExecutorSettings : public ExecutorSettingsBase {
	public:
	// Creates a LlmExecutorSettings with default values using the provided
	// ModelAssets.
	static absl::StatusOr<LlmExecutorSettings> CreateDefault(
	ModelAssets model_assets, Backend backend = Backend::CPU,
	std::optional<Backend> sampler_backend = std::nullopt);

	uint32_t GetMaxNumTokens() const { return max_num_tokens_; }
	void SetMaxNumTokens(uint32_t max_num_tokens) {
	max_num_tokens_ = max_num_tokens;
	}

	uint32_t GetMaxNumImages() const { return max_num_images_; }
	void SetMaxNumImages(uint32_t max_num_images) {
	max_num_images_ = max_num_images;
	}

	uint32_t GetLoraRank() const { return lora_rank_; }
	void SetLoraRank(uint32_t lora_rank) { lora_rank_ = lora_rank; }

	template <typename T>
	absl::StatusOr<const T> GetBackendConfig() const {
	if (std::holds_alternative<T>(backend_config_)) {
	return std::get<T>(backend_config_);
	}
	return absl::InvalidArgumentError("Backend config is not valid.");
	}

	template <typename T>
	absl::StatusOr<T> MutableBackendConfig() {
	if (std::holds_alternative<T>(backend_config_)) {
	return std::get<T>(backend_config_);
	}
	return absl::InvalidArgumentError("Backend config is not valid.");
	}

	void SetBackendConfig(
	const std::variant<GpuArtisanConfig, GpuConfig, CpuConfig>& config) {
	backend_config_ = config;
	}

	Backend GetSamplerBackend() const { return sampler_backend_; }
	void SetSamplerBackend(Backend sampler_backend) {
	sampler_backend_ = sampler_backend;
	}

	const std::optional<AdvancedSettings>& GetAdvancedSettings() const {
	return advanced_settings_;
	}
	void SetAdvancedSettings(const AdvancedSettings& advanced_settings) {
	advanced_settings_ = advanced_settings;
	}

	absl::Status SetSupportedLoraRanks(const std::vector<uint32_t>& lora_ranks) {
	if (std::holds_alternative<GpuArtisanConfig>(backend_config_)) {
	std::get<GpuArtisanConfig>(backend_config_).supported_lora_ranks =
	lora_ranks;
	return absl::OkStatus();
	} else if (!lora_ranks.empty()) {
	// If lora_ranks is not empty, but the backend is not GpuArtisanConfig,
	// we log a warning and ignore the lora ranks.
	LOG(ERROR) << "supported_lora_ranks is only supported for "
	"GpuArtisanConfig. The provided lora ranks will be "
	"ignored.";
	}
	return absl::OkStatus();
	}

	private:
	explicit LlmExecutorSettings(ModelAssets model_assets)
	: ExecutorSettingsBase(std::move(model_assets)) {}

	// Maximum number of the sum of input and output tokens. It is equivalent to
	// the size of the kv-cache.
	uint32_t max_num_tokens_;

	// Maximum number of images the model can handle.
	uint32_t max_num_images_;

	// LoRA rank. 0 means LoRA is disabled.
	uint32_t lora_rank_ = 0;

	// Backend specific config.
	std::variant<GpuArtisanConfig, GpuConfig, CpuConfig> backend_config_;

	// Backend to use for sampling.
	Backend sampler_backend_ = Backend::UNSPECIFIED;

	// Optional advanced settings.
	std::optional<AdvancedSettings> advanced_settings_;

	// Declare the output stream operator as a friend such that it can be used
	// to print the LlmExecutorSettings private member.
	friend std::ostream& operator<<(std::ostream& os,
	const LlmExecutorSettings& config);
	};
	std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config);

	// Struct to host the runtime settings for the executor.
	// Settings will not be changed by the executor while executing task.
	// TODO: b/404279705 - Set default values in LLM Executor RuntimeConfig
	struct RuntimeConfig {

	// The number of output heads.
	// Multiple output heads might be supported in the future. For now, it is
	// always 1.
	std::optional<int> output_heads;

	// The number of tokens per decode function call.
	std::optional<int> tokens_per_decode;
	};

	} // namespace litert::lm

	#endif // THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_