Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 16,485 Bytes

5f923cd

// Copyright 2024 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_
#define THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_

#include <cstdint>
#include <iostream>
#include <memory>
#include <optional>
#include <ostream>
#include <set>
#include <string>
#include <utility>
#include <variant>
#include <vector>

#include "absl/log/absl_log.h"  // from @com_google_absl
#include "absl/log/log.h"  // from @com_google_absl
#include "absl/status/status.h"  // from @com_google_absl
#include "absl/status/statusor.h"  // from @com_google_absl
#include "absl/strings/str_cat.h"  // from @com_google_absl
#include "runtime/executor/executor_settings_base.h"

namespace litert::lm {

struct GpuArtisanConfig {
  // Number of output candidates.
  uint32_t num_output_candidates = 1;

  // Whether to wait for weight uploads before prefilling.
  bool wait_for_weight_uploads = false;

  // Number of decode steps per sync. Used by GPU only.
  uint32_t num_decode_steps_per_sync = 1;

  // Sequence batch size for encoding. Used by GPU only. Number of input
  // tokens to process at a time for batch processing. Setting this value to 1
  // means both the encoding and decoding share the same graph of sequence
  // length of 1. Setting this value to 0 means the batch size will be
  // optimized programmatically.
  uint32_t sequence_batch_size = 0;

  // The supported lora ranks for the base model. Used by GPU only. By default
  // it will be empty, meaning not supporting any lora ranks.
  std::vector<uint32_t> supported_lora_ranks = {};

  // Maximum top k, which is the max Top-K value supported for all
  // sessions created with the engine, used by GPU only. If a session with
  // Top-K value larger than this is being asked to be created, it will be
  // rejected(throw error). The max top k will be 1, which means only greedy
  // decoding is supported for any sessions created with this engine.
  uint32_t max_top_k = 1;

  // Enables decode logits.
  // AiCore uses decode logits, so this is enabled for AiCore.
  // LLM Engine defaults to disabling decode logits.
  bool enable_decode_logits = false;

  // Enables external embeddings.
  // AiCore uses external embeddings, so this is enabled for AiCore.
  // LLM Engine defaults to disabling external embeddings.
  bool enable_external_embeddings = false;

  // Whether the submodel should be used if available.
  bool use_submodel = false;
};

std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config);

struct GpuConfig {
  // Maximum top k, which is the max Top-K value supported for all
  // sessions created with the engine, used by GPU only. If a session with
  // Top-K value larger than this is being asked to be created, it will be
  // rejected(throw error). The default max top k will be 1, which
  // means only greedy decoding is supported for any sessions created with
  // this engine.
  uint32_t max_top_k = 1;

  // Whether to use external tensor mode.
  bool external_tensor_mode = false;
};
std::ostream& operator<<(std::ostream& os, const GpuConfig& config);

struct CpuConfig {
  // The increment size of the kv-cache. This is used by dynamically exported
  // models. Each time during decode, the kv-cache size is increased by this
  // size.
  uint32_t kv_increment_size = 16;

  // The maximum number of tokens to process in a single prefill chunk. This
  // setting is only applicable to dynamically exported models. Using smaller
  // chunk sizes can reduce peak memory usage and allow for more timely
  // cancellation of long input sequences. A value of -1 indicates that no
  // chunking is applied, and the entire prefill is processed at once.
  int prefill_chunk_size = -1;

  // Number of threads. The default value is 4.
  uint32_t number_of_threads = 4;
};
std::ostream& operator<<(std::ostream& os, const CpuConfig& config);

// Optional advanced settings for the LLM executor.
struct AdvancedSettings {
  // Ordered set of the maximum number of prefill tokens processed at once when
  // the graph has dynamic prefill lengths.
  std::set<int> prefill_batch_sizes;

  // The number of output candidates, or the decode batch size.
  int num_output_candidates = 1;

  // Whether to configure magic numbers when the model contains magic numbers.
  // Magic number for the context length will be replaced with max_num_tokens_
  // in LlmExecutorSettings.
  // Magic numbers of the prefill lengths will be replaced with the
  // prefill_batch_sizes above with best match which means, a subgraph of
  // prefill will be chosen to have the smallest magic number greater than or
  // equal to the given prefill batch size.
  // The numbers that replaced magic numbers must be less than magic numbers.
  // Otherwise, default values less than magic numbers will be used that are
  // chosen by some heuristics.
  bool configure_magic_numbers = true;

  // Whether to verify magic numbers when the model contains magic numbers and
  // test signatures.
  // If true, the subgraphs replacing magic numbers with real dimensions must be
  // the same as or supersets of the subgraphs in test signatures of the same
  // dimensions.
  bool verify_magic_numbers = false;

  // Whether to clear kv cache before the first prefill step which may help to
  // disclose any issues related to kv cache.
  // When mask is in floating point and KV cache is not cleared, some
  // uninitialized values in KV cache, .e.g. NaN, may disrupt calculations
  // improperly.
  // Disable it if it's safe to keep the KV cache uninitialized, e.g. quantized,
  // so, they can't be NaN.
  bool clear_kv_cache_before_prefill = true;

  // For debugging purpose, the number of values at the beginning of logits, in
  // the middle of logits, and at the end of logits to print after each decode
  // step. If 0, disables printing logits.
  uint32_t num_logits_to_print_after_decode = 0;

  // If true, the GPU backend will madvise the original shared tensors after
  // use.
  bool gpu_madvise_original_shared_tensors = true;

  // If true, the executor is running a benchmark.
  bool is_benchmark = false;

  // Preferred WebGPU device name substring, case-insensitive.
  // If not empty, the adapter which the device name contains the substring will
  // be chosen.
  // If empty, the device will be determined by other factors.
  std::string preferred_device_substr;

  // Number of threads for WebGPU weight upload. -1 means it's determined by
  // the runtime.
  int num_threads_to_upload = -1;
  // Number of threads for WebGPU kernel shader compilation. -1 means it's
  // determined by the runtime.
  int num_threads_to_compile = -1;

  // If true, the executor will convert weights on GPU.
  // It is not supported by the all backends so this flag is ignored when using
  // non-OpenCL and non-WebGPU backends.
  bool convert_weights_on_gpu = true;

  // If false, the executor does not wait for weights conversion on GPU to
  // complete during benchmark. It's meaningful only when both is_benchmark and
  // convert_weights_on_gpu are true.
  bool wait_for_weights_conversion_complete_in_benchmark = true;

  // If true (by default), the executor enables Vulkan kernel shader
  // optimization.
  // Some GPU backends like Vulkan don't get much performance benefit from the
  // shader optimization but just increase initialization time with longer
  // shader compilation time.
  bool optimize_shader_compilation = true;

  // If true, the executor only cache the compiled shaders. If false, gpu graph
  // info including work group sizes (and compiled shaders depending on backend,
  // e.g. OpenCL includes compiled shaders, but WebGPU doesn't) will be cached.
  bool cache_compiled_shaders_only = false;

  // If true (by default), the executor enables constant tensor sharing.
  // Some GPU backends like Vulkan may degrade the performance when constant
  // tensor sharing is enabled.
  bool share_constant_tensors = true;

  // If true and the sampler supports, the sampler manipulates decode input
  // tensors including tokens, positions, and mask.
  bool sampler_handles_input = true;

  // If true, the executor allows src quantized fc conv ops on the GPU.
  // This feature is only supported by some GPUs. It can greatly improve
  // performance at the risk of reducing quality.
  std::optional<bool> allow_src_quantized_fc_conv_ops;

  // If true, the executor hints waiting for completion. This is to wait for all
  // the enqueued commands to be completed after each invoke.
  // This feature is only applied to the OpenCL backend and the goal is to fix
  // a known quality issue on AMD and Mali GPUs.
  // This flag is by default nullopt, which means the decision is made by the
  // runtime.
  // And for runtime, by default, it is false. But if we are running a Generic
  // model (most OSS models) on AMD or Mali GPU, we would set this flag to true.
  std::optional<bool> hint_waiting_for_completion;

  // If true, the GPU context priority will be set to low.
  // This flag is by default nullopt, which means the decision is made by the
  // runtime.
  // And for runtime, by default, it is false. If we are running a Generic model
  // (most OSS models), we would set this flag to true to ensure smooth UI.
  std::optional<bool> gpu_context_low_priority;

  // If true, the executor enables speculative decoding.
  bool enable_speculative_decoding = false;

  // If true, the executor disables delegate clustering. Can be useful for cases
  // where the default model delegate partitioning is not optimal.
  bool disable_delegate_clustering = false;

  bool operator==(const AdvancedSettings& other) const {
    return prefill_batch_sizes == other.prefill_batch_sizes &&
           num_output_candidates == other.num_output_candidates &&
           configure_magic_numbers == other.configure_magic_numbers &&
           verify_magic_numbers == other.verify_magic_numbers &&
           clear_kv_cache_before_prefill ==
               other.clear_kv_cache_before_prefill &&
           num_logits_to_print_after_decode ==
               other.num_logits_to_print_after_decode &&
           gpu_madvise_original_shared_tensors ==
               other.gpu_madvise_original_shared_tensors &&
           is_benchmark == other.is_benchmark &&
           preferred_device_substr == other.preferred_device_substr &&
           num_threads_to_upload == other.num_threads_to_upload &&
           num_threads_to_compile == other.num_threads_to_compile &&
           convert_weights_on_gpu == other.convert_weights_on_gpu &&
           wait_for_weights_conversion_complete_in_benchmark ==
               other.wait_for_weights_conversion_complete_in_benchmark &&
           optimize_shader_compilation == other.optimize_shader_compilation &&
           cache_compiled_shaders_only == other.cache_compiled_shaders_only &&
           share_constant_tensors == other.share_constant_tensors &&
           sampler_handles_input == other.sampler_handles_input &&
           allow_src_quantized_fc_conv_ops ==
               other.allow_src_quantized_fc_conv_ops &&
           hint_waiting_for_completion == other.hint_waiting_for_completion &&
           gpu_context_low_priority == other.gpu_context_low_priority &&
           enable_speculative_decoding == other.enable_speculative_decoding &&
           disable_delegate_clustering == other.disable_delegate_clustering;
  }
};
std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings);

// Settings for the LLM executor.
//
// This class holds the settings for the LLM executor, including the
// model assets, cache directory, maximum number of tokens, backend,
// activation data type, and backend-specific settings.
//
// The user should construct the class using ModelAssets and then set the
// remaining settings using the setter APIs.
class LlmExecutorSettings : public ExecutorSettingsBase {
 public:
  // Creates a LlmExecutorSettings with default values using the provided
  // ModelAssets.
  static absl::StatusOr<LlmExecutorSettings> CreateDefault(
      ModelAssets model_assets, Backend backend = Backend::CPU,
      std::optional<Backend> sampler_backend = std::nullopt);

  uint32_t GetMaxNumTokens() const { return max_num_tokens_; }
  void SetMaxNumTokens(uint32_t max_num_tokens) {
    max_num_tokens_ = max_num_tokens;
  }

  uint32_t GetMaxNumImages() const { return max_num_images_; }
  void SetMaxNumImages(uint32_t max_num_images) {
    max_num_images_ = max_num_images;
  }

  uint32_t GetLoraRank() const { return lora_rank_; }
  void SetLoraRank(uint32_t lora_rank) { lora_rank_ = lora_rank; }

  template <typename T>
  absl::StatusOr<const T> GetBackendConfig() const {
    if (std::holds_alternative<T>(backend_config_)) {
      return std::get<T>(backend_config_);
    }
    return absl::InvalidArgumentError("Backend config is not valid.");
  }

  template <typename T>
  absl::StatusOr<T> MutableBackendConfig() {
    if (std::holds_alternative<T>(backend_config_)) {
      return std::get<T>(backend_config_);
    }
    return absl::InvalidArgumentError("Backend config is not valid.");
  }

  void SetBackendConfig(
      const std::variant<GpuArtisanConfig, GpuConfig, CpuConfig>& config) {
    backend_config_ = config;
  }

  Backend GetSamplerBackend() const { return sampler_backend_; }
  void SetSamplerBackend(Backend sampler_backend) {
    sampler_backend_ = sampler_backend;
  }

  const std::optional<AdvancedSettings>& GetAdvancedSettings() const {
    return advanced_settings_;
  }
  void SetAdvancedSettings(const AdvancedSettings& advanced_settings) {
    advanced_settings_ = advanced_settings;
  }

  absl::Status SetSupportedLoraRanks(const std::vector<uint32_t>& lora_ranks) {
    if (std::holds_alternative<GpuArtisanConfig>(backend_config_)) {
      std::get<GpuArtisanConfig>(backend_config_).supported_lora_ranks =
          lora_ranks;
      return absl::OkStatus();
    } else if (!lora_ranks.empty()) {
      // If lora_ranks is not empty, but the backend is not GpuArtisanConfig,
      // we log a warning and ignore the lora ranks.
      LOG(ERROR) << "supported_lora_ranks is only supported for "
                    "GpuArtisanConfig. The provided lora ranks will be "
                    "ignored.";
    }
    return absl::OkStatus();
  }

 private:
  explicit LlmExecutorSettings(ModelAssets model_assets)
      : ExecutorSettingsBase(std::move(model_assets)) {}

  // Maximum number of the sum of input and output tokens. It is equivalent to
  // the size of the kv-cache.
  uint32_t max_num_tokens_;

  // Maximum number of images the model can handle.
  uint32_t max_num_images_;

  // LoRA rank. 0 means LoRA is disabled.
  uint32_t lora_rank_ = 0;

  // Backend specific config.
  std::variant<GpuArtisanConfig, GpuConfig, CpuConfig> backend_config_;

  // Backend to use for sampling.
  Backend sampler_backend_ = Backend::UNSPECIFIED;

  // Optional advanced settings.
  std::optional<AdvancedSettings> advanced_settings_;

  // Declare the output stream operator as a friend such that it can be used
  // to print the LlmExecutorSettings private member.
  friend std::ostream& operator<<(std::ostream& os,
                                  const LlmExecutorSettings& config);
};
std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config);

// Struct to host the runtime settings for the executor.
// Settings will not be changed by the executor while executing task.
// TODO: b/404279705 - Set default values in LLM Executor RuntimeConfig
struct RuntimeConfig {

  // The number of output heads.
  // Multiple output heads might be supported in the future. For now, it is
  // always 1.
  std::optional<int> output_heads;

  // The number of tokens per decode function call.
  std::optional<int> tokens_per_decode;
};

}  // namespace litert::lm

#endif  // THIRD_PARTY_ODML_LITE_RT_LLM_EXECUTOR_LLM_EXECUTOR_SETTINGS_H_