Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 7,588 Bytes

5f923cd

// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/executor/llm_executor_settings.h"

#include <iostream>
#include <memory>
#include <optional>
#include <ostream>
#include <utility>
#include <variant>

#include "absl/status/status.h"  // from @com_google_absl
#include "absl/status/statusor.h"  // from @com_google_absl
#include "absl/strings/str_cat.h"  // from @com_google_absl
#include "absl/strings/str_join.h"  // from @com_google_absl
#include "runtime/executor/executor_settings_base.h"
#include "runtime/util/logging.h"
#include "runtime/util/status_macros.h"  // NOLINT

namespace litert::lm {

std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config) {
  os << "num_output_candidates: " << config.num_output_candidates << "\n";
  os << "wait_for_weight_uploads: " << config.wait_for_weight_uploads << "\n";
  os << "num_decode_steps_per_sync: " << config.num_decode_steps_per_sync
     << "\n";
  os << "sequence_batch_size: " << config.sequence_batch_size << "\n";
  os << "supported_lora_ranks: " << config.supported_lora_ranks << "\n";
  os << "max_top_k: " << config.max_top_k << "\n";
  os << "enable_decode_logits: " << config.enable_decode_logits << "\n";
  os << "enable_external_embeddings: " << config.enable_external_embeddings
     << "\n";
  os << "use_submodel: " << config.use_submodel << "\n";
  return os;
}

std::ostream& operator<<(std::ostream& os, const GpuConfig& config) {
  os << "max_top_k: " << config.max_top_k << "\n";
  return os;
}

std::ostream& operator<<(std::ostream& os, const CpuConfig& config) {
  os << "kv_increment_size: " << config.kv_increment_size << "\n";
  os << "prefill_chunk_size: " << config.prefill_chunk_size << "\n";
  os << "number_of_threads: " << config.number_of_threads << "\n";
  return os;
}

std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings) {
  os << "prefill_batch_sizes: ["
     << absl::StrJoin(settings.prefill_batch_sizes, ", ") << "]\n";
  os << "num_output_candidates: " << settings.num_output_candidates << "\n";
  os << "configure_magic_numbers: " << settings.configure_magic_numbers << "\n";
  os << "verify_magic_numbers: " << settings.verify_magic_numbers << "\n";
  os << "clear_kv_cache_before_prefill: "
     << settings.clear_kv_cache_before_prefill << "\n";
  os << "num_logits_to_print_after_decode: "
     << settings.num_logits_to_print_after_decode << "\n";
  os << "gpu_madvise_original_shared_tensors: "
     << settings.gpu_madvise_original_shared_tensors << "\n";
  os << "is_benchmark: " << settings.is_benchmark << "\n";
  os << "preferred_device_substr: " << settings.preferred_device_substr << "\n";
  os << "num_threads_to_upload: " << settings.num_threads_to_upload << "\n";
  os << "num_threads_to_compile: " << settings.num_threads_to_compile << "\n";
  os << "convert_weights_on_gpu: " << settings.convert_weights_on_gpu << "\n";
  os << "wait_for_weights_conversion_complete_in_benchmark: "
     << settings.wait_for_weights_conversion_complete_in_benchmark << "\n";
  os << "optimize_shader_compilation: " << settings.optimize_shader_compilation
     << "\n";
  os << "cache_compiled_shaders_only: " << settings.cache_compiled_shaders_only
     << "\n";
  os << "share_constant_tensors: " << settings.share_constant_tensors << "\n";
  os << "sampler_handles_input: " << settings.sampler_handles_input << "\n";
  if (settings.allow_src_quantized_fc_conv_ops.has_value()) {
    os << "allow_src_quantized_fc_conv_ops: "
       << settings.allow_src_quantized_fc_conv_ops.value() << "\n";
  } else {
    os << "allow_src_quantized_fc_conv_ops: Not set\n";
  }
  if (settings.hint_waiting_for_completion.has_value()) {
    os << "hint_waiting_for_completion: "
       << settings.hint_waiting_for_completion.value() << "\n";
  } else {
    os << "hint_waiting_for_completion: Not set\n";
  }
  if (settings.gpu_context_low_priority.has_value()) {
    os << "gpu_context_low_priority: "
       << settings.gpu_context_low_priority.value() << "\n";
  } else {
    os << "gpu_context_low_priority: Not set\n";
  }
  os << "enable_speculative_decoding: " << settings.enable_speculative_decoding
     << "\n";
  os << "disable_delegate_clustering: " << settings.disable_delegate_clustering
     << "\n";
  return os;
}

std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config) {
  os << "backend: " << config.GetBackend() << "\n";
  std::visit(
      [&os](const auto& backend_config) {
        os << "backend_config:\n" << backend_config << "\n";
      },
      config.backend_config_);
  os << "max_tokens: " << config.GetMaxNumTokens() << "\n";
  os << "activation_data_type: " << config.GetActivationDataType() << "\n";
  os << "max_num_images: " << config.GetMaxNumImages() << "\n";
  os << "lora_rank: " << config.GetLoraRank() << "\n";
  os << "cache_dir: " << config.GetCacheDir() << "\n";
  if (config.GetScopedCacheFile()) {
    os << "cache_file: " << config.GetScopedCacheFile()->file() << "\n";
  } else {
    os << "cache_file: Not set\n";
  }
  if (config.GetLitertDispatchLibDir().empty()) {
    os << "litert_dispatch_lib_dir: Not set\n";
  } else {
    os << "litert_dispatch_lib_dir: " << config.GetLitertDispatchLibDir()
       << "\n";
  }
  os << "model_assets: " << config.GetModelAssets() << "\n";
  if (config.GetAdvancedSettings().has_value()) {
    os << "advanced_settings: " << *config.GetAdvancedSettings() << "\n";
  } else {
    os << "advanced_settings: Not set\n";
  }
  return os;
}

// static
absl::StatusOr<LlmExecutorSettings> LlmExecutorSettings::CreateDefault(
    ModelAssets model_assets, Backend backend,
    std::optional<Backend> sampler_backend) {
  LlmExecutorSettings settings(std::move(model_assets));
  if (backend == Backend::CPU) {
    CpuConfig config;
    config.kv_increment_size = 16;
    config.prefill_chunk_size = -1;
    config.number_of_threads = 4;
    settings.SetBackendConfig(config);
  } else if (backend == Backend::GPU) {
    GpuConfig config;
    // Default max top k to 1 for GPU.
    config.max_top_k = 1;
    settings.SetBackendConfig(config);
  } else if (backend == Backend::NPU) {
  } else if (backend == Backend::GPU_ARTISAN) {
    settings.SetBackendConfig(GpuArtisanConfig());
  } else {
    return absl::InvalidArgumentError(
        absl::StrCat("Unsupported backend: ", backend));
  }
  RETURN_IF_ERROR(settings.SetBackend(backend));
  // Explicitly set the field value to avoid undefined behavior. Setting to 0
  // means that the maximum number of tokens is not set can could be inferred
  // from the model assets (but note that for the model or backend which does
  // not support this, an error will be thrown during initialization).
  settings.SetMaxNumTokens(0);
  // Disable image input by default.
  settings.SetMaxNumImages(0);
  // Disable LoRA by default.
  settings.SetLoraRank(0);

  if (sampler_backend.has_value() && *sampler_backend != Backend::UNSPECIFIED) {
    settings.SetSamplerBackend(*sampler_backend);
  }
  return settings;
}

}  // namespace litert::lm