LiteRT-LM / runtime /executor /llm_executor_settings.cc
SeaWolf-AI's picture
Upload full LiteRT-LM codebase
5f923cd verified
// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "runtime/executor/llm_executor_settings.h"
#include <iostream>
#include <memory>
#include <optional>
#include <ostream>
#include <utility>
#include <variant>
#include "absl/status/status.h" // from @com_google_absl
#include "absl/status/statusor.h" // from @com_google_absl
#include "absl/strings/str_cat.h" // from @com_google_absl
#include "absl/strings/str_join.h" // from @com_google_absl
#include "runtime/executor/executor_settings_base.h"
#include "runtime/util/logging.h"
#include "runtime/util/status_macros.h" // NOLINT
namespace litert::lm {
std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config) {
os << "num_output_candidates: " << config.num_output_candidates << "\n";
os << "wait_for_weight_uploads: " << config.wait_for_weight_uploads << "\n";
os << "num_decode_steps_per_sync: " << config.num_decode_steps_per_sync
<< "\n";
os << "sequence_batch_size: " << config.sequence_batch_size << "\n";
os << "supported_lora_ranks: " << config.supported_lora_ranks << "\n";
os << "max_top_k: " << config.max_top_k << "\n";
os << "enable_decode_logits: " << config.enable_decode_logits << "\n";
os << "enable_external_embeddings: " << config.enable_external_embeddings
<< "\n";
os << "use_submodel: " << config.use_submodel << "\n";
return os;
}
std::ostream& operator<<(std::ostream& os, const GpuConfig& config) {
os << "max_top_k: " << config.max_top_k << "\n";
return os;
}
std::ostream& operator<<(std::ostream& os, const CpuConfig& config) {
os << "kv_increment_size: " << config.kv_increment_size << "\n";
os << "prefill_chunk_size: " << config.prefill_chunk_size << "\n";
os << "number_of_threads: " << config.number_of_threads << "\n";
return os;
}
std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings) {
os << "prefill_batch_sizes: ["
<< absl::StrJoin(settings.prefill_batch_sizes, ", ") << "]\n";
os << "num_output_candidates: " << settings.num_output_candidates << "\n";
os << "configure_magic_numbers: " << settings.configure_magic_numbers << "\n";
os << "verify_magic_numbers: " << settings.verify_magic_numbers << "\n";
os << "clear_kv_cache_before_prefill: "
<< settings.clear_kv_cache_before_prefill << "\n";
os << "num_logits_to_print_after_decode: "
<< settings.num_logits_to_print_after_decode << "\n";
os << "gpu_madvise_original_shared_tensors: "
<< settings.gpu_madvise_original_shared_tensors << "\n";
os << "is_benchmark: " << settings.is_benchmark << "\n";
os << "preferred_device_substr: " << settings.preferred_device_substr << "\n";
os << "num_threads_to_upload: " << settings.num_threads_to_upload << "\n";
os << "num_threads_to_compile: " << settings.num_threads_to_compile << "\n";
os << "convert_weights_on_gpu: " << settings.convert_weights_on_gpu << "\n";
os << "wait_for_weights_conversion_complete_in_benchmark: "
<< settings.wait_for_weights_conversion_complete_in_benchmark << "\n";
os << "optimize_shader_compilation: " << settings.optimize_shader_compilation
<< "\n";
os << "cache_compiled_shaders_only: " << settings.cache_compiled_shaders_only
<< "\n";
os << "share_constant_tensors: " << settings.share_constant_tensors << "\n";
os << "sampler_handles_input: " << settings.sampler_handles_input << "\n";
if (settings.allow_src_quantized_fc_conv_ops.has_value()) {
os << "allow_src_quantized_fc_conv_ops: "
<< settings.allow_src_quantized_fc_conv_ops.value() << "\n";
} else {
os << "allow_src_quantized_fc_conv_ops: Not set\n";
}
if (settings.hint_waiting_for_completion.has_value()) {
os << "hint_waiting_for_completion: "
<< settings.hint_waiting_for_completion.value() << "\n";
} else {
os << "hint_waiting_for_completion: Not set\n";
}
if (settings.gpu_context_low_priority.has_value()) {
os << "gpu_context_low_priority: "
<< settings.gpu_context_low_priority.value() << "\n";
} else {
os << "gpu_context_low_priority: Not set\n";
}
os << "enable_speculative_decoding: " << settings.enable_speculative_decoding
<< "\n";
os << "disable_delegate_clustering: " << settings.disable_delegate_clustering
<< "\n";
return os;
}
std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config) {
os << "backend: " << config.GetBackend() << "\n";
std::visit(
[&os](const auto& backend_config) {
os << "backend_config:\n" << backend_config << "\n";
},
config.backend_config_);
os << "max_tokens: " << config.GetMaxNumTokens() << "\n";
os << "activation_data_type: " << config.GetActivationDataType() << "\n";
os << "max_num_images: " << config.GetMaxNumImages() << "\n";
os << "lora_rank: " << config.GetLoraRank() << "\n";
os << "cache_dir: " << config.GetCacheDir() << "\n";
if (config.GetScopedCacheFile()) {
os << "cache_file: " << config.GetScopedCacheFile()->file() << "\n";
} else {
os << "cache_file: Not set\n";
}
if (config.GetLitertDispatchLibDir().empty()) {
os << "litert_dispatch_lib_dir: Not set\n";
} else {
os << "litert_dispatch_lib_dir: " << config.GetLitertDispatchLibDir()
<< "\n";
}
os << "model_assets: " << config.GetModelAssets() << "\n";
if (config.GetAdvancedSettings().has_value()) {
os << "advanced_settings: " << *config.GetAdvancedSettings() << "\n";
} else {
os << "advanced_settings: Not set\n";
}
return os;
}
// static
absl::StatusOr<LlmExecutorSettings> LlmExecutorSettings::CreateDefault(
ModelAssets model_assets, Backend backend,
std::optional<Backend> sampler_backend) {
LlmExecutorSettings settings(std::move(model_assets));
if (backend == Backend::CPU) {
CpuConfig config;
config.kv_increment_size = 16;
config.prefill_chunk_size = -1;
config.number_of_threads = 4;
settings.SetBackendConfig(config);
} else if (backend == Backend::GPU) {
GpuConfig config;
// Default max top k to 1 for GPU.
config.max_top_k = 1;
settings.SetBackendConfig(config);
} else if (backend == Backend::NPU) {
} else if (backend == Backend::GPU_ARTISAN) {
settings.SetBackendConfig(GpuArtisanConfig());
} else {
return absl::InvalidArgumentError(
absl::StrCat("Unsupported backend: ", backend));
}
RETURN_IF_ERROR(settings.SetBackend(backend));
// Explicitly set the field value to avoid undefined behavior. Setting to 0
// means that the maximum number of tokens is not set can could be inferred
// from the model assets (but note that for the model or backend which does
// not support this, an error will be thrown during initialization).
settings.SetMaxNumTokens(0);
// Disable image input by default.
settings.SetMaxNumImages(0);
// Disable LoRA by default.
settings.SetLoraRank(0);
if (sampler_backend.has_value() && *sampler_backend != Backend::UNSPECIFIED) {
settings.SetSamplerBackend(*sampler_backend);
}
return settings;
}
} // namespace litert::lm