Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /executor /llm_executor_settings.cc

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 3 days ago

raw

history blame contribute delete

7.59 kB

	// Copyright 2025 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "runtime/executor/llm_executor_settings.h"

	#include <iostream>
	#include <memory>
	#include <optional>
	#include <ostream>
	#include <utility>
	#include <variant>

	#include "absl/status/status.h" // from @com_google_absl
	#include "absl/status/statusor.h" // from @com_google_absl
	#include "absl/strings/str_cat.h" // from @com_google_absl
	#include "absl/strings/str_join.h" // from @com_google_absl
	#include "runtime/executor/executor_settings_base.h"
	#include "runtime/util/logging.h"
	#include "runtime/util/status_macros.h" // NOLINT

	namespace litert::lm {

	std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config) {
	os << "num_output_candidates: " << config.num_output_candidates << "\n";
	os << "wait_for_weight_uploads: " << config.wait_for_weight_uploads << "\n";
	os << "num_decode_steps_per_sync: " << config.num_decode_steps_per_sync
	<< "\n";
	os << "sequence_batch_size: " << config.sequence_batch_size << "\n";
	os << "supported_lora_ranks: " << config.supported_lora_ranks << "\n";
	os << "max_top_k: " << config.max_top_k << "\n";
	os << "enable_decode_logits: " << config.enable_decode_logits << "\n";
	os << "enable_external_embeddings: " << config.enable_external_embeddings
	<< "\n";
	os << "use_submodel: " << config.use_submodel << "\n";
	return os;
	}

	std::ostream& operator<<(std::ostream& os, const GpuConfig& config) {
	os << "max_top_k: " << config.max_top_k << "\n";
	return os;
	}

	std::ostream& operator<<(std::ostream& os, const CpuConfig& config) {
	os << "kv_increment_size: " << config.kv_increment_size << "\n";
	os << "prefill_chunk_size: " << config.prefill_chunk_size << "\n";
	os << "number_of_threads: " << config.number_of_threads << "\n";
	return os;
	}

	std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings) {
	os << "prefill_batch_sizes: ["
	<< absl::StrJoin(settings.prefill_batch_sizes, ", ") << "]\n";
	os << "num_output_candidates: " << settings.num_output_candidates << "\n";
	os << "configure_magic_numbers: " << settings.configure_magic_numbers << "\n";
	os << "verify_magic_numbers: " << settings.verify_magic_numbers << "\n";
	os << "clear_kv_cache_before_prefill: "
	<< settings.clear_kv_cache_before_prefill << "\n";
	os << "num_logits_to_print_after_decode: "
	<< settings.num_logits_to_print_after_decode << "\n";
	os << "gpu_madvise_original_shared_tensors: "
	<< settings.gpu_madvise_original_shared_tensors << "\n";
	os << "is_benchmark: " << settings.is_benchmark << "\n";
	os << "preferred_device_substr: " << settings.preferred_device_substr << "\n";
	os << "num_threads_to_upload: " << settings.num_threads_to_upload << "\n";
	os << "num_threads_to_compile: " << settings.num_threads_to_compile << "\n";
	os << "convert_weights_on_gpu: " << settings.convert_weights_on_gpu << "\n";
	os << "wait_for_weights_conversion_complete_in_benchmark: "
	<< settings.wait_for_weights_conversion_complete_in_benchmark << "\n";
	os << "optimize_shader_compilation: " << settings.optimize_shader_compilation
	<< "\n";
	os << "cache_compiled_shaders_only: " << settings.cache_compiled_shaders_only
	<< "\n";
	os << "share_constant_tensors: " << settings.share_constant_tensors << "\n";
	os << "sampler_handles_input: " << settings.sampler_handles_input << "\n";
	if (settings.allow_src_quantized_fc_conv_ops.has_value()) {
	os << "allow_src_quantized_fc_conv_ops: "
	<< settings.allow_src_quantized_fc_conv_ops.value() << "\n";
	} else {
	os << "allow_src_quantized_fc_conv_ops: Not set\n";
	}
	if (settings.hint_waiting_for_completion.has_value()) {
	os << "hint_waiting_for_completion: "
	<< settings.hint_waiting_for_completion.value() << "\n";
	} else {
	os << "hint_waiting_for_completion: Not set\n";
	}
	if (settings.gpu_context_low_priority.has_value()) {
	os << "gpu_context_low_priority: "
	<< settings.gpu_context_low_priority.value() << "\n";
	} else {
	os << "gpu_context_low_priority: Not set\n";
	}
	os << "enable_speculative_decoding: " << settings.enable_speculative_decoding
	<< "\n";
	os << "disable_delegate_clustering: " << settings.disable_delegate_clustering
	<< "\n";
	return os;
	}

	std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config) {
	os << "backend: " << config.GetBackend() << "\n";
	std::visit(
	[&os](const auto& backend_config) {
	os << "backend_config:\n" << backend_config << "\n";
	},
	config.backend_config_);
	os << "max_tokens: " << config.GetMaxNumTokens() << "\n";
	os << "activation_data_type: " << config.GetActivationDataType() << "\n";
	os << "max_num_images: " << config.GetMaxNumImages() << "\n";
	os << "lora_rank: " << config.GetLoraRank() << "\n";
	os << "cache_dir: " << config.GetCacheDir() << "\n";
	if (config.GetScopedCacheFile()) {
	os << "cache_file: " << config.GetScopedCacheFile()->file() << "\n";
	} else {
	os << "cache_file: Not set\n";
	}
	if (config.GetLitertDispatchLibDir().empty()) {
	os << "litert_dispatch_lib_dir: Not set\n";
	} else {
	os << "litert_dispatch_lib_dir: " << config.GetLitertDispatchLibDir()
	<< "\n";
	}
	os << "model_assets: " << config.GetModelAssets() << "\n";
	if (config.GetAdvancedSettings().has_value()) {
	os << "advanced_settings: " << *config.GetAdvancedSettings() << "\n";
	} else {
	os << "advanced_settings: Not set\n";
	}
	return os;
	}

	// static
	absl::StatusOr<LlmExecutorSettings> LlmExecutorSettings::CreateDefault(
	ModelAssets model_assets, Backend backend,
	std::optional<Backend> sampler_backend) {
	LlmExecutorSettings settings(std::move(model_assets));
	if (backend == Backend::CPU) {
	CpuConfig config;
	config.kv_increment_size = 16;
	config.prefill_chunk_size = -1;
	config.number_of_threads = 4;
	settings.SetBackendConfig(config);
	} else if (backend == Backend::GPU) {
	GpuConfig config;
	// Default max top k to 1 for GPU.
	config.max_top_k = 1;
	settings.SetBackendConfig(config);
	} else if (backend == Backend::NPU) {
	} else if (backend == Backend::GPU_ARTISAN) {
	settings.SetBackendConfig(GpuArtisanConfig());
	} else {
	return absl::InvalidArgumentError(
	absl::StrCat("Unsupported backend: ", backend));
	}
	RETURN_IF_ERROR(settings.SetBackend(backend));
	// Explicitly set the field value to avoid undefined behavior. Setting to 0
	// means that the maximum number of tokens is not set can could be inferred
	// from the model assets (but note that for the model or backend which does
	// not support this, an error will be thrown during initialization).
	settings.SetMaxNumTokens(0);
	// Disable image input by default.
	settings.SetMaxNumImages(0);
	// Disable LoRA by default.
	settings.SetLoraRank(0);

	if (sampler_backend.has_value() && *sampler_backend != Backend::UNSPECIFIED) {
	settings.SetSamplerBackend(*sampler_backend);
	}
	return settings;
	}

	} // namespace litert::lm