Spaces:

FINAL-Bench
/

LiteRT-LM

Running

App Files Files Community

LiteRT-LM / runtime /executor /llm_executor_settings_utils.cc

SeaWolf-AI

Upload full LiteRT-LM codebase

5f923cd verified 3 days ago

raw

history blame contribute delete

13.5 kB

	// Copyright 2025 The ODML Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "runtime/executor/llm_executor_settings_utils.h"

	#include <cstdint>
	#include <filesystem> // NOLINT
	#include <memory>
	#include <optional>
	#include <string>
	#include <variant>

	#include "absl/log/absl_log.h" // from @com_google_absl
	#include "absl/status/status.h" // from @com_google_absl
	#include "absl/status/statusor.h" // from @com_google_absl
	#include "absl/strings/str_cat.h" // from @com_google_absl
	#include "absl/strings/string_view.h" // from @com_google_absl
	#include "litert/cc/litert_common.h" // from @litert
	#include "litert/cc/litert_macros.h" // from @litert
	#include "litert/cc/litert_options.h" // from @litert
	#include "litert/cc/options/litert_gpu_options.h" // from @litert
	#include "runtime/executor/executor_settings_base.h"
	#include "runtime/executor/litert_compiled_model_executor_utils.h"
	#include "runtime/executor/llm_executor_settings.h"
	#include "runtime/util/file_util.h"
	#include "runtime/util/scoped_file.h"
	#include "runtime/util/status_macros.h"
	#include "tflite/delegates/xnnpack/xnnpack_delegate.h" // from @litert

	namespace litert::lm {

	namespace {

	// Default number of threads for WebGPU weight upload and kernel compilation.
	constexpr int kDefaultNumThreadsToUpload = 2;
	constexpr int kDefaultNumThreadsToCompile = 1;

	} // namespace

	absl::StatusOr<Backend> GetSamplerBackend(
	const LlmExecutorSettings& executor_settings) {
	Backend backend = executor_settings.GetBackend();
	Backend sampler_backend = executor_settings.GetSamplerBackend();

	if (sampler_backend == Backend::UNSPECIFIED) {
	sampler_backend = backend;
	}

	if (sampler_backend != Backend::CPU && sampler_backend != Backend::GPU) {
	return absl::InvalidArgumentError(
	absl::StrCat("Unsupported sampler backend: ", sampler_backend,
	" for backend: ", backend));
	}

	return sampler_backend;
	}

	absl::StatusOr<litert::Options> CreateCompilationOptions(
	const LlmExecutorSettings& executor_settings,
	const ActivationDataType& activation_data_type,
	std::optional<ModelSignatures*> signatures,
	std::optional<std::string> cache_suffix) {
	LITERT_ASSIGN_OR_RETURN(auto compilation_options, Options::Create());
	std::string cache_path = executor_settings.GetCacheDir();

	switch (executor_settings.GetBackend()) {
	case Backend::GPU: {
	// TODO: b/403132820 - Add accelerator compilation options for ML_DRIFT.
	LITERT_ASSIGN_OR_RETURN(auto& gpu_compilation_options,
	compilation_options.GetGpuOptions());
	gpu_compilation_options.EnableInfiniteFloatCapping(true);
	if (activation_data_type == ActivationDataType::FLOAT32) {
	gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp32);
	} else {
	gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp16);
	}
	#if defined(__APPLE__)
	gpu_compilation_options.SetPreferTextureWeights(false);
	gpu_compilation_options.SetUseMetalArgumentBuffers(true);
	#else // !__APPLE__
	gpu_compilation_options.SetPreferTextureWeights(true);
	#endif // !__APPLE__

	bool has_valid_model_fd =
	executor_settings.GetModelAssets().GetScopedFile().ok() &&
	executor_settings.GetModelAssets().GetScopedFile().value()->IsValid();

	auto program_cache_file =
	executor_settings.GetProgramCacheFile(".mldrift_program_cache.bin");
	bool has_valid_program_cache_fd =
	program_cache_file.ok() &&
	!std::holds_alternative<std::string>(*program_cache_file);

	auto model_path_or_status = executor_settings.GetModelAssets().GetPath();
	if (model_path_or_status.ok()) {
	// If the model path is available, use the model name as the cache key.
	absl::string_view model_path = *model_path_or_status;
	absl::string_view model_name = Basename(model_path);
	gpu_compilation_options.SetModelCacheKey(model_name.data());
	} else if (has_valid_model_fd && has_valid_program_cache_fd) {
	// If the model is loaded from an fd, there is no way to automatically
	// generate a cache key. But if we are loading a model from an fd, it is
	// likely that our program cache is also loaded from an fd which does
	// not require a cache key to prevent collisions. The GPU delegate will
	// still expect a cache key, so we set it to a constant value.
	gpu_compilation_options.SetModelCacheKey("fd_token");
	}

	AdvancedSettings advanced_settings;
	if (executor_settings.GetAdvancedSettings()) {
	advanced_settings = *executor_settings.GetAdvancedSettings();
	}

	bool serialization_dir_set = false;
	if (cache_path != ":nocache") {
	if (cache_path.empty()) {
	ASSIGN_OR_RETURN(auto model_path,
	executor_settings.GetModelAssets().GetPath());
	cache_path = std::filesystem::path(std::string(model_path))
	.parent_path()
	.string();
	if (cache_path.empty()) {
	cache_path = std::filesystem::current_path().string();
	}
	}
	ABSL_LOG(INFO) << "Setting serialization dir: " << cache_path;
	gpu_compilation_options.SetSerializationDir(cache_path.c_str());
	serialization_dir_set = true;
	gpu_compilation_options.SetSerializeExternalTensors(true);
	gpu_compilation_options.CacheCompiledProgramsOnly(
	advanced_settings.cache_compiled_shaders_only);
	} else {
	gpu_compilation_options.SetSerializeExternalTensors(false);
	}

	if (program_cache_file.ok()) {
	if (std::holds_alternative<std::string>(*program_cache_file)) {
	if (!serialization_dir_set) {
	cache_path = std::filesystem::path(
	std::get<std::string>(*program_cache_file))
	.parent_path()
	.string();
	ABSL_LOG(INFO) << "Setting program cache dir: " << cache_path;
	gpu_compilation_options.SetSerializationDir(cache_path.c_str());
	}
	} else {
	auto scoped_cache_file =
	std::get<std::shared_ptr<lm::ScopedFile>>(*program_cache_file);
	ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate());
	ASSIGN_OR_RETURN(int fd, duplicated.Release());
	gpu_compilation_options.SetProgramCacheFd(fd);
	}
	gpu_compilation_options.SetSerializeProgramCache(true);
	} else {
	gpu_compilation_options.SetSerializeProgramCache(false);
	}

	// Use NoExternalTensorsMode to get better performance.
	ASSIGN_OR_RETURN(const GpuConfig gpu_config,
	executor_settings.GetBackendConfig<GpuConfig>());
	bool external_tensor_mode = gpu_config.external_tensor_mode;
	gpu_compilation_options.EnableExternalTensorsMode(external_tensor_mode);
	if (!external_tensor_mode) {
	// This option prevents KVCache handling from being affected by
	// BHWC conversion in NoExternalTensorsMode.
	gpu_compilation_options.AddExternalTensorPattern("kv_cache_");
	if (signatures.has_value() &&
	signatures.value()->input_int32_param.has_value()) {
	gpu_compilation_options.AddBufferStorageTensorPattern("kv_cache_");
	gpu_compilation_options.AddExternalTensorPattern("param_tensor");
	gpu_compilation_options.AddBufferStorageTensorPattern("param_tensor");
	}
	ASSIGN_OR_RETURN(auto sampler_backend,
	GetSamplerBackend(executor_settings));
	if (sampler_backend == Backend::GPU) {
	// GPU Sampler requires logits to be external tensors (PHWC4 format).
	gpu_compilation_options.AddExternalTensorPattern("logits");
	}
	}
	// Prefill and decode are always fully delegated to single delegate.
	gpu_compilation_options.SetHintFullyDelegatedToSingleDelegate(true);

	gpu_compilation_options.SetMadviseOriginalSharedTensors(
	advanced_settings.gpu_madvise_original_shared_tensors);
	gpu_compilation_options.SetConvertWeightsOnGpu(
	advanced_settings.convert_weights_on_gpu);
	gpu_compilation_options.EnableConstantTensorSharing(
	advanced_settings.share_constant_tensors);
	gpu_compilation_options.EnableAllowSrcQuantizedFcConvOps(
	!advanced_settings.allow_src_quantized_fc_conv_ops.has_value() \|\|
	advanced_settings.allow_src_quantized_fc_conv_ops.value());
	gpu_compilation_options.HintWaitingForCompletion(
	advanced_settings.hint_waiting_for_completion.has_value() &&
	advanced_settings.hint_waiting_for_completion.value());
	if (advanced_settings.is_benchmark) {
	gpu_compilation_options.SetSyncExecutionModeWaitType(
	GpuOptions::SyncExecutionModeWaitType::kActive);
	gpu_compilation_options.WaitForWeightsConversionComplete(
	advanced_settings
	.wait_for_weights_conversion_complete_in_benchmark);
	}
	if (advanced_settings.gpu_context_low_priority.has_value() &&
	advanced_settings.gpu_context_low_priority.value()) {
	gpu_compilation_options.SetPriority(GpuOptions::Priority::kLow);
	}
	if (!advanced_settings.preferred_device_substr.empty()) {
	gpu_compilation_options.SetPreferredDeviceSubstr(
	advanced_settings.preferred_device_substr.c_str());
	}
	gpu_compilation_options.DisableShaderOptimization(
	!advanced_settings.optimize_shader_compilation);
	// TODO b/441627719 - Select backend by runtime options.
	#if defined(LITERT_USE_WEBGPU_ACCELERATOR)
	gpu_compilation_options.SetBackend(GpuOptions::Backend::kWebGpu);
	#endif // defined(LITERT_USE_WEBGPU_ACCELERATOR)
	// Prepare WebGPU or Vulkan command buffers ahead to reduce the overhead
	// of command buffer preparation. 2 steps ahead because KV cache is
	// swapped and the GPU resource bindings are the same as the previous
	// previous step.
	gpu_compilation_options.SetNumStepsOfCommandBufferPreparations(2);
	gpu_compilation_options.SetNumThreadsToUpload(
	advanced_settings.num_threads_to_upload >= 0
	? advanced_settings.num_threads_to_upload
	: kDefaultNumThreadsToUpload);
	gpu_compilation_options.SetNumThreadsToCompile(
	advanced_settings.num_threads_to_compile >= 0
	? advanced_settings.num_threads_to_compile
	: kDefaultNumThreadsToCompile);
	compilation_options.SetHardwareAccelerators(HwAccelerators::kGpu);
	break;
	}
	case Backend::CPU: {
	LITERT_ASSIGN_OR_RETURN(auto& cpu_compilation_options,
	compilation_options.GetCpuOptions());
	ASSIGN_OR_RETURN(const CpuConfig cpu_config,
	executor_settings.GetBackendConfig<CpuConfig>());
	const uint32_t num_threads = cpu_config.number_of_threads;
	cpu_compilation_options.SetNumThreads(num_threads);
	auto weight_cache_file = executor_settings.GetWeightCacheFile(
	cache_suffix.value_or("") + ".xnnpack_cache");
	if (weight_cache_file.ok()) {
	if (std::holds_alternative<std::string>(*weight_cache_file)) {
	cache_path = std::get<std::string>(*weight_cache_file);
	cpu_compilation_options.SetXNNPackWeightCachePath(cache_path.c_str());
	} else {
	auto scoped_cache_file =
	std::get<std::shared_ptr<ScopedFile>>(*weight_cache_file);
	ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate());
	ASSIGN_OR_RETURN(int fd, duplicated.Release());
	cpu_compilation_options.SetXNNPackWeightCacheFileDescriptor(fd);
	}
	} else {
	ABSL_LOG(WARNING) << "Can't use cache: " << weight_cache_file.status();
	}
	auto default_xnn_options = TfLiteXNNPackDelegateOptionsDefault();
	cpu_compilation_options.SetXNNPackFlags(
	default_xnn_options.flags \|
	TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS);
	LITERT_ASSIGN_OR_RETURN(auto& runtime_options,
	compilation_options.GetRuntimeOptions());
	runtime_options.SetCompressQuantizationZeroPoints(true);
	AdvancedSettings advanced_settings;
	if (executor_settings.GetAdvancedSettings()) {
	advanced_settings = *executor_settings.GetAdvancedSettings();
	}
	runtime_options.SetDisableDelegateClustering(
	advanced_settings.disable_delegate_clustering);
	compilation_options.SetHardwareAccelerators(HwAccelerators::kCpu);
	break;
	}
	default:
	return absl::InvalidArgumentError(absl::StrCat(
	"Unsupported backend: ", executor_settings.GetBackend()));
	}

	return compilation_options;
	}

	} // namespace litert::lm