// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // TODO(b/417209286): Remove this once the model assets are stored in the // litertlm file format. #include // NOLINT: Required for path manipulation. #include // NOLINT(build/c++11) #include #include #include #include #include #include "absl/base/no_destructor.h" // from @com_google_absl #include "absl/log/absl_check.h" // from @com_google_absl #include "absl/log/absl_log.h" // from @com_google_absl #include "absl/log/check.h" // from @com_google_absl #include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/time/clock.h" // from @com_google_absl #include "absl/time/time.h" // from @com_google_absl #include "litert/cc/litert_environment.h" // from @litert #include "litert/cc/litert_macros.h" // from @litert #include "runtime/components/model_resources.h" #include "runtime/components/tokenizer.h" #include "runtime/core/session_factory.h" #include "runtime/engine/engine.h" #include "runtime/engine/engine_factory.h" #include "runtime/engine/engine_settings.h" #include "runtime/engine/io_types.h" #include "runtime/executor/audio_executor.h" #include "runtime/executor/audio_executor_settings.h" #include "runtime/executor/audio_litert_compiled_model_executor.h" #include "runtime/executor/executor_settings_base.h" #include "runtime/executor/litert_compiled_model_executor_utils.h" #include "runtime/executor/llm_executor.h" #include "runtime/executor/llm_executor_settings.h" #include "runtime/executor/llm_litert_compiled_model_executor_factory.h" #include "runtime/executor/magic_number_configs_helper.h" #include "runtime/executor/vision_executor.h" #include "runtime/executor/vision_litert_compiled_model_executor.h" #include "runtime/framework/threadpool.h" #include "runtime/proto/llm_metadata.pb.h" #include "runtime/proto/sampler_params.pb.h" #include "runtime/util/status_macros.h" // NOLINT namespace litert::lm { namespace { // Gets the singleton Environment, initializing it on the first call // with the provided settings. This ensure we maintain the same LiteRT // environment during the whole application lifetime. This is required for GPU // LiteRT environment. See b/454383477 for more details. absl::StatusOr GetEnvironment(EngineSettings& engine_settings, ModelResources& model_resources) { // Helper must be available until LlmLiteRtCompiledModelExecutor::Create() is // called. Since env is used multiple times, it should also be static. static absl::NoDestructor helper; static absl::NoDestructor> kEnvironment( [&]() -> absl::StatusOr { std::vector env_options; const auto& main_executor_settings = engine_settings.GetMainExecutorSettings(); if ((main_executor_settings.GetBackend() == Backend::CPU) || (main_executor_settings.GetBackend() == Backend::GPU)) { if (!main_executor_settings .GetAdvancedSettings() || // Default is true. main_executor_settings.GetAdvancedSettings() ->configure_magic_numbers) { env_options = helper->GetLiteRtEnvOptions(model_resources, main_executor_settings); } } else { #if defined(LITERT_DISABLE_NPU) return absl::InvalidArgumentError( "Only CPU and GPU backends are supported."); #else if (!main_executor_settings.GetLitertDispatchLibDir().empty()) { // If the dispatch library directory is provided, use it. env_options.push_back(::litert::Environment::Option{ ::litert::Environment::OptionTag::DispatchLibraryDir, main_executor_settings.GetLitertDispatchLibDir()}); ABSL_LOG(INFO) << "Setting dispatch library path from " "main_executor_settings: " << main_executor_settings.GetLitertDispatchLibDir(); } else { // Otherwise, use the directory of the model file. std::string model_path( main_executor_settings.GetModelAssets().GetPath().value_or("")); std::filesystem::path path(model_path); // Note: Existence check for path was here, but it's better to check // before calling this function if needed. static const absl::NoDestructor kDispatchLibraryPath( path.parent_path().string()); if (!kDispatchLibraryPath->empty()) { ABSL_LOG(INFO) << "Setting dispatch library path: " << *kDispatchLibraryPath; env_options.push_back(::litert::Environment::Option{ ::litert::Environment::OptionTag::DispatchLibraryDir, absl::string_view(*kDispatchLibraryPath)}); } else { ABSL_LOG(INFO) << "No dispatch library path provided."; } } #endif // defined(LITERT_DISABLE_NPU) } LITERT_ASSIGN_OR_RETURN(auto env, Environment::Create(env_options)); return std::move(env); }()); if (!kEnvironment->ok()) { return kEnvironment->status(); } return **kEnvironment; } class EngineImpl : public Engine { public: ~EngineImpl() override { ABSL_QCHECK_OK(WaitUntilDone(Engine::kDefaultTimeout)); } static absl::StatusOr> Create( EngineSettings engine_settings, absl::string_view input_prompt_as_hint); EngineImpl(EngineSettings engine_settings, std::unique_ptr litert_model_resources, std::unique_ptr tokenizer, std::unique_ptr executor, std::unique_ptr vision_executor, std::unique_ptr audio_executor, std::optional benchmark_info, std::unique_ptr worker_thread_pool) : engine_settings_(std::move(engine_settings)), litert_model_resources_(std::move(litert_model_resources)), tokenizer_(std::move(tokenizer)), executor_(std::move(executor)), vision_executor_(std::move(vision_executor)), audio_executor_(std::move(audio_executor)), stop_token_ids_(), sampler_params_(), benchmark_info_(std::move(benchmark_info)), worker_thread_pool_(std::move(worker_thread_pool)) {} // Method to create the Session. absl::StatusOr> CreateSession( const SessionConfig& session_config) override { std::optional session_benchmark_info; if (benchmark_info_.has_value()) { // Each session will have its own benchmark info, which will be populated // with the session-specific information. session_benchmark_info = benchmark_info_; RETURN_IF_ERROR(session_benchmark_info->TimeInitPhaseStart( BenchmarkInfo::InitPhase::kSession)); } SessionConfig config = session_config; // TODO(b/418794726): Move this logics to be part of the SessionConfig // class. RETURN_IF_ERROR(config.MaybeUpdateAndValidate(engine_settings_)); ABSL_CHECK(litert_model_resources_ != nullptr); ASSIGN_OR_RETURN( auto session, InitializeSessionBasic(executor_.get(), tokenizer_.get(), /*vision_executor=*/vision_executor_.get(), /*audio_executor=*/audio_executor_.get(), config, std::move(session_benchmark_info), worker_thread_pool_.get())); if (benchmark_info_.has_value()) { auto session_benchmark_info_or = session->GetMutableBenchmarkInfo(); if (session_benchmark_info_or.ok()) { RETURN_IF_ERROR(session_benchmark_info_or.value()->TimeInitPhaseEnd( BenchmarkInfo::InitPhase::kSession)); } } return session; } absl::Status WaitUntilDone(absl::Duration timeout) override { return worker_thread_pool_->WaitUntilDone(timeout); } const EngineSettings& GetEngineSettings() const override { return engine_settings_; } const Tokenizer& GetTokenizer() const override { return *tokenizer_; } absl::StatusOr GetAudioExecutorProperties() const override { if (audio_executor_ == nullptr) { return absl::FailedPreconditionError("Audio modality is not enabled."); } return audio_executor_->GetAudioExecutorProperties(); } absl::StatusOr GetVisionExecutorProperties() const override { if (vision_executor_ == nullptr) { return absl::FailedPreconditionError("Vision modality is not enabled."); } return vision_executor_->GetVisionExecutorProperties(); } private: // Stored engine settings. EngineSettings engine_settings_; // Model resources, which must outlive `executor_`. std::unique_ptr litert_model_resources_; // Tokenizer shared by all sessions. std::unique_ptr tokenizer_; // Shared executor for all sessions. std::unique_ptr executor_; // Shared vision executor for all sessions. std::unique_ptr vision_executor_; // shared audio executor for all sessions. std::unique_ptr audio_executor_; // Default stop token ids for all sessions loaded from the model file. std::vector> stop_token_ids_; proto::SamplerParameters sampler_params_; // Benchmark info for the engine. std::optional benchmark_info_; // Thread pool for the engine to execute the works. std::unique_ptr worker_thread_pool_; }; // Method to create Engine. absl::StatusOr> EngineImpl::Create( EngineSettings engine_settings, absl::string_view input_prompt_as_hint) { std::optional benchmark_info = engine_settings.IsBenchmarkEnabled() ? std::make_optional( engine_settings.GetBenchmarkParams().value()) : std::nullopt; if (benchmark_info.has_value()) { RETURN_IF_ERROR( benchmark_info->TimeInitPhaseStart(BenchmarkInfo::InitPhase::kTotal)); RETURN_IF_ERROR(benchmark_info->TimeInitPhaseStart( BenchmarkInfo::InitPhase::kModelAssets)); } const auto& model_assets = engine_settings.GetMutableMainExecutorSettings().GetModelAssets(); ASSIGN_OR_RETURN(auto model_resources, BuildLiteRtCompiledModelResources(model_assets)); if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimeInitPhaseEnd( BenchmarkInfo::InitPhase::kModelAssets)); } if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimeInitPhaseStart( BenchmarkInfo::InitPhase::kLlmMetadata)); } ASSIGN_OR_RETURN(auto* llm_metadata, model_resources->GetLlmMetadata()); if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimeInitPhaseEnd( BenchmarkInfo::InitPhase::kLlmMetadata)); } bool hasLlmModelType = llm_metadata->has_llm_model_type(); absl::Duration tokenizer_duration = absl::ZeroDuration(); // This lambda is used to create the tokenizer asynchronously if the model // type is available, such that the tokenizer can be created in parallel with // the executor. auto create_tokenizer = [&tokenizer_duration, &model_resources]() -> absl::StatusOr> { absl::Time start_time = absl::Now(); ASSIGN_OR_RETURN(std::unique_ptr tokenizer, model_resources->GetTokenizer()); tokenizer_duration = absl::Now() - start_time; return tokenizer; }; const auto& main_executor_settings = engine_settings.GetMainExecutorSettings(); std::future>> tokenizer_future; std::unique_ptr tokenizer; if (!hasLlmModelType) { ABSL_LOG(INFO) << "Legacy model files don't have LlmModelType, loading tokenizer now"; ASSIGN_OR_RETURN(tokenizer, create_tokenizer()); // Update and load the parameters from the model file and convert the // tokens to ids. RETURN_IF_ERROR(engine_settings.MaybeUpdateAndValidate( tokenizer.get(), llm_metadata, input_prompt_as_hint, model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLitePrefillDecode), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteVisionEncoder), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteAudioEncoderHw))); } else { // If the model type is available, wait for the tokenizer to be created // after the model is loaded. ABSL_LOG(INFO) << "New model files have LlmModelType, loading tokenizer " "asynchronously"; if (engine_settings.GetParallelFileSectionLoading()) { tokenizer_future = std::async(std::launch::async, create_tokenizer); } else { tokenizer_future = std::async(std::launch::deferred, create_tokenizer); } RETURN_IF_ERROR(engine_settings.MaybeUpdateAndValidate( nullptr, llm_metadata, input_prompt_as_hint, model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLitePrefillDecode), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteVisionEncoder), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteAudioEncoderHw))); } if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->TimeInitPhaseStart( BenchmarkInfo::InitPhase::kExecutor)); } std::unique_ptr executor; ASSIGN_OR_RETURN(auto& env, GetEnvironment(engine_settings, *model_resources)); switch (main_executor_settings.GetBackend()) { default: { ASSIGN_OR_RETURN(executor, CreateLlmLiteRtCompiledModelExecutor( main_executor_settings, env, *model_resources)); } }; // TODO - b/436674053: Modularize the executor creation logic into a // separate executor class, and have unit test for it. std::unique_ptr vision_executor; if (engine_settings.GetVisionExecutorSettings().has_value()) { ASSIGN_OR_RETURN( vision_executor, VisionLiteRtCompiledModelExecutor::Create( engine_settings.GetMutableVisionExecutorSettings().value(), env)); } std::unique_ptr audio_executor; if (engine_settings.GetAudioExecutorSettings().has_value()) { ASSIGN_OR_RETURN( audio_executor, AudioLiteRtCompiledModelExecutor::Create( engine_settings.GetAudioExecutorSettings().value(), env)); } if (benchmark_info.has_value()) { RETURN_IF_ERROR( benchmark_info->TimeInitPhaseEnd(BenchmarkInfo::InitPhase::kExecutor)); } if (hasLlmModelType) { // Now load the tokenizer and update the engine settings. ASSIGN_OR_RETURN(tokenizer, tokenizer_future.get()); RETURN_IF_ERROR(engine_settings.MaybeUpdateAndValidate( tokenizer.get(), llm_metadata, input_prompt_as_hint, model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLitePrefillDecode), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteVisionEncoder), model_resources->GetTFLiteModelBackendConstraint( ModelType::kTfLiteAudioEncoderHw))); // As we load the tokenizer asynchronously, we need to update the executor // settings after the tokenizer is loaded. RETURN_IF_ERROR(executor->UpdateExecutorSettings( engine_settings.GetMainExecutorSettings())); } if (benchmark_info.has_value()) { RETURN_IF_ERROR(benchmark_info->InitPhaseRecord( BenchmarkInfo::InitPhase::kTokenizer, tokenizer_duration)); } // Creating the thread pool of a single thread to execute the works. auto worker_thread_pool = std::make_unique(/*name_prefix=*/"engine", /*max_num_threads=*/1); if (benchmark_info.has_value()) { RETURN_IF_ERROR( benchmark_info->TimeInitPhaseEnd(BenchmarkInfo::InitPhase::kTotal)); } auto llm_impl = std::make_unique( std::move(engine_settings), std::move(model_resources), std::move(tokenizer), std::move(executor), std::move(vision_executor), std::move(audio_executor), std::move(benchmark_info), std::move(worker_thread_pool)); return llm_impl; }; LITERT_LM_REGISTER_ENGINE(EngineFactory::EngineType::kLiteRTCompiledModel, [](EngineSettings settings, absl::string_view input_prompt_as_hint) { return EngineImpl::Create(std::move(settings), input_prompt_as_hint); }); } // namespace } // namespace litert::lm