Spaces:
Running
Running
| // Copyright 2025 The ODML Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| namespace litert::lm { | |
| namespace { | |
| // Default number of threads for WebGPU weight upload and kernel compilation. | |
| constexpr int kDefaultNumThreadsToUpload = 2; | |
| constexpr int kDefaultNumThreadsToCompile = 1; | |
| } // namespace | |
| absl::StatusOr<Backend> GetSamplerBackend( | |
| const LlmExecutorSettings& executor_settings) { | |
| Backend backend = executor_settings.GetBackend(); | |
| Backend sampler_backend = executor_settings.GetSamplerBackend(); | |
| if (sampler_backend == Backend::UNSPECIFIED) { | |
| sampler_backend = backend; | |
| } | |
| if (sampler_backend != Backend::CPU && sampler_backend != Backend::GPU) { | |
| return absl::InvalidArgumentError( | |
| absl::StrCat("Unsupported sampler backend: ", sampler_backend, | |
| " for backend: ", backend)); | |
| } | |
| return sampler_backend; | |
| } | |
| absl::StatusOr<litert::Options> CreateCompilationOptions( | |
| const LlmExecutorSettings& executor_settings, | |
| const ActivationDataType& activation_data_type, | |
| std::optional<ModelSignatures*> signatures, | |
| std::optional<std::string> cache_suffix) { | |
| LITERT_ASSIGN_OR_RETURN(auto compilation_options, Options::Create()); | |
| std::string cache_path = executor_settings.GetCacheDir(); | |
| switch (executor_settings.GetBackend()) { | |
| case Backend::GPU: { | |
| // TODO: b/403132820 - Add accelerator compilation options for ML_DRIFT. | |
| LITERT_ASSIGN_OR_RETURN(auto& gpu_compilation_options, | |
| compilation_options.GetGpuOptions()); | |
| gpu_compilation_options.EnableInfiniteFloatCapping(true); | |
| if (activation_data_type == ActivationDataType::FLOAT32) { | |
| gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp32); | |
| } else { | |
| gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp16); | |
| } | |
| gpu_compilation_options.SetPreferTextureWeights(false); | |
| gpu_compilation_options.SetUseMetalArgumentBuffers(true); | |
| gpu_compilation_options.SetPreferTextureWeights(true); | |
| bool has_valid_model_fd = | |
| executor_settings.GetModelAssets().GetScopedFile().ok() && | |
| executor_settings.GetModelAssets().GetScopedFile().value()->IsValid(); | |
| auto program_cache_file = | |
| executor_settings.GetProgramCacheFile(".mldrift_program_cache.bin"); | |
| bool has_valid_program_cache_fd = | |
| program_cache_file.ok() && | |
| !std::holds_alternative<std::string>(*program_cache_file); | |
| auto model_path_or_status = executor_settings.GetModelAssets().GetPath(); | |
| if (model_path_or_status.ok()) { | |
| // If the model path is available, use the model name as the cache key. | |
| absl::string_view model_path = *model_path_or_status; | |
| absl::string_view model_name = Basename(model_path); | |
| gpu_compilation_options.SetModelCacheKey(model_name.data()); | |
| } else if (has_valid_model_fd && has_valid_program_cache_fd) { | |
| // If the model is loaded from an fd, there is no way to automatically | |
| // generate a cache key. But if we are loading a model from an fd, it is | |
| // likely that our program cache is also loaded from an fd which does | |
| // not require a cache key to prevent collisions. The GPU delegate will | |
| // still expect a cache key, so we set it to a constant value. | |
| gpu_compilation_options.SetModelCacheKey("fd_token"); | |
| } | |
| AdvancedSettings advanced_settings; | |
| if (executor_settings.GetAdvancedSettings()) { | |
| advanced_settings = *executor_settings.GetAdvancedSettings(); | |
| } | |
| bool serialization_dir_set = false; | |
| if (cache_path != ":nocache") { | |
| if (cache_path.empty()) { | |
| ASSIGN_OR_RETURN(auto model_path, | |
| executor_settings.GetModelAssets().GetPath()); | |
| cache_path = std::filesystem::path(std::string(model_path)) | |
| .parent_path() | |
| .string(); | |
| if (cache_path.empty()) { | |
| cache_path = std::filesystem::current_path().string(); | |
| } | |
| } | |
| ABSL_LOG(INFO) << "Setting serialization dir: " << cache_path; | |
| gpu_compilation_options.SetSerializationDir(cache_path.c_str()); | |
| serialization_dir_set = true; | |
| gpu_compilation_options.SetSerializeExternalTensors(true); | |
| gpu_compilation_options.CacheCompiledProgramsOnly( | |
| advanced_settings.cache_compiled_shaders_only); | |
| } else { | |
| gpu_compilation_options.SetSerializeExternalTensors(false); | |
| } | |
| if (program_cache_file.ok()) { | |
| if (std::holds_alternative<std::string>(*program_cache_file)) { | |
| if (!serialization_dir_set) { | |
| cache_path = std::filesystem::path( | |
| std::get<std::string>(*program_cache_file)) | |
| .parent_path() | |
| .string(); | |
| ABSL_LOG(INFO) << "Setting program cache dir: " << cache_path; | |
| gpu_compilation_options.SetSerializationDir(cache_path.c_str()); | |
| } | |
| } else { | |
| auto scoped_cache_file = | |
| std::get<std::shared_ptr<lm::ScopedFile>>(*program_cache_file); | |
| ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate()); | |
| ASSIGN_OR_RETURN(int fd, duplicated.Release()); | |
| gpu_compilation_options.SetProgramCacheFd(fd); | |
| } | |
| gpu_compilation_options.SetSerializeProgramCache(true); | |
| } else { | |
| gpu_compilation_options.SetSerializeProgramCache(false); | |
| } | |
| // Use NoExternalTensorsMode to get better performance. | |
| ASSIGN_OR_RETURN(const GpuConfig gpu_config, | |
| executor_settings.GetBackendConfig<GpuConfig>()); | |
| bool external_tensor_mode = gpu_config.external_tensor_mode; | |
| gpu_compilation_options.EnableExternalTensorsMode(external_tensor_mode); | |
| if (!external_tensor_mode) { | |
| // This option prevents KVCache handling from being affected by | |
| // BHWC conversion in NoExternalTensorsMode. | |
| gpu_compilation_options.AddExternalTensorPattern("kv_cache_"); | |
| if (signatures.has_value() && | |
| signatures.value()->input_int32_param.has_value()) { | |
| gpu_compilation_options.AddBufferStorageTensorPattern("kv_cache_"); | |
| gpu_compilation_options.AddExternalTensorPattern("param_tensor"); | |
| gpu_compilation_options.AddBufferStorageTensorPattern("param_tensor"); | |
| } | |
| ASSIGN_OR_RETURN(auto sampler_backend, | |
| GetSamplerBackend(executor_settings)); | |
| if (sampler_backend == Backend::GPU) { | |
| // GPU Sampler requires logits to be external tensors (PHWC4 format). | |
| gpu_compilation_options.AddExternalTensorPattern("logits"); | |
| } | |
| } | |
| // Prefill and decode are always fully delegated to single delegate. | |
| gpu_compilation_options.SetHintFullyDelegatedToSingleDelegate(true); | |
| gpu_compilation_options.SetMadviseOriginalSharedTensors( | |
| advanced_settings.gpu_madvise_original_shared_tensors); | |
| gpu_compilation_options.SetConvertWeightsOnGpu( | |
| advanced_settings.convert_weights_on_gpu); | |
| gpu_compilation_options.EnableConstantTensorSharing( | |
| advanced_settings.share_constant_tensors); | |
| gpu_compilation_options.EnableAllowSrcQuantizedFcConvOps( | |
| !advanced_settings.allow_src_quantized_fc_conv_ops.has_value() || | |
| advanced_settings.allow_src_quantized_fc_conv_ops.value()); | |
| gpu_compilation_options.HintWaitingForCompletion( | |
| advanced_settings.hint_waiting_for_completion.has_value() && | |
| advanced_settings.hint_waiting_for_completion.value()); | |
| if (advanced_settings.is_benchmark) { | |
| gpu_compilation_options.SetSyncExecutionModeWaitType( | |
| GpuOptions::SyncExecutionModeWaitType::kActive); | |
| gpu_compilation_options.WaitForWeightsConversionComplete( | |
| advanced_settings | |
| .wait_for_weights_conversion_complete_in_benchmark); | |
| } | |
| if (advanced_settings.gpu_context_low_priority.has_value() && | |
| advanced_settings.gpu_context_low_priority.value()) { | |
| gpu_compilation_options.SetPriority(GpuOptions::Priority::kLow); | |
| } | |
| if (!advanced_settings.preferred_device_substr.empty()) { | |
| gpu_compilation_options.SetPreferredDeviceSubstr( | |
| advanced_settings.preferred_device_substr.c_str()); | |
| } | |
| gpu_compilation_options.DisableShaderOptimization( | |
| !advanced_settings.optimize_shader_compilation); | |
| // TODO b/441627719 - Select backend by runtime options. | |
| gpu_compilation_options.SetBackend(GpuOptions::Backend::kWebGpu); | |
| // Prepare WebGPU or Vulkan command buffers ahead to reduce the overhead | |
| // of command buffer preparation. 2 steps ahead because KV cache is | |
| // swapped and the GPU resource bindings are the same as the previous | |
| // previous step. | |
| gpu_compilation_options.SetNumStepsOfCommandBufferPreparations(2); | |
| gpu_compilation_options.SetNumThreadsToUpload( | |
| advanced_settings.num_threads_to_upload >= 0 | |
| ? advanced_settings.num_threads_to_upload | |
| : kDefaultNumThreadsToUpload); | |
| gpu_compilation_options.SetNumThreadsToCompile( | |
| advanced_settings.num_threads_to_compile >= 0 | |
| ? advanced_settings.num_threads_to_compile | |
| : kDefaultNumThreadsToCompile); | |
| compilation_options.SetHardwareAccelerators(HwAccelerators::kGpu); | |
| break; | |
| } | |
| case Backend::CPU: { | |
| LITERT_ASSIGN_OR_RETURN(auto& cpu_compilation_options, | |
| compilation_options.GetCpuOptions()); | |
| ASSIGN_OR_RETURN(const CpuConfig cpu_config, | |
| executor_settings.GetBackendConfig<CpuConfig>()); | |
| const uint32_t num_threads = cpu_config.number_of_threads; | |
| cpu_compilation_options.SetNumThreads(num_threads); | |
| auto weight_cache_file = executor_settings.GetWeightCacheFile( | |
| cache_suffix.value_or("") + ".xnnpack_cache"); | |
| if (weight_cache_file.ok()) { | |
| if (std::holds_alternative<std::string>(*weight_cache_file)) { | |
| cache_path = std::get<std::string>(*weight_cache_file); | |
| cpu_compilation_options.SetXNNPackWeightCachePath(cache_path.c_str()); | |
| } else { | |
| auto scoped_cache_file = | |
| std::get<std::shared_ptr<ScopedFile>>(*weight_cache_file); | |
| ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate()); | |
| ASSIGN_OR_RETURN(int fd, duplicated.Release()); | |
| cpu_compilation_options.SetXNNPackWeightCacheFileDescriptor(fd); | |
| } | |
| } else { | |
| ABSL_LOG(WARNING) << "Can't use cache: " << weight_cache_file.status(); | |
| } | |
| auto default_xnn_options = TfLiteXNNPackDelegateOptionsDefault(); | |
| cpu_compilation_options.SetXNNPackFlags( | |
| default_xnn_options.flags | | |
| TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS); | |
| LITERT_ASSIGN_OR_RETURN(auto& runtime_options, | |
| compilation_options.GetRuntimeOptions()); | |
| runtime_options.SetCompressQuantizationZeroPoints(true); | |
| AdvancedSettings advanced_settings; | |
| if (executor_settings.GetAdvancedSettings()) { | |
| advanced_settings = *executor_settings.GetAdvancedSettings(); | |
| } | |
| runtime_options.SetDisableDelegateClustering( | |
| advanced_settings.disable_delegate_clustering); | |
| compilation_options.SetHardwareAccelerators(HwAccelerators::kCpu); | |
| break; | |
| } | |
| default: | |
| return absl::InvalidArgumentError(absl::StrCat( | |
| "Unsupported backend: ", executor_settings.GetBackend())); | |
| } | |
| return compilation_options; | |
| } | |
| } // namespace litert::lm | |