Spaces:

FINAL-Bench
/

LiteRT-LM

Running

File size: 13,481 Bytes

5f923cd

// Copyright 2025 The ODML Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "runtime/executor/llm_executor_settings_utils.h"

#include <cstdint>
#include <filesystem>  // NOLINT
#include <memory>
#include <optional>
#include <string>
#include <variant>

#include "absl/log/absl_log.h"  // from @com_google_absl
#include "absl/status/status.h"  // from @com_google_absl
#include "absl/status/statusor.h"  // from @com_google_absl
#include "absl/strings/str_cat.h"  // from @com_google_absl
#include "absl/strings/string_view.h"  // from @com_google_absl
#include "litert/cc/litert_common.h"  // from @litert
#include "litert/cc/litert_macros.h"  // from @litert
#include "litert/cc/litert_options.h"  // from @litert
#include "litert/cc/options/litert_gpu_options.h"  // from @litert
#include "runtime/executor/executor_settings_base.h"
#include "runtime/executor/litert_compiled_model_executor_utils.h"
#include "runtime/executor/llm_executor_settings.h"
#include "runtime/util/file_util.h"
#include "runtime/util/scoped_file.h"
#include "runtime/util/status_macros.h"
#include "tflite/delegates/xnnpack/xnnpack_delegate.h"  // from @litert

namespace litert::lm {

namespace {

// Default number of threads for WebGPU weight upload and kernel compilation.
constexpr int kDefaultNumThreadsToUpload = 2;
constexpr int kDefaultNumThreadsToCompile = 1;

}  // namespace

absl::StatusOr<Backend> GetSamplerBackend(
    const LlmExecutorSettings& executor_settings) {
  Backend backend = executor_settings.GetBackend();
  Backend sampler_backend = executor_settings.GetSamplerBackend();

  if (sampler_backend == Backend::UNSPECIFIED) {
    sampler_backend = backend;
  }

  if (sampler_backend != Backend::CPU && sampler_backend != Backend::GPU) {
    return absl::InvalidArgumentError(
        absl::StrCat("Unsupported sampler backend: ", sampler_backend,
                     " for backend: ", backend));
  }

  return sampler_backend;
}

absl::StatusOr<litert::Options> CreateCompilationOptions(
    const LlmExecutorSettings& executor_settings,
    const ActivationDataType& activation_data_type,
    std::optional<ModelSignatures*> signatures,
    std::optional<std::string> cache_suffix) {
  LITERT_ASSIGN_OR_RETURN(auto compilation_options, Options::Create());
  std::string cache_path = executor_settings.GetCacheDir();

  switch (executor_settings.GetBackend()) {
    case Backend::GPU: {
      // TODO: b/403132820 - Add accelerator compilation options for ML_DRIFT.
      LITERT_ASSIGN_OR_RETURN(auto& gpu_compilation_options,
                              compilation_options.GetGpuOptions());
      gpu_compilation_options.EnableInfiniteFloatCapping(true);
      if (activation_data_type == ActivationDataType::FLOAT32) {
        gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp32);
      } else {
        gpu_compilation_options.SetPrecision(GpuOptions::Precision::kFp16);
      }
#if defined(__APPLE__)
      gpu_compilation_options.SetPreferTextureWeights(false);
      gpu_compilation_options.SetUseMetalArgumentBuffers(true);
#else   // !__APPLE__
      gpu_compilation_options.SetPreferTextureWeights(true);
#endif  // !__APPLE__

      bool has_valid_model_fd =
          executor_settings.GetModelAssets().GetScopedFile().ok() &&
          executor_settings.GetModelAssets().GetScopedFile().value()->IsValid();

      auto program_cache_file =
          executor_settings.GetProgramCacheFile(".mldrift_program_cache.bin");
      bool has_valid_program_cache_fd =
          program_cache_file.ok() &&
          !std::holds_alternative<std::string>(*program_cache_file);

      auto model_path_or_status = executor_settings.GetModelAssets().GetPath();
      if (model_path_or_status.ok()) {
        // If the model path is available, use the model name as the cache key.
        absl::string_view model_path = *model_path_or_status;
        absl::string_view model_name = Basename(model_path);
        gpu_compilation_options.SetModelCacheKey(model_name.data());
      } else if (has_valid_model_fd && has_valid_program_cache_fd) {
        // If the model is loaded from an fd, there is no way to automatically
        // generate a cache key. But if we are loading a model from an fd, it is
        // likely that our program cache is also loaded from an fd which does
        // not require a cache key to prevent collisions. The GPU delegate will
        // still expect a cache key, so we set it to a constant value.
        gpu_compilation_options.SetModelCacheKey("fd_token");
      }

      AdvancedSettings advanced_settings;
      if (executor_settings.GetAdvancedSettings()) {
        advanced_settings = *executor_settings.GetAdvancedSettings();
      }

      bool serialization_dir_set = false;
      if (cache_path != ":nocache") {
        if (cache_path.empty()) {
          ASSIGN_OR_RETURN(auto model_path,
                           executor_settings.GetModelAssets().GetPath());
          cache_path = std::filesystem::path(std::string(model_path))
                           .parent_path()
                           .string();
          if (cache_path.empty()) {
            cache_path = std::filesystem::current_path().string();
          }
        }
        ABSL_LOG(INFO) << "Setting serialization dir: " << cache_path;
        gpu_compilation_options.SetSerializationDir(cache_path.c_str());
        serialization_dir_set = true;
        gpu_compilation_options.SetSerializeExternalTensors(true);
        gpu_compilation_options.CacheCompiledProgramsOnly(
            advanced_settings.cache_compiled_shaders_only);
      } else {
        gpu_compilation_options.SetSerializeExternalTensors(false);
      }

      if (program_cache_file.ok()) {
        if (std::holds_alternative<std::string>(*program_cache_file)) {
          if (!serialization_dir_set) {
            cache_path = std::filesystem::path(
                             std::get<std::string>(*program_cache_file))
                             .parent_path()
                             .string();
            ABSL_LOG(INFO) << "Setting program cache dir: " << cache_path;
            gpu_compilation_options.SetSerializationDir(cache_path.c_str());
          }
        } else {
          auto scoped_cache_file =
              std::get<std::shared_ptr<lm::ScopedFile>>(*program_cache_file);
          ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate());
          ASSIGN_OR_RETURN(int fd, duplicated.Release());
          gpu_compilation_options.SetProgramCacheFd(fd);
        }
        gpu_compilation_options.SetSerializeProgramCache(true);
      } else {
        gpu_compilation_options.SetSerializeProgramCache(false);
      }

      // Use NoExternalTensorsMode to get better performance.
      ASSIGN_OR_RETURN(const GpuConfig gpu_config,
                       executor_settings.GetBackendConfig<GpuConfig>());
      bool external_tensor_mode = gpu_config.external_tensor_mode;
      gpu_compilation_options.EnableExternalTensorsMode(external_tensor_mode);
      if (!external_tensor_mode) {
        // This option prevents KVCache handling from being affected by
        // BHWC conversion in NoExternalTensorsMode.
        gpu_compilation_options.AddExternalTensorPattern("kv_cache_");
        if (signatures.has_value() &&
            signatures.value()->input_int32_param.has_value()) {
          gpu_compilation_options.AddBufferStorageTensorPattern("kv_cache_");
          gpu_compilation_options.AddExternalTensorPattern("param_tensor");
          gpu_compilation_options.AddBufferStorageTensorPattern("param_tensor");
        }
        ASSIGN_OR_RETURN(auto sampler_backend,
                         GetSamplerBackend(executor_settings));
        if (sampler_backend == Backend::GPU) {
          // GPU Sampler requires logits to be external tensors (PHWC4 format).
          gpu_compilation_options.AddExternalTensorPattern("logits");
        }
      }
      // Prefill and decode are always fully delegated to single delegate.
      gpu_compilation_options.SetHintFullyDelegatedToSingleDelegate(true);

      gpu_compilation_options.SetMadviseOriginalSharedTensors(
          advanced_settings.gpu_madvise_original_shared_tensors);
      gpu_compilation_options.SetConvertWeightsOnGpu(
          advanced_settings.convert_weights_on_gpu);
      gpu_compilation_options.EnableConstantTensorSharing(
          advanced_settings.share_constant_tensors);
      gpu_compilation_options.EnableAllowSrcQuantizedFcConvOps(
          !advanced_settings.allow_src_quantized_fc_conv_ops.has_value() ||
          advanced_settings.allow_src_quantized_fc_conv_ops.value());
      gpu_compilation_options.HintWaitingForCompletion(
          advanced_settings.hint_waiting_for_completion.has_value() &&
          advanced_settings.hint_waiting_for_completion.value());
      if (advanced_settings.is_benchmark) {
        gpu_compilation_options.SetSyncExecutionModeWaitType(
            GpuOptions::SyncExecutionModeWaitType::kActive);
        gpu_compilation_options.WaitForWeightsConversionComplete(
            advanced_settings
                .wait_for_weights_conversion_complete_in_benchmark);
      }
      if (advanced_settings.gpu_context_low_priority.has_value() &&
          advanced_settings.gpu_context_low_priority.value()) {
        gpu_compilation_options.SetPriority(GpuOptions::Priority::kLow);
      }
      if (!advanced_settings.preferred_device_substr.empty()) {
        gpu_compilation_options.SetPreferredDeviceSubstr(
            advanced_settings.preferred_device_substr.c_str());
      }
      gpu_compilation_options.DisableShaderOptimization(
          !advanced_settings.optimize_shader_compilation);
      // TODO b/441627719 - Select backend by runtime options.
#if defined(LITERT_USE_WEBGPU_ACCELERATOR)
      gpu_compilation_options.SetBackend(GpuOptions::Backend::kWebGpu);
#endif  // defined(LITERT_USE_WEBGPU_ACCELERATOR)
      // Prepare WebGPU or Vulkan command buffers ahead to reduce the overhead
      // of command buffer preparation. 2 steps ahead because KV cache is
      // swapped and the GPU resource bindings are the same as the previous
      // previous step.
      gpu_compilation_options.SetNumStepsOfCommandBufferPreparations(2);
      gpu_compilation_options.SetNumThreadsToUpload(
          advanced_settings.num_threads_to_upload >= 0
              ? advanced_settings.num_threads_to_upload
              : kDefaultNumThreadsToUpload);
      gpu_compilation_options.SetNumThreadsToCompile(
          advanced_settings.num_threads_to_compile >= 0
              ? advanced_settings.num_threads_to_compile
              : kDefaultNumThreadsToCompile);
      compilation_options.SetHardwareAccelerators(HwAccelerators::kGpu);
      break;
    }
    case Backend::CPU: {
      LITERT_ASSIGN_OR_RETURN(auto& cpu_compilation_options,
                              compilation_options.GetCpuOptions());
      ASSIGN_OR_RETURN(const CpuConfig cpu_config,
                       executor_settings.GetBackendConfig<CpuConfig>());
      const uint32_t num_threads = cpu_config.number_of_threads;
      cpu_compilation_options.SetNumThreads(num_threads);
      auto weight_cache_file = executor_settings.GetWeightCacheFile(
          cache_suffix.value_or("") + ".xnnpack_cache");
      if (weight_cache_file.ok()) {
        if (std::holds_alternative<std::string>(*weight_cache_file)) {
          cache_path = std::get<std::string>(*weight_cache_file);
          cpu_compilation_options.SetXNNPackWeightCachePath(cache_path.c_str());
        } else {
          auto scoped_cache_file =
              std::get<std::shared_ptr<ScopedFile>>(*weight_cache_file);
          ASSIGN_OR_RETURN(auto duplicated, scoped_cache_file->Duplicate());
          ASSIGN_OR_RETURN(int fd, duplicated.Release());
          cpu_compilation_options.SetXNNPackWeightCacheFileDescriptor(fd);
        }
      } else {
        ABSL_LOG(WARNING) << "Can't use cache: " << weight_cache_file.status();
      }
      auto default_xnn_options = TfLiteXNNPackDelegateOptionsDefault();
      cpu_compilation_options.SetXNNPackFlags(
          default_xnn_options.flags |
          TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS);
      LITERT_ASSIGN_OR_RETURN(auto& runtime_options,
                              compilation_options.GetRuntimeOptions());
      runtime_options.SetCompressQuantizationZeroPoints(true);
      AdvancedSettings advanced_settings;
      if (executor_settings.GetAdvancedSettings()) {
        advanced_settings = *executor_settings.GetAdvancedSettings();
      }
      runtime_options.SetDisableDelegateClustering(
          advanced_settings.disable_delegate_clustering);
      compilation_options.SetHardwareAccelerators(HwAccelerators::kCpu);
      break;
    }
    default:
      return absl::InvalidArgumentError(absl::StrCat(
          "Unsupported backend: ", executor_settings.GetBackend()));
  }

  return compilation_options;
}

}  // namespace litert::lm