Spaces:
Running
Running
| // Copyright 2025 The ODML Authors. | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| namespace litert::lm { | |
| std::ostream& operator<<(std::ostream& os, const GpuArtisanConfig& config) { | |
| os << "num_output_candidates: " << config.num_output_candidates << "\n"; | |
| os << "wait_for_weight_uploads: " << config.wait_for_weight_uploads << "\n"; | |
| os << "num_decode_steps_per_sync: " << config.num_decode_steps_per_sync | |
| << "\n"; | |
| os << "sequence_batch_size: " << config.sequence_batch_size << "\n"; | |
| os << "supported_lora_ranks: " << config.supported_lora_ranks << "\n"; | |
| os << "max_top_k: " << config.max_top_k << "\n"; | |
| os << "enable_decode_logits: " << config.enable_decode_logits << "\n"; | |
| os << "enable_external_embeddings: " << config.enable_external_embeddings | |
| << "\n"; | |
| os << "use_submodel: " << config.use_submodel << "\n"; | |
| return os; | |
| } | |
| std::ostream& operator<<(std::ostream& os, const GpuConfig& config) { | |
| os << "max_top_k: " << config.max_top_k << "\n"; | |
| return os; | |
| } | |
| std::ostream& operator<<(std::ostream& os, const CpuConfig& config) { | |
| os << "kv_increment_size: " << config.kv_increment_size << "\n"; | |
| os << "prefill_chunk_size: " << config.prefill_chunk_size << "\n"; | |
| os << "number_of_threads: " << config.number_of_threads << "\n"; | |
| return os; | |
| } | |
| std::ostream& operator<<(std::ostream& os, const AdvancedSettings& settings) { | |
| os << "prefill_batch_sizes: [" | |
| << absl::StrJoin(settings.prefill_batch_sizes, ", ") << "]\n"; | |
| os << "num_output_candidates: " << settings.num_output_candidates << "\n"; | |
| os << "configure_magic_numbers: " << settings.configure_magic_numbers << "\n"; | |
| os << "verify_magic_numbers: " << settings.verify_magic_numbers << "\n"; | |
| os << "clear_kv_cache_before_prefill: " | |
| << settings.clear_kv_cache_before_prefill << "\n"; | |
| os << "num_logits_to_print_after_decode: " | |
| << settings.num_logits_to_print_after_decode << "\n"; | |
| os << "gpu_madvise_original_shared_tensors: " | |
| << settings.gpu_madvise_original_shared_tensors << "\n"; | |
| os << "is_benchmark: " << settings.is_benchmark << "\n"; | |
| os << "preferred_device_substr: " << settings.preferred_device_substr << "\n"; | |
| os << "num_threads_to_upload: " << settings.num_threads_to_upload << "\n"; | |
| os << "num_threads_to_compile: " << settings.num_threads_to_compile << "\n"; | |
| os << "convert_weights_on_gpu: " << settings.convert_weights_on_gpu << "\n"; | |
| os << "wait_for_weights_conversion_complete_in_benchmark: " | |
| << settings.wait_for_weights_conversion_complete_in_benchmark << "\n"; | |
| os << "optimize_shader_compilation: " << settings.optimize_shader_compilation | |
| << "\n"; | |
| os << "cache_compiled_shaders_only: " << settings.cache_compiled_shaders_only | |
| << "\n"; | |
| os << "share_constant_tensors: " << settings.share_constant_tensors << "\n"; | |
| os << "sampler_handles_input: " << settings.sampler_handles_input << "\n"; | |
| if (settings.allow_src_quantized_fc_conv_ops.has_value()) { | |
| os << "allow_src_quantized_fc_conv_ops: " | |
| << settings.allow_src_quantized_fc_conv_ops.value() << "\n"; | |
| } else { | |
| os << "allow_src_quantized_fc_conv_ops: Not set\n"; | |
| } | |
| if (settings.hint_waiting_for_completion.has_value()) { | |
| os << "hint_waiting_for_completion: " | |
| << settings.hint_waiting_for_completion.value() << "\n"; | |
| } else { | |
| os << "hint_waiting_for_completion: Not set\n"; | |
| } | |
| if (settings.gpu_context_low_priority.has_value()) { | |
| os << "gpu_context_low_priority: " | |
| << settings.gpu_context_low_priority.value() << "\n"; | |
| } else { | |
| os << "gpu_context_low_priority: Not set\n"; | |
| } | |
| os << "enable_speculative_decoding: " << settings.enable_speculative_decoding | |
| << "\n"; | |
| os << "disable_delegate_clustering: " << settings.disable_delegate_clustering | |
| << "\n"; | |
| return os; | |
| } | |
| std::ostream& operator<<(std::ostream& os, const LlmExecutorSettings& config) { | |
| os << "backend: " << config.GetBackend() << "\n"; | |
| std::visit( | |
| [&os](const auto& backend_config) { | |
| os << "backend_config:\n" << backend_config << "\n"; | |
| }, | |
| config.backend_config_); | |
| os << "max_tokens: " << config.GetMaxNumTokens() << "\n"; | |
| os << "activation_data_type: " << config.GetActivationDataType() << "\n"; | |
| os << "max_num_images: " << config.GetMaxNumImages() << "\n"; | |
| os << "lora_rank: " << config.GetLoraRank() << "\n"; | |
| os << "cache_dir: " << config.GetCacheDir() << "\n"; | |
| if (config.GetScopedCacheFile()) { | |
| os << "cache_file: " << config.GetScopedCacheFile()->file() << "\n"; | |
| } else { | |
| os << "cache_file: Not set\n"; | |
| } | |
| if (config.GetLitertDispatchLibDir().empty()) { | |
| os << "litert_dispatch_lib_dir: Not set\n"; | |
| } else { | |
| os << "litert_dispatch_lib_dir: " << config.GetLitertDispatchLibDir() | |
| << "\n"; | |
| } | |
| os << "model_assets: " << config.GetModelAssets() << "\n"; | |
| if (config.GetAdvancedSettings().has_value()) { | |
| os << "advanced_settings: " << *config.GetAdvancedSettings() << "\n"; | |
| } else { | |
| os << "advanced_settings: Not set\n"; | |
| } | |
| return os; | |
| } | |
| // static | |
| absl::StatusOr<LlmExecutorSettings> LlmExecutorSettings::CreateDefault( | |
| ModelAssets model_assets, Backend backend, | |
| std::optional<Backend> sampler_backend) { | |
| LlmExecutorSettings settings(std::move(model_assets)); | |
| if (backend == Backend::CPU) { | |
| CpuConfig config; | |
| config.kv_increment_size = 16; | |
| config.prefill_chunk_size = -1; | |
| config.number_of_threads = 4; | |
| settings.SetBackendConfig(config); | |
| } else if (backend == Backend::GPU) { | |
| GpuConfig config; | |
| // Default max top k to 1 for GPU. | |
| config.max_top_k = 1; | |
| settings.SetBackendConfig(config); | |
| } else if (backend == Backend::NPU) { | |
| } else if (backend == Backend::GPU_ARTISAN) { | |
| settings.SetBackendConfig(GpuArtisanConfig()); | |
| } else { | |
| return absl::InvalidArgumentError( | |
| absl::StrCat("Unsupported backend: ", backend)); | |
| } | |
| RETURN_IF_ERROR(settings.SetBackend(backend)); | |
| // Explicitly set the field value to avoid undefined behavior. Setting to 0 | |
| // means that the maximum number of tokens is not set can could be inferred | |
| // from the model assets (but note that for the model or backend which does | |
| // not support this, an error will be thrown during initialization). | |
| settings.SetMaxNumTokens(0); | |
| // Disable image input by default. | |
| settings.SetMaxNumImages(0); | |
| // Disable LoRA by default. | |
| settings.SetLoraRank(0); | |
| if (sampler_backend.has_value() && *sampler_backend != Backend::UNSPECIFIED) { | |
| settings.SetSamplerBackend(*sampler_backend); | |
| } | |
| return settings; | |
| } | |
| } // namespace litert::lm | |