| # Cài đặt này nên để ở ngoài cùng để ConversationManager nhận diện | |
| SAY_SENTENCE_SEPARATELY: true | |
| VERBOSE: false | |
| # Thêm cấu hình MCP ở đây | |
| mcp_config: | |
| enabled: true | |
| servers: | |
| - local-tools | |
| # System Settings: Setting related to the initialization of the server | |
| system_config: | |
| conf_version: 'v1.2.1' | |
| host: '0.0.0.0' # use 0.0.0.0 if you want other devices to access this page use localhost if you want only your device to access this page | |
| port: 7860 | |
| # --------------------------- | |
| # New setting for alternative configurations | |
| config_alts_dir: 'characters' | |
| # Tool prompts that will be appended to the persona prompt | |
| tool_prompts: | |
| # This will be appended to the end of system prompt to let LLM include keywords to control facial expressions. | |
| # Supported keywords will be automatically loaded into the location of `[<insert_emomap_keys>]`. | |
| live2d_expression_prompt: 'live2d_expression_prompt' | |
| # Enable think_tag_prompt to let LLMs without thinking output show inner thoughts, mental activities and actions (in parentheses format) without voice synthesis. See think_tag_prompt for more details. | |
| # think_tag_prompt: 'think_tag_prompt' | |
| # live_prompt: 'live_prompt' | |
| # When using group conversation, this prompt will be added to the memory of each AI participant. | |
| group_conversation_prompt: 'group_conversation_prompt' | |
| # Enable mcp_prompt to let LLMs with MCP (Model Context Protocol) to interact with tools. | |
| # Please note that Agent will decide whether to use this prompt or not. | |
| mcp_prompt: 'mcp_prompt' | |
| # Prompt used when AI is asked to speak proactively | |
| proactive_speak_prompt: 'proactive_speak_prompt' | |
| # Prompt to enhance the LLM's ability to output speakable text | |
| # speakable_prompt: 'speakable_prompt' | |
| # Additional guidance for LLM on how to use tools | |
| tool_guidance_prompt: 'tool_guidance_prompt' | |
| # configuration for the default character | |
| character_config: | |
| conf_name: 'vi_Yue_Pro' # The name of character configuration file. | |
| conf_uid: 'vi_Yue__01' # The unique identifier of character configuration. | |
| live2d_model_name: 'Kamiyahakuk_pro' # The name of Live2D model. Must be the same as the corresponding name in model_dict.json | |
| character_name: 'Yue' # Will be used in the group conversation and the display name of the AI. | |
| avatar: 'Yue_001.png' # Suggest using a square image for the avatar. Save it in the avatars folder. Leave blank to use the first letter of the character name as the avatar. | |
| human_name: 'Human' # Will be used in the group conversation and the display name of the human. | |
| # ============== Prompts ============== | |
| # Enter the persona prompt you want to use below. | |
| # If you want to create multiple characters and switch between them, | |
| # add characters in characters folder | |
| persona_prompt: | | |
| "Bạn là Yue, trợ lý AI từ NopeC137. Phản hồi bằng tiếng Việt với giọng điệu hỗ trợ nhưng mỉa mai, dí dỏm. Bạn thích trêu chọc sự ngây ngô của người dùng nhưng luôn đem lại kiến thức kỹ thuật sâu sắc. | |
| QUY TẮC PHÁT NHẠC: | |
| 1. Chỉ dẫn ngắn gọn (Ví dụ: "Nghe nhé..."). | |
| 2. Lệnh SING_COMMAND phải ở CUỐI CÙNG, không có bất kỳ ký tự nào sau nó. | |
| 3. Cú pháp: [[SING_COMMAND]]tên_file (TUYỆT ĐỐI KHÔNG viết thêm chữ .mp3 vào lệnh). | |
| DANH SÁCH TÊN FILE: | |
| - golden | |
| - Catch_Me_If_You_Can | |
| - ecstacy | |
| - eve | |
| - ode_to_the_nameless_martyr | |
| - running_up_that_hill | |
| - throttle_up | |
| - what_it_sounds_like | |
| - worry_slowed | |
| # =================== LLM Backend Settings =================== | |
| agent_config: | |
| conversation_agent_choice: 'basic_memory_agent' | |
| agent_settings: | |
| basic_memory_agent: | |
| # The Basic AI Agent. Nothing fancy. | |
| # choose one of the llm provider from the llm_config | |
| # and set the required parameters in the corresponding field | |
| # examples: | |
| # 'openai_compatible_llm', 'llama_cpp_llm', 'claude_llm', 'ollama_llm' | |
| # 'openai_llm', 'gemini_llm', 'zhipu_llm', 'deepseek_llm', 'groq_llm' | |
| # 'mistral_llm', 'lmstudio_llm', and more | |
| llm_provider: 'openai_llm' | |
| # let ai speak as soon as the first comma is received on the first sentence | |
| # to reduced latency. | |
| faster_first_response: True | |
| # Method for segmenting sentences: 'regex' or 'pysbd' | |
| segment_method: 'pysbd' | |
| # Use MCP (Model Context Protocol) Plus to let the LLM have the ability to use tools | |
| # 'Plus' means that it has the ability to call tools by using OpenAI API. | |
| use_mcpp: True | |
| mcp_enabled_servers: ["local-tools"] # Enabled MCP servers | |
| letta_agent: | |
| host: 'localhost' # Host address | |
| port: 8283 # Port number | |
| id: xxx # ID number of the Agent running on the Letta server | |
| faster_first_response: True | |
| # Method for segmenting sentences: 'regex' or 'pysbd' | |
| segment_method: 'pysbd' | |
| # Once Letta is chosen as the agent, the LLM that runs in practice is configured on Letta, so the user needs to run the Letta server themselves. | |
| # For more detailed information, please refer to their documentation. | |
| hume_ai_agent: | |
| api_key: '' | |
| host: 'api.hume.ai' # Do not change this in most cases | |
| config_id: '' # Optional | |
| idle_timeout: 15 # How many seconds to wait before disconnecting | |
| # MemGPT Configurations: MemGPT is temporarily removed | |
| ## | |
| llm_configs: | |
| # a configuration pool for the credentials and connection details for | |
| # all of the stateless llm providers that will be used in different agents | |
| # Stateless LLM with Template (For Non-ChatML LLMs, usually not needed) | |
| stateless_llm_with_template: | |
| base_url: 'http://localhost:8080/v1' | |
| llm_api_key: 'somethingelse' | |
| organization_id: null | |
| project_id: null | |
| model: 'qwen2.5:latest' | |
| template: 'CHATML' | |
| temperature: 1.0 # value between 0 to 2 | |
| interrupt_method: 'user' | |
| # OpenAI Compatible inference backend | |
| openai_compatible_llm: | |
| base_url: 'http://localhost:11434/v1' | |
| llm_api_key: 'somethingelse' | |
| organization_id: null | |
| project_id: null | |
| model: 'mistral:latest' #mistral:latest,qwen2.5:latest' | |
| temperature: 1.0 # value between 0 to 2 | |
| interrupt_method: 'user' | |
| # This is the method to use for prompting the interruption signal. | |
| # If the provider supports inserting system prompt anywhere in the chat memory, use 'system'. | |
| # Otherwise, use 'user'. You don't usually need to change this setting. | |
| # Claude API Configuration | |
| claude_llm: | |
| base_url: 'https://api.anthropic.com' | |
| llm_api_key: 'YOUR API KEY HERE' | |
| model: 'claude-3-haiku-20240307' | |
| llama_cpp_llm: | |
| model_path: '<path-to-gguf-model-file>' | |
| verbose: False | |
| ollama_llm: | |
| base_url: 'http://localhost:11434/v1' | |
| model: 'qwen3.5:4b' | |
| temperature: 0.7 # value between 0 to 2 | |
| # seconds to keep the model in memory after inactivity. | |
| # set to -1 to keep the model in memory forever (even after exiting open llm vtuber) | |
| keep_alive: -1 | |
| unload_at_exit: True # unload the model from memory at exit | |
| lmstudio_llm: | |
| base_url: 'http://localhost:1234/v1' | |
| model: 'qwen2.5:latest' | |
| temperature: 1.0 # value between 0 to 2 | |
| openai_llm: | |
| llm_api_key: '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' | |
| base_url: 'https://openrouter.ai/api/v1' | |
| model: 'google/gemini-2.0-flash-001' | |
| temperature: 0.8 # value between 0 to 2 | |
| max_tokens: 500 | |
| gemini_llm: | |
| llm_api_key: 'AIzaSyCZ5s2t6EqeQuADJZigYmaj1mbmV6PwJz4' | |
| model: 'gemini-1.5-flash' | |
| temperature: 0. # value between 0 to 2 | |
| zhipu_llm: | |
| llm_api_key: 'Your ZhiPu AI API key' | |
| model: 'glm-4-flash' | |
| temperature: 1.0 # value between 0 to 2 | |
| deepseek_llm: | |
| llm_api_key: 'sk-167e94436b134f6f92c914ccccf606df' | |
| model: 'deepseek/deepseek-chat:free' | |
| temperature: 0.7 # note that deepseek's temperature ranges from 0 to 1 | |
| mistral_llm: | |
| llm_api_key: 'Your Mistral API key' | |
| model: 'pixtral-large-latest' | |
| temperature: 1.0 # value between 0 to 2 | |
| groq_llm: | |
| llm_api_key: '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' | |
| model: 'llama-3.3-70b-versatile' | |
| temperature: 0.5 # value between 0 to 2 | |
| # === Automatic Speech Recognition === | |
| asr_config: | |
| # speech to text model options: 'faster_whisper', 'whisper_cpp', 'whisper', 'azure_asr', 'fun_asr', 'groq_whisper_asr', 'sherpa_onnx_asr' | |
| asr_model: 'groq_whisper_asr' | |
| azure_asr: | |
| api_key: 'azure_api_key' | |
| region: 'eastus' | |
| languages: ['en-US', 'zh-CN'] # List of languages to detect | |
| # Faster whisper config | |
| faster_whisper: | |
| model_path: 'large-v3-turbo' # model path, name, or id from hf hub | |
| download_root: 'models/whisper' | |
| language: 'en' # en, zh, or something else. put nothing for auto-detect. | |
| device: 'auto' # cpu, cuda, or auto. faster-whisper doesn't support mps | |
| compute_type: 'int8' | |
| prompt: '' # You can put a prompt here to help the model understand the context of the audio | |
| whisper_cpp: | |
| # all available models are listed on https://abdeladim-s.github.io/pywhispercpp/#pywhispercpp.constants.AVAILABLE_MODELS | |
| model_name: 'small' | |
| model_dir: 'models/whisper' | |
| print_realtime: False | |
| print_progress: False | |
| language: 'auto' # en, zh, auto, | |
| prompt: '' # You can put a prompt here to help the model understand the context of the audio | |
| whisper: | |
| name: 'medium' | |
| download_root: 'models/whisper' | |
| device: 'cpu' | |
| prompt: '' # You can put a prompt here to help the model understand the context of the audio | |
| # FunASR currently needs internet connection on launch | |
| # to download / check the models. You can disconnect the internet after initialization. | |
| # Or you can use sherpa onnx asr or Faster-Whisper for complete offline experience | |
| fun_asr: | |
| model_name: 'iic/SenseVoiceSmall' # or 'paraformer-zh' | |
| vad_model: 'fsmn-vad' # this is only used to make it works if audio is longer than 30s | |
| punc_model: 'ct-punc' # punctuation model. | |
| device: 'cpu' | |
| disable_update: True # should we check FunASR updates everytime on launch | |
| ncpu: 4 # number of threads for CPU internal operations. | |
| hub: 'ms' # ms (default) to download models from ModelScope. Use hf to download models from Hugging Face. | |
| use_itn: False | |
| language: 'auto' # zh, en, auto | |
| # pip install sherpa-onnx | |
| # documentation: https://k2-fsa.github.io/sherpa/onnx/index.html | |
| # ASR models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models | |
| sherpa_onnx_asr: | |
| model_type: 'sense_voice' # 'transducer', 'paraformer', 'nemo_ctc', 'wenet_ctc', 'whisper', 'tdnn_ctc', 'sense_voice', 'fire_red_asr' | |
| # Choose only ONE of the following, depending on the model_type: | |
| # --- For model_type: 'transducer' --- | |
| # encoder: '' # Path to the encoder model (e.g., 'path/to/encoder.onnx') | |
| # decoder: '' # Path to the decoder model (e.g., 'path/to/decoder.onnx') | |
| # joiner: '' # Path to the joiner model (e.g., 'path/to/joiner.onnx') | |
| # --- For model_type: 'paraformer' --- | |
| # paraformer: '' # Path to the paraformer model (e.g., 'path/to/model.onnx') | |
| # --- For model_type: 'fire_red_asr' (FireredASR - High-performance Chinese & English ASR with dialect support) --- | |
| # fire_red_asr_encoder: '' # Path to the encoder model (e.g., 'path/to/encoder.onnx') | |
| # fire_red_asr_decoder: '' # Path to the decoder model (e.g., 'path/to/decoder.onnx') | |
| # --- For model_type: 'nemo_ctc' --- | |
| # nemo_ctc: '' # Path to the NeMo CTC model (e.g., 'path/to/model.onnx') | |
| # --- For model_type: 'wenet_ctc' --- | |
| # wenet_ctc: '' # Path to the WeNet CTC model (e.g., 'path/to/model.onnx') | |
| # --- For model_type: 'tdnn_ctc' --- | |
| # tdnn_model: '' # Path to the TDNN CTC model (e.g., 'path/to/model.onnx') | |
| # --- For model_type: 'whisper' --- | |
| # whisper_encoder: '' # Path to the Whisper encoder model (e.g., 'path/to/encoder.onnx') | |
| # whisper_decoder: '' # Path to the Whisper decoder model (e.g., 'path/to/decoder.onnx') | |
| # --- For model_type: 'sense_voice' --- | |
| # I've coded so that the sense voice model will get automatically downloaded. | |
| # For other models, you need to download them yourself | |
| sense_voice: './models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx' # Path to the SenseVoice model (e.g., 'path/to/model.onnx') | |
| tokens: './models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt' # Path to tokens.txt (required for all model types) | |
| # --- Optional parameters (with defaults shown) --- | |
| # hotwords_file: '' # Path to hotwords file (if using hotwords) | |
| # hotwords_score: 1.5 # Score for hotwords | |
| # modeling_unit: '' # Modeling unit for hotwords (if applicable) | |
| # bpe_vocab: '' # Path to BPE vocabulary (if applicable) | |
| num_threads: 4 # Number of threads | |
| # whisper_language: '' # Language for Whisper models (e.g., 'en', 'zh', etc. - if using Whisper) | |
| # whisper_task: 'transcribe' # Task for Whisper models ('transcribe' or 'translate' - if using Whisper) | |
| # whisper_tail_paddings: -1 # Tail padding for Whisper models (if using Whisper) | |
| # blank_penalty: 0.0 # Penalty for blank symbol | |
| # decoding_method: 'greedy_search' # 'greedy_search' or 'modified_beam_search' | |
| # debug: False # Enable debug mode | |
| # sample_rate: 16000 # Sample rate (should match the model's expected sample rate) | |
| # feature_dim: 80 # Feature dimension (should match the model's expected feature dimension) | |
| use_itn: True # Enable ITN for SenseVoice models (should set to False if not using SenseVoice models) | |
| # Provider for inference (cpu or cuda) (cuda option needs additional settings. Please check our docs) | |
| provider: 'cpu' | |
| groq_whisper_asr: | |
| api_key: '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' | |
| model: 'whisper-large-v3-turbo' # or 'whisper-large-v3' | |
| lang: 'vi' # put nothing and it will be auto | |
| # =================== Text to Speech =================== | |
| tts_config: | |
| tts_model: 'edge_tts' | |
| # text to speech model options: | |
| # 'azure_tts', 'pyttsx3_tts', 'edge_tts', 'bark_tts', | |
| # 'cosyvoice_tts', 'melo_tts', 'coqui_tts', 'piper_tts', | |
| # 'fish_api_tts', 'x_tts', 'gpt_sovits_tts', 'sherpa_onnx_tts' | |
| # 'minimax_tts', 'elevenlabs_tts', 'cartesia_tts' | |
| azure_tts: | |
| api_key: 'azure-api-key' | |
| region: 'eastus' | |
| voice: 'en-US-AshleyNeural' | |
| pitch: '26' # percentage of the pitch adjustment | |
| rate: '1' # rate of speak | |
| bark_tts: | |
| voice: 'v2/en_speaker_1' | |
| edge_tts: | |
| # Check out doc at https://github.com/rany2/edge-tts | |
| # Use `edge-tts --list-voices` to list all available voices | |
| voice: 'vi-VN-HoaiMyNeural' # 'en-US-AvaMultilingualNeural' #'zh-CN-XiaoxiaoNeural' # 'ja-JP-NanamiNeural' | |
| # pyttsx3_tts doesn't have any config. | |
| piper_tts: | |
| model_path: 'models/piper/zh_CN-huayan-medium.onnx' # Path to the model file (.onnx) | |
| speaker_id: 0 # Speaker ID (for multi-speaker models; keep 0 for single-speaker models) | |
| length_scale: 1.0 # Speech speed control (0.5 = 2x faster, 1.0 = normal, 2.0 = 2x slower) | |
| noise_scale: 0.667 # Degree of audio variation (0.0–1.0; higher = richer, more varied; recommended 0.667) | |
| noise_w: 0.8 # Speaking style variation (0.0–1.0; higher = more expressive; recommended 0.8) | |
| volume: 1.0 # Volume level (0.0–1.0; 1.0 = normal) | |
| normalize_audio: true # Whether to normalize audio (recommended: true, for more consistent volume) | |
| use_cuda: false # Whether to use GPU acceleration (requires onnxruntime-gpu) | |
| cosyvoice_tts: # Cosy Voice TTS connects to the gradio webui | |
| # Check their documentation for deployment and the meaning of the following configurations | |
| client_url: 'http://127.0.0.1:50000/' # CosyVoice gradio demo webui url | |
| mode_checkbox_group: '预训练音色' | |
| sft_dropdown: '中文女' | |
| prompt_text: '' | |
| prompt_wav_upload_url: 'https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav' | |
| prompt_wav_record_url: 'https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav' | |
| instruct_text: '' | |
| seed: 0 | |
| api_name: '/generate_audio' | |
| cosyvoice2_tts: # Cosy Voice TTS connects to the gradio webui | |
| # Check their documentation for deployment and the meaning of the following configurations | |
| client_url: 'http://127.0.0.1:50000/' # CosyVoice gradio demo webui url | |
| mode_checkbox_group: '3s极速复刻' | |
| sft_dropdown: '' | |
| prompt_text: '' | |
| prompt_wav_upload_url: 'https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav' | |
| prompt_wav_record_url: 'https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav' | |
| instruct_text: '' | |
| stream: False | |
| seed: 0 | |
| speed: 1.0 | |
| api_name: '/generate_audio' | |
| melo_tts: | |
| speaker: 'EN-Default' # ZH | |
| language: 'EN' # ZH | |
| device: 'auto' # You can set it manually to 'cpu' or 'cuda' or 'cuda:0' or 'mps' | |
| speed: 1.0 | |
| x_tts: | |
| api_url: 'http://127.0.0.1:8020/tts_to_audio' | |
| speaker_wav: 'female' | |
| language: 'en' | |
| gpt_sovits_tts: | |
| # put ref audio to root path of GPT-Sovits, or set the path here | |
| api_url: 'http://127.0.0.1:9880/tts' | |
| text_lang: 'zh' | |
| ref_audio_path: '' | |
| prompt_lang: 'zh' | |
| prompt_text: '' | |
| text_split_method: 'cut5' | |
| batch_size: '1' | |
| media_type: 'wav' | |
| streaming_mode: 'false' | |
| fish_api_tts: | |
| # The API key for the Fish TTS API. | |
| api_key: '' | |
| # The reference ID for the voice to be used. Get it on the [Fish Audio website](https://fish.audio/). | |
| reference_id: '' | |
| # Either 'normal' or 'balanced'. balance is faster but lower quality. | |
| latency: 'balanced' | |
| base_url: 'https://api.fish.audio' | |
| coqui_tts: | |
| # Name of the TTS model to use. If empty, will use default model | |
| # do 'tts --list_models' to list supported models for coqui-tts | |
| # Some examples: | |
| # - 'tts_models/en/ljspeech/tacotron2-DDC' (single speaker) | |
| # - 'tts_models/zh-CN/baker/tacotron2-DDC-GST' (single speaker for chinese) | |
| # - 'tts_models/multilingual/multi-dataset/your_tts' (multi-speaker) | |
| # - 'tts_models/multilingual/multi-dataset/xtts_v2' (multi-speaker) | |
| model_name: 'tts_models/en/ljspeech/tacotron2-DDC' | |
| speaker_wav: '' | |
| language: 'en' | |
| device: '' | |
| siliconflow_tts: | |
| api_url: "https://api.siliconflow.cn/v1/audio/speech" | |
| api_key: "your key" | |
| default_model: "FunAudioLLM/CosyVoice2-0.5B" | |
| default_voice: "speech:Dreamflowers:5bdstvc39i:xkqldnpasqmoqbakubom your voice name" # Default voice configuration in the format: "speech:MODEL_NAME:VOICE_ID:your voice name" | |
| sample_rate: 32000 # Control the output sample rate. The default values and differ for different video output types, as follows: opus: Supports 48000 Hz. wav, pcm: Supports 8000, 16000, 24000, 32000, 44100 Hz, with a default of 44100 Hz. mp3: Supports 32000, 44100 Hz, with a default of 44100 Hz. | |
| response_format: "mp3" # The format to audio out. Supported formats are mp3, opus, wav, pcm | |
| stream: true | |
| speed: 1 | |
| gain: 0 | |
| # pip install sherpa-onnx | |
| # documentation: https://k2-fsa.github.io/sherpa/onnx/index.html | |
| # TTS models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models | |
| # see config_alts for more examples | |
| sherpa_onnx_tts: | |
| vits_model: '/path/to/tts-models/vits-melo-tts-zh_en/model.onnx' # Path to VITS model file | |
| vits_lexicon: '/path/to/tts-models/vits-melo-tts-zh_en/lexicon.txt' # Path to lexicon file (optional) | |
| vits_tokens: '/path/to/tts-models/vits-melo-tts-zh_en/tokens.txt' # Path to tokens file | |
| vits_data_dir: '' # '/path/to/tts-models/vits-piper-en_GB-cori-high/espeak-ng-data' # Path to espeak-ng data (optional) | |
| vits_dict_dir: '/path/to/tts-models/vits-melo-tts-zh_en/dict' # Path to Jieba dict (optional, for Chinese) | |
| tts_rule_fsts: '/path/to/tts-models/vits-melo-tts-zh_en/number.fst,/path/to/tts-models/vits-melo-tts-zh_en/phone.fst,/path/to/tts-models/vits-melo-tts-zh_en/date.fst,/path/to/tts-models/vits-melo-tts-zh_en/new_heteronym.fst' # Path to rule FSTs file (optional) | |
| max_num_sentences: 2 # Max sentences per batch (or -1 for all) | |
| sid: 1 # Speaker ID (for multi-speaker models) | |
| provider: 'cpu' # Use 'cpu', 'cuda' (GPU), or 'coreml' (Apple) | |
| num_threads: 1 # Number of computation threads | |
| speed: 1.0 # Speech speed (1.0 is normal) | |
| debug: false # Enable debug mode (True/False) | |
| spark_tts: | |
| api_url: 'http://127.0.0.1:6006/' # API URL. Uses Gradio's built-in front-end API. Repository: https://github.com/SparkAudio/Spark-TTS | |
| api_name: "voice_clone" # Endpoint name. Options: voice_clone, voice_creation | |
| prompt_wav_upload: "https://uploadstatic.mihoyo.com/ys-obc/2022/11/02/16576950/4d9feb71760c5e8eb5f6c700df12fa0c_6824265537002152805.mp3" # Reference audio URL. Provide if api_name equals "voice_clone" | |
| gender: "female" # Voice type (gender). Provide if api_name equals "voice_creation" | |
| pitch: 3 # Pitch shift (in semitones) default 3,range 1-5. Valid only if api_name equals "voice_creation" | |
| speed: 3 # Speed of the voice (in percent) default 3,range 1-5. Valid only if api_name equals "voice_creation" | |
| openai_tts: # Configuration for OpenAI-compatible TTS endpoints | |
| # These settings override the defaults in the openai_tts.py file if provided | |
| model: 'kokoro' # Model name expected by the server (e.g., 'tts-1', 'kokoro') | |
| voice: 'af_sky+af_bella' # Voice name(s) expected by the server (e.g., 'alloy', 'af_sky+af_bella') | |
| api_key: 'not-needed' # API key if required by the server | |
| base_url: 'http://localhost:8880/v1' # Base URL of the TTS server | |
| file_extension: 'mp3' # Audio file format ('mp3' or 'wav') | |
| # For more details, see: https://platform.minimaxi.com/document/Announcement | |
| minimax_tts: | |
| group_id: '' # Your minimax group_id | |
| api_key: '' # Your minimax api_key | |
| # Supported models: 'speech-02-hd', 'speech-02-turbo' (recommended: 'speech-02-turbo') | |
| model: 'speech-02-turbo' # minimax model name | |
| voice_id: 'female-shaonv' # minimax voice id, default is 'female-shaonv' | |
| # Custom pronunciation dictionary, default empty. | |
| # Example: '{"tone": ["测试/(ce4)(shi4)", "危险/dangerous"]}' | |
| pronunciation_dict: '' | |
| elevenlabs_tts: | |
| api_key: '' | |
| voice_id: '' # Voice ID from ElevenLabs | |
| model_id: 'eleven_multilingual_v2' # Model ID (e.g., eleven_multilingual_v2) | |
| output_format: 'mp3_44100_128' # Output audio format (e.g., mp3_44100_128) | |
| stability: 0.5 # Voice stability (0.0 to 1.0) | |
| similarity_boost: 0.5 # Voice similarity boost (0.0 to 1.0) | |
| style: 0.0 # Voice style exaggeration (0.0 to 1.0) | |
| use_speaker_boost: true # Enable speaker boost for better quality | |
| cartesia_tts: | |
| api_key: '' | |
| voice_id: '' # Voice ID from Cartesia | |
| model_id: 'sonic-3' # Model ID (e.g., sonic-3) | |
| output_format: 'wav' # Output audio format (e.g., wav) | |
| language: 'en' # Output language of voice (e.g., en) | |
| emotion: 'neutral' # Emotional guidance | |
| volume: 1.0 # Voice volume (0.5 to 2.0) | |
| speed: 1.0 # Voice speed (0.6 to 1.5) | |
| # =================== Voice Activity Detection =================== | |
| vad_config: | |
| vad_model: null | |
| silero_vad: | |
| orig_sr: 16000 # Original Audio Sample Rate | |
| target_sr: 16000 # Target Audio Sample Rate | |
| prob_threshold: 0.4 # Probability Threshold for VAD | |
| db_threshold: 60 # Decibel Threshold for VAD | |
| required_hits: 3 # Number of consecutive hits required to consider speech | |
| required_misses: 24 # Number of consecutive misses required to consider silence | |
| smoothing_window: 5 # Smoothing window size for VAD | |
| tts_preprocessor_config: | |
| # settings regarding preprocessing for text that goes into TTS | |
| remove_special_char: True # remove special characters like emoji from audio generation | |
| ignore_brackets: False # ignore everything inside brackets | |
| ignore_parentheses: True # ignore everything inside parentheses | |
| ignore_asterisks: True # ignore everything wrapped inside asterisks | |
| ignore_angle_brackets: True # ignore everything wrapped inside <text> | |
| translator_config: | |
| # Like... you speak and read the subtitles in English, and the TTS speaks Japanese or that kind of things | |
| translate_audio: False # Warning: you need to deploy DeeplX to use this. Otherwise it's going to crash | |
| translate_provider: 'deeplx' # deeplx or tencent | |
| deeplx: | |
| deeplx_target_lang: 'JA' | |
| deeplx_api_endpoint: 'http://localhost:1188/v2/translate' | |
| # Tencent Text Translation 5 million characters per month Remember to turn off post-payment, need to manually go to Machine Translation Console > System Settings to disable | |
| # https://cloud.tencent.com/document/product/551/35017 | |
| # https://console.cloud.tencent.com/cam/capi | |
| tencent: | |
| secret_id: '' | |
| secret_key: '' | |
| region: 'ap-guangzhou' | |
| source_lang: 'zh' | |
| target_lang: 'ja' | |
| # --- ASSETS (Back to 0 spaces - Global) --- | |
| live2d_config: | |
| live2d_path: 'live2d-models' | |
| default_model: 'Kamiyahakuk_pro' | |
| background_config: | |
| background_path: 'backgrounds' | |
| default_background: 'ceiling-window-room-night.jpeg' | |
| # Live Streaming Integration | |
| live_config: | |
| bilibili_live: | |
| # List of BiliBili live room IDs to monitor | |
| room_ids: [1991478060] | |
| # SESSDATA cookie value (optional, for authenticated requests) | |
| sessdata: "" |