| | import numpy as np |
| | from fastapi import HTTPException |
| |
|
| | from modules.api.impl.handler.AudioHandler import AudioHandler |
| | from modules.api.impl.model.audio_model import AdjustConfig |
| | from modules.api.impl.model.chattts_model import InferConfig |
| | from modules.api.impl.model.enhancer_model import EnhancerConfig |
| | from modules.Enhancer.ResembleEnhance import apply_audio_enhance_full |
| | from modules.normalization import text_normalize |
| | from modules.ssml_parser.SSMLParser import create_ssml_parser |
| | from modules.SynthesizeSegments import SynthesizeSegments, combine_audio_segments |
| | from modules.utils import audio |
| |
|
| |
|
| | class SSMLHandler(AudioHandler): |
| | def __init__( |
| | self, |
| | ssml_content: str, |
| | infer_config: InferConfig, |
| | adjust_config: AdjustConfig, |
| | enhancer_config: EnhancerConfig, |
| | ) -> None: |
| | assert isinstance(ssml_content, str), "ssml_content must be a string." |
| | assert isinstance( |
| | infer_config, InferConfig |
| | ), "infer_config must be an InferConfig object." |
| | assert isinstance( |
| | adjust_config, AdjustConfig |
| | ), "adjest_config should be AdjustConfig" |
| | assert isinstance( |
| | enhancer_config, EnhancerConfig |
| | ), "enhancer_config must be an EnhancerConfig object." |
| |
|
| | self.ssml_content = ssml_content |
| | self.infer_config = infer_config |
| | self.adjest_config = adjust_config |
| | self.enhancer_config = enhancer_config |
| |
|
| | self.validate() |
| |
|
| | def validate(self): |
| | |
| | pass |
| |
|
| | def enqueue(self) -> tuple[np.ndarray, int]: |
| | ssml_content = self.ssml_content |
| | infer_config = self.infer_config |
| | adjust_config = self.adjest_config |
| | enhancer_config = self.enhancer_config |
| |
|
| | parser = create_ssml_parser() |
| | segments = parser.parse(ssml_content) |
| | for seg in segments: |
| | seg["text"] = text_normalize(seg["text"], is_end=True) |
| |
|
| | if len(segments) == 0: |
| | raise HTTPException( |
| | status_code=422, detail="The SSML text is empty or parsing failed." |
| | ) |
| |
|
| | synthesize = SynthesizeSegments( |
| | batch_size=infer_config.batch_size, |
| | eos=infer_config.eos, |
| | spliter_thr=infer_config.spliter_threshold, |
| | ) |
| | audio_segments = synthesize.synthesize_segments(segments) |
| | combined_audio = combine_audio_segments(audio_segments) |
| |
|
| | sample_rate, audio_data = audio.pydub_to_np(combined_audio) |
| |
|
| | if enhancer_config.enabled: |
| | nfe = enhancer_config.nfe |
| | solver = enhancer_config.solver |
| | lambd = enhancer_config.lambd |
| | tau = enhancer_config.tau |
| |
|
| | audio_data, sample_rate = apply_audio_enhance_full( |
| | audio_data=audio_data, |
| | sr=sample_rate, |
| | nfe=nfe, |
| | solver=solver, |
| | lambd=lambd, |
| | tau=tau, |
| | ) |
| |
|
| | audio_data = audio.apply_prosody_to_audio_data( |
| | audio_data=audio_data, |
| | rate=adjust_config.speed_rate, |
| | pitch=adjust_config.pitch, |
| | volume=adjust_config.volume_gain_db, |
| | sr=sample_rate, |
| | ) |
| |
|
| | if adjust_config.normalize: |
| | sample_rate, audio_data = audio.apply_normalize( |
| | audio_data=audio_data, headroom=adjust_config.headroom, sr=sample_rate |
| | ) |
| |
|
| | return audio_data, sample_rate |
| |
|