| """ |
| Audio saving and transcoding utility module |
| |
| Independent audio file operations outside of handler, supporting: |
| - Save audio tensor/numpy to files (default FLAC format, fast) |
| - Format conversion (FLAC/WAV/MP3) |
| - Batch processing |
| """ |
|
|
| import os |
| import hashlib |
| import json |
| from pathlib import Path |
| from typing import Union, Optional, List, Tuple |
| import torch |
| import numpy as np |
| import torchaudio |
| from loguru import logger |
|
|
|
|
| class AudioSaver: |
| """Audio saving and transcoding utility class""" |
| |
| def __init__(self, default_format: str = "flac"): |
| """ |
| Initialize audio saver |
| |
| Args: |
| default_format: Default save format ('flac', 'wav', 'mp3') |
| """ |
| self.default_format = default_format.lower() |
| if self.default_format not in ["flac", "wav", "mp3"]: |
| logger.warning(f"Unsupported format {default_format}, using 'flac'") |
| self.default_format = "flac" |
| |
| def save_audio( |
| self, |
| audio_data: Union[torch.Tensor, np.ndarray], |
| output_path: Union[str, Path], |
| sample_rate: int = 48000, |
| format: Optional[str] = None, |
| channels_first: bool = True, |
| ) -> str: |
| """ |
| Save audio data to file |
| |
| Args: |
| audio_data: Audio data, torch.Tensor [channels, samples] or numpy.ndarray |
| output_path: Output file path (extension can be omitted) |
| sample_rate: Sample rate |
| format: Audio format ('flac', 'wav', 'mp3'), defaults to default_format |
| channels_first: If True, tensor format is [channels, samples], else [samples, channels] |
| |
| Returns: |
| Actual saved file path |
| """ |
| format = (format or self.default_format).lower() |
| if format not in ["flac", "wav", "mp3"]: |
| logger.warning(f"Unsupported format {format}, using {self.default_format}") |
| format = self.default_format |
| |
| |
| output_path = Path(output_path) |
| if output_path.suffix.lower() not in ['.flac', '.wav', '.mp3']: |
| output_path = output_path.with_suffix(f'.{format}') |
| |
| |
| if isinstance(audio_data, np.ndarray): |
| if channels_first: |
| |
| audio_tensor = torch.from_numpy(audio_data.T).float() |
| else: |
| |
| audio_tensor = torch.from_numpy(audio_data).float() |
| if audio_tensor.dim() == 2 and audio_tensor.shape[0] < audio_tensor.shape[1]: |
| audio_tensor = audio_tensor.T |
| else: |
| |
| audio_tensor = audio_data.cpu().float() |
| if not channels_first and audio_tensor.dim() == 2: |
| |
| if audio_tensor.shape[0] > audio_tensor.shape[1]: |
| audio_tensor = audio_tensor.T |
| |
| |
| audio_tensor = audio_tensor.contiguous() |
| |
| |
| try: |
| if format == "mp3": |
| |
| torchaudio.save( |
| str(output_path), |
| audio_tensor, |
| sample_rate, |
| channels_first=True, |
| backend='ffmpeg', |
| ) |
| elif format in ["flac", "wav"]: |
| |
| torchaudio.save( |
| str(output_path), |
| audio_tensor, |
| sample_rate, |
| channels_first=True, |
| backend='soundfile', |
| ) |
| else: |
| |
| torchaudio.save( |
| str(output_path), |
| audio_tensor, |
| sample_rate, |
| channels_first=True, |
| ) |
| |
| logger.debug(f"[AudioSaver] Saved audio to {output_path} ({format}, {sample_rate}Hz)") |
| return str(output_path) |
| |
| except Exception as e: |
| try: |
| import soundfile as sf |
| audio_np = audio_tensor.transpose(0, 1).numpy() |
| sf.write(str(output_path), audio_np, sample_rate, format=format.upper()) |
| logger.debug(f"[AudioSaver] Fallback soundfile Saved audio to {output_path} ({format}, {sample_rate}Hz)") |
| return str(output_path) |
| except Exception as e: |
| logger.error(f"[AudioSaver] Failed to save audio: {e}") |
| raise |
| |
| def convert_audio( |
| self, |
| input_path: Union[str, Path], |
| output_path: Union[str, Path], |
| output_format: str, |
| remove_input: bool = False, |
| ) -> str: |
| """ |
| Convert audio format |
| |
| Args: |
| input_path: Input audio file path |
| output_path: Output audio file path |
| output_format: Target format ('flac', 'wav', 'mp3') |
| remove_input: Whether to delete input file |
| |
| Returns: |
| Output file path |
| """ |
| input_path = Path(input_path) |
| output_path = Path(output_path) |
| |
| if not input_path.exists(): |
| raise FileNotFoundError(f"Input file not found: {input_path}") |
| |
| |
| audio_tensor, sample_rate = torchaudio.load(str(input_path)) |
| |
| |
| output_path = self.save_audio( |
| audio_tensor, |
| output_path, |
| sample_rate=sample_rate, |
| format=output_format, |
| channels_first=True |
| ) |
| |
| |
| if remove_input: |
| input_path.unlink() |
| logger.debug(f"[AudioSaver] Removed input file: {input_path}") |
| |
| return output_path |
| |
| def save_batch( |
| self, |
| audio_batch: Union[List[torch.Tensor], torch.Tensor], |
| output_dir: Union[str, Path], |
| file_prefix: str = "audio", |
| sample_rate: int = 48000, |
| format: Optional[str] = None, |
| channels_first: bool = True, |
| ) -> List[str]: |
| """ |
| Save audio batch |
| |
| Args: |
| audio_batch: Audio batch, List[tensor] or tensor [batch, channels, samples] |
| output_dir: Output directory |
| file_prefix: File prefix |
| sample_rate: Sample rate |
| format: Audio format |
| channels_first: Tensor format flag |
| |
| Returns: |
| List of saved file paths |
| """ |
| output_dir = Path(output_dir) |
| output_dir.mkdir(parents=True, exist_ok=True) |
| |
| |
| if isinstance(audio_batch, torch.Tensor) and audio_batch.dim() == 3: |
| |
| audio_list = [audio_batch[i] for i in range(audio_batch.shape[0])] |
| elif isinstance(audio_batch, list): |
| audio_list = audio_batch |
| else: |
| audio_list = [audio_batch] |
| |
| saved_paths = [] |
| for i, audio in enumerate(audio_list): |
| output_path = output_dir / f"{file_prefix}_{i:04d}" |
| saved_path = self.save_audio( |
| audio, |
| output_path, |
| sample_rate=sample_rate, |
| format=format, |
| channels_first=channels_first |
| ) |
| saved_paths.append(saved_path) |
| |
| return saved_paths |
|
|
|
|
| def get_audio_file_hash(audio_file) -> str: |
| """ |
| Get hash identifier for an audio file. |
| |
| Args: |
| audio_file: Path to audio file (str) or file-like object |
| |
| Returns: |
| Hash string or empty string |
| """ |
| if audio_file is None: |
| return "" |
| |
| try: |
| if isinstance(audio_file, str): |
| if os.path.exists(audio_file): |
| with open(audio_file, 'rb') as f: |
| return hashlib.md5(f.read()).hexdigest() |
| return hashlib.md5(audio_file.encode('utf-8')).hexdigest() |
| elif hasattr(audio_file, 'name'): |
| return hashlib.md5(str(audio_file.name).encode('utf-8')).hexdigest() |
| return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest() |
| except Exception: |
| return hashlib.md5(str(audio_file).encode('utf-8')).hexdigest() |
|
|
|
|
| def generate_uuid_from_params(params_dict) -> str: |
| """ |
| Generate deterministic UUID from generation parameters. |
| Same parameters will always generate the same UUID. |
| |
| Args: |
| params_dict: Dictionary of parameters |
| |
| Returns: |
| UUID string |
| """ |
| |
| params_json = json.dumps(params_dict, sort_keys=True, ensure_ascii=False) |
| hash_obj = hashlib.sha256(params_json.encode('utf-8')) |
| hash_hex = hash_obj.hexdigest() |
| uuid_str = f"{hash_hex[0:8]}-{hash_hex[8:12]}-{hash_hex[12:16]}-{hash_hex[16:20]}-{hash_hex[20:32]}" |
| return uuid_str |
|
|
|
|
| def generate_uuid_from_audio_data( |
| audio_data: Union[torch.Tensor, np.ndarray], |
| seed: Optional[int] = None |
| ) -> str: |
| """ |
| Generate UUID from audio data (for caching/deduplication) |
| |
| Args: |
| audio_data: Audio data |
| seed: Optional seed value |
| |
| Returns: |
| UUID string |
| """ |
| if isinstance(audio_data, torch.Tensor): |
| |
| audio_np = audio_data.cpu().numpy() |
| else: |
| audio_np = audio_data |
| |
| |
| data_hash = hashlib.md5(audio_np.tobytes()).hexdigest() |
| |
| if seed is not None: |
| combined = f"{data_hash}_{seed}" |
| return hashlib.md5(combined.encode()).hexdigest() |
| |
| return data_hash |
|
|
|
|
| |
| _default_saver = AudioSaver(default_format="flac") |
|
|
|
|
| def save_audio( |
| audio_data: Union[torch.Tensor, np.ndarray], |
| output_path: Union[str, Path], |
| sample_rate: int = 48000, |
| format: Optional[str] = None, |
| channels_first: bool = True, |
| ) -> str: |
| """ |
| Convenience function: save audio (using default configuration) |
| |
| Args: |
| audio_data: Audio data |
| output_path: Output path |
| sample_rate: Sample rate |
| format: Format (default flac) |
| channels_first: Tensor format flag |
| |
| Returns: |
| Saved file path |
| """ |
| return _default_saver.save_audio( |
| audio_data, output_path, sample_rate, format, channels_first |
| ) |
|
|
|
|