| | import os |
| | import sys |
| | import time |
| | from scipy import signal |
| | from scipy.io import wavfile |
| | import numpy as np |
| | import concurrent.futures |
| | from tqdm import tqdm |
| | import json |
| | from distutils.util import strtobool |
| | import librosa |
| | import multiprocessing |
| | import noisereduce as nr |
| |
|
| | now_directory = os.getcwd() |
| | sys.path.append(now_directory) |
| |
|
| | from rvc.lib.utils import load_audio |
| | from rvc.train.preprocess.slicer import Slicer |
| |
|
| | |
| | import logging |
| |
|
| | logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) |
| | logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) |
| | logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) |
| |
|
| | |
| | OVERLAP = 0.3 |
| | MAX_AMPLITUDE = 0.9 |
| | ALPHA = 0.75 |
| | HIGH_PASS_CUTOFF = 48 |
| | SAMPLE_RATE_16K = 16000 |
| |
|
| |
|
| | class PreProcess: |
| | def __init__(self, sr: int, exp_dir: str, per: float): |
| | self.slicer = Slicer( |
| | sr=sr, |
| | threshold=-42, |
| | min_length=1500, |
| | min_interval=400, |
| | hop_size=15, |
| | max_sil_kept=500, |
| | ) |
| | self.sr = sr |
| | self.b_high, self.a_high = signal.butter( |
| | N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr |
| | ) |
| | self.per = per |
| | self.exp_dir = exp_dir |
| | self.device = "cpu" |
| | self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") |
| | self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") |
| | os.makedirs(self.gt_wavs_dir, exist_ok=True) |
| | os.makedirs(self.wavs16k_dir, exist_ok=True) |
| |
|
| | def _normalize_audio(self, audio: np.ndarray): |
| | tmp_max = np.abs(audio).max() |
| | if tmp_max > 2.5: |
| | return None |
| | return (audio / tmp_max * (MAX_AMPLITUDE * ALPHA)) + (1 - ALPHA) * audio |
| |
|
| | def process_audio_segment( |
| | self, |
| | normalized_audio: np.ndarray, |
| | sid: int, |
| | idx0: int, |
| | idx1: int, |
| | ): |
| | if normalized_audio is None: |
| | print(f"{sid}-{idx0}-{idx1}-filtered") |
| | return |
| | wavfile.write( |
| | os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{idx1}.wav"), |
| | self.sr, |
| | normalized_audio.astype(np.float32), |
| | ) |
| | audio_16k = librosa.resample( |
| | normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K |
| | ) |
| | wavfile.write( |
| | os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), |
| | SAMPLE_RATE_16K, |
| | audio_16k.astype(np.float32), |
| | ) |
| |
|
| | def process_audio( |
| | self, |
| | path: str, |
| | idx0: int, |
| | sid: int, |
| | cut_preprocess: bool, |
| | process_effects: bool, |
| | noise_reduction: bool, |
| | reduction_strength: float, |
| | ): |
| | audio_length = 0 |
| | try: |
| | audio = load_audio(path, self.sr) |
| | audio_length = librosa.get_duration(y=audio, sr=self.sr) |
| | if process_effects: |
| | audio = signal.lfilter(self.b_high, self.a_high, audio) |
| | audio = self._normalize_audio(audio) |
| | if noise_reduction: |
| | audio = nr.reduce_noise( |
| | y=audio, sr=self.sr, prop_decrease=reduction_strength |
| | ) |
| | idx1 = 0 |
| | if cut_preprocess: |
| | for audio_segment in self.slicer.slice(audio): |
| | i = 0 |
| | while True: |
| | start = int(self.sr * (self.per - OVERLAP) * i) |
| | i += 1 |
| | if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: |
| | tmp_audio = audio_segment[ |
| | start : start + int(self.per * self.sr) |
| | ] |
| | self.process_audio_segment( |
| | tmp_audio, |
| | sid, |
| | idx0, |
| | idx1, |
| | ) |
| | idx1 += 1 |
| | else: |
| | tmp_audio = audio_segment[start:] |
| | self.process_audio_segment( |
| | tmp_audio, |
| | sid, |
| | idx0, |
| | idx1, |
| | ) |
| | idx1 += 1 |
| | break |
| | else: |
| | self.process_audio_segment( |
| | audio, |
| | sid, |
| | idx0, |
| | idx1, |
| | ) |
| | except Exception as error: |
| | print(f"Error processing audio: {error}") |
| | return audio_length |
| |
|
| |
|
| | def format_duration(seconds): |
| | hours = int(seconds // 3600) |
| | minutes = int((seconds % 3600) // 60) |
| | seconds = int(seconds % 60) |
| | return f"{hours:02}:{minutes:02}:{seconds:02}" |
| |
|
| |
|
| | def save_dataset_duration(file_path, dataset_duration): |
| | try: |
| | with open(file_path, "r") as f: |
| | data = json.load(f) |
| | except FileNotFoundError: |
| | data = {} |
| |
|
| | formatted_duration = format_duration(dataset_duration) |
| | new_data = { |
| | "total_dataset_duration": formatted_duration, |
| | "total_seconds": dataset_duration, |
| | } |
| | data.update(new_data) |
| |
|
| | with open(file_path, "w") as f: |
| | json.dump(data, f, indent=4) |
| |
|
| |
|
| | def process_audio_wrapper(args): |
| | pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength = ( |
| | args |
| | ) |
| | file_path, idx0, sid = file |
| | return pp.process_audio( |
| | file_path, |
| | idx0, |
| | sid, |
| | cut_preprocess, |
| | process_effects, |
| | noise_reduction, |
| | reduction_strength, |
| | ) |
| |
|
| |
|
| | def preprocess_training_set( |
| | input_root: str, |
| | sr: int, |
| | num_processes: int, |
| | exp_dir: str, |
| | per: float, |
| | cut_preprocess: bool, |
| | process_effects: bool, |
| | noise_reduction: bool, |
| | reduction_strength: float, |
| | ): |
| | start_time = time.time() |
| | pp = PreProcess(sr, exp_dir, per) |
| | print(f"Starting preprocess with {num_processes} processes...") |
| |
|
| | files = [] |
| | idx = 0 |
| |
|
| | for root, _, filenames in os.walk(input_root): |
| | try: |
| | sid = 0 if root == input_root else int(os.path.basename(root)) |
| | for f in filenames: |
| | if f.lower().endswith((".wav", ".mp3", ".flac", ".ogg")): |
| | files.append((os.path.join(root, f), idx, sid)) |
| | idx += 1 |
| | except ValueError: |
| | print( |
| | f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.' |
| | ) |
| |
|
| | |
| | audio_length = [] |
| | with tqdm(total=len(files)) as pbar: |
| | with concurrent.futures.ProcessPoolExecutor( |
| | max_workers=num_processes |
| | ) as executor: |
| | futures = [ |
| | executor.submit( |
| | process_audio_wrapper, |
| | ( |
| | pp, |
| | file, |
| | cut_preprocess, |
| | process_effects, |
| | noise_reduction, |
| | reduction_strength, |
| | ), |
| | ) |
| | for file in files |
| | ] |
| | for future in concurrent.futures.as_completed(futures): |
| | audio_length.append(future.result()) |
| | pbar.update(1) |
| |
|
| | audio_length = sum(audio_length) |
| | save_dataset_duration( |
| | os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length |
| | ) |
| | elapsed_time = time.time() - start_time |
| | print( |
| | f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | experiment_directory = str(sys.argv[1]) |
| | input_root = str(sys.argv[2]) |
| | sample_rate = int(sys.argv[3]) |
| | percentage = float(sys.argv[4]) |
| | num_processes = sys.argv[5] |
| | if num_processes.lower() == "none": |
| | num_processes = multiprocessing.cpu_count() |
| | else: |
| | num_processes = int(num_processes) |
| | cut_preprocess = strtobool(sys.argv[6]) |
| | process_effects = strtobool(sys.argv[7]) |
| | noise_reduction = strtobool(sys.argv[8]) |
| | reduction_strength = float(sys.argv[9]) |
| |
|
| | preprocess_training_set( |
| | input_root, |
| | sample_rate, |
| | num_processes, |
| | experiment_directory, |
| | percentage, |
| | cut_preprocess, |
| | process_effects, |
| | noise_reduction, |
| | reduction_strength, |
| | ) |
| |
|