import numpy as np import torch import torch.nn.functional as F import torchaudio class MelodySpectrogram(torch.nn.Module): def __init__( self, n_mel_channels=80, sampling_rate=44100, win_length=2048, hop_length=512, n_fft=None, mel_fmin=0, mel_fmax=None, clamp=1e-5, ): from librosa.filters import mel super().__init__() n_fft = win_length if n_fft is None else n_fft self.hann_window = {} mel_basis = mel( sr=sampling_rate, n_fft=n_fft, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax, htk=True, ) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis) self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.n_mel_channels = n_mel_channels self.clamp = clamp def _mel_forward(self, audio, keyshift=0, speed=1, center=True): factor = 2 ** (keyshift / 12) n_fft_new = int(np.round(self.n_fft * factor)) win_length_new = int(np.round(self.win_length * factor)) hop_length_new = int(np.round(self.hop_length * speed)) keyshift_key = str(keyshift) + "_" + str(audio.device) if keyshift_key not in self.hann_window: self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( audio.device ) fft = torch.stft( audio, n_fft=n_fft_new, hop_length=hop_length_new, win_length=win_length_new, window=self.hann_window[keyshift_key], center=center, return_complex=True, ) magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) if keyshift != 0: size = self.n_fft // 2 + 1 resize = magnitude.size(1) if resize < size: magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) magnitude = magnitude[:, :size, :] * self.win_length / win_length_new mel_output = torch.matmul(self.mel_basis, magnitude) log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) return log_mel_spec @torch.no_grad() def forward(self, audio, sr, sil_len_to_end=None, keyshift=0, speed=1): # audio, sr = torchaudio.load(audio_path) if sil_len_to_end is not None: silence = torch.zeros(audio.shape[0], int(sr * sil_len_to_end)) audio = torch.cat([audio, silence], dim=1) if sr != self.sampling_rate: audio = torchaudio.transforms.Resample(sr, self.sampling_rate)(audio) if audio.shape[0] > 1: audio = torch.mean(audio, dim=0, keepdim=True) audio = audio.to(self.mel_basis.device) return self._mel_forward(audio, keyshift=keyshift, speed=speed)