YingMusic-Singer / src /YingMusicSinger /utils /mel_spectrogram.py
xjsc0's picture
1
61e6f25
import numpy as np
import torch
import torch.nn.functional as F
import torchaudio
class MelodySpectrogram(torch.nn.Module):
def __init__(
self,
n_mel_channels=80,
sampling_rate=44100,
win_length=2048,
hop_length=512,
n_fft=None,
mel_fmin=0,
mel_fmax=None,
clamp=1e-5,
):
from librosa.filters import mel
super().__init__()
n_fft = win_length if n_fft is None else n_fft
self.hann_window = {}
mel_basis = mel(
sr=sampling_rate,
n_fft=n_fft,
n_mels=n_mel_channels,
fmin=mel_fmin,
fmax=mel_fmax,
htk=True,
)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer("mel_basis", mel_basis)
self.n_fft = n_fft
self.hop_length = hop_length
self.win_length = win_length
self.sampling_rate = sampling_rate
self.n_mel_channels = n_mel_channels
self.clamp = clamp
def _mel_forward(self, audio, keyshift=0, speed=1, center=True):
factor = 2 ** (keyshift / 12)
n_fft_new = int(np.round(self.n_fft * factor))
win_length_new = int(np.round(self.win_length * factor))
hop_length_new = int(np.round(self.hop_length * speed))
keyshift_key = str(keyshift) + "_" + str(audio.device)
if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
audio.device
)
fft = torch.stft(
audio,
n_fft=n_fft_new,
hop_length=hop_length_new,
win_length=win_length_new,
window=self.hann_window[keyshift_key],
center=center,
return_complex=True,
)
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
if keyshift != 0:
size = self.n_fft // 2 + 1
resize = magnitude.size(1)
if resize < size:
magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
mel_output = torch.matmul(self.mel_basis, magnitude)
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
return log_mel_spec
@torch.no_grad()
def forward(self, audio, sr, sil_len_to_end=None, keyshift=0, speed=1):
# audio, sr = torchaudio.load(audio_path)
if sil_len_to_end is not None:
silence = torch.zeros(audio.shape[0], int(sr * sil_len_to_end))
audio = torch.cat([audio, silence], dim=1)
if sr != self.sampling_rate:
audio = torchaudio.transforms.Resample(sr, self.sampling_rate)(audio)
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
audio = audio.to(self.mel_basis.device)
return self._mel_forward(audio, keyshift=keyshift, speed=speed)