Spaces:

ASLP-lab
/

YingMusic-Singer

Runtime error

YingMusic-Singer / src /YingMusicSinger /utils /mel_spectrogram.py

61e6f25 2 days ago

3 kB

	import numpy as np
	import torch
	import torch.nn.functional as F
	import torchaudio


	class MelodySpectrogram(torch.nn.Module):
	def __init__(
	self,
	n_mel_channels=80,
	sampling_rate=44100,
	win_length=2048,
	hop_length=512,
	n_fft=None,
	mel_fmin=0,
	mel_fmax=None,
	clamp=1e-5,
	):
	from librosa.filters import mel

	super().__init__()
	n_fft = win_length if n_fft is None else n_fft
	self.hann_window = {}
	mel_basis = mel(
	sr=sampling_rate,
	n_fft=n_fft,
	n_mels=n_mel_channels,
	fmin=mel_fmin,
	fmax=mel_fmax,
	htk=True,
	)
	mel_basis = torch.from_numpy(mel_basis).float()
	self.register_buffer("mel_basis", mel_basis)
	self.n_fft = n_fft
	self.hop_length = hop_length
	self.win_length = win_length
	self.sampling_rate = sampling_rate
	self.n_mel_channels = n_mel_channels
	self.clamp = clamp

	def _mel_forward(self, audio, keyshift=0, speed=1, center=True):
	factor = 2 ** (keyshift / 12)
	n_fft_new = int(np.round(self.n_fft * factor))
	win_length_new = int(np.round(self.win_length * factor))
	hop_length_new = int(np.round(self.hop_length * speed))

	keyshift_key = str(keyshift) + "_" + str(audio.device)
	if keyshift_key not in self.hann_window:
	self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
	audio.device
	)

	fft = torch.stft(
	audio,
	n_fft=n_fft_new,
	hop_length=hop_length_new,
	win_length=win_length_new,
	window=self.hann_window[keyshift_key],
	center=center,
	return_complex=True,
	)
	magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))

	if keyshift != 0:
	size = self.n_fft // 2 + 1
	resize = magnitude.size(1)
	if resize < size:
	magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
	magnitude = magnitude[:, :size, :] * self.win_length / win_length_new

	mel_output = torch.matmul(self.mel_basis, magnitude)
	log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
	return log_mel_spec

	@torch.no_grad()
	def forward(self, audio, sr, sil_len_to_end=None, keyshift=0, speed=1):
	# audio, sr = torchaudio.load(audio_path)
	if sil_len_to_end is not None:
	silence = torch.zeros(audio.shape[0], int(sr * sil_len_to_end))
	audio = torch.cat([audio, silence], dim=1)
	if sr != self.sampling_rate:
	audio = torchaudio.transforms.Resample(sr, self.sampling_rate)(audio)
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)
	audio = audio.to(self.mel_basis.device)
	return self._mel_forward(audio, keyshift=keyshift, speed=speed)