Spaces:

telegram191
/

MelodyFlow

Sleeping

App Files Files Community

telegram191 commited on Mar 13

Commit

8be5e63

verified ·

1 Parent(s): 44ea95d

Fallback to ffmpeg when PyAV is unavailable

Browse files

Files changed (1) hide show

audiocraft/data/audio.py +69 -2

audiocraft/data/audio.py CHANGED Viewed

@@ -19,19 +19,25 @@ import soundfile
 import torch
 from torch.nn import functional as F
-import av
 import subprocess as sp
 from .audio_utils import f32_pcm, normalize_audio
 _av_initialized = False
 def _init_av():
     global _av_initialized
     if _av_initialized:
         return
     logger = logging.getLogger('libav.mp3')
     logger.setLevel(logging.ERROR)
     _av_initialized = True
@@ -46,6 +52,8 @@ class AudioFileInfo:
 def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
     _init_av()
     with av.open(str(filepath)) as af:
         stream = af.streams.audio[0]
         sample_rate = stream.codec_context.sample_rate
@@ -59,6 +67,24 @@ def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
     return AudioFileInfo(info.samplerate, info.duration, info.channels)
 def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
     # torchaudio no longer returns useful duration informations for some formats like mp3s.
     filepath = Path(filepath)
@@ -66,6 +92,8 @@ def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
         # ffmpeg has some weird issue with flac.
         return _soundfile_info(filepath)
     else:
         return _av_info(filepath)
@@ -81,6 +109,8 @@ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: floa
         tuple of torch.Tensor, int: Tuple containing audio data and sample rate
     """
     _init_av()
     with av.open(str(filepath)) as af:
         stream = af.streams.audio[0]
         sr = stream.codec_context.sample_rate
@@ -113,6 +143,40 @@ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: floa
         return f32_pcm(wav), sr
 def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
                duration: float = -1.0, pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
     """Read audio by picking the most appropriate backend tool based on the audio format.
@@ -137,7 +201,10 @@ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
         if len(wav.shape) == 1:
             wav = torch.unsqueeze(wav, 0)
     else:
-        wav, sr = _av_read(filepath, seek_time, duration)
     if pad and duration > 0:
         expected_frames = int(duration * sr)
         wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))

 import torch
 from torch.nn import functional as F
 import subprocess as sp
 from .audio_utils import f32_pcm, normalize_audio
 _av_initialized = False
+try:
+    import av
+except Exception:
+    av = None
 def _init_av():
     global _av_initialized
     if _av_initialized:
         return
+    if av is None:
+        _av_initialized = True
+        return
     logger = logging.getLogger('libav.mp3')
     logger.setLevel(logging.ERROR)
     _av_initialized = True
 def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
     _init_av()
+    if av is None:
+        raise RuntimeError("PyAV is not available")
     with av.open(str(filepath)) as af:
         stream = af.streams.audio[0]
         sample_rate = stream.codec_context.sample_rate
     return AudioFileInfo(info.samplerate, info.duration, info.channels)
+def _ffmpeg_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
+    command = [
+        "ffprobe",
+        "-v", "error",
+        "-select_streams", "a:0",
+        "-show_entries", "stream=sample_rate,channels,duration",
+        "-of", "default=noprint_wrappers=1:nokey=1",
+        str(filepath),
+    ]
+    out = sp.check_output(command).decode("utf-8", "replace").strip().splitlines()
+    if len(out) < 3:
+        raise RuntimeError("ffprobe did not return enough audio info")
+    sample_rate = int(float(out[0]))
+    channels = int(float(out[1]))
+    duration = float(out[2])
+    return AudioFileInfo(sample_rate, duration, channels)
 def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
     # torchaudio no longer returns useful duration informations for some formats like mp3s.
     filepath = Path(filepath)
         # ffmpeg has some weird issue with flac.
         return _soundfile_info(filepath)
     else:
+        if av is None:
+            return _ffmpeg_info(filepath)
         return _av_info(filepath)
         tuple of torch.Tensor, int: Tuple containing audio data and sample rate
     """
     _init_av()
+    if av is None:
+        raise RuntimeError("PyAV is not available")
     with av.open(str(filepath)) as af:
         stream = af.streams.audio[0]
         sr = stream.codec_context.sample_rate
         return f32_pcm(wav), sr
+def _ffmpeg_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
+    try:
+        info = _ffmpeg_info(filepath)
+        sr = info.sample_rate
+        channels = info.channels
+    except Exception:
+        sr = 44100
+        channels = 2
+    command = [
+        "ffmpeg",
+        "-loglevel", "error",
+        "-nostdin",
+    ]
+    if seek_time > 0:
+        command += ["-ss", str(seek_time)]
+    command += ["-i", str(filepath)]
+    if duration and duration > 0:
+        command += ["-t", str(duration)]
+    command += [
+        "-f", "f32le",
+        "-acodec", "pcm_f32le",
+        "-ar", str(sr),
+        "-ac", str(channels),
+        "-",
+    ]
+    raw = sp.check_output(command)
+    audio = np.frombuffer(raw, dtype=np.float32)
+    if audio.size == 0:
+        raise RuntimeError("ffmpeg returned empty audio")
+    audio = audio.reshape(-1, channels).T
+    wav = torch.from_numpy(audio).contiguous()
+    return f32_pcm(wav), sr
 def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
                duration: float = -1.0, pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
     """Read audio by picking the most appropriate backend tool based on the audio format.
         if len(wav.shape) == 1:
             wav = torch.unsqueeze(wav, 0)
     else:
+        if av is None:
+            wav, sr = _ffmpeg_read(filepath, seek_time, duration)
+        else:
+            wav, sr = _av_read(filepath, seek_time, duration)
     if pad and duration > 0:
         expected_frames = int(duration * sr)
         wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))