Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -263,7 +263,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
| 263 |
|
| 264 |
max_new_tokens = generation_config.max_new_tokens
|
| 265 |
|
| 266 |
-
while current_generated_length +
|
| 267 |
generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
|
| 268 |
if is_greedy_gen_mode:
|
| 269 |
if generation_config.num_return_sequences > 1:
|
|
@@ -378,7 +378,7 @@ class MusicgenMelodyForLongFormConditionalGeneration(MusicgenMelodyForConditiona
|
|
| 378 |
|
| 379 |
# Specific to this gradio demo
|
| 380 |
if streamer is not None:
|
| 381 |
-
streamer.end(True)
|
| 382 |
|
| 383 |
audio_scales = model_kwargs.get("audio_scales")
|
| 384 |
if audio_scales is None:
|
|
@@ -414,7 +414,7 @@ title = "Streaming Long-form MusicGen"
|
|
| 414 |
description = """
|
| 415 |
Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
| 416 |
|
| 417 |
-
The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated, but in theory, it could run **endlessly**.
|
| 418 |
|
| 419 |
Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
|
| 420 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
|
@@ -468,6 +468,7 @@ class MusicgenStreamer(BaseStreamer):
|
|
| 468 |
stride: Optional[int] = None,
|
| 469 |
timeout: Optional[float] = None,
|
| 470 |
is_longform: Optional[bool] = False,
|
|
|
|
| 471 |
):
|
| 472 |
"""
|
| 473 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
|
@@ -496,6 +497,7 @@ class MusicgenStreamer(BaseStreamer):
|
|
| 496 |
self.audio_encoder = model.audio_encoder
|
| 497 |
self.generation_config = model.generation_config
|
| 498 |
self.device = device if device is not None else model.device
|
|
|
|
| 499 |
|
| 500 |
# variables used in the streaming process
|
| 501 |
self.play_steps = play_steps
|
|
@@ -509,6 +511,8 @@ class MusicgenStreamer(BaseStreamer):
|
|
| 509 |
|
| 510 |
self.is_longform = is_longform
|
| 511 |
|
|
|
|
|
|
|
| 512 |
# varibles used in the thread process
|
| 513 |
self.audio_queue = Queue()
|
| 514 |
self.stop_signal = None
|
|
@@ -565,19 +569,19 @@ class MusicgenStreamer(BaseStreamer):
|
|
| 565 |
|
| 566 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
| 567 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
|
| 572 |
-
def end(self, stream_end=False):
|
| 573 |
"""Flushes any remaining cache and appends the stop symbol."""
|
| 574 |
if self.token_cache is not None:
|
| 575 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 576 |
else:
|
| 577 |
audio_values = np.zeros(self.to_yield)
|
| 578 |
|
| 579 |
-
|
| 580 |
-
|
| 581 |
|
| 582 |
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
| 583 |
"""Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
|
|
@@ -618,8 +622,10 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
|
|
| 618 |
return wav_buf.read()
|
| 619 |
|
| 620 |
@spaces.GPU(duration=90)
|
| 621 |
-
def generate_audio(text_prompt, audio,
|
|
|
|
| 622 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
|
|
|
| 623 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 624 |
|
| 625 |
if audio is not None:
|
|
@@ -649,7 +655,8 @@ def generate_audio(text_prompt, audio, audio_length_in_s=10.0, play_steps_in_s=2
|
|
| 649 |
return_tensors="pt",
|
| 650 |
)
|
| 651 |
|
| 652 |
-
streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True
|
|
|
|
| 653 |
|
| 654 |
generation_kwargs = dict(
|
| 655 |
**inputs.to(device),
|
|
@@ -678,19 +685,17 @@ demo = gr.Interface(
|
|
| 678 |
inputs=[
|
| 679 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
| 680 |
gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
|
| 681 |
-
gr.Slider(30, 60, value=45, step=5, label="(Approximate) Audio length in seconds."),
|
| 682 |
-
gr.Slider(0.5, 2.5, value=1.5, step=0.5, label="Streaming interval in seconds.", info="Lower = shorter chunks, lower latency, more codec steps."),
|
| 683 |
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
|
| 684 |
],
|
| 685 |
outputs=[
|
| 686 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
| 687 |
],
|
| 688 |
examples=[
|
| 689 |
-
["An 80s driving pop song with heavy drums and synth pads in the background", None,
|
| 690 |
-
["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3",
|
| 691 |
-
["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3",
|
| 692 |
-
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None,
|
| 693 |
-
["lofi slow bpm electro chill with organic samples", None,
|
| 694 |
],
|
| 695 |
title=title,
|
| 696 |
description=description,
|
|
|
|
| 263 |
|
| 264 |
max_new_tokens = generation_config.max_new_tokens
|
| 265 |
|
| 266 |
+
while current_generated_length + 4 <= max_longform_generation_length:
|
| 267 |
generation_config.max_new_tokens = min(max_new_tokens, max_longform_generation_length - current_generated_length)
|
| 268 |
if is_greedy_gen_mode:
|
| 269 |
if generation_config.num_return_sequences > 1:
|
|
|
|
| 378 |
|
| 379 |
# Specific to this gradio demo
|
| 380 |
if streamer is not None:
|
| 381 |
+
streamer.end(final_end=True)
|
| 382 |
|
| 383 |
audio_scales = model_kwargs.get("audio_scales")
|
| 384 |
if audio_scales is None:
|
|
|
|
| 414 |
description = """
|
| 415 |
Stream the outputs of the MusicGen Melody text-to-music model by playing the generated audio as soon as the first chunk is ready.
|
| 416 |
|
| 417 |
+
The generation loop is adapted to perform **long-form** music generation. In this demo, we limit the duration of the music generated to 1mn20, but in theory, it could run **endlessly**.
|
| 418 |
|
| 419 |
Demo uses [MusicGen Melody](https://huggingface.co/facebook/musicgen-melody) in the 🤗 Transformers library. Note that the
|
| 420 |
demo works best on the Chrome browser. If there is no audio output, try switching browser to Chrome.
|
|
|
|
| 468 |
stride: Optional[int] = None,
|
| 469 |
timeout: Optional[float] = None,
|
| 470 |
is_longform: Optional[bool] = False,
|
| 471 |
+
longform_stride: Optional[float] = 10,
|
| 472 |
):
|
| 473 |
"""
|
| 474 |
Streamer that stores playback-ready audio in a queue, to be used by a downstream application as an iterator. This is
|
|
|
|
| 497 |
self.audio_encoder = model.audio_encoder
|
| 498 |
self.generation_config = model.generation_config
|
| 499 |
self.device = device if device is not None else model.device
|
| 500 |
+
self.longform_stride = longform_stride
|
| 501 |
|
| 502 |
# variables used in the streaming process
|
| 503 |
self.play_steps = play_steps
|
|
|
|
| 511 |
|
| 512 |
self.is_longform = is_longform
|
| 513 |
|
| 514 |
+
self.previous_len = -1
|
| 515 |
+
|
| 516 |
# varibles used in the thread process
|
| 517 |
self.audio_queue = Queue()
|
| 518 |
self.stop_signal = None
|
|
|
|
| 569 |
|
| 570 |
if self.token_cache.shape[-1] % self.play_steps == 0:
|
| 571 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 572 |
+
self.on_finalized_audio(audio_values[self.to_yield : -self.stride])
|
| 573 |
+
self.to_yield = len(audio_values) - self.stride
|
| 574 |
+
self.previous_len = len(audio_values)
|
| 575 |
|
| 576 |
+
def end(self, stream_end=False, final_end=False):
|
| 577 |
"""Flushes any remaining cache and appends the stop symbol."""
|
| 578 |
if self.token_cache is not None:
|
| 579 |
audio_values = self.apply_delay_pattern_mask(self.token_cache)
|
| 580 |
else:
|
| 581 |
audio_values = np.zeros(self.to_yield)
|
| 582 |
|
| 583 |
+
if final_end:
|
| 584 |
+
self.on_finalized_audio(audio_values[self.to_yield :], stream_end=True)
|
| 585 |
|
| 586 |
def on_finalized_audio(self, audio: np.ndarray, stream_end: bool = False):
|
| 587 |
"""Put the new audio in the queue. If the stream is ending, also put a stop signal in the queue."""
|
|
|
|
| 622 |
return wav_buf.read()
|
| 623 |
|
| 624 |
@spaces.GPU(duration=90)
|
| 625 |
+
def generate_audio(text_prompt, audio, seed=0):
|
| 626 |
+
audio_length_in_s = 60
|
| 627 |
max_new_tokens = int(frame_rate * audio_length_in_s)
|
| 628 |
+
play_steps_in_s = 2.0
|
| 629 |
play_steps = int(frame_rate * play_steps_in_s)
|
| 630 |
|
| 631 |
if audio is not None:
|
|
|
|
| 655 |
return_tensors="pt",
|
| 656 |
)
|
| 657 |
|
| 658 |
+
streamer = MusicgenStreamer(model, device=device, play_steps=play_steps, is_longform=True,
|
| 659 |
+
longform_stride=15*32000)
|
| 660 |
|
| 661 |
generation_kwargs = dict(
|
| 662 |
**inputs.to(device),
|
|
|
|
| 685 |
inputs=[
|
| 686 |
gr.Text(label="Prompt", value="80s pop track with synth and instrumentals"),
|
| 687 |
gr.Audio(type="filepath", label="Conditioning audio. Use this for melody-guided generation."),
|
|
|
|
|
|
|
| 688 |
gr.Number(value=5, precision=0, step=1, minimum=0, label="Seed for random generations."),
|
| 689 |
],
|
| 690 |
outputs=[
|
| 691 |
gr.Audio(label="Generated Music", autoplay=True, interactive=False, streaming=True)
|
| 692 |
],
|
| 693 |
examples=[
|
| 694 |
+
["An 80s driving pop song with heavy drums and synth pads in the background", None, 5],
|
| 695 |
+
["Bossa nova with guitars and synthesizer", "./assets/assets_bolero_ravel.mp3", 5],
|
| 696 |
+
["90s rock song with electric guitar and heavy drums", "./assets/assets_bach.mp3", 5],
|
| 697 |
+
["a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130", None, 5],
|
| 698 |
+
["lofi slow bpm electro chill with organic samples", None, 5],
|
| 699 |
],
|
| 700 |
title=title,
|
| 701 |
description=description,
|