| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import unittest |
|
|
| import numpy as np |
|
|
| from transformers import ( |
| MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING, |
| AutoProcessor, |
| TextToAudioPipeline, |
| pipeline, |
| ) |
| from transformers.testing_utils import ( |
| is_pipeline_test, |
| require_torch, |
| require_torch_accelerator, |
| require_torch_or_tf, |
| slow, |
| torch_device, |
| ) |
| from transformers.trainer_utils import set_seed |
|
|
| from .test_pipelines_common import ANY |
|
|
|
|
| @is_pipeline_test |
| @require_torch_or_tf |
| class TextToAudioPipelineTests(unittest.TestCase): |
| model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING |
| |
|
|
| @slow |
| @require_torch |
| def test_small_musicgen_pt(self): |
| music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt") |
|
|
| forward_params = { |
| "do_sample": False, |
| "max_new_tokens": 250, |
| } |
|
|
| outputs = music_generator("This is a test", forward_params=forward_params) |
| self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 32000}, outputs) |
|
|
| |
| outputs = music_generator(["This is a test", "This is a second test"], forward_params=forward_params) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| |
| outputs = music_generator( |
| ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2 |
| ) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| @slow |
| @require_torch |
| def test_medium_seamless_m4t_pt(self): |
| speech_generator = pipeline(task="text-to-audio", model="facebook/hf-seamless-m4t-medium", framework="pt") |
|
|
| for forward_params in [{"tgt_lang": "eng"}, {"return_intermediate_token_ids": True, "tgt_lang": "eng"}]: |
| outputs = speech_generator("This is a test", forward_params=forward_params) |
| self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 16000}, outputs) |
|
|
| |
| outputs = speech_generator(["This is a test", "This is a second test"], forward_params=forward_params) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| |
| outputs = speech_generator( |
| ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2 |
| ) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| @slow |
| @require_torch |
| def test_small_bark_pt(self): |
| speech_generator = pipeline(task="text-to-audio", model="suno/bark-small", framework="pt") |
|
|
| forward_params = { |
| |
| "do_sample": False, |
| "semantic_max_new_tokens": 100, |
| } |
|
|
| outputs = speech_generator("This is a test", forward_params=forward_params) |
| self.assertEqual( |
| {"audio": ANY(np.ndarray), "sampling_rate": 24000}, |
| outputs, |
| ) |
|
|
| |
| outputs = speech_generator( |
| ["This is a test", "This is a second test"], |
| forward_params=forward_params, |
| ) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| |
| forward_params = { |
| "do_sample": True, |
| "semantic_max_new_tokens": 100, |
| "semantic_num_return_sequences": 2, |
| } |
|
|
| outputs = speech_generator("This is a test", forward_params=forward_params) |
| audio = outputs["audio"] |
| self.assertEqual(ANY(np.ndarray), audio) |
|
|
| |
| processor = AutoProcessor.from_pretrained("suno/bark-small") |
| temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5") |
| history_prompt = temp_inp["history_prompt"] |
| forward_params["history_prompt"] = history_prompt |
|
|
| outputs = speech_generator( |
| ["This is a test", "This is a second test"], |
| forward_params=forward_params, |
| batch_size=2, |
| ) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| @slow |
| @require_torch_accelerator |
| def test_conversion_additional_tensor(self): |
| speech_generator = pipeline(task="text-to-audio", model="suno/bark-small", framework="pt", device=torch_device) |
| processor = AutoProcessor.from_pretrained("suno/bark-small") |
|
|
| forward_params = { |
| "do_sample": True, |
| "semantic_max_new_tokens": 100, |
| } |
|
|
| |
| preprocess_params = { |
| "max_length": 256, |
| "add_special_tokens": False, |
| "return_attention_mask": True, |
| "return_token_type_ids": False, |
| "padding": "max_length", |
| } |
| outputs = speech_generator( |
| "This is a test", |
| forward_params=forward_params, |
| preprocess_params=preprocess_params, |
| ) |
|
|
| temp_inp = processor("hey, how are you?", voice_preset="v2/en_speaker_5") |
| history_prompt = temp_inp["history_prompt"] |
| forward_params["history_prompt"] = history_prompt |
|
|
| |
| |
| outputs = speech_generator( |
| "This is a test", forward_params=forward_params, preprocess_params=preprocess_params |
| ) |
| self.assertEqual( |
| {"audio": ANY(np.ndarray), "sampling_rate": 24000}, |
| outputs, |
| ) |
|
|
| @slow |
| @require_torch |
| def test_vits_model_pt(self): |
| speech_generator = pipeline(task="text-to-audio", model="facebook/mms-tts-eng", framework="pt") |
|
|
| outputs = speech_generator("This is a test") |
| self.assertEqual(outputs["sampling_rate"], 16000) |
|
|
| audio = outputs["audio"] |
| self.assertEqual(ANY(np.ndarray), audio) |
|
|
| |
| outputs = speech_generator(["This is a test", "This is a second test"]) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|
| |
| outputs = speech_generator(["This is a test", "This is a second test"], batch_size=2) |
| self.assertEqual(ANY(np.ndarray), outputs[0]["audio"]) |
|
|
| @slow |
| @require_torch |
| def test_forward_model_kwargs(self): |
| |
| speech_generator = pipeline(task="text-to-audio", model="kakao-enterprise/vits-vctk", framework="pt") |
|
|
| |
| set_seed(555) |
| outputs = speech_generator("This is a test", forward_params={"speaker_id": 5}) |
| audio = outputs["audio"] |
|
|
| with self.assertRaises(TypeError): |
| |
| outputs = speech_generator("This is a test", forward_params={"speaker_id": 5, "do_sample": True}) |
|
|
| forward_params = {"speaker_id": 5} |
| generate_kwargs = {"do_sample": True} |
|
|
| with self.assertRaises(ValueError): |
| |
| outputs = speech_generator( |
| "This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs |
| ) |
| self.assertTrue(np.abs(outputs["audio"] - audio).max() < 1e-5) |
|
|
| @slow |
| @require_torch |
| def test_generative_model_kwargs(self): |
| |
| music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt") |
|
|
| forward_params = { |
| "do_sample": True, |
| "max_new_tokens": 250, |
| } |
|
|
| |
| set_seed(555) |
| outputs = music_generator("This is a test", forward_params=forward_params) |
| audio = outputs["audio"] |
| self.assertEqual(ANY(np.ndarray), audio) |
|
|
| |
| forward_params = { |
| "do_sample": False, |
| "max_new_tokens": 250, |
| } |
| generate_kwargs = {"do_sample": True} |
|
|
| |
| set_seed(555) |
| outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs) |
| self.assertListEqual(outputs["audio"].tolist(), audio.tolist()) |
|
|
| def get_test_pipeline( |
| self, |
| model, |
| tokenizer=None, |
| image_processor=None, |
| feature_extractor=None, |
| processor=None, |
| torch_dtype="float32", |
| ): |
| speech_generator = TextToAudioPipeline( |
| model=model, |
| tokenizer=tokenizer, |
| feature_extractor=feature_extractor, |
| image_processor=image_processor, |
| processor=processor, |
| torch_dtype=torch_dtype, |
| ) |
| return speech_generator, ["This is a test", "Another test"] |
|
|
| def run_pipeline_test(self, speech_generator, _): |
| outputs = speech_generator("This is a test") |
| self.assertEqual(ANY(np.ndarray), outputs["audio"]) |
|
|
| forward_params = ( |
| {"num_return_sequences": 2, "do_sample": True} if speech_generator.model.can_generate() else {} |
| ) |
| outputs = speech_generator(["This is great !", "Something else"], forward_params=forward_params) |
| audio = [output["audio"] for output in outputs] |
| self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) |
|
|