| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import unittest |
|
|
| from datasets import load_dataset |
|
|
| from transformers import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, is_vision_available |
| from transformers.pipelines import pipeline |
| from transformers.testing_utils import ( |
| is_pipeline_test, |
| is_torch_available, |
| nested_simplify, |
| require_tf, |
| require_torch, |
| require_torch_accelerator, |
| require_vision, |
| slow, |
| torch_device, |
| ) |
|
|
| from .test_pipelines_common import ANY |
|
|
|
|
| if is_torch_available(): |
| import torch |
|
|
| from transformers.pipelines.pt_utils import KeyDataset |
|
|
|
|
| if is_vision_available(): |
| from PIL import Image |
| else: |
|
|
| class Image: |
| @staticmethod |
| def open(*args, **kwargs): |
| pass |
|
|
|
|
| @is_pipeline_test |
| @require_torch |
| @require_vision |
| class VisualQuestionAnsweringPipelineTests(unittest.TestCase): |
| model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING |
|
|
| def get_test_pipeline( |
| self, |
| model, |
| tokenizer=None, |
| image_processor=None, |
| feature_extractor=None, |
| processor=None, |
| torch_dtype="float32", |
| ): |
| vqa_pipeline = pipeline( |
| "visual-question-answering", |
| model="hf-internal-testing/tiny-vilt-random-vqa", |
| torch_dtype=torch_dtype, |
| ) |
| examples = [ |
| { |
| "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), |
| "question": "How many cats are there?", |
| }, |
| { |
| "image": "./tests/fixtures/tests_samples/COCO/000000039769.png", |
| "question": "How many cats are there?", |
| }, |
| ] |
| return vqa_pipeline, examples |
|
|
| def run_pipeline_test(self, vqa_pipeline, examples): |
| outputs = vqa_pipeline(examples, top_k=1) |
| self.assertEqual( |
| outputs, |
| [ |
| [{"score": ANY(float), "answer": ANY(str)}], |
| [{"score": ANY(float), "answer": ANY(str)}], |
| ], |
| ) |
|
|
| @require_torch |
| def test_small_model_pt(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") |
| image = "./tests/fixtures/tests_samples/COCO/000000039769.png" |
| question = "How many cats are there?" |
|
|
| outputs = vqa_pipeline(image=image, question="How many cats are there?", top_k=2) |
| self.assertEqual( |
| outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}] |
| ) |
|
|
| outputs = vqa_pipeline({"image": image, "question": question}, top_k=2) |
| self.assertEqual( |
| outputs, [{"score": ANY(float), "answer": ANY(str)}, {"score": ANY(float), "answer": ANY(str)}] |
| ) |
|
|
| @require_torch |
| @require_torch_accelerator |
| def test_small_model_pt_blip2(self): |
| vqa_pipeline = pipeline( |
| "visual-question-answering", model="hf-internal-testing/tiny-random-Blip2ForConditionalGeneration" |
| ) |
| image = "./tests/fixtures/tests_samples/COCO/000000039769.png" |
| question = "How many cats are there?" |
|
|
| outputs = vqa_pipeline(image=image, question=question) |
| self.assertEqual(outputs, [{"answer": ANY(str)}]) |
|
|
| outputs = vqa_pipeline({"image": image, "question": question}) |
| self.assertEqual(outputs, [{"answer": ANY(str)}]) |
|
|
| outputs = vqa_pipeline([{"image": image, "question": question}, {"image": image, "question": question}]) |
| self.assertEqual(outputs, [[{"answer": ANY(str)}]] * 2) |
|
|
| vqa_pipeline = pipeline( |
| "visual-question-answering", |
| model="hf-internal-testing/tiny-random-Blip2ForConditionalGeneration", |
| model_kwargs={"torch_dtype": torch.float16}, |
| device=torch_device, |
| ) |
| self.assertEqual(vqa_pipeline.model.device, torch.device(f"{torch_device}:0")) |
| self.assertEqual(vqa_pipeline.model.language_model.dtype, torch.float16) |
| self.assertEqual(vqa_pipeline.model.vision_model.dtype, torch.float16) |
|
|
| outputs = vqa_pipeline(image=image, question=question) |
| self.assertEqual(outputs, [{"answer": ANY(str)}]) |
|
|
| @slow |
| @require_torch |
| def test_large_model_pt(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa") |
| image = "./tests/fixtures/tests_samples/COCO/000000039769.png" |
| question = "How many cats are there?" |
|
|
| outputs = vqa_pipeline(image=image, question=question, top_k=2) |
| self.assertEqual( |
| nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}] |
| ) |
|
|
| outputs = vqa_pipeline({"image": image, "question": question}, top_k=2) |
| self.assertEqual( |
| nested_simplify(outputs, decimals=4), [{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}] |
| ) |
|
|
| outputs = vqa_pipeline( |
| [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2 |
| ) |
| self.assertEqual( |
| nested_simplify(outputs, decimals=4), |
| [[{"score": 0.8799, "answer": "2"}, {"score": 0.296, "answer": "1"}]] * 2, |
| ) |
|
|
| @slow |
| @require_torch |
| @require_torch_accelerator |
| def test_large_model_pt_blip2(self): |
| vqa_pipeline = pipeline( |
| "visual-question-answering", |
| model="Salesforce/blip2-opt-2.7b", |
| model_kwargs={"torch_dtype": torch.float16}, |
| device=torch_device, |
| ) |
| self.assertEqual(vqa_pipeline.model.device, torch.device(f"{torch_device}:0")) |
| self.assertEqual(vqa_pipeline.model.language_model.dtype, torch.float16) |
|
|
| image = "./tests/fixtures/tests_samples/COCO/000000039769.png" |
| question = "Question: how many cats are there? Answer:" |
|
|
| outputs = vqa_pipeline(image=image, question=question) |
| self.assertEqual(outputs, [{"answer": "two"}]) |
|
|
| outputs = vqa_pipeline({"image": image, "question": question}) |
| self.assertEqual(outputs, [{"answer": "two"}]) |
|
|
| outputs = vqa_pipeline([{"image": image, "question": question}, {"image": image, "question": question}]) |
| self.assertEqual(outputs, [[{"answer": "two"}]] * 2) |
|
|
| @require_torch |
| def test_small_model_pt_image_list(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") |
| images = [ |
| "./tests/fixtures/tests_samples/COCO/000000039769.png", |
| "./tests/fixtures/tests_samples/COCO/000000004016.png", |
| ] |
|
|
| outputs = vqa_pipeline(image=images, question="How many cats are there?", top_k=1) |
| self.assertEqual( |
| outputs, [[{"score": ANY(float), "answer": ANY(str)}], [{"score": ANY(float), "answer": ANY(str)}]] |
| ) |
|
|
| @require_torch |
| def test_small_model_pt_question_list(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") |
| image = "./tests/fixtures/tests_samples/COCO/000000039769.png" |
| questions = ["How many cats are there?", "Are there any dogs?"] |
|
|
| outputs = vqa_pipeline(image=image, question=questions, top_k=1) |
| self.assertEqual( |
| outputs, [[{"score": ANY(float), "answer": ANY(str)}], [{"score": ANY(float), "answer": ANY(str)}]] |
| ) |
|
|
| @require_torch |
| def test_small_model_pt_both_list(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") |
| images = [ |
| "./tests/fixtures/tests_samples/COCO/000000039769.png", |
| "./tests/fixtures/tests_samples/COCO/000000004016.png", |
| ] |
| questions = ["How many cats are there?", "Are there any dogs?"] |
|
|
| outputs = vqa_pipeline(image=images, question=questions, top_k=1) |
| self.assertEqual( |
| outputs, |
| [ |
| [{"score": ANY(float), "answer": ANY(str)}], |
| [{"score": ANY(float), "answer": ANY(str)}], |
| [{"score": ANY(float), "answer": ANY(str)}], |
| [{"score": ANY(float), "answer": ANY(str)}], |
| ], |
| ) |
|
|
| @require_torch |
| def test_small_model_pt_dataset(self): |
| vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa") |
| dataset = load_dataset("hf-internal-testing/dummy_image_text_data", split="train[:2]") |
| question = "What's in the image?" |
|
|
| outputs = vqa_pipeline(image=KeyDataset(dataset, "image"), question=question, top_k=1) |
| self.assertEqual( |
| outputs, |
| [ |
| [{"score": ANY(float), "answer": ANY(str)}], |
| [{"score": ANY(float), "answer": ANY(str)}], |
| ], |
| ) |
|
|
| @require_tf |
| @unittest.skip(reason="Visual question answering not implemented in TF") |
| def test_small_model_tf(self): |
| pass |
|
|