Spaces:

Zaman01
/

Image_Caption

Sleeping

App Files Files Community

Zaman01 commited on Dec 19, 2025

Commit

ce418ea

verified ·

1 Parent(s): f15d90c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +163 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# import gradio as gr
+# from transformers import BlipProcessor, BlipForConditionalGeneration
+# from PIL import Image
+# import torch
+# import requests
+# # Load model & processor
+# processor = BlipProcessor.from_pretrained(
+#     "Salesforce/blip-image-captioning-base"
+# )
+# model = BlipForConditionalGeneration.from_pretrained(
+#     "Salesforce/blip-image-captioning-base"
+# )
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# model.to(device)
+# def caption_image(image, prompt="", openai_api_key=""):
+#     if not prompt or not prompt.strip():
+#         return "Please enter a prompt/question for the image."
+#     image = image.convert("RGB")
+#     # Use OpenAI API if key provided (unchanged)
+#     if openai_api_key:
+#         try:
+#             import base64
+#             from io import BytesIO
+#             buffered = BytesIO()
+#             image.save(buffered, format="PNG")
+#             img_b64 = base64.b64encode(buffered.getvalue()).decode()
+#             headers = {
+#                 "Authorization": f"Bearer {openai_api_key}",
+#                 "Content-Type": "application/json"
+#             }
+#             data = {
+#                 "model": "gpt-4-vision-preview",
+#                 "messages": [
+#                     {
+#                         "role": "user",
+#                         "content": [
+#                             {"type": "text", "text": prompt.strip()},
+#                             {"type": "image_url", "image_url": f"data:image/png;base64,{img_b64}"}
+#                         ]
+#                     }
+#                 ],
+#                 "max_tokens": 100
+#             }
+#             resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
+#             if resp.status_code == 200:
+#                 result = resp.json()
+#                 return result["choices"][0]["message"]["content"].strip()
+#             else:
+#                 return f"OpenAI API error: {resp.status_code} {resp.text}"
+#         except Exception as e:
+#             return f"OpenAI API error: {e}"
+#     # BLIP: always use prompt as instruction, no retry, fast settings
+#     p = prompt.strip()
+#     prompt_text = f"Question: {p} Answer:"
+#     inputs = processor(images=image, text=prompt_text, return_tensors="pt").to(device)
+#     # Speed up: reduce beams and max_length
+#     gen_kwargs = {"max_length": 25, "num_beams": 1, "early_stopping": True}
+#     output = model.generate(**inputs, **gen_kwargs)
+#     caption = processor.decode(output[0], skip_special_tokens=True)
+#     # Extract answer after 'Answer:' if present
+#     idx = caption.lower().find("answer:")
+#     if idx != -1:
+#         ans = caption[idx + len("answer:"):].strip()
+#         if ans:
+#             return ans
+#     # Otherwise, return the raw caption
+#     return caption.strip()
+# # Gradio UI: horizontal layout with image, prompt, button left; output right
+# with gr.Blocks() as demo:
+#     gr.Markdown("## 🖼️ Image Captioning (Prompt-driven)\nUpload an image, enter a prompt, and click Submit. Output depends on both image and prompt.")
+#     with gr.Row():
+#         with gr.Column(scale=2):
+#             img = gr.Image(type="pil", label="Upload Image")
+#             prompt = gr.Textbox(label="Prompt (ask a question)", placeholder="What is the color of the t-shirt?")
+#             openai_api_key = gr.Textbox(label="OpenAI API Key (optional)", type="password", placeholder="sk-...", lines=1)
+#             btn = gr.Button("Submit")
+#         with gr.Column(scale=1):
+#             out = gr.Textbox(label="Answer", lines=6)
+#     btn.click(fn=caption_image, inputs=[img, prompt, openai_api_key], outputs=out)
+# demo.launch()
+import gradio as gr
+import torch
+from transformers import BlipProcessor, BlipForQuestionAnswering
+from PIL import Image
+# ---------------------------
+# Load BLIP VQA model
+# ---------------------------
+MODEL_NAME = "Salesforce/blip-vqa-base"
+processor = BlipProcessor.from_pretrained(MODEL_NAME)
+model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+model.eval()
+# ---------------------------
+# Inference function
+# ---------------------------
+def answer_image_question(image, question):
+    if image is None:
+        return "Please upload an image."
+    if not question.strip():
+        return "Please enter a question."
+    image = image.convert("RGB")
+    inputs = processor(
+        images=image,
+        text=question,
+        return_tensors="pt"
+    ).to(device)
+    with torch.no_grad():
+        output = model.generate(
+            **inputs,
+            max_length=10,   # fast
+            num_beams=1      # faster
+        )
+    answer = processor.decode(output[0], skip_special_tokens=True)
+    return answer
+# ---------------------------
+# Gradio UI
+# ---------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## 🖼️ Image Question Answering (Fast & Accurate)")
+    gr.Markdown(
+        "Upload an image and ask a question like:\n"
+        "- *What is the color of the shirt?*\n"
+        "- *How many people are there?*\n"
+        "- *Is the person wearing glasses?*"
+    )
+    with gr.Row():
+        with gr.Column():
+            img = gr.Image(type="pil", label="Upload Image")
+            question = gr.Textbox(
+                label="Question",
+                placeholder="What is the color of the shirt?"
+            )
+            btn = gr.Button("Submit")
+        with gr.Column():
+            answer = gr.Textbox(label="Answer", lines=3)
+    btn.click(
+        fn=answer_image_question,
+        inputs=[img, question],
+        outputs=answer
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+transformers
+Pillow
+torch