Zaman01 commited on
Commit
ce418ea
·
verified ·
1 Parent(s): f15d90c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +163 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # import gradio as gr
3
+ # from transformers import BlipProcessor, BlipForConditionalGeneration
4
+ # from PIL import Image
5
+ # import torch
6
+ # import requests
7
+
8
+ # # Load model & processor
9
+ # processor = BlipProcessor.from_pretrained(
10
+ # "Salesforce/blip-image-captioning-base"
11
+ # )
12
+ # model = BlipForConditionalGeneration.from_pretrained(
13
+ # "Salesforce/blip-image-captioning-base"
14
+ # )
15
+
16
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ # model.to(device)
18
+
19
+ # def caption_image(image, prompt="", openai_api_key=""):
20
+ # if not prompt or not prompt.strip():
21
+ # return "Please enter a prompt/question for the image."
22
+ # image = image.convert("RGB")
23
+
24
+ # # Use OpenAI API if key provided (unchanged)
25
+ # if openai_api_key:
26
+ # try:
27
+ # import base64
28
+ # from io import BytesIO
29
+ # buffered = BytesIO()
30
+ # image.save(buffered, format="PNG")
31
+ # img_b64 = base64.b64encode(buffered.getvalue()).decode()
32
+ # headers = {
33
+ # "Authorization": f"Bearer {openai_api_key}",
34
+ # "Content-Type": "application/json"
35
+ # }
36
+ # data = {
37
+ # "model": "gpt-4-vision-preview",
38
+ # "messages": [
39
+ # {
40
+ # "role": "user",
41
+ # "content": [
42
+ # {"type": "text", "text": prompt.strip()},
43
+ # {"type": "image_url", "image_url": f"data:image/png;base64,{img_b64}"}
44
+ # ]
45
+ # }
46
+ # ],
47
+ # "max_tokens": 100
48
+ # }
49
+ # resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=data)
50
+ # if resp.status_code == 200:
51
+ # result = resp.json()
52
+ # return result["choices"][0]["message"]["content"].strip()
53
+ # else:
54
+ # return f"OpenAI API error: {resp.status_code} {resp.text}"
55
+ # except Exception as e:
56
+ # return f"OpenAI API error: {e}"
57
+
58
+ # # BLIP: always use prompt as instruction, no retry, fast settings
59
+ # p = prompt.strip()
60
+ # prompt_text = f"Question: {p} Answer:"
61
+ # inputs = processor(images=image, text=prompt_text, return_tensors="pt").to(device)
62
+ # # Speed up: reduce beams and max_length
63
+ # gen_kwargs = {"max_length": 25, "num_beams": 1, "early_stopping": True}
64
+ # output = model.generate(**inputs, **gen_kwargs)
65
+ # caption = processor.decode(output[0], skip_special_tokens=True)
66
+ # # Extract answer after 'Answer:' if present
67
+ # idx = caption.lower().find("answer:")
68
+ # if idx != -1:
69
+ # ans = caption[idx + len("answer:"):].strip()
70
+ # if ans:
71
+ # return ans
72
+ # # Otherwise, return the raw caption
73
+ # return caption.strip()
74
+
75
+ # # Gradio UI: horizontal layout with image, prompt, button left; output right
76
+ # with gr.Blocks() as demo:
77
+ # gr.Markdown("## 🖼️ Image Captioning (Prompt-driven)\nUpload an image, enter a prompt, and click Submit. Output depends on both image and prompt.")
78
+ # with gr.Row():
79
+ # with gr.Column(scale=2):
80
+ # img = gr.Image(type="pil", label="Upload Image")
81
+ # prompt = gr.Textbox(label="Prompt (ask a question)", placeholder="What is the color of the t-shirt?")
82
+ # openai_api_key = gr.Textbox(label="OpenAI API Key (optional)", type="password", placeholder="sk-...", lines=1)
83
+ # btn = gr.Button("Submit")
84
+ # with gr.Column(scale=1):
85
+ # out = gr.Textbox(label="Answer", lines=6)
86
+ # btn.click(fn=caption_image, inputs=[img, prompt, openai_api_key], outputs=out)
87
+ # demo.launch()
88
+
89
+ import gradio as gr
90
+ import torch
91
+ from transformers import BlipProcessor, BlipForQuestionAnswering
92
+ from PIL import Image
93
+
94
+ # ---------------------------
95
+ # Load BLIP VQA model
96
+ # ---------------------------
97
+ MODEL_NAME = "Salesforce/blip-vqa-base"
98
+
99
+ processor = BlipProcessor.from_pretrained(MODEL_NAME)
100
+ model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME)
101
+
102
+ device = "cuda" if torch.cuda.is_available() else "cpu"
103
+ model.to(device)
104
+ model.eval()
105
+
106
+ # ---------------------------
107
+ # Inference function
108
+ # ---------------------------
109
+ def answer_image_question(image, question):
110
+ if image is None:
111
+ return "Please upload an image."
112
+ if not question.strip():
113
+ return "Please enter a question."
114
+
115
+ image = image.convert("RGB")
116
+
117
+ inputs = processor(
118
+ images=image,
119
+ text=question,
120
+ return_tensors="pt"
121
+ ).to(device)
122
+
123
+ with torch.no_grad():
124
+ output = model.generate(
125
+ **inputs,
126
+ max_length=10, # fast
127
+ num_beams=1 # faster
128
+ )
129
+
130
+ answer = processor.decode(output[0], skip_special_tokens=True)
131
+ return answer
132
+
133
+ # ---------------------------
134
+ # Gradio UI
135
+ # ---------------------------
136
+ with gr.Blocks() as demo:
137
+ gr.Markdown("## 🖼️ Image Question Answering (Fast & Accurate)")
138
+ gr.Markdown(
139
+ "Upload an image and ask a question like:\n"
140
+ "- *What is the color of the shirt?*\n"
141
+ "- *How many people are there?*\n"
142
+ "- *Is the person wearing glasses?*"
143
+ )
144
+
145
+ with gr.Row():
146
+ with gr.Column():
147
+ img = gr.Image(type="pil", label="Upload Image")
148
+ question = gr.Textbox(
149
+ label="Question",
150
+ placeholder="What is the color of the shirt?"
151
+ )
152
+ btn = gr.Button("Submit")
153
+
154
+ with gr.Column():
155
+ answer = gr.Textbox(label="Answer", lines=3)
156
+
157
+ btn.click(
158
+ fn=answer_image_question,
159
+ inputs=[img, question],
160
+ outputs=answer
161
+ )
162
+
163
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ Pillow
4
+ torch