Spaces:

Mark8398
/

CLIP-Style-Multimodal

Runtime error

App Files Files Community

Mark8398 commited on Dec 20, 2025

Commit

f634c61

verified ·

1 Parent(s): b51196c

Uploaded 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +152 -0
examples/beach.jpg +3 -0
examples/dog.jpg +0 -0
flickr8k_best_index.pt +3 -0
flickr8k_best_model_r1_27.pth +3 -0
requirements.txt +77 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/beach.jpg filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import ViTModel, AutoModel, AutoTokenizer
+from torchvision import transforms
+from datasets import load_dataset
+from PIL import Image
+# --- 1. MODEL ARCHITECTURE ---
+class MultiModalEngine(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.image_model = ViTModel.from_pretrained("google/vit-base-patch16-224")
+        self.text_model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")
+        self.image_projection = nn.Linear(768, 256)
+        self.text_projection = nn.Linear(768, 256)
+        self.logit_scale = nn.Parameter(torch.ones([]) * 2.659)
+    def encode_text(self, input_ids, attention_mask):
+        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
+        text_embeds = self.text_projection(self.mean_pooling(text_out, attention_mask))
+        return F.normalize(text_embeds, dim=1)
+    def encode_image(self, images):
+        vision_out = self.image_model(pixel_values=images)
+        image_embeds = self.image_projection(vision_out.last_hidden_state[:, 0, :])
+        return F.normalize(image_embeds, dim=1)
+    def mean_pooling(self, model_output, attention_mask):
+        token_embeddings = model_output.last_hidden_state
+        mask = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+        return torch.sum(token_embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
+# --- 2. LOAD RESOURCES ---
+print("⏳ Loading resources...")
+device = "cpu"
+# Load Model
+model = MultiModalEngine()
+model.load_state_dict(torch.load("flickr8k_best_model_r1_27.pth", map_location=device))
+model.eval()
+# Load Index
+image_embeddings = torch.load("flickr8k_best_index.pt", map_location=device)
+# Load Tokenizer & Transforms
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
+val_transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+# Load Dataset (Standard mode to fetch result images)
+print("⏳ Downloading dataset (this may take a minute)...")
+dataset = load_dataset("tsystems/flickr8k", split="train")
+print("✅ Server Ready!")
+# --- 3. SEARCH LOGIC ---
+def search_text(query):
+    inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
+    with torch.no_grad():
+        text_emb = model.encode_text(inputs['input_ids'], inputs['attention_mask'])
+    scores = text_emb @ image_embeddings.T
+    scores = scores.squeeze()
+    values, indices = torch.topk(scores, 3)
+    return [dataset[int(idx)]['image'] for idx in indices]
+def search_image(query_img):
+    if query_img is None: return []
+    # Ensure it's a PIL Image (Gradio handles this, but good safety)
+    if not isinstance(query_img, Image.Image):
+        query_img = Image.fromarray(query_img)
+    img_tensor = val_transform(query_img).unsqueeze(0)
+    with torch.no_grad():
+        img_emb = model.encode_image(img_tensor)
+    scores = img_emb @ image_embeddings.T
+    scores = scores.squeeze()
+    values, indices = torch.topk(scores, 3)
+    return [dataset[int(idx)]['image'] for idx in indices]
+# --- 4. UI WITH EXAMPLES ---
+with gr.Blocks(title="Flickr8k AI Search", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🔍 AI Super-Search")
+    gr.Markdown("Search for images using **Text** OR using another **Image**.")
+    with gr.Tabs():
+        # --- TAB 1: TEXT SEARCH ---
+        with gr.TabItem("Search by Text"):
+            with gr.Row():
+                txt_input = gr.Textbox(label="Type your query", placeholder="e.g. A dog running...")
+                txt_btn = gr.Button("Search", variant="primary")
+            txt_gallery = gr.Gallery(label="Top Matches", columns=3, height=300)
+            # CLICKABLE TEXT EXAMPLES
+            gr.Examples(
+                examples=[
+                    ["A dog running on grass"],
+                    ["Children playing in the water"],
+                    ["A girl in a pink dress"],
+                    ["A man climbing a rock"]
+                ],
+                inputs=txt_input,       # Clicking populates this box
+                outputs=txt_gallery,    # Result appears here
+                fn=search_text,         # Function to run
+                run_on_click=True,      # Run immediately when clicked!
+                label="Try these examples:"
+            )
+            txt_btn.click(search_text, inputs=txt_input, outputs=txt_gallery)
+        # --- TAB 2: IMAGE SEARCH ---
+        # --- TAB 2: IMAGE SEARCH ---
+        with gr.TabItem("Search by Image"):
+            # 1. Define components first (but don't draw them yet)
+            # We set render=False so we can place them visually later
+            img_input = gr.Image(type="pil", label="Upload Source Image", sources=['upload', 'clipboard'], render=False)
+            img_gallery = gr.Gallery(label="Similar Images", columns=3, height=300, render=False)
+            # 2. Draw Examples FIRST (So they appear at the very top)
+            gr.Examples(
+                examples=[
+                    ["examples/dog.jpg"],
+                    ["examples/beach.jpg"]
+                ],
+                inputs=img_input,
+                outputs=img_gallery,
+                fn=search_image,
+                run_on_click=True,
+                label="Click an image to test:"
+            )
+            # 3. Draw Input and Button (Visually below examples)
+            with gr.Row():
+                img_input.render() # <--- Now we actually draw the input box
+                img_btn = gr.Button("Find Similar", variant="primary")
+            # 4. Draw Gallery (Visually at the bottom)
+            img_gallery.render()
+            # 5. Connect the Button
+            img_btn.click(search_image, inputs=img_input, outputs=img_gallery)
+if __name__ == "__main__":
+    demo.launch()

examples/beach.jpg ADDED Viewed

Git LFS Details

SHA256: 9f957fcad6e690f37f99e5bba984de7d8958a4527288298759ee7754002dade9
Pointer size: 131 Bytes
Size of remote file: 102 kB

examples/dog.jpg ADDED Viewed

flickr8k_best_index.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5c6aa5489f0d9ce320e37ab0ebbed3e49f25b5f769a1826c4684a363766b5b5
+size 8286845

flickr8k_best_model_r1_27.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1003c7746967816f6e1d02d58c199fb8c32cd94ae9c45366a38707a79d1c43b
+size 785252523

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.12.0
+attrs==25.4.0
+brotli==1.2.0
+certifi==2025.11.12
+charset-normalizer==3.4.4
+click==8.3.1
+colorama==0.4.6
+datasets==4.4.2
+dill==0.4.0
+fastapi==0.125.0
+ffmpy==1.0.0
+filelock==3.20.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+gradio==6.2.0
+gradio_client==2.0.2
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+markdown-it-py==4.0.0
+MarkupSafe==3.0.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.7.0
+multiprocess==0.70.18
+networkx==3.6.1
+numpy==2.3.5
+orjson==3.11.5
+packaging==25.0
+pandas==2.3.3
+pillow==12.0.0
+propcache==0.4.1
+pyarrow==22.0.0
+pydantic==2.12.5
+pydantic_core==2.41.5
+pydub==0.25.1
+Pygments==2.19.2
+python-dateutil==2.9.0.post0
+python-multipart==0.0.21
+pytz==2025.2
+PyYAML==6.0.3
+regex==2025.11.3
+requests==2.32.5
+rich==14.2.0
+safehttpx==0.1.7
+safetensors==0.7.0
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+starlette==0.50.0
+sympy==1.14.0
+tokenizers==0.22.1
+tomlkit==0.13.3
+torch==2.9.1
+torchvision==0.24.1
+tqdm==4.67.1
+transformers==4.57.3
+typer==0.20.1
+typer-slim==0.20.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+tzdata==2025.3
+urllib3==2.6.2
+uvicorn==0.38.0
+xxhash==3.6.0
+yarl==1.22.0