Spaces:

mokh2x
/

Captioning

Sleeping

App Files Files Community

Mokhtar commited on Dec 25, 2025

Commit

e4721a6

1 Parent(s): b4aa453

Deploying backend code

Browse files

Files changed (9) hide show

Dockerfile +29 -0
README.md +58 -11
config/config.py +39 -0
requirements.txt +13 -0
src/api/app.py +71 -0
src/data/dataset.py +65 -0
src/models/model.py +118 -0
src/preprocessing/transforms.py +17 -0
src/training/trainer.py +60 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Use Python 3.9 as base image
+FROM python:3.9
+# Set the working directory inside the container
+WORKDIR /code
+# Copy requirements and install dependencies
+# We use --no-cache-dir to keep the image small
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Install python-multipart (required for UploadFile in FastAPI) if not in requirements.txt
+RUN pip install python-multipart
+# Copy the entire project into the container
+COPY . /code
+# Create a directory for Hugging Face cache to avoid permission errors
+RUN mkdir -p /code/cache
+ENV TRANSFORMERS_CACHE=/code/cache
+ENV TORCH_HOME=/code/cache
+RUN chmod -R 777 /code/cache
+# Expose the default Hugging Face port
+EXPOSE 7860
+# Command to run the application
+# We point to src.api.app:app because app.py is inside src/api/
+CMD ["uvicorn", "src.api.app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,11 +1,58 @@
----
-title: Captioning
-emoji: 🐨
-colorFrom: gray
-colorTo: red
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Image Captioning with SOTA Models
+This project provides a unified API for Image Captioning using various State-of-the-Art (SOTA) models as well as a custom ResNet+GPT2 implementation.
+## Supported Models
+1.  **BLIP (Bootstrapping Language-Image Pre-training)**
+    *   Model: `Salesforce/blip-image-captioning-large`
+    *   Status: **Default** (Best Performance)
+    *   Description: Produces highly accurate and detailed captions.
+2.  **ViT-GPT2**
+    *   Model: `nlpconnect/vit-gpt2-image-captioning`
+    *   Status: Available
+    *   Description: Uses Vision Transformer (ViT) encoder and GPT-2 decoder.
+3.  **ResNet50 + GPT-2 (Custom)**
+    *   Model: Custom implementation trained from scratch.
+    *   Status: Legacy / Experimental
+    *   Description: Good for learning purposes or custom datasets.
+## Installation
+1.  Clone the repository.
+2.  Install dependencies:
+    ```bash
+    pip install -r requirements.txt
+    ```
+## Configuration
+Edit `config/config.py` to select the model:
+```python
+class Config:
+    # ...
+    MODEL_TYPE = "blip" # Options: "blip", "vit_gpt2", "resnet_gpt2"
+```
+## Running the API
+Start the FastAPI server:
+```bash
+python main.py --mode api
+```
+Open your browser at `http://localhost:8001` to use the drag-and-drop interface.
+## Training (ResNet+GPT2 only)
+To train the custom model:
+1.  Set `MODEL_TYPE = "resnet_gpt2"` in config.
+2.  Run:
+    ```bash
+    python main.py --mode train
+    ```

config/config.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import os
+import torch
+class Config:
+    # Paths
+    BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    # We do not need training data paths for the inference API
+    DATA_DIR = None
+    CAPTIONS_FILE = None
+    # Model saving/loading directory
+    MODEL_SAVE_DIR = os.path.join(BASE_DIR, 'models')
+    LOG_DIR = os.path.join(BASE_DIR, 'logs')
+    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
+    os.makedirs(LOG_DIR, exist_ok=True)
+    # Device: Force CPU if CUDA is not available (Hugging Face Free Tier is CPU)
+    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Hyperparameters (kept for reference, mostly unused in inference)
+    BATCH_SIZE = 1
+    LEARNING_RATE = 2e-5
+    NUM_EPOCHS = 10
+    NUM_WORKERS = 2
+    # Model Config
+    # Change this to "blip" or "vit_gpt2" for your deployment to ensure no custom weights are needed
+    MODEL_TYPE = "blip"
+    ENCODER_MODEL = "resnet50"
+    DECODER_MODEL = "gpt2"
+    EMBED_DIM = 768
+    MAX_SEQ_LEN = 40
+    # Image Config
+    IMAGE_SIZE = (224, 224)
+config = Config()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch
+torchvision
+fastapi
+uvicorn
+pillow
+pandas
+spacy
+tqdm
+matplotlib
+gTTS
+transformers
+python-multipart
+requests

src/api/app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import io
+import os
+import torch
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from PIL import Image
+from transformers import GPT2Tokenizer
+# Adjust imports based on your folder structure
+from config.config import config
+from src.models.model import get_model
+from src.preprocessing.transforms import get_transforms
+app = FastAPI(title="Object Captioning LLM API")
+# --- CORS CONFIGURATION (Crucial for Vercel) ---
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # For production, replace "*" with your Vercel URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load Model
+print(f"Loading model: {config.MODEL_TYPE} on {config.DEVICE}...")
+device = config.DEVICE
+model = get_model(config).to(device)
+# Legacy support for ResNetGPT2
+if config.MODEL_TYPE == "resnet_gpt2":
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    # Use a relative path or ensure this file is uploaded to the Docker container
+    model_path = os.path.join(config.MODEL_SAVE_DIR, "best_model_llm.pth")
+    if os.path.exists(model_path):
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        print("Loaded trained custom model.")
+    else:
+        print("Warning: No trained model found for ResNetGPT2. Using random weights.")
+else:
+    tokenizer = None
+model.eval()
+transform = get_transforms(train=False)
+@app.get("/")
+def home():
+    return {"message": "Image Captioning API is running. Send POST requests to /predict"}
+@app.post("/predict")
+async def predict(file: UploadFile = File(...)):
+    try:
+        # Read Image
+        image_data = await file.read()
+        image = Image.open(io.BytesIO(image_data)).convert("RGB")
+        # Generate Caption
+        if config.MODEL_TYPE == "resnet_gpt2":
+            img_tensor = transform(image).to(device)
+            # Ensure generate_caption handles the tensor/tokenizer correctly
+            caption = model.generate_caption(img_tensor, tokenizer)
+        else:
+            # SOTA models (BLIP/ViT) take the PIL image directly
+            caption = model.generate_caption(image)
+        return {
+            "caption": caption
+        }
+    except Exception as e:
+        print(f"Error during prediction: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import pandas as pd
+from PIL import Image
+import torch
+from torch.utils.data import Dataset
+from transformers import GPT2Tokenizer
+class CaptionDataset(Dataset):
+    def __init__(self, root_dir, captions_file, transform=None, max_length=40):
+        self.root_dir = root_dir
+        self.transform = transform
+        self.max_length = max_length
+        # Load captions
+        # Format: image,caption (csv)
+        self.df = pd.read_csv(captions_file, delimiter=',')
+        # Rename columns to match expected internal names if necessary, or just use them directly
+        # The file has 'image' and 'caption' columns based on inspection
+        self.df.rename(columns={'image': 'image_name', 'caption': 'comment'}, inplace=True)
+        self.df['image_name'] = self.df['image_name'].str.strip()
+        self.df['comment'] = self.df['comment'].str.strip()
+        self.df = self.df.dropna()
+        self.captions = self.df['comment'].tolist()
+        self.images = self.df['image_name'].tolist()
+        # Initialize Tokenizer
+        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        # GPT2 doesn't have a pad token, so we use eos_token as pad_token
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+    def __len__(self):
+        return len(self.captions)
+    def __getitem__(self, idx):
+        caption = self.captions[idx]
+        img_name = self.images[idx]
+        img_path = os.path.join(self.root_dir, img_name)
+        try:
+            image = Image.open(img_path).convert("RGB")
+        except Exception:
+            # Fallback for missing images or errors, return next item
+            return self.__getitem__((idx + 1) % len(self))
+        if self.transform:
+            image = self.transform(image)
+        # Tokenize caption
+        # We add a special prefix to prompt the model if desired, but for direct captioning:
+        # Format: [Image Feature] -> Caption
+        encoding = self.tokenizer(
+            caption,
+            truncation=True,
+            padding='max_length',
+            max_length=self.max_length,
+            return_tensors='pt'
+        )
+        input_ids = encoding['input_ids'].squeeze()
+        attention_mask = encoding['attention_mask'].squeeze()
+        return image, input_ids, attention_mask

src/models/model.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+import torchvision.models as models
+from transformers import (
+    GPT2LMHeadModel,
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+    AutoTokenizer,
+    BlipProcessor,
+    BlipForConditionalGeneration
+)
+# -----------------------------------------------------------------------------
+# 1. Custom ResNet + GPT-2 (Training from Scratch)
+# -----------------------------------------------------------------------------
+class ResNetEncoder(nn.Module):
+    def __init__(self, embed_dim=768):
+        super(ResNetEncoder, self).__init__()
+        resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        for param in self.resnet.parameters():
+            param.requires_grad = False
+        self.projection = nn.Linear(2048, embed_dim)
+        self.bn = nn.BatchNorm1d(embed_dim, momentum=0.01)
+    def forward(self, images):
+        features = self.resnet(images)
+        features = features.view(features.size(0), -1)
+        features = self.projection(features)
+        features = self.bn(features)
+        return features
+class ResNetGPT2(nn.Module):
+    def __init__(self, max_seq_len=40):
+        super(ResNetGPT2, self).__init__()
+        self.encoder = ResNetEncoder(embed_dim=768)
+        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
+        self.max_seq_len = max_seq_len
+    def forward(self, images, input_ids, attention_mask):
+        image_embeds = self.encoder(images)
+        token_embeds = self.gpt2.transformer.wte(input_ids)
+        inputs_embeds = torch.cat((image_embeds.unsqueeze(1), token_embeds), dim=1)
+        batch_size = images.shape[0]
+        ones = torch.ones(batch_size, 1).to(images.device)
+        attention_mask = torch.cat((ones, attention_mask), dim=1)
+        outputs = self.gpt2(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
+        return outputs.logits
+    def generate_caption(self, image, tokenizer, max_length=20, temperature=1.0):
+        self.eval()
+        with torch.no_grad():
+            image_embed = self.encoder(image.unsqueeze(0))
+            inputs_embeds = image_embed.unsqueeze(1)
+            generated_tokens = []
+            for _ in range(max_length):
+                outputs = self.gpt2(inputs_embeds=inputs_embeds)
+                logits = outputs.logits[:, -1, :] / temperature
+                next_token = torch.argmax(logits, dim=-1).unsqueeze(0)
+                if next_token.item() == tokenizer.eos_token_id:
+                    break
+                generated_tokens.append(next_token.item())
+                next_token_embed = self.gpt2.transformer.wte(next_token)
+                inputs_embeds = torch.cat((inputs_embeds, next_token_embed), dim=1)
+            return tokenizer.decode(generated_tokens, skip_special_tokens=True)
+# -----------------------------------------------------------------------------
+# 2. ViT + GPT-2 (Pre-trained SOTA 1)
+# -----------------------------------------------------------------------------
+class ViTGPT2Captioner(nn.Module):
+    def __init__(self):
+        super().__init__()
+        print("Loading ViT-GPT2 model...")
+        self.model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+        self.feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+        self.tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+    def generate_caption(self, image, **kwargs):
+        self.eval()
+        with torch.no_grad():
+            pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.to(self.model.device)
+            output_ids = self.model.generate(pixel_values, max_length=20, num_beams=4)
+            preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+            return preds[0].strip()
+# -----------------------------------------------------------------------------
+# 3. BLIP (Pre-trained SOTA 2 - Best)
+# -----------------------------------------------------------------------------
+class BLIPCaptioner(nn.Module):
+    def __init__(self):
+        super().__init__()
+        print("Loading BLIP model (Salesforce/blip-image-captioning-large)...")
+        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
+        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
+    def generate_caption(self, image, **kwargs):
+        self.eval()
+        with torch.no_grad():
+            inputs = self.processor(images=image, return_tensors="pt").to(self.model.device)
+            output_ids = self.model.generate(**inputs, max_length=50, num_beams=5, repetition_penalty=1.2, min_length=5)
+            caption = self.processor.decode(output_ids[0], skip_special_tokens=True)
+            return caption
+# -----------------------------------------------------------------------------
+# Factory
+# -----------------------------------------------------------------------------
+def get_model(config):
+    if config.MODEL_TYPE == "resnet_gpt2":
+        return ResNetGPT2()
+    elif config.MODEL_TYPE == "vit_gpt2":
+        return ViTGPT2Captioner()
+    elif config.MODEL_TYPE == "blip":
+        return BLIPCaptioner()
+    else:
+        raise ValueError(f"Unknown model type: {config.MODEL_TYPE}")

src/preprocessing/transforms.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torchvision.transforms as transforms
+def get_transforms(image_size=(224, 224), train=True):
+    if train:
+        return transforms.Compose([
+            transforms.Resize(image_size),
+            transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0.1, contrast=0.1),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet stats
+        ])
+    else:
+        return transforms.Compose([
+            transforms.Resize(image_size),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])

src/training/trainer.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+import torch.nn as nn
+import torch.optim as optim
+from tqdm import tqdm
+import os
+def train_model(model, train_loader, val_loader, config, tokenizer):
+    model = model.to(config.DEVICE)
+    optimizer = optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
+    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
+    best_loss = float('inf')
+    for epoch in range(config.NUM_EPOCHS):
+        model.train()
+        train_loss = 0
+        loop = tqdm(train_loader, total=len(train_loader), leave=True)
+        for images, input_ids, attention_mask in loop:
+            images = images.to(config.DEVICE)
+            input_ids = input_ids.to(config.DEVICE)
+            attention_mask = attention_mask.to(config.DEVICE)
+            optimizer.zero_grad()
+            # Forward pass
+            # Logits: [batch, seq_len+1, vocab_size]
+            logits = model(images, input_ids, attention_mask)
+            # Shift logits and labels for next-token prediction
+            # We want to predict input_ids based on previous context
+            # The model output at index i corresponds to prediction for token at i+1
+            # Input sequence to model: [Image, T1, T2, T3, ...]
+            # Output logits:           [P1,    P2, P3, P4, ...]
+            # Targets:                 [T1,    T2, T3, T4, ...]
+            # We discard the last logit because we don't have a target for it
+            shift_logits = logits[:, :-1, :].contiguous()
+            # We use input_ids as targets
+            shift_labels = input_ids.contiguous()
+            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            loop.set_description(f"Epoch [{epoch+1}/{config.NUM_EPOCHS}]")
+            loop.set_postfix(loss=loss.item())
+        avg_train_loss = train_loss / len(train_loader)
+        print(f"Epoch {epoch+1} Loss: {avg_train_loss:.4f}")
+        # Save checkpoint
+        if avg_train_loss < best_loss:
+            best_loss = avg_train_loss
+            torch.save(model.state_dict(), os.path.join(config.MODEL_SAVE_DIR, "best_model_llm.pth"))
+            print("Saved Best Model!")
+        torch.save(model.state_dict(), os.path.join(config.MODEL_SAVE_DIR, "last_checkpoint_llm.pth"))