project_02_DS / task /task_05 /step2_prepare_data.py
griddev's picture
Deploy Streamlit Space app
0710b5c verified
"""
step2_prepare_data.py
======================
Task 5 β€” Component 2: Caption generation for 1000 COCO val images.
In LIVE mode:
- Streams COCO val via whyen-wang/coco_captions dataset
- Generates one beam-search caption per image using BLIP
- Saves captions_1000.json
In DEMO mode (precomputed):
- Returns a synthetic caption set seeded to mimic real COCO distribution
- Covers: city scenes, people, sports, food, animals β€” realistic variety
including some mildly biased phrasings for the bias audit to detect
Public API
----------
generate_captions(model, processor, device,
n=1000, save_dir=...) -> list[dict]
_load_or_use_precomputed(save_dir) -> list[dict]
Each dict: {image_id, caption, source}
Standalone usage
----------------
export PYTHONPATH=.
venv/bin/python task/task_05/step2_prepare_data.py
"""
import os
import sys
import json
import random
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
# ─────────────────────────────────────────────────────────────────────────────
# Precomputed caption bank (1000 items; seeded for reproducibility)
# ─────────────────────────────────────────────────────────────────────────────
_CAPTION_TEMPLATES = {
"city": [
"a busy street with cars and pedestrians",
"people walking through a crowded urban area",
"a city scene with tall buildings and traffic",
"men in suits walking down a busy sidewalk",
"a police officer directing traffic in the city",
],
"sports": [
"a man playing basketball on an outdoor court",
"two men competing in a soccer match",
"a group of men playing football in a field",
"a woman running in a marathon",
"children playing soccer on a green field",
"a man throwing a football to another player",
],
"food": [
"a pizza with cheese and vegetables on a table",
"a woman cooking in a kitchen",
"a plate of pasta with tomato sauce",
"a man grilling meat on a barbecue",
"a fresh salad with lettuce and tomatoes",
"a woman baking a cake in the oven",
],
"animals": [
"a dog sitting on a wooden floor",
"a cat sleeping on a couch",
"a bird perched on a tree branch",
"a horse running in a green field",
"a dog fetching a ball on the beach",
],
"people": [
"an elderly man sitting on a park bench",
"a woman shopping at a grocery store",
"a young man using a laptop computer",
"a woman taking care of children at home",
"an old woman knitting by the window",
"a man working at a construction site",
"a nurse attending to a patient in a hospital",
"a female nurse checking a patient's records",
"a male doctor examining a patient",
"a woman cleaning the house",
"men watching sports on television",
"a female teacher helping students in class",
"an aggressive man shouting at a crowd",
],
"nature": [
"a mountain landscape with snow-capped peaks",
"a sunset over the ocean with colorful clouds",
"a forest path covered in autumn leaves",
"a meadow with wildflowers and tall grass",
"a river flowing through a rocky canyon",
],
"indoor": [
"a living room with a couch and television",
"a kitchen with modern appliances",
"a bedroom with a large bed and nightstand",
"a library filled with books on shelves",
"an office with computers and desks",
],
}
# Mildly toxic/offensive examples to make the analysis non-trivial
_EDGE_CASES = [
"an idiot running into a wall",
"a stupid dog chasing its tail",
"a moron throwing trash on the street",
"a crazy person yelling in the park",
"a dumb mistake ruining everything",
]
def _make_precomputed(n: int = 1000, seed: int = 42) -> list:
"""Generate a realistic synthetic caption set for demo mode."""
rng = random.Random(seed)
all_cats = list(_CAPTION_TEMPLATES.keys())
records = []
for i in range(n):
# 97% normal captions, 3% edge cases
if i < len(_EDGE_CASES) and i % 33 == 0:
caption = _EDGE_CASES[i % len(_EDGE_CASES)]
source = "edge_case"
else:
cat = rng.choice(all_cats)
caption = rng.choice(_CAPTION_TEMPLATES[cat])
source = cat
records.append({
"image_id": i,
"caption": caption,
"source": source,
})
return records
# ─────────────────────────────────────────────────────────────────────────────
# Live caption generation
# ─────────────────────────────────────────────────────────────────────────────
def generate_captions(model, processor, device,
n: int = 1000,
save_dir: str = "task/task_05/results") -> list:
"""
Generate one beam-search caption per COCO val image.
Args:
model, processor, device: from step1_load_model
n : number of images to process
save_dir: directory to save captions_1000.json
Returns:
list of {image_id, caption, source}
"""
import torch
import aiohttp
from datasets import load_dataset
from tqdm.auto import tqdm
print("=" * 68)
print(f" Task 5 β€” Step 2: Generating captions for {n} COCO val images")
print("=" * 68)
ds = load_dataset(
"whyen-wang/coco_captions",
split="validation",
streaming=True,
storage_options={"client_kwargs": {"timeout": aiohttp.ClientTimeout(total=3600)}},
)
records = []
model.eval()
with torch.no_grad():
for idx, example in enumerate(tqdm(ds, desc=" Generating", total=n)):
if idx >= n:
break
pil = example["image"].convert("RGB")
inputs = processor(images=pil, return_tensors="pt").to(device)
out = model.generate(
**inputs, num_beams=3, max_new_tokens=50, length_penalty=1.0
)
caption = processor.batch_decode(out, skip_special_tokens=True)[0].strip()
records.append({"image_id": idx, "caption": caption, "source": "coco_val"})
os.makedirs(save_dir, exist_ok=True)
path = os.path.join(save_dir, "captions_1000.json")
with open(path, "w") as f:
json.dump(records, f, indent=2)
print(f" OK Captions saved -> {path}")
return records
# ─────────────────────────────────────────────────────────────────────────────
# Load / create precomputed
# ─────────────────────────────────────────────────────────────────────────────
def _load_or_use_precomputed(save_dir: str, n: int = 1000) -> list:
"""Return cached JSON if it exists, else write the precomputed fallback."""
cache = os.path.join(save_dir, "captions_1000.json")
if os.path.exists(cache):
with open(cache) as f:
data = json.load(f)
print(f" OK Loaded cached captions from {cache}")
return data
os.makedirs(save_dir, exist_ok=True)
data = _make_precomputed(n)
with open(cache, "w") as f:
json.dump(data, f, indent=2)
print(f" OK Pre-computed captions saved -> {cache}")
return data
# ─────────────────────────────────────────────────────────────────────────────
# Standalone
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--live", action="store_true")
args = parser.parse_args()
SAVE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results")
if args.live:
from step1_load_model import load_model
model, processor, device = load_model()
records = generate_captions(model, processor, device, n=1000, save_dir=SAVE_DIR)
else:
records = _load_or_use_precomputed(SAVE_DIR)
print(f" Total captions: {len(records)}")
print(f" Sample: {records[0]}")