# src/cloud_db.py — Enterprise Lens V4 # ════════════════════════════════════════════════════════════════ # NOTE: In the production FastAPI app (main.py), ALL Pinecone and # Cloudinary operations are performed directly — this class is NOT # called by main.py. It exists as a standalone utility / SDK wrapper # for scripts, notebooks, or future use outside the API. # # If you use this class, ensure your Pinecone indexes match V4 dims: # enterprise-faces → 1024-D (ArcFace-512 + AdaFace-512, fused) # enterprise-objects → 1536-D (SigLIP-768 + DINOv2-768, fused) # ════════════════════════════════════════════════════════════════ import os import uuid import cloudinary import cloudinary.uploader from pinecone import Pinecone, ServerlessSpec from dotenv import load_dotenv load_dotenv() # ── V4 Index constants — MUST match main.py and models.py ──────── IDX_FACES = "enterprise-faces" IDX_OBJECTS = "enterprise-objects" IDX_FACES_DIM = 1024 # ArcFace(512) + AdaFace(512) fused, always 1024 IDX_OBJECTS_DIM = 1536 # SigLIP(768) + DINOv2(768) fused, always 1536 # V4 face similarity thresholds (fused 1024-D cosine space) # These MUST stay in sync with main.py FACE_THRESHOLD_* constants FACE_THRESHOLD_HIGH = 0.40 # high-quality face (det_score >= 0.85) FACE_THRESHOLD_LOW = 0.32 # lower-quality face (det_score < 0.85) OBJECT_THRESHOLD = 0.45 # object/scene similarity threshold class CloudDB: """ Utility wrapper around Pinecone + Cloudinary for Enterprise Lens V4. Index dimensions: enterprise-faces : 1024-D cosine enterprise-objects : 1536-D cosine Face vectors: ArcFace(512) + AdaFace(512) concatenated + L2-normalised Object vectors: SigLIP(768) + DINOv2(768) concatenated + L2-normalised """ def __init__(self): # ── Cloudinary ──────────────────────────────────────────── cloudinary.config( cloud_name = os.getenv("CLOUDINARY_CLOUD_NAME"), api_key = os.getenv("CLOUDINARY_API_KEY"), api_secret = os.getenv("CLOUDINARY_API_SECRET"), ) # ── Pinecone ────────────────────────────────────────────── self.pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")) self._ensure_indexes() self.index_faces = self.pc.Index(IDX_FACES) self.index_objects = self.pc.Index(IDX_OBJECTS) def _ensure_indexes(self): """ Create Pinecone indexes at correct V4 dimensions if they don't exist. Safe to call multiple times — skips existing indexes. """ existing = {idx.name for idx in self.pc.list_indexes()} if IDX_FACES not in existing: print(f"📦 Creating {IDX_FACES} at {IDX_FACES_DIM}-D...") self.pc.create_index( name = IDX_FACES, dimension = IDX_FACES_DIM, # 1024-D — ArcFace+AdaFace metric = "cosine", spec = ServerlessSpec(cloud="aws", region="us-east-1"), ) print(f" ✅ {IDX_FACES} created at {IDX_FACES_DIM}-D") else: # Validate existing index has correct dimension desc = self.pc.describe_index(IDX_FACES) actual_dim = desc.dimension if actual_dim != IDX_FACES_DIM: raise ValueError( f"❌ {IDX_FACES} exists at {actual_dim}-D but V4 needs " f"{IDX_FACES_DIM}-D. Go to Settings → Danger Zone → " f"Reset Database to recreate at correct dimensions." ) if IDX_OBJECTS not in existing: print(f"📦 Creating {IDX_OBJECTS} at {IDX_OBJECTS_DIM}-D...") self.pc.create_index( name = IDX_OBJECTS, dimension = IDX_OBJECTS_DIM, # 1536-D — SigLIP+DINOv2 metric = "cosine", spec = ServerlessSpec(cloud="aws", region="us-east-1"), ) print(f" ✅ {IDX_OBJECTS} created at {IDX_OBJECTS_DIM}-D") else: desc = self.pc.describe_index(IDX_OBJECTS) actual_dim = desc.dimension if actual_dim != IDX_OBJECTS_DIM: raise ValueError( f"❌ {IDX_OBJECTS} exists at {actual_dim}-D but V4 needs " f"{IDX_OBJECTS_DIM}-D. Go to Settings → Danger Zone → " f"Reset Database to recreate at correct dimensions." ) # ── Upload image to Cloudinary ──────────────────────────────── def upload_image(self, file_path: str, folder_name: str = "visual_search") -> str: """Upload image to Cloudinary, return secure_url.""" response = cloudinary.uploader.upload(file_path, folder=folder_name) return response["secure_url"] # ── Store vector in correct Pinecone index ──────────────────── def add_vector(self, data_dict: dict, image_url: str, image_id: str = None): """ Upsert one vector into the correct Pinecone index. data_dict keys: type : "face" or "object" vector : np.ndarray or list — must match index dimension face_crop : str (base64 JPEG thumbnail, face only) det_score : float (InsightFace detection confidence, face only) face_quality: float (alias for det_score) face_width_px: int (face bounding box width in pixels) face_idx : int (face index within the source image) bbox : list [x, y, w, h] folder : str (Cloudinary folder / category name) """ vec_id = image_id or str(uuid.uuid4()) vec_list = (data_dict["vector"].tolist() if hasattr(data_dict["vector"], "tolist") else list(data_dict["vector"])) if data_dict["type"] == "face": # ── V4 face metadata — full set required for UI ─────── payload = [{ "id": vec_id, "values": vec_list, "metadata": { "image_url": image_url, "url": image_url, # alias for compatibility "folder": data_dict.get("folder", ""), "face_idx": data_dict.get("face_idx", 0), "bbox": str(data_dict.get("bbox", [])), "face_crop": data_dict.get("face_crop", ""), # base64 thumb "det_score": data_dict.get("det_score", 1.0), "face_quality": data_dict.get("face_quality", data_dict.get("det_score", 1.0)), "face_width_px": data_dict.get("face_width_px", 0), }, }] self.index_faces.upsert(vectors=payload) else: # ── V4 object metadata ──────────────────────────────── payload = [{ "id": vec_id, "values": vec_list, "metadata": { "image_url": image_url, "url": image_url, "folder": data_dict.get("folder", ""), }, }] self.index_objects.upsert(vectors=payload) # ── Search ──────────────────────────────────────────────────── def search(self, query_dict: dict, top_k: int = 10, min_score: float = None) -> list: """ Search the correct Pinecone index for one query vector. For face vectors: uses adaptive threshold based on det_score. For object vectors: uses OBJECT_THRESHOLD (default 0.45). Returns list of dicts: {url, score, caption, [face_crop, folder]} """ vec_list = (query_dict["vector"].tolist() if hasattr(query_dict["vector"], "tolist") else list(query_dict["vector"])) results = [] if query_dict["type"] == "face": # ── V4 face search ──────────────────────────────────── # Adaptive threshold: high-quality faces are stricter det_score = query_dict.get("det_score", 1.0) threshold = (FACE_THRESHOLD_HIGH if det_score >= 0.85 else FACE_THRESHOLD_LOW) if min_score is not None: threshold = min_score response = self.index_faces.query( vector=vec_list, top_k=top_k * 3, # over-fetch, filter below include_metadata=True, ) # Deduplicate by image_url — keep best score per image image_map = {} for match in response.get("matches", []): raw = match["score"] if raw < threshold: continue url = (match["metadata"].get("url") or match["metadata"].get("image_url", "")) if not url: continue if url not in image_map or raw > image_map[url]["raw"]: image_map[url] = { "raw": raw, "face_crop": match["metadata"].get("face_crop", ""), "folder": match["metadata"].get("folder", ""), } # Remap raw cosine → UI percentage (75%–99%) for url, d in image_map.items(): lo = FACE_THRESHOLD_LOW ui = round(min(0.99, 0.75 + ((d["raw"] - lo) / (1.0 - lo)) * 0.24), 4) results.append({ "url": url, "score": ui, "raw_score": round(d["raw"], 4), "face_crop": d["face_crop"], "folder": d["folder"], "caption": "👤 Verified Identity Match", }) results = sorted(results, key=lambda x: x["score"], reverse=True)[:top_k] else: # ── V4 object search ────────────────────────────────── threshold = min_score if min_score is not None else OBJECT_THRESHOLD response = self.index_objects.query( vector=vec_list, top_k=top_k, include_metadata=True) for match in response.get("matches", []): if match["score"] < threshold: continue results.append({ "url": (match["metadata"].get("url") or match["metadata"].get("image_url", "")), "score": round(match["score"], 4), "folder": match["metadata"].get("folder", ""), "caption": "🎯 Visual & Semantic Match", }) return results