The example python code just doesn't work anymore?

#9
by TheoryOfMadness - opened
"""
Test cross-modal text-to-image search with Nomic embeddings.

This test verifies that text embeddings with 'search_query:' prefix
can properly match against image embeddings.
"""
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
from PIL import Image
import requests
import sys
from pathlib import Path

# Add backend to path
backend_dir = Path(__file__).parent.parent
sys.path.insert(0, str(backend_dir))


def mean_pooling(model_output, attention_mask):
    """Mean pooling for text embeddings."""
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def test_nomic_cross_modal_search():
    """Test Nomic's cross-modal text-to-image search."""

    print("\n" + "="*60)
    print("Testing Nomic Cross-Modal Text-to-Image Search")
    print("="*60)

    # Load vision model
    print("\n[1/4] Loading vision model...")
    processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
    vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
    vision_model.eval()
    print("βœ… Vision model loaded")

    # Load image
    print("\n[2/4] Loading test image...")
    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
    image = Image.open(requests.get(url, stream=True).raw)
    print("βœ… Image loaded (cats)")

    # Generate image embedding
    print("\n[3/4] Generating image embedding...")
    inputs = processor(image, return_tensors="pt")

    with torch.no_grad():
        img_emb = vision_model(**inputs).last_hidden_state
        img_embeddings = F.normalize(img_emb[:, 0], p=2, dim=1)

    print(f"βœ… Image embedding generated: shape={img_embeddings.shape}")

    # Load text model
    print("\n[4/4] Loading text model and generating query embeddings...")
    tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
    text_model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
    text_model.eval()

    # Test queries with search_query prefix
    sentences = [
        'search_query: What are cute animals to cuddle with?',
        'search_query: What do cats look like?',
        'search_query: Dogs playing in a park',  # Negative example
    ]

    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output = text_model(**encoded_input)

    # Apply mean pooling + layer norm + L2 norm
    text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    text_embeddings = F.layer_norm(text_embeddings, normalized_shape=(text_embeddings.shape[1],))
    text_embeddings = F.normalize(text_embeddings, p=2, dim=1)

    print(f"βœ… Text embeddings generated: shape={text_embeddings.shape}")

    # Compute similarity scores
    print("\n" + "="*60)
    print("Cross-Modal Similarity Scores")
    print("="*60)

    scores = torch.matmul(img_embeddings, text_embeddings.T)

    for i, query in enumerate(sentences):
        score = scores[0, i].item()
        query_text = query.replace('search_query: ', '')
        print(f"\nQuery: {query_text}")
        print(f"Score: {score:.4f}")

        if score > 0.3:
            print("βœ… GOOD - Strong similarity")
        elif score > 0.1:
            print("⚠️  MEDIUM - Moderate similarity")
        else:
            print("❌ LOW - Weak similarity")

    print("\n" + "="*60)
    print("Test complete!")
    print("="*60)

    # Verify that cat-related queries have higher scores than dog query
    cat_score = max(scores[0, 0].item(), scores[0, 1].item())
    dog_score = scores[0, 2].item()

    print(f"\nBest cat-related score: {cat_score:.4f}")
    print(f"Dog-related score: {dog_score:.4f}")

    if cat_score > dog_score:
        print("βœ… PASS: Cat queries score higher than dog query")
        return True
    else:
        print("❌ FAIL: Expected cat queries to score higher")
        return False


if __name__ == "__main__":
    success = test_nomic_cross_modal_search()
    sys.exit(0 if success else 1)

βœ… Text embeddings generated: shape=torch.Size([3, 768])

============================================================
Cross-Modal Similarity Scores

Query: What are cute animals to cuddle with?
Score: 0.0751
❌ LOW - Weak similarity

Query: What do cats look like?
Score: 0.0684
❌ LOW - Weak similarity

Query: Dogs playing in a park
Score: 0.0535
❌ LOW - Weak similarity

============================================================
Test complete!

Sign up or log in to comment