| |
| |
| |
| |
| |
| |
|
|
| from transformers import AutoModel, AutoTokenizer |
| import torch |
|
|
| REPO_ID = "AbstractPhil/geolip-captionbert-8192" |
|
|
| print("Loading model...") |
| model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True) |
| model.eval() |
| print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") |
|
|
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True) |
| print(f" Vocab: {tokenizer.vocab_size}") |
|
|
| |
| texts = [ |
| "girl", |
| "boy", |
| "woman", |
| "man", |
| "mans", |
| "womens", |
| "women", |
| "woman", |
| "adjacency", |
| "adjacent", |
| "nearby", |
| "near", |
| "away", |
| "aways", |
| "similar", |
| "dissimilar", |
| "solid", |
| "liquid", |
| "prophetic", |
| "predictive", |
| "similarity", |
| "differentiation", |
| "differential", |
| "addition", |
| "subtraction", |
| "division", |
| "multiplication" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ] |
|
|
| inputs = tokenizer(texts, max_length=8192, padding=True, |
| truncation=True, return_tensors="pt") |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
|
|
| emb = outputs.last_hidden_state |
| print(f"\n Output shape: {emb.shape}") |
| print(f" Norms: {emb.norm(dim=-1).tolist()}") |
|
|
| |
| print(f"\n Pairwise cosine similarity:") |
| sim = emb @ emb.T |
| for i in range(len(texts)): |
| for j in range(i+1, len(texts)): |
| print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}↔{texts[j][:40]})") |
|
|
| |
| if hasattr(model, 'encode'): |
| print(f"\n Testing encode() method...") |
| e = model.encode(["Hello world", "Testing the encoder"]) |
| print(f" Shape: {e.shape}") |
| print(f" Cosine: {(e[0] @ e[1]).item():.3f}") |
|
|
| print("\n✓ All tests passed") |