colab_test_script.py · AbstractPhil/geolip-captionbert-8192 at main

geolip-captionbert-8192 / colab_test_script.py

Create colab_test_script.py

2028a79 verified 14 days ago

2.63 kB

	# ========================================================================================================================== #
	# CLEAN TEST: AutoModel load from HuggingFace
	# Run on a fresh Colab runtime with no prior state
	# Paste this in Colab and it will simply run.
	# Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential.
	# ========================================================================================================================== #

	from transformers import AutoModel, AutoTokenizer
	import torch

	REPO_ID = "AbstractPhil/geolip-captionbert-8192"

	print("Loading model...")
	model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True)
	model.eval()
	print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")

	print("Loading tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
	print(f" Vocab: {tokenizer.vocab_size}")

	# Encode
	texts = [
	"girl",
	"boy",
	"woman",
	"man",
	"mans",
	"womens",
	"women",
	"woman",
	"adjacency",
	"adjacent",
	"nearby",
	"near",
	"away",
	"aways",
	"similar",
	"dissimilar",
	"solid",
	"liquid",
	"prophetic",
	"predictive",
	"similarity",
	"differentiation",
	"differential",
	"addition",
	"subtraction",
	"division",
	"multiplication"
	#"A cat sitting on a windowsill watching birds outside",
	#"A golden retriever playing fetch on the beach at sunset",
	#"A still life painting with flowers and fruit on a table",
	#"An aerial photograph of a city skyline at night",
	#"A child riding a bicycle through autumn leaves in a park",
	#"a girl performing an action",
	#"a boy performing an action",
	#"a woman performing an action",
	#"a man performing an action",
	]

	inputs = tokenizer(texts, max_length=8192, padding=True,
	truncation=True, return_tensors="pt")

	with torch.no_grad():
	outputs = model(**inputs)

	emb = outputs.last_hidden_state
	print(f"\n Output shape: {emb.shape}")
	print(f" Norms: {emb.norm(dim=-1).tolist()}")

	# Pairwise similarity
	print(f"\n Pairwise cosine similarity:")
	sim = emb @ emb.T
	for i in range(len(texts)):
	for j in range(i+1, len(texts)):
	print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}↔{texts[j][:40]})")

	# Test encode convenience method
	if hasattr(model, 'encode'):
	print(f"\n Testing encode() method...")
	e = model.encode(["Hello world", "Testing the encoder"])
	print(f" Shape: {e.shape}")
	print(f" Cosine: {(e[0] @ e[1]).item():.3f}")

	print("\n✓ All tests passed")