"""Pre-train embeddings using gensim's optimized Word2Vec on full text8. This is the canonical script for generating pretrained_W.npy and pretrained_vocab.json. Run locally (not on HF Space): pip install gensim numpy python pretrain_gensim.py gensim is a dev-only dependency — the Space runtime uses only numpy, gradio, plotly, and scikit-learn (see requirements.txt). """ import json import numpy as np from gensim.models import Word2Vec from microembeddings import load_text8 # Load full text8 corpus (17M words) words = load_text8(max_words=17_000_000) # gensim expects list of sentences — split into ~1000-word chunks sentences = [words[i:i+1000] for i in range(0, len(words), 1000)] print(f"Training on {len(words)} words, {len(sentences)} sentences...") model = Word2Vec( sentences, vector_size=50, window=5, min_count=5, sg=1, # skip-gram negative=5, epochs=5, workers=4, max_final_vocab=10000, ) # Export to our format vocab_list = list(model.wv.index_to_key) W = np.array([model.wv[w] for w in vocab_list], dtype=np.float32) np.save("pretrained_W.npy", W) with open("pretrained_vocab.json", "w") as f: json.dump({"vocab": vocab_list, "losses": []}, f) print(f"Saved: {W.shape[0]} words x {W.shape[1]} dims") # Quick quality check from microembeddings import normalize, most_similar, analogy W_norm = normalize(W) word2idx = {w: i for i, w in enumerate(vocab_list)} idx2word = {i: w for i, w in enumerate(vocab_list)} print("\n--- Nearest Neighbors ---") for word in ["king", "france", "dog", "computer"]: neighbors = most_similar(word, W_norm, word2idx, idx2word, topn=5) print(f"{word}: {', '.join(f'{w} ({s:.3f})' for w, s in neighbors)}") print("\n--- Analogies ---") for a, b, c in [("man", "king", "woman"), ("france", "paris", "germany"), ("big", "bigger", "small")]: results = analogy(a, b, c, W_norm, word2idx, idx2word) ans = results[0][0] if results else "?" print(f"{a} : {b} :: {c} : {ans}")