gregtatum
/

static-embeddings

static-embeddings

Model card Files Files and versions

xet

Community

gregtatum commited on Sep 19, 2025

Commit

c133d59

1 Parent(s): 08d598e

Add loss

Browse files

Files changed (1) hide show

scripts/build_models.py +238 -32

scripts/build_models.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import numpy as np
 from zstandard import ZstdCompressor
 from pathlib import Path
@@ -5,62 +8,265 @@ import io
 from sentence_transformers import SentenceTransformer
 from torch.nn import EmbeddingBag
 import torch
 def save_data(path: Path, tensor: torch.Tensor):
-    """Writes out the static embeddings to a .npy.zst file"""
-    assert str(path).endswith(".npy.zst")
     buffer = io.BytesIO()
-    np.save(buffer, tensor.detach().numpy())
-    with (
-        open(path, "wb") as outfile,
-        ZstdCompressor().stream_writer(outfile) as writer,
-    ):
-        writer.write(buffer.getvalue())
-model_path = Path("model")
-model_name = "sentence-transformers/static-similarity-mrl-multilingual-v1"
-vocab_size = 105_879
-dimensions = 1024
-def load_embeddings():
-    model = SentenceTransformer(model_name, device="cpu")
-    embedding_bag: EmbeddingBag = model[0].embedding  # type: ignore
-    embeddings = torch.Tensor(embedding_bag.weight)
-    print(embeddings.shape)
-    assert embeddings.shape == torch.Size([vocab_size, dimensions])
-    print("float32")
-    print(f"  1024 dim - {embeddings.shape[0] * 1024 * 4 / 1024 / 1024:,.1f} MiB")
-    print(f"   512 dim - {embeddings.shape[0] * 512 * 4 / 1024 / 1024:,.1f} MiB")
-    print(f"   256 dim - {embeddings.shape[0] * 256 * 4 / 1024 / 1024:,.1f} MiB")
-    print("float16")
-    print(f"  1024 dim - {embeddings.shape[0] * 1024 * 2 / 1024 / 1024:,.1f} MiB")
-    print(f"   512 dim - {embeddings.shape[0] * 512 * 2 / 1024 / 1024:,.1f} MiB")
-    print(f"   256 dim - {embeddings.shape[0] * 256 * 2 / 1024 / 1024:,.1f} MiB")
     for dim in (1024, 512, 384, 256, 128):
         truncated = embeddings[:, :dim]
         assert truncated.shape == torch.Size([vocab_size, dim])
-        save_data(model_path / f"static-embeddings.{dim}.fp32.npy.zst", embeddings)
         save_data(
-            model_path / f"static-embeddings.{dim}.fp16.npy.zst",
-            embeddings.to(dtype=torch.float16),
         )
         save_data(
-            model_path / f"static-embeddings.{dim}.int8.npy.zst",
-            embeddings.to(dtype=torch.int8),
         )
 def main() -> None:
-    load_embeddings()
 if __name__ == "__main__":

+import shutil
+from textwrap import dedent
+from typing import Any
 import numpy as np
 from zstandard import ZstdCompressor
 from pathlib import Path
 from sentence_transformers import SentenceTransformer
 from torch.nn import EmbeddingBag
 import torch
+from model2vec import StaticModel
+from tokenizers import Tokenizer
+models_path = Path("models")
+def zst_compress_file(input: Path):
+    cctx = ZstdCompressor()
+    output = input.parent / f"{input.name}.zst"
+    print(f"Compressing {output}")
+    with open(input, "rb") as fin, open(output, "wb") as fout:
+        cctx.copy_stream(fin, fout)
 def save_data(path: Path, tensor: torch.Tensor):
+    """Writes out the static embeddings to a .npy and .npy.zst file"""
     buffer = io.BytesIO()
+    if tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2):
+        # Store as the raw bytes.
+        np.save(buffer, tensor.detach().view(torch.uint8).numpy())
+    else:
+        np.save(buffer, tensor.detach().numpy())
+    print(f"Saving {path}")
+    with (open(path, "wb") as outfile,):
+        outfile.write(buffer.getvalue())
+    zst_compress_file(path)
+def quantization_loss_mse(tensor: torch.Tensor, dtype: torch.dtype):
+    """
+    Compute reconstruction loss when converting embeddings to a datatype and back using
+    the mean squared error, which punishes big errors more than small ones.
+    """
+    # Original → quantize → dequantize
+    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
+    # Mean squared error
+    return torch.mean((tensor - roundtrip) ** 2).item()
+def quantization_loss_mae(tensor: torch.Tensor, dtype: torch.dtype):
+    """
+    Compute reconstruction loss when converting embeddings to a datatype and back using
+    the mean absolute error, which is less sensitive to outliers than MSE.
+    """
+    # Original → quantize → dequantize
+    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
+    # Mean absolute error
+    return torch.mean(torch.abs(tensor - roundtrip)).item()
+def quantization_loss_cosine(tensor: torch.Tensor, dtype: torch.dtype):
+    """
+    Compute reconstruction loss when converting embeddings to a datatype and back using
+    cosine similarity. This measures whether the embedding directions are preserved
+    after quantization, independent of their magnitudes.
+    """
+    # Original → quantize → dequantize
+    roundtrip = tensor.detach().to(dtype).to(tensor.dtype)
+    # Flatten both to 2D (num_vectors, dimensions) in case tensor is 1D or higher-D
+    if tensor.ndim == 1:
+        orig = tensor.unsqueeze(0)
+        recon = roundtrip.unsqueeze(0)
+    else:
+        orig = tensor.view(tensor.shape[0], -1)
+        recon = roundtrip.view(roundtrip.shape[0], -1)
+    # Cosine similarity per vector, then average
+    cos = torch.nn.functional.cosine_similarity(orig, recon, dim=1)
+    return cos.mean().item()
+def export_embeddings(
+    hf_org: str, hf_repo: str, model_path: Path, embeddings: torch.Tensor
+) -> None:
+    vocab_size, dimensions = embeddings.shape
+    # This logic can always be adjusted for models with different shapes.
+    assert (
+        embeddings.dtype == torch.float32
+    ), f"The embeddings {embeddings.dtype} are assumed to be float32."
+    assert (
+        dimensions <= 1024
+    ), f"The embedding {dimensions} dimension is assumed to be at most 1024."
+    norms = torch.norm(embeddings, dim=1)  # shape: [vocab_size]
+    print(f" - vocab size {vocab_size:,.0f}")
+    print(f" - embedding dimension {dimensions:,.0f}")
+    print(f" - vector length (mean): {norms.mean().item():.2f}")
+    print(f" - vector length (median): {norms.median().item():.2f}")
+    print(f" - stddev ±{norms.std().item():.2f}")
+    print(f" - value (mean): {embeddings.mean().item():.2f}")
+    print(f" - value (median): {embeddings.median().item():.2f}")
+    print(f" - stddev ±{embeddings.std().item():.2f}")
+    model_path.mkdir(exist_ok=True, parents=True)
+    with (model_path / "README.md").open("wt") as file:
+        file.write(
+            dedent(
+                f"""
+                # [{hf_org}/{hf_repo}](https://huggingface.co/{hf_org}/{hf_repo})
+                Beyond the vocab size and embedding size, these are stats for the length
+                of the vectors and the distribution of the values.
+                | item          | metric                  | value |
+                | --------------| ----------------------- | ----- |
+                | vocab         | size                    | {vocab_size:,.0f} |
+                | embedding     | dimensions              | {dimensions:,.0f} |
+                | vector length | mean                    | {norms.mean().item():.2f} |
+                | vector length | median                  | {norms.median().item():.2f} |
+                | vector length | stddev                  | {norms.std().item():.2f} |
+                | values        | mean                    | {embeddings.mean().item():.2f} |
+                | values        | median                  | {embeddings.median().item():.2f} |
+                | values        | stddev                  | {embeddings.std().item():.2f} |
+                ## Quantization Loss
+                | Precision     | Metric | Value |
+                | ------------- | ------ | ----- |
+                | fp16          | mse    | {quantization_loss_mse(embeddings, torch.float16):.5f} |
+                | fp8 e4m3      | mse    | {quantization_loss_mse(embeddings, torch.float8_e4m3fn):.5f} |
+                | fp8 e5m2      | mse    | {quantization_loss_mse(embeddings, torch.float8_e5m2):.5f} |
+                | fp16          | mae    | {quantization_loss_mae(embeddings, torch.float16):.5f} |
+                | fp8 e4m3      | mae    | {quantization_loss_mae(embeddings, torch.float8_e4m3fn):.5f} |
+                | fp8 e5m2      | mae    | {quantization_loss_mae(embeddings, torch.float8_e5m2):.5f} |
+                | fp16          | cosine | {quantization_loss_cosine(embeddings, torch.float16):.5f} |
+                | fp8 e4m3      | cosine | {quantization_loss_cosine(embeddings, torch.float8_e4m3fn):.5f} |
+                | fp8 e5m2      | cosine | {quantization_loss_cosine(embeddings, torch.float8_e5m2):.5f} |
+                When embeddings are quantized to lower precision (e.g. FP8) and then dequantized
+                back to `float32`, some information is inevitably lost. To measure how much the
+                quantized embeddings differ from the originals, we report three complementary
+                metrics:
+                - **MSE (Mean Squared Error)** — emphasizes large errors by squaring the
+                differences. Useful for detecting whether any values are badly distorted.
+                - **MAE (Mean Absolute Error)** — the average absolute difference between
+                original and quantized values. Easier to interpret, less sensitive to outliers.
+                - **Cosine Similarity** — measures how well the *direction* of embedding vectors
+                is preserved after quantization, independent of scale. This is especially
+                relevant when embeddings are used for similarity search or retrieval.
+                Together, these metrics provide a more complete picture of quantization quality
+                than any one alone.
+                ### Interpreting Quantization Loss
+                - **Cosine similarity** is the most important metric for embedding use-cases
+                such as similarity search, clustering, or retrieval. Values close to 1.0
+                mean that embedding directions are preserved after quantization, so model
+                quality is likely to hold up.
+                - **MSE and MAE** measure raw element-wise reconstruction error. They provide
+                a sense of how much the numerical values change, but these shifts often have
+                limited impact on cosine similarity once embeddings are pooled and
+                normalized.
+                - **FP16** is effectively lossless and can be treated as a baseline.
+                - **FP8 E4M3** typically offers better precision (lower MSE/MAE) when values
+                stay within a moderate range, making it a strong default for static
+                embeddings.
+                - **FP8 E5M2** trades some precision for greater dynamic range. It can be
+                preferable if embeddings occasionally contain very large values, but it will
+                usually show higher MSE/MAE than E4M3.
+                In practice, if cosine similarity remains very close to 1.0, quantization is
+                unlikely to harm downstream tasks, even if MSE/MAE look relatively large.
+                """
+            ).strip()
+        )
     for dim in (1024, 512, 384, 256, 128):
+        if dim > dimensions:
+            print(f"Skipping output of {dim} as the max dimension is {dimensions}")
+            continue
         truncated = embeddings[:, :dim]
         assert truncated.shape == torch.Size([vocab_size, dim])
+        save_data(model_path / f"fp32.d{dim}.npy", truncated)
+        save_data(
+            model_path / f"fp16.d{dim}.npy",
+            truncated.to(dtype=torch.float16),
+        )
         save_data(
+            model_path / f"fp8_e5m2.d{dim}.npy",
+            truncated.to(dtype=torch.float8_e5m2),
         )
         save_data(
+            model_path / f"fp8_e4m3.d{dim}.npy",
+            truncated.to(dtype=torch.float8_e4m3fn),
         )
+def export_tokenizer(model_path: Path, tokenizer: Tokenizer) -> None:
+    tokenizer_path = model_path / "tokenizer.json"
+    print(f"Exporting tokenizer: {tokenizer_path}")
+    tokenizer.save(str(tokenizer_path))
+    zst_compress_file(tokenizer_path)
+def export_sentence_transformers(hf_org: str, hf_repo: str) -> None:
+    """Extract the embeddings and tokenizer from SentenceTransformers"""
+    model_name = f"{hf_org}/{hf_repo}"
+    print("Processing", model_name)
+    model = SentenceTransformer(f"{hf_org}/{hf_repo}", device="cpu")
+    embedding_bag: EmbeddingBag = model[0].embedding  # type: ignore
+    model_path = models_path / hf_org / hf_repo
+    export_embeddings(hf_org, hf_repo, model_path, torch.Tensor(embedding_bag.weight))
+    export_tokenizer(model_path, model.tokenizer)
+def export_model2vec(hf_org: str, hf_repo: str) -> None:
+    """Extract the embeddings and tokenizer from model2vec"""
+    model = StaticModel.from_pretrained("minishlab/potion-multilingual-128M")
+    model_path = models_path / hf_org / hf_repo
+    export_embeddings(hf_org, hf_repo, model_path, torch.from_numpy(model.embedding))
+    export_tokenizer(model_path, model.tokenizer)
 def main() -> None:
+    # Static embedders that use sentence_transformers models.
+    sentence_transformers_models = [
+        ("sentence-transformers", "static-similarity-mrl-multilingual-v1"),
+        ("sentence-transformers", "static-retrieval-mrl-en-v1"),
+    ]
+    # Static embedders that use model2vec.
+    model2vec_models = [
+        ("minishlab", "potion-multilingual-128M"),
+        ("minishlab", "potion-retrieval-32M"),
+    ]
+    if models_path.exists():
+        print(f"Removing the old models folder: {models_path}")
+        shutil.rmtree(models_path)
+        models_path.mkdir()
+    for hf_org, hf_repo in sentence_transformers_models:
+        export_sentence_transformers(hf_org, hf_repo)
+    for hf_org, hf_repo in model2vec_models:
+        export_model2vec(hf_org, hf_repo)
 if __name__ == "__main__":