Spaces:
Sleeping
Sleeping
File size: 4,229 Bytes
659d6ec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | """
src/vector_store.py
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Responsible for:
1. Converting document chunks into vector embeddings
2. Storing them in a FAISS index (fast similarity search)
3. Persisting the index to disk (so you don't re-embed every time)
4. Loading an existing index from disk
What is an embedding?
An embedding is a numeric vector (list of floats) that represents the
semantic meaning of a text. Similar texts β close vectors in space.
This lets us find the most relevant document chunks for a user's question.
What is FAISS?
Facebook AI Similarity Search β an ultra-fast library to find the nearest
vectors to a query vector. Perfect for document retrieval.
"""
import os
from typing import List
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings:
"""
Load a sentence-transformer embedding model from Hugging Face.
The model runs locally (no API call for embeddings), which means:
- It's free and private
- Fast for batch processing
- No rate limits
Args:
model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2"
Returns:
HuggingFaceEmbeddings object usable by LangChain.
"""
print(f"[VectorStore] Loading embedding model: {model_id}")
embeddings = HuggingFaceEmbeddings(
model_name=model_id,
model_kwargs={"device": "cpu"}, # use "cuda" if you have a GPU
encode_kwargs={"normalize_embeddings": True}, # unit vectors β cosine similarity
)
return embeddings
def create_vectorstore(
chunks: List[Document],
embeddings: HuggingFaceEmbeddings,
persist_path: str,
) -> FAISS:
"""
Embed all document chunks and store them in a FAISS index.
The index is saved to disk for reuse across sessions.
Args:
chunks: Document chunks from document_loader.split_documents()
embeddings: The embedding model to use.
persist_path: Folder where the FAISS index will be saved.
Returns:
A FAISS vectorstore ready for similarity search.
"""
print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)")
vectorstore = FAISS.from_documents(chunks, embeddings)
# Persist to disk so we don't need to re-embed on next startup
os.makedirs(persist_path, exist_ok=True)
vectorstore.save_local(persist_path)
print(f"[VectorStore] Index saved to: {persist_path}")
return vectorstore
def load_vectorstore(
persist_path: str,
embeddings: HuggingFaceEmbeddings,
) -> FAISS:
"""
Load a previously saved FAISS index from disk.
Args:
persist_path: Folder where the index was saved.
embeddings: Must be the SAME embedding model used during creation.
Returns:
A FAISS vectorstore ready for similarity search.
"""
print(f"[VectorStore] Loading existing index from: {persist_path}")
vectorstore = FAISS.load_local(
persist_path,
embeddings,
allow_dangerous_deserialization=True, # required by LangChain for local files
)
return vectorstore
def get_or_create_vectorstore(
chunks: List[Document],
embeddings: HuggingFaceEmbeddings,
persist_path: str,
) -> FAISS:
"""
Convenience function: loads existing index if available, else creates it.
This avoids re-embedding documents on every restart.
Args:
chunks: Document chunks (only used if index doesn't exist yet).
embeddings: Embedding model.
persist_path: Where to save/load the FAISS index.
Returns:
A ready-to-use FAISS vectorstore.
"""
index_file = os.path.join(persist_path, "index.faiss")
if os.path.exists(index_file):
return load_vectorstore(persist_path, embeddings)
else:
return create_vectorstore(chunks, embeddings, persist_path) |