""" src/vector_store.py ─────────────────────────────────────────────────────────────────────────────── Responsible for: 1. Converting document chunks into vector embeddings 2. Storing them in a FAISS index (fast similarity search) 3. Persisting the index to disk (so you don't re-embed every time) 4. Loading an existing index from disk What is an embedding? An embedding is a numeric vector (list of floats) that represents the semantic meaning of a text. Similar texts → close vectors in space. This lets us find the most relevant document chunks for a user's question. What is FAISS? Facebook AI Similarity Search — an ultra-fast library to find the nearest vectors to a query vector. Perfect for document retrieval. """ import os from typing import List from langchain_core.documents import Document from langchain_community.vectorstores import FAISS from langchain_huggingface import HuggingFaceEmbeddings def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings: """ Load a sentence-transformer embedding model from Hugging Face. The model runs locally (no API call for embeddings), which means: - It's free and private - Fast for batch processing - No rate limits Args: model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2" Returns: HuggingFaceEmbeddings object usable by LangChain. """ print(f"[VectorStore] Loading embedding model: {model_id}") embeddings = HuggingFaceEmbeddings( model_name=model_id, model_kwargs={"device": "cpu"}, # use "cuda" if you have a GPU encode_kwargs={"normalize_embeddings": True}, # unit vectors → cosine similarity ) return embeddings def create_vectorstore( chunks: List[Document], embeddings: HuggingFaceEmbeddings, persist_path: str, ) -> FAISS: """ Embed all document chunks and store them in a FAISS index. The index is saved to disk for reuse across sessions. Args: chunks: Document chunks from document_loader.split_documents() embeddings: The embedding model to use. persist_path: Folder where the FAISS index will be saved. Returns: A FAISS vectorstore ready for similarity search. """ print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)") vectorstore = FAISS.from_documents(chunks, embeddings) # Persist to disk so we don't need to re-embed on next startup os.makedirs(persist_path, exist_ok=True) vectorstore.save_local(persist_path) print(f"[VectorStore] Index saved to: {persist_path}") return vectorstore def load_vectorstore( persist_path: str, embeddings: HuggingFaceEmbeddings, ) -> FAISS: """ Load a previously saved FAISS index from disk. Args: persist_path: Folder where the index was saved. embeddings: Must be the SAME embedding model used during creation. Returns: A FAISS vectorstore ready for similarity search. """ print(f"[VectorStore] Loading existing index from: {persist_path}") vectorstore = FAISS.load_local( persist_path, embeddings, allow_dangerous_deserialization=True, # required by LangChain for local files ) return vectorstore def get_or_create_vectorstore( chunks: List[Document], embeddings: HuggingFaceEmbeddings, persist_path: str, ) -> FAISS: """ Convenience function: loads existing index if available, else creates it. This avoids re-embedding documents on every restart. Args: chunks: Document chunks (only used if index doesn't exist yet). embeddings: Embedding model. persist_path: Where to save/load the FAISS index. Returns: A ready-to-use FAISS vectorstore. """ index_file = os.path.join(persist_path, "index.faiss") if os.path.exists(index_file): return load_vectorstore(persist_path, embeddings) else: return create_vectorstore(chunks, embeddings, persist_path)