File size: 4,229 Bytes
659d6ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
src/vector_store.py
───────────────────────────────────────────────────────────────────────────────
Responsible for:
  1. Converting document chunks into vector embeddings
  2. Storing them in a FAISS index (fast similarity search)
  3. Persisting the index to disk (so you don't re-embed every time)
  4. Loading an existing index from disk

What is an embedding?
  An embedding is a numeric vector (list of floats) that represents the
  semantic meaning of a text. Similar texts β†’ close vectors in space.
  This lets us find the most relevant document chunks for a user's question.

What is FAISS?
  Facebook AI Similarity Search β€” an ultra-fast library to find the nearest
  vectors to a query vector. Perfect for document retrieval.
"""

import os
from typing import List

from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings


def build_embedding_model(model_id: str) -> HuggingFaceEmbeddings:
    """
    Load a sentence-transformer embedding model from Hugging Face.

    The model runs locally (no API call for embeddings), which means:
      - It's free and private
      - Fast for batch processing
      - No rate limits

    Args:
        model_id: Hugging Face model ID, e.g. "sentence-transformers/all-MiniLM-L6-v2"

    Returns:
        HuggingFaceEmbeddings object usable by LangChain.
    """
    print(f"[VectorStore] Loading embedding model: {model_id}")
    embeddings = HuggingFaceEmbeddings(
        model_name=model_id,
        model_kwargs={"device": "cpu"},   # use "cuda" if you have a GPU
        encode_kwargs={"normalize_embeddings": True},  # unit vectors β†’ cosine similarity
    )
    return embeddings


def create_vectorstore(
    chunks: List[Document],
    embeddings: HuggingFaceEmbeddings,
    persist_path: str,
) -> FAISS:
    """
    Embed all document chunks and store them in a FAISS index.
    The index is saved to disk for reuse across sessions.

    Args:
        chunks:       Document chunks from document_loader.split_documents()
        embeddings:   The embedding model to use.
        persist_path: Folder where the FAISS index will be saved.

    Returns:
        A FAISS vectorstore ready for similarity search.
    """
    print(f"[VectorStore] Embedding {len(chunks)} chunks... (this may take a moment)")
    vectorstore = FAISS.from_documents(chunks, embeddings)

    # Persist to disk so we don't need to re-embed on next startup
    os.makedirs(persist_path, exist_ok=True)
    vectorstore.save_local(persist_path)
    print(f"[VectorStore] Index saved to: {persist_path}")

    return vectorstore


def load_vectorstore(
    persist_path: str,
    embeddings: HuggingFaceEmbeddings,
) -> FAISS:
    """
    Load a previously saved FAISS index from disk.

    Args:
        persist_path: Folder where the index was saved.
        embeddings:   Must be the SAME embedding model used during creation.

    Returns:
        A FAISS vectorstore ready for similarity search.
    """
    print(f"[VectorStore] Loading existing index from: {persist_path}")
    vectorstore = FAISS.load_local(
        persist_path,
        embeddings,
        allow_dangerous_deserialization=True,  # required by LangChain for local files
    )
    return vectorstore


def get_or_create_vectorstore(
    chunks: List[Document],
    embeddings: HuggingFaceEmbeddings,
    persist_path: str,
) -> FAISS:
    """
    Convenience function: loads existing index if available, else creates it.
    This avoids re-embedding documents on every restart.

    Args:
        chunks:       Document chunks (only used if index doesn't exist yet).
        embeddings:   Embedding model.
        persist_path: Where to save/load the FAISS index.

    Returns:
        A ready-to-use FAISS vectorstore.
    """
    index_file = os.path.join(persist_path, "index.faiss")

    if os.path.exists(index_file):
        return load_vectorstore(persist_path, embeddings)
    else:
        return create_vectorstore(chunks, embeddings, persist_path)