| | from fastapi import FastAPI, Request, Query |
| | from fastapi.templating import Jinja2Templates |
| | from fastapi import File, UploadFile |
| | from fastapi.responses import FileResponse |
| | from fastapi.responses import Response |
| |
|
| | from pydantic import BaseModel |
| | from sentence_transformers import SentenceTransformer |
| | import faiss |
| | import numpy as np |
| | import json |
| | import io |
| |
|
| | app = FastAPI() |
| | |
| | |
| | |
| | embedding_dimension = 512 |
| | |
| | model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1") |
| |
|
| |
|
| | index = faiss.IndexFlatL2(embedding_dimension) |
| | documents = [] |
| |
|
| | templates = Jinja2Templates(directory=".") |
| |
|
| | class EmbedRequest(BaseModel): |
| | texts: list[str] |
| |
|
| | class SearchRequest(BaseModel): |
| | text: str |
| | n: int = 5 |
| | |
| | @app.get("/") |
| | def read_root(request: Request): |
| | return templates.TemplateResponse("index.html", {"request": request}) |
| |
|
| |
|
| | @app.post("/embed") |
| | def embed_strings(request: EmbedRequest): |
| | new_documents = request.texts |
| | print(f"Start embedding of {len(new_documents)} docs") |
| | batch_size = 20 |
| |
|
| | |
| | batches = [new_documents[i:i+batch_size] for i in range(0, len(new_documents), batch_size)] |
| |
|
| | |
| | new_embeddings = [] |
| | for batch in batches: |
| | batch_embeddings = model.encode(batch) |
| | new_embeddings.extend(batch_embeddings) |
| | print(f"embeded {batch_size} docs") |
| |
|
| | |
| | remaining_docs = len(new_documents) % batch_size |
| | print(f"embedind remaining {remaining_docs} docs") |
| | |
| | if remaining_docs > 0: |
| | remaining_batch = new_documents[-remaining_docs:] |
| | remaining_embeddings = model.encode(remaining_batch) |
| | new_embeddings.extend(remaining_embeddings) |
| |
|
| | index.add(np.array(new_embeddings)) |
| | new_size = index.ntotal |
| | documents.extend(new_documents) |
| | print(f"End embedding {len(new_documents)} docs, new DB size: {new_size}") |
| | return { |
| | "message": f"{len(new_documents)} new strings embedded and added to FAISS database. New size of the database: {new_size}" |
| | } |
| | |
| | def embed_strings_v0(request: EmbedRequest): |
| | new_documents = request.texts |
| | new_embeddings = model.encode(new_documents) |
| | index.add(np.array(new_embeddings)) |
| | new_size = index.ntotal |
| | documents.extend(new_documents) |
| | return { |
| | "message": f"{len(new_documents)} new strings embedded and added to FAISS database. New size of the database: {new_size}" |
| | } |
| |
|
| |
|
| | @app.post("/search") |
| | def search_string(request: SearchRequest): |
| | embedding = model.encode([request.text]) |
| | distances, indices = index.search(np.array(embedding), request.n) |
| |
|
| | |
| | found_documents = [documents[i] for i in indices[0]] |
| |
|
| | return { |
| | "distances": distances[0].tolist(), |
| | "indices": indices[0].tolist(), |
| | "documents": found_documents |
| | } |
| |
|
| | |
| | |
| | |
| | @app.get("/admin/database/length") |
| | def get_database_length(): |
| | return {"length": index.ntotal} |
| |
|
| | @app.post("/admin/database/reset") |
| | def reset_database(): |
| | global index |
| | global documents |
| | index = faiss.IndexFlatL2(embedding_dimension) |
| | documents = [] |
| | return {"message": "Database reset"} |
| |
|
| | @app.get("/admin/documents/download") |
| | def download_documents(): |
| | |
| | documents_json = json.dumps({"texts": documents}) |
| |
|
| | |
| | response = Response(content=documents_json, media_type="application/json") |
| |
|
| | |
| | response.headers["Content-Disposition"] = "attachment; filename=documents.json" |
| |
|
| | return response |
| |
|
| | @app.post("/admin/documents/upload") |
| | def upload_documents(file: UploadFile = File(...)): |
| | |
| | contents = file.file.read() |
| |
|
| | |
| | data = json.loads(contents) |
| |
|
| | |
| | new_documents = data["texts"] |
| |
|
| | |
| | documents.extend(new_documents) |
| |
|
| | return {"message": f"{len(new_documents)} new documents uploaded"} |
| |
|
| | @app.post("/admin/documents/embed") |
| | def embed_documents(file: UploadFile = File(...)): |
| | |
| | contents = file.file.read() |
| |
|
| | |
| | data = json.loads(contents) |
| |
|
| | |
| | new_documents = data["texts"] |
| |
|
| | |
| | new_embeddings = model.encode(new_documents) |
| | index.add(np.array(new_embeddings)) |
| |
|
| | |
| | documents.extend(new_documents) |
| |
|
| | return {"message": f"{len(new_documents)} new documents uploaded and embedded"} |
| |
|
| |
|
| | @app.get("/admin/database/download") |
| | def download_database(): |
| | |
| | faiss.write_index(index, "database.index") |
| |
|
| | |
| | response = FileResponse("database.index", media_type="application/octet-stream") |
| |
|
| | |
| | response.headers["Content-Disposition"] = "attachment; filename=database.index" |
| |
|
| | return response |
| |
|
| |
|
| | @app.post("/admin/database/upload") |
| | def upload_database(file: UploadFile = File(...)): |
| | |
| | |
| |
|
| | |
| | with open(file.filename, "wb") as f: |
| | f.write(file.file.read()) |
| |
|
| | |
| | global index |
| | index = faiss.read_index(file.filename) |
| |
|
| | return {"message": f"Database uploaded with {index.ntotal} embeddings"} |
| |
|
| |
|
| |
|
| | def upload_database_1(file: UploadFile = File(...)): |
| | |
| | with open(file.filename, "wb") as f: |
| | f.write(file.file.read()) |
| |
|
| | |
| | with open(file.filename, "rb") as f: |
| | |
| | global index |
| | index = faiss.read_index_binary(f) |
| |
|
| | |
| | global documents |
| | documents = index.reconstruct_n(0, index.ntotal).tolist() |
| |
|
| | return {"message": f"Database uploaded with {len(documents)} documents"} |
| |
|
| |
|
| | def upload_database_0(file: UploadFile = File(...)): |
| | |
| | contents = file.file.read() |
| |
|
| | |
| | global index |
| | index = faiss.read_index_binary(contents) |
| |
|
| | |
| | |
| | |
| |
|
| | return {"message": f"Database uploaded with {index.ntotal} embeddings"} |
| |
|