import os import sys # Run initialization if index doesn't exist if not os.path.exists("/app/data/faiss_index.bin"): print("šŸ”§ Initializing FAISS index and cache...") # Run the init script from scripts.initialize_rag import main as init_main init_main() print("āœ… Initialization complete.") import gradio as gr import time import sys import os # Add repo root to path sys.path.append(os.path.dirname(__file__)) # Global references to loaded systems _naive_rag = None _optimized_rag = None _no_compromise_rag = None _embedding_model = None # shared model def get_embedding_model(): """Load the embedding model once and reuse it across all RAG classes.""" global _embedding_model if _embedding_model is None: from sentence_transformers import SentenceTransformer print("Loading embedding model...") _embedding_model = SentenceTransformer('all-MiniLM-L6-v2') return _embedding_model def get_naive(): global _naive_rag if _naive_rag is None: from app.rag_naive import NaiveRAG print("Initializing Naive RAG...") # Pass the shared embedding model if the class supports it # (you may need to modify your RAG classes to accept a model argument) _naive_rag = NaiveRAG() # If NaiveRAG has a set_embedding_model method, call it: # _naive_rag.set_embedding_model(get_embedding_model()) return _naive_rag def get_optimized(): global _optimized_rag if _optimized_rag is None: from app.rag_optimized import OptimizedRAG print("Initializing Optimized RAG...") _optimized_rag = OptimizedRAG() return _optimized_rag def get_no_compromise(): global _no_compromise_rag if _no_compromise_rag is None: from app.no_compromise_rag import NoCompromiseRAG print("Initializing No-Compromise RAG...") _no_compromise_rag = NoCompromiseRAG() return _no_compromise_rag def query_naive(question): try: rag = get_naive() start = time.perf_counter() answer, chunks_used, cache_hit = rag.query(question) latency = (time.perf_counter() - start) * 1000 return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No" except Exception as e: return f"Error: {e}", "0 ms", "0", "No" def query_optimized(question): try: rag = get_optimized() start = time.perf_counter() answer, chunks_used, cache_hit = rag.query(question) latency = (time.perf_counter() - start) * 1000 return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No" except Exception as e: return f"Error: {e}", "0 ms", "0", "No" def query_no_compromise(question): try: rag = get_no_compromise() start = time.perf_counter() answer, chunks_used, cache_hit = rag.query(question) latency = (time.perf_counter() - start) * 1000 return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No" except Exception as e: return f"Error: {e}", "0 ms", "0", "No" # ------------------------------------------------------------------- # Build the Gradio interface # ------------------------------------------------------------------- with gr.Blocks(title="RAG Latency Optimization", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # ⚔ RAG Latency Optimization ### Compare Naive, Optimized, and No‑Compromise RAG on CPU‑only hardware **Proven 2.7Ɨ speedup (247ms → 92ms)** – now interactive! """) with gr.Tabs(): # ----- Naive RAG tab ----- with gr.TabItem("🐢 Naive RAG (Baseline)"): with gr.Row(): question_naive = gr.Textbox(label="Your Question", lines=2) submit_naive = gr.Button("Ask", variant="primary") with gr.Row(): answer_naive = gr.Textbox(label="Answer", lines=4) with gr.Row(): latency_naive = gr.Textbox(label="Latency", interactive=False) chunks_naive = gr.Textbox(label="Chunks Used", interactive=False) cache_naive = gr.Textbox(label="Cache Hit", interactive=False) submit_naive.click( query_naive, inputs=question_naive, outputs=[answer_naive, latency_naive, chunks_naive, cache_naive] ) # ----- Optimized RAG tab ----- with gr.TabItem("⚔ Optimized RAG (Production)"): with gr.Row(): question_opt = gr.Textbox(label="Your Question", lines=2) submit_opt = gr.Button("Ask", variant="primary") with gr.Row(): answer_opt = gr.Textbox(label="Answer", lines=4) with gr.Row(): latency_opt = gr.Textbox(label="Latency", interactive=False) chunks_opt = gr.Textbox(label="Chunks Used", interactive=False) cache_opt = gr.Textbox(label="Cache Hit", interactive=False) submit_opt.click( query_optimized, inputs=question_opt, outputs=[answer_opt, latency_opt, chunks_opt, cache_opt] ) # ----- No‑Compromise RAG tab ----- with gr.TabItem("šŸš€ No‑Compromise RAG (Max Speed)"): with gr.Row(): question_nc = gr.Textbox(label="Your Question", lines=2) submit_nc = gr.Button("Ask", variant="primary") with gr.Row(): answer_nc = gr.Textbox(label="Answer", lines=4) with gr.Row(): latency_nc = gr.Textbox(label="Latency", interactive=False) chunks_nc = gr.Textbox(label="Chunks Used", interactive=False) cache_nc = gr.Textbox(label="Cache Hit", interactive=False) submit_nc.click( query_no_compromise, inputs=question_nc, outputs=[answer_nc, latency_nc, chunks_nc, cache_nc] ) gr.Markdown(""" --- **Architecture**: CPU‑only | **Embeddings**: `all-MiniLM-L6-v2` | **Vector Store**: FAISS **Caching**: SQLite (Optimized) + LRU memory | **Generation**: Simulated (real LLM can be plugged in) """) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)