import os
import sys

# Run initialization if index doesn't exist
if not os.path.exists("/app/data/faiss_index.bin"):
    print("🔧 Initializing FAISS index and cache...")
    # Run the init script
    from scripts.initialize_rag import main as init_main
    init_main()
    print("✅ Initialization complete.")
import gradio as gr
import time
import sys
import os

# Add repo root to path
sys.path.append(os.path.dirname(__file__))

# Global references to loaded systems
_naive_rag = None
_optimized_rag = None
_no_compromise_rag = None
_embedding_model = None   # shared model

def get_embedding_model():
    """Load the embedding model once and reuse it across all RAG classes."""
    global _embedding_model
    if _embedding_model is None:
        from sentence_transformers import SentenceTransformer
        print("Loading embedding model...")
        _embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    return _embedding_model

def get_naive():
    global _naive_rag
    if _naive_rag is None:
        from app.rag_naive import NaiveRAG
        print("Initializing Naive RAG...")
        # Pass the shared embedding model if the class supports it
        # (you may need to modify your RAG classes to accept a model argument)
        _naive_rag = NaiveRAG()
        # If NaiveRAG has a set_embedding_model method, call it:
        # _naive_rag.set_embedding_model(get_embedding_model())
    return _naive_rag

def get_optimized():
    global _optimized_rag
    if _optimized_rag is None:
        from app.rag_optimized import OptimizedRAG
        print("Initializing Optimized RAG...")
        _optimized_rag = OptimizedRAG()
    return _optimized_rag

def get_no_compromise():
    global _no_compromise_rag
    if _no_compromise_rag is None:
        from app.no_compromise_rag import NoCompromiseRAG
        print("Initializing No-Compromise RAG...")
        _no_compromise_rag = NoCompromiseRAG()
    return _no_compromise_rag

def query_naive(question):
    try:
        rag = get_naive()
        start = time.perf_counter()
        answer, chunks_used, cache_hit = rag.query(question)
        latency = (time.perf_counter() - start) * 1000
        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
    except Exception as e:
        return f"Error: {e}", "0 ms", "0", "No"

def query_optimized(question):
    try:
        rag = get_optimized()
        start = time.perf_counter()
        answer, chunks_used, cache_hit = rag.query(question)
        latency = (time.perf_counter() - start) * 1000
        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
    except Exception as e:
        return f"Error: {e}", "0 ms", "0", "No"

def query_no_compromise(question):
    try:
        rag = get_no_compromise()
        start = time.perf_counter()
        answer, chunks_used, cache_hit = rag.query(question)
        latency = (time.perf_counter() - start) * 1000
        return answer, f"{latency:.1f} ms", str(chunks_used), "Yes" if cache_hit else "No"
    except Exception as e:
        return f"Error: {e}", "0 ms", "0", "No"

# -------------------------------------------------------------------
# Build the Gradio interface
# -------------------------------------------------------------------
with gr.Blocks(title="RAG Latency Optimization", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # ⚡ RAG Latency Optimization
    ### Compare Naive, Optimized, and No‑Compromise RAG on CPU‑only hardware
    **Proven 2.7× speedup (247ms → 92ms)** – now interactive!
    """)

    with gr.Tabs():
        # ----- Naive RAG tab -----
        with gr.TabItem("🐢 Naive RAG (Baseline)"):
            with gr.Row():
                question_naive = gr.Textbox(label="Your Question", lines=2)
                submit_naive = gr.Button("Ask", variant="primary")
            with gr.Row():
                answer_naive = gr.Textbox(label="Answer", lines=4)
            with gr.Row():
                latency_naive = gr.Textbox(label="Latency", interactive=False)
                chunks_naive = gr.Textbox(label="Chunks Used", interactive=False)
                cache_naive = gr.Textbox(label="Cache Hit", interactive=False)
            submit_naive.click(
                query_naive,
                inputs=question_naive,
                outputs=[answer_naive, latency_naive, chunks_naive, cache_naive]
            )

        # ----- Optimized RAG tab -----
        with gr.TabItem("⚡ Optimized RAG (Production)"):
            with gr.Row():
                question_opt = gr.Textbox(label="Your Question", lines=2)
                submit_opt = gr.Button("Ask", variant="primary")
            with gr.Row():
                answer_opt = gr.Textbox(label="Answer", lines=4)
            with gr.Row():
                latency_opt = gr.Textbox(label="Latency", interactive=False)
                chunks_opt = gr.Textbox(label="Chunks Used", interactive=False)
                cache_opt = gr.Textbox(label="Cache Hit", interactive=False)
            submit_opt.click(
                query_optimized,
                inputs=question_opt,
                outputs=[answer_opt, latency_opt, chunks_opt, cache_opt]
            )

        # ----- No‑Compromise RAG tab -----
        with gr.TabItem("🚀 No‑Compromise RAG (Max Speed)"):
            with gr.Row():
                question_nc = gr.Textbox(label="Your Question", lines=2)
                submit_nc = gr.Button("Ask", variant="primary")
            with gr.Row():
                answer_nc = gr.Textbox(label="Answer", lines=4)
            with gr.Row():
                latency_nc = gr.Textbox(label="Latency", interactive=False)
                chunks_nc = gr.Textbox(label="Chunks Used", interactive=False)
                cache_nc = gr.Textbox(label="Cache Hit", interactive=False)
            submit_nc.click(
                query_no_compromise,
                inputs=question_nc,
                outputs=[answer_nc, latency_nc, chunks_nc, cache_nc]
            )

    gr.Markdown("""
    ---
    **Architecture**: CPU‑only | **Embeddings**: `all-MiniLM-L6-v2` | **Vector Store**: FAISS  
    **Caching**: SQLite (Optimized) + LRU memory | **Generation**: Simulated (real LLM can be plugged in)
    """)

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)