Spaces:

parthtamu
/

rag-code-assistant

Sleeping

File size: 8,660 Bytes

2eb3831
 
 
 
 
55bac67
 
 
2eb3831
 
 
 
 
dcac338
143bd7b
dcac338
 
 
 
2eb3831
 
 
 
 
 
 
dcac338
 
 
2eb3831
 
 
 
dcac338
 
 
a1934d7
 
143bd7b
a1934d7
143bd7b
 
 
 
 
 
 
 
a1934d7
 
143bd7b
 
 
 
 
 
a1934d7
143bd7b
 
a1934d7
143bd7b
2eb3831
dcac338
 
2eb3831
dcac338
 
 
 
 
 
2eb3831
dcac338
 
 
 
 
 
 
2eb3831
dcac338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2eb3831
dcac338
2eb3831
dcac338
55bac67
 
 
 
 
 
 
 
 
 
 
 
 
 
2eb3831
 
 
 
 
 
 
dcac338
2eb3831
 
 
 
 
 
 
 
 
 
dcac338
2eb3831
 
 
dcac338
55bac67
2eb3831
 
 
 
 
 
 
 
dcac338
2eb3831
 
 
 
dcac338
 
 
 
 
 
2eb3831
 
 
 
 
 
dcac338
2eb3831
 
 
 
 
 
55bac67
2eb3831
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcac338
2eb3831
 
 
 
 
 
 
 
 
 
 
 
dcac338
2eb3831

import os
import uvicorn
from dotenv import load_dotenv

load_dotenv()

from fastapi.security import APIKeyHeader
from fastapi import FastAPI, UploadFile, File, Security, HTTPException, status, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel

from langchain_pinecone import PineconeVectorStore # Changed from Chroma
from langchain_ollama import ChatOllama
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.tools.retriever import create_retriever_tool
from langchain.agents import create_agent
from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import fast_tokenizer
from pathlib import Path
from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader

def load_vectorstore():
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    return PineconeVectorStore(
        index_name="rag-agent",
        embedding=embeddings,
        pinecone_api_key=os.environ["PINECONE_API_KEY"],
    )

def load_llm():
    # 1. Use .get() so it doesn't crash if the variable is missing
    ollama_url = os.environ.get("OLLAMA_BASE_URL")
    
    # 2. If the URL exists (like on your laptop), use Ollama
    if ollama_url:
        print("🔧 Using local Ollama LLM (Development Mode)")
        return ChatOllama(
            model="llama3.1",
            temperature=0.1,
            base_url=ollama_url,
        )
    
    # 3. If it doesn't exist (like on Hugging Face), fall back to Groq
    groq_api_key = os.environ.get("GROQ_API_KEY")
    if not groq_api_key:
        raise ValueError(
            "Neither OLLAMA_BASE_URL nor GROQ_API_KEY found! "
            "Please set one in your environment variables."
        )
    
    print("☁️ Using Groq Cloud LLM (Production Mode)")
    return ChatGroq(
        api_key=groq_api_key,
        model_name="llama-3.3-70b-versatile",
        temperature=0.1
    )

def load_retriever(vectorstore):
    # Kept exactly as you wrote it
    return vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 4, "fetch_k": 20}
    )

def load_retriever_tool(retriever):
    # Kept exactly as you wrote it
    return create_retriever_tool(
        retriever, 
        "rag_retriever", 
        description="Retrieve relevant documents from the RAG database of programming languages documentations. Don't output raw JSON in your final answer."
    )

def load_agent(tools, llm):
    # Kept exactly as you wrote it
    system_prompt = (
        "You are an expert all in one assistant. Follow these rules strictly:\n\n"
        "1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n"
        "2. GREETINGS: If the user says 'Hi' or 'Hello', respond warmly and ask how you can help with Python. DO NOT use the tool.\n"
        "3. OFF-TOPIC QUESTIONS: If the user asks a non-coding question (e.g., trivia, history), answer it briefly using your own knowledge, then politely steer the conversation back to Python. DO NOT use the tool.\n\n"
        "STRICT CONSTRAINTS:\n"
        "- NEVER output raw JSON in your final answer.\n"
        "- NEVER explain your internal workings or mention the terms 'tool', 'database', or 'training data' to the user.\n"
        "- NEVER apologize or say 'I am just an AI' or 'I don't have direct access'."
    )
    
    llm_with_tools = llm.bind_tools(tools)
    
    return create_agent(
        model=llm_with_tools,
        tools=tools,
        system_prompt=system_prompt,
    )


# --- FASTAPI SETUP & GLOBAL INITIALIZATION ---

app = FastAPI(title="Python RAG Agent API")

# 1. Define the name of the header we expect
api_key_header = APIKeyHeader(name="X-API-Key")

# 2. Get your secret password from environment variables
SECRET_APP_KEY = os.environ["APP_API_KEY"]

# 3. Create the security function
def verify_api_key(api_key: str = Security(api_key_header)):
    if api_key != SECRET_APP_KEY:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Invalid or missing API Key"
        )

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"], 
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize your agent once when the server starts
vectorstore = load_vectorstore()
llm = load_llm()
retriever = load_retriever(vectorstore)
retriever_tool = load_retriever_tool(retriever)
tools = [retriever_tool]
agent = load_agent(tools, llm)


# --- API ENDPOINTS ---

class ChatRequest(BaseModel):
    message: str
    history: list[dict] = []  # Allows UI to send previous messages

@app.post("/chat", dependencies=[Depends(verify_api_key)])
async def chat_endpoint(request: ChatRequest):
    # 1. Build the chat history array from the UI's request
    chat_history = []
    for msg in request.history:
        if msg["role"] == "user":
            chat_history.append(HumanMessage(content=msg["content"]))
        else:
            chat_history.append(AIMessage(content=msg["content"]))
            
    chat_history.append(HumanMessage(content=request.message))

    # 2. Wrap your exact original streaming logic in a generator function
    async def generate_stream():
        try:
            for chunk, metadata in agent.stream(
                {"messages": chat_history},
                stream_mode="messages",
            ):
                if isinstance(chunk, AIMessageChunk) and chunk.content:
                    yield chunk.content
        except Exception as e:
            yield f"\n[Error]: {e}"

    # 3. Stream the output to the Vercel frontend
    return StreamingResponse(generate_stream(), media_type="text/event-stream")


# 1. Add your custom token length function back
def custom_token_length(text):
    tokens = fast_tokenizer.tokenize(text)
    return len(tokens)

@app.post("/upload", dependencies=[Depends(verify_api_key)])
async def upload_document(file: UploadFile = File(...)):
    """Accepts PDF, HTML, MD, and TXT files and uploads them to Pinecone using fast_tokenizer."""
    
    ext = Path(file.filename).suffix.lower()
    
    supported_extensions = [".pdf", ".html", ".htm", ".md", ".txt"]
    if ext not in supported_extensions:
        return {"error": f"Unsupported file type. Please upload one of: {', '.join(supported_extensions)}"}

    temp_file_path = f"temp_{file.filename}"
    with open(temp_file_path, "wb") as f:
        f.write(await file.read())

    try:
        if ext == ".pdf":
            loader = PyPDFLoader(temp_file_path)
            docs = loader.load()
            
        elif ext in [".html", ".htm"]:
            try:
                loader = UnstructuredHTMLLoader(temp_file_path)
                docs = loader.load()
            except Exception as e:
                print(f"Warning: UnstructuredHTMLLoader failed, trying BSHTMLLoader: {e}")
                loader = BSHTMLLoader(temp_file_path)
                docs = loader.load()
                
        elif ext == ".md":
            loader = UnstructuredMarkdownLoader(temp_file_path)
            docs = loader.load()
            
        elif ext == ".txt":
            loader = TextLoader(temp_file_path)
            docs = loader.load()

        # 2. Re-implement your exact RecursiveCharacterTextSplitter settings
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=350,       # Max 350 tokens per chunk
            chunk_overlap=50,     # Overlap of 50 tokens
            length_function=custom_token_length  # Tells LangChain to use your C++ tool
        )
        
        splits = text_splitter.split_documents(docs)

        # 3. Upload the perfectly tokenized chunks to Pinecone
        vectorstore.add_documents(splits)

        return {
            "status": "success", 
            "message": f"Successfully processed {file.filename} into {len(splits)} chunks and uploaded to Pinecone."
        }
        
    except Exception as e:
        return {"error": f"Failed to process file: {str(e)}"}
        
    finally:
        # Clean up temp file
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)


if __name__ == "__main__":
    # Runs the API server on port 7860 (Required for Hugging Face Spaces)
    print("\n" + "="*50)
    print("🐍 Python RAG API Initialized on Port 7860")
    print("="*50 + "\n")
    uvicorn.run("agent:app", host="0.0.0.0", port=7860, reload=True)