import os import uvicorn from dotenv import load_dotenv load_dotenv() from fastapi.security import APIKeyHeader from fastapi import FastAPI, UploadFile, File, Security, HTTPException, status, Depends from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel from langchain_pinecone import PineconeVectorStore # Changed from Chroma from langchain_ollama import ChatOllama from langchain_groq import ChatGroq from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.tools.retriever import create_retriever_tool from langchain.agents import create_agent from langchain_core.messages import HumanMessage, AIMessage, AIMessageChunk from langchain_community.document_loaders import PyPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter import fast_tokenizer from pathlib import Path from langchain_community.document_loaders import UnstructuredHTMLLoader, UnstructuredMarkdownLoader, TextLoader, BSHTMLLoader def load_vectorstore(): embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") return PineconeVectorStore( index_name="rag-agent", embedding=embeddings, pinecone_api_key=os.environ["PINECONE_API_KEY"], ) def load_llm(): # 1. Use .get() so it doesn't crash if the variable is missing ollama_url = os.environ.get("OLLAMA_BASE_URL") # 2. If the URL exists (like on your laptop), use Ollama if ollama_url: print("🔧 Using local Ollama LLM (Development Mode)") return ChatOllama( model="llama3.1", temperature=0.1, base_url=ollama_url, ) # 3. If it doesn't exist (like on Hugging Face), fall back to Groq groq_api_key = os.environ.get("GROQ_API_KEY") if not groq_api_key: raise ValueError( "Neither OLLAMA_BASE_URL nor GROQ_API_KEY found! " "Please set one in your environment variables." ) print("☁️ Using Groq Cloud LLM (Production Mode)") return ChatGroq( api_key=groq_api_key, model_name="llama-3.3-70b-versatile", temperature=0.1 ) def load_retriever(vectorstore): # Kept exactly as you wrote it return vectorstore.as_retriever( search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20} ) def load_retriever_tool(retriever): # Kept exactly as you wrote it return create_retriever_tool( retriever, "rag_retriever", description="Retrieve relevant documents from the RAG database of programming languages documentations. Don't output raw JSON in your final answer." ) def load_agent(tools, llm): # Kept exactly as you wrote it system_prompt = ( "You are an expert all in one assistant. Follow these rules strictly:\n\n" "1. PYTHON QUESTIONS: YOU MUST use tools to search for the answer.\n" "2. GREETINGS: If the user says 'Hi' or 'Hello', respond warmly and ask how you can help with Python. DO NOT use the tool.\n" "3. OFF-TOPIC QUESTIONS: If the user asks a non-coding question (e.g., trivia, history), answer it briefly using your own knowledge, then politely steer the conversation back to Python. DO NOT use the tool.\n\n" "STRICT CONSTRAINTS:\n" "- NEVER output raw JSON in your final answer.\n" "- NEVER explain your internal workings or mention the terms 'tool', 'database', or 'training data' to the user.\n" "- NEVER apologize or say 'I am just an AI' or 'I don't have direct access'." ) llm_with_tools = llm.bind_tools(tools) return create_agent( model=llm_with_tools, tools=tools, system_prompt=system_prompt, ) # --- FASTAPI SETUP & GLOBAL INITIALIZATION --- app = FastAPI(title="Python RAG Agent API") # 1. Define the name of the header we expect api_key_header = APIKeyHeader(name="X-API-Key") # 2. Get your secret password from environment variables SECRET_APP_KEY = os.environ["APP_API_KEY"] # 3. Create the security function def verify_api_key(api_key: str = Security(api_key_header)): if api_key != SECRET_APP_KEY: raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid or missing API Key" ) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize your agent once when the server starts vectorstore = load_vectorstore() llm = load_llm() retriever = load_retriever(vectorstore) retriever_tool = load_retriever_tool(retriever) tools = [retriever_tool] agent = load_agent(tools, llm) # --- API ENDPOINTS --- class ChatRequest(BaseModel): message: str history: list[dict] = [] # Allows UI to send previous messages @app.post("/chat", dependencies=[Depends(verify_api_key)]) async def chat_endpoint(request: ChatRequest): # 1. Build the chat history array from the UI's request chat_history = [] for msg in request.history: if msg["role"] == "user": chat_history.append(HumanMessage(content=msg["content"])) else: chat_history.append(AIMessage(content=msg["content"])) chat_history.append(HumanMessage(content=request.message)) # 2. Wrap your exact original streaming logic in a generator function async def generate_stream(): try: for chunk, metadata in agent.stream( {"messages": chat_history}, stream_mode="messages", ): if isinstance(chunk, AIMessageChunk) and chunk.content: yield chunk.content except Exception as e: yield f"\n[Error]: {e}" # 3. Stream the output to the Vercel frontend return StreamingResponse(generate_stream(), media_type="text/event-stream") # 1. Add your custom token length function back def custom_token_length(text): tokens = fast_tokenizer.tokenize(text) return len(tokens) @app.post("/upload", dependencies=[Depends(verify_api_key)]) async def upload_document(file: UploadFile = File(...)): """Accepts PDF, HTML, MD, and TXT files and uploads them to Pinecone using fast_tokenizer.""" ext = Path(file.filename).suffix.lower() supported_extensions = [".pdf", ".html", ".htm", ".md", ".txt"] if ext not in supported_extensions: return {"error": f"Unsupported file type. Please upload one of: {', '.join(supported_extensions)}"} temp_file_path = f"temp_{file.filename}" with open(temp_file_path, "wb") as f: f.write(await file.read()) try: if ext == ".pdf": loader = PyPDFLoader(temp_file_path) docs = loader.load() elif ext in [".html", ".htm"]: try: loader = UnstructuredHTMLLoader(temp_file_path) docs = loader.load() except Exception as e: print(f"Warning: UnstructuredHTMLLoader failed, trying BSHTMLLoader: {e}") loader = BSHTMLLoader(temp_file_path) docs = loader.load() elif ext == ".md": loader = UnstructuredMarkdownLoader(temp_file_path) docs = loader.load() elif ext == ".txt": loader = TextLoader(temp_file_path) docs = loader.load() # 2. Re-implement your exact RecursiveCharacterTextSplitter settings text_splitter = RecursiveCharacterTextSplitter( chunk_size=350, # Max 350 tokens per chunk chunk_overlap=50, # Overlap of 50 tokens length_function=custom_token_length # Tells LangChain to use your C++ tool ) splits = text_splitter.split_documents(docs) # 3. Upload the perfectly tokenized chunks to Pinecone vectorstore.add_documents(splits) return { "status": "success", "message": f"Successfully processed {file.filename} into {len(splits)} chunks and uploaded to Pinecone." } except Exception as e: return {"error": f"Failed to process file: {str(e)}"} finally: # Clean up temp file if os.path.exists(temp_file_path): os.remove(temp_file_path) if __name__ == "__main__": # Runs the API server on port 7860 (Required for Hugging Face Spaces) print("\n" + "="*50) print("🐍 Python RAG API Initialized on Port 7860") print("="*50 + "\n") uvicorn.run("agent:app", host="0.0.0.0", port=7860, reload=True)