import os import time import torch import re from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from huggingface_hub import login from fastapi.middleware.cors import CORSMiddleware import uvicorn # ✅ Safe GPU decorator try: from spaces import GPU except ImportError: def GPU(func): return func # ---------------- FastAPI setup ---------------- app = FastAPI( title="ChatMate Real-Time API", description="LangChain + DuckDuckGo + Phi-4", version="1.0", docs_url="/apidocs", # Swagger UI at /apidocs redoc_url="/redoc" # ReDoc at /redoc ) # ✅ Static + templates app.mount("/static", StaticFiles(directory="static"), name="static") templates = Jinja2Templates(directory="templates") # Enable CORS (important for browser clients) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # ✅ Hugging Face login login(token=os.environ.get("CHAT_MATE")) # ✅ Load model model_id = "microsoft/phi-4" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) device = 0 if torch.cuda.is_available() else -1 pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=device, max_new_tokens=512 ) def is_incomplete(text): return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip()) @GPU def generate_full_reply(message, history): system_prompt = ( "You are a friendly, helpful, and conversational AI assistant built by " "Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you." ) messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"] reply = full_output[len(prompt):].strip() while is_incomplete(reply): continuation_prompt = prompt + reply next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"] continuation = next_output[len(continuation_prompt):].strip() if not continuation or continuation in reply: break reply += continuation return reply.strip() # ---------------- Pydantic models ---------------- class ChatRequest(BaseModel): message: str history: list = [] # ---------------- FastAPI route ---------------- # ---------------- Routes ---------------- @app.get("/", summary="Serve homepage") async def home(request: Request): return templates.TemplateResponse("index.html", {"request": request}) @app.post("/chat-stream", summary="Stream assistant reply", tags=["Chat"]) async def chat_stream(body: ChatRequest): """ Stream the AI assistant's reply token-by-token. """ def generate(): reply = generate_full_reply(body.message, body.history) for token in reply: yield token time.sleep(0.05) return StreamingResponse(generate(), media_type="text/plain") # ---------------- Startup warm-up ---------------- @app.on_event("startup") async def warmup_model(): print("🔧 Warming up...") _ = generate_full_reply("Hello", []) # ---------------- Run with Uvicorn ---------------- # In Hugging Face Spaces, just run: uvicorn app:app --host 0.0.0.0 --port 7860 if __name__ == "__main__": # Hugging Face Spaces usually expects port 7860 port = int(os.environ.get("PORT", 7860)) # Run using uvicorn for FastAPI/Flask with ASGI wrapper uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)