Spaces:
Sleeping
Sleeping
File size: 4,129 Bytes
6e7cdea 5066083 7850c5c 5f4982e 5066083 6e7cdea 5066083 d0833a3 7655952 6e7cdea 5066083 7850c5c 5066083 5f4982e 7850c5c 5066083 6e7cdea d2d7b0c 6e7cdea d2d7b0c 6e7cdea 5066083 6e7cdea d2d7b0c 6e7cdea 5066083 6e7cdea 7850c5c 5f4982e 7850c5c 5066083 7850c5c 5066083 7850c5c 5066083 7850c5c 5066083 5990aeb 5066083 7850c5c 5066083 7850c5c 5066083 7850c5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | import os
import time
import torch
import re
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
# β
Safe GPU decorator
try:
from spaces import GPU
except ImportError:
def GPU(func): return func
# ---------------- FastAPI setup ----------------
app = FastAPI(
title="ChatMate Real-Time API",
description="LangChain + DuckDuckGo + Phi-4",
version="1.0",
docs_url="/apidocs", # Swagger UI at /apidocs
redoc_url="/redoc" # ReDoc at /redoc
)
# β
Static + templates
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")
# Enable CORS (important for browser clients)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# β
Hugging Face login
login(token=os.environ.get("CHAT_MATE"))
# β
Load model
model_id = "microsoft/phi-4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=device,
max_new_tokens=512
)
def is_incomplete(text):
return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip())
@GPU
def generate_full_reply(message, history):
system_prompt = (
"You are a friendly, helpful, and conversational AI assistant built by "
"Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you."
)
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"]
reply = full_output[len(prompt):].strip()
while is_incomplete(reply):
continuation_prompt = prompt + reply
next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"]
continuation = next_output[len(continuation_prompt):].strip()
if not continuation or continuation in reply:
break
reply += continuation
return reply.strip()
# ---------------- Pydantic models ----------------
class ChatRequest(BaseModel):
message: str
history: list = []
# ---------------- FastAPI route ----------------
# ---------------- Routes ----------------
@app.get("/", summary="Serve homepage")
async def home(request: Request):
return templates.TemplateResponse("index.html", {"request": request})
@app.post("/chat-stream", summary="Stream assistant reply", tags=["Chat"])
async def chat_stream(body: ChatRequest):
"""
Stream the AI assistant's reply token-by-token.
"""
def generate():
reply = generate_full_reply(body.message, body.history)
for token in reply:
yield token
time.sleep(0.05)
return StreamingResponse(generate(), media_type="text/plain")
# ---------------- Startup warm-up ----------------
@app.on_event("startup")
async def warmup_model():
print("π§ Warming up...")
_ = generate_full_reply("Hello", [])
# ---------------- Run with Uvicorn ----------------
# In Hugging Face Spaces, just run: uvicorn app:app --host 0.0.0.0 --port 7860
if __name__ == "__main__":
# Hugging Face Spaces usually expects port 7860
port = int(os.environ.get("PORT", 7860))
# Run using uvicorn for FastAPI/Flask with ASGI wrapper
uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False) |