File size: 4,129 Bytes
6e7cdea
 
 
5066083
7850c5c
 
5f4982e
 
5066083
6e7cdea
 
5066083
d0833a3
7655952
6e7cdea
 
 
 
 
 
5066083
 
 
 
 
7850c5c
 
5066083
 
5f4982e
 
 
 
7850c5c
5066083
 
 
 
 
 
 
6e7cdea
d2d7b0c
6e7cdea
 
d2d7b0c
6e7cdea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5066083
 
6e7cdea
 
 
 
 
 
d2d7b0c
 
6e7cdea
 
 
 
 
 
 
5066083
 
 
 
6e7cdea
7850c5c
5f4982e
 
 
 
 
7850c5c
 
5066083
7850c5c
5066083
7850c5c
 
 
 
 
5066083
7850c5c
5066083
 
 
 
5990aeb
 
5066083
 
7850c5c
5066083
7850c5c
5066083
7850c5c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import time
import torch
import re
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from fastapi.middleware.cors import CORSMiddleware
import uvicorn

# βœ… Safe GPU decorator
try:
    from spaces import GPU
except ImportError:
    def GPU(func): return func

# ---------------- FastAPI setup ----------------
app = FastAPI(
    title="ChatMate Real-Time API",
    description="LangChain + DuckDuckGo + Phi-4",
    version="1.0",
    docs_url="/apidocs",  # Swagger UI at /apidocs
    redoc_url="/redoc"    # ReDoc at /redoc
)

# βœ… Static + templates
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")

# Enable CORS (important for browser clients)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# βœ… Hugging Face login
login(token=os.environ.get("CHAT_MATE"))

# βœ… Load model
model_id = "microsoft/phi-4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
    max_new_tokens=512
)

def is_incomplete(text):
    return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip())

@GPU
def generate_full_reply(message, history):
    system_prompt = (
        "You are a friendly, helpful, and conversational AI assistant built by "
        "Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you."
    )
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"]
    reply = full_output[len(prompt):].strip()

    while is_incomplete(reply):
        continuation_prompt = prompt + reply
        next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"]
        continuation = next_output[len(continuation_prompt):].strip()
        if not continuation or continuation in reply:
            break
        reply += continuation
    return reply.strip()

# ---------------- Pydantic models ----------------
class ChatRequest(BaseModel):
    message: str
    history: list = []

# ---------------- FastAPI route ----------------
# ---------------- Routes ----------------
@app.get("/", summary="Serve homepage")
async def home(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})

@app.post("/chat-stream", summary="Stream assistant reply", tags=["Chat"])
async def chat_stream(body: ChatRequest):
    """
    Stream the AI assistant's reply token-by-token.
    """
    def generate():
        reply = generate_full_reply(body.message, body.history)
        for token in reply:
            yield token
            time.sleep(0.05)

    return StreamingResponse(generate(), media_type="text/plain")

# ---------------- Startup warm-up ----------------
@app.on_event("startup")
async def warmup_model():
    print("πŸ”§ Warming up...")
    _ = generate_full_reply("Hello", [])

# ---------------- Run with Uvicorn ----------------
# In Hugging Face Spaces, just run: uvicorn app:app --host 0.0.0.0 --port 7860
if __name__ == "__main__":
    # Hugging Face Spaces usually expects port 7860
    port = int(os.environ.get("PORT", 7860))

    # Run using uvicorn for FastAPI/Flask with ASGI wrapper
    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)