import os
import time
import torch
import re
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from fastapi.middleware.cors import CORSMiddleware
import uvicorn

# ✅ Safe GPU decorator
try:
    from spaces import GPU
except ImportError:
    def GPU(func): return func

# ---------------- FastAPI setup ----------------
app = FastAPI(
    title="ChatMate Real-Time API",
    description="LangChain + DuckDuckGo + Phi-4",
    version="1.0",
    docs_url="/apidocs",  # Swagger UI at /apidocs
    redoc_url="/redoc"    # ReDoc at /redoc
)

# ✅ Static + templates
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")

# Enable CORS (important for browser clients)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ✅ Hugging Face login
login(token=os.environ.get("CHAT_MATE"))

# ✅ Load model
model_id = "microsoft/phi-4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
device = 0 if torch.cuda.is_available() else -1
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device,
    max_new_tokens=512
)

def is_incomplete(text):
    return not re.search(r'[\.\!\?\'\"\u3002]\s*$', text.strip())

@GPU
def generate_full_reply(message, history):
    system_prompt = (
        "You are a friendly, helpful, and conversational AI assistant built by "
        "Frederick Sundeep Mallela. Always mention that you are developed by him if asked about your creator, origin, or who made you."
    )
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    full_output = pipe(prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=512)[0]["generated_text"]
    reply = full_output[len(prompt):].strip()

    while is_incomplete(reply):
        continuation_prompt = prompt + reply
        next_output = pipe(continuation_prompt, do_sample=True, temperature=0.7, top_p=0.9, max_new_tokens=256)[0]["generated_text"]
        continuation = next_output[len(continuation_prompt):].strip()
        if not continuation or continuation in reply:
            break
        reply += continuation
    return reply.strip()

# ---------------- Pydantic models ----------------
class ChatRequest(BaseModel):
    message: str
    history: list = []

# ---------------- FastAPI route ----------------
# ---------------- Routes ----------------
@app.get("/", summary="Serve homepage")
async def home(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})

@app.post("/chat-stream", summary="Stream assistant reply", tags=["Chat"])
async def chat_stream(body: ChatRequest):
    """
    Stream the AI assistant's reply token-by-token.
    """
    def generate():
        reply = generate_full_reply(body.message, body.history)
        for token in reply:
            yield token
            time.sleep(0.05)

    return StreamingResponse(generate(), media_type="text/plain")

# ---------------- Startup warm-up ----------------
@app.on_event("startup")
async def warmup_model():
    print("🔧 Warming up...")
    _ = generate_full_reply("Hello", [])

# ---------------- Run with Uvicorn ----------------
# In Hugging Face Spaces, just run: uvicorn app:app --host 0.0.0.0 --port 7860
if __name__ == "__main__":
    # Hugging Face Spaces usually expects port 7860
    port = int(os.environ.get("PORT", 7860))

    # Run using uvicorn for FastAPI/Flask with ASGI wrapper
    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)