import os
import time
from flask import Flask, request, render_template, jsonify, Response
from flasgger import Swagger, swag_from
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login
from langchain_community.tools import DuckDuckGoSearchRun

# ✅ Safe import of GPU decorator
try:
    from spaces import GPU
except ImportError:
    def GPU(func): return func

# Flask + Swagger setup
app = Flask(__name__, static_folder="static", template_folder="templates")
swagger = Swagger(app, template={
    "swagger": "2.0",
    "info": {
        "title": "ChatMate Real-Time API",
        "description": "LangChain + DuckDuckGo enabled AI chatbot",
        "version": "1.0"
    }
}, config={
    "headers": [],
    "specs": [{"endpoint": 'apispec', "route": '/apispec.json', "rule_filter": lambda rule: True}],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/apidocs/"
})

# ✅ Hugging Face login (if token provided)
login(token=os.environ.get("CHAT_MATE"))

# ✅ Load LLaMA 3.1 Instruct model
model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="auto")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)

# ✅ Simple keyword-based check for real-time info
REAL_TIME_KEYWORDS = {"latest", "current", "news", "today", "price", "time", "live", "trending", "update", "happening"}

def should_search(message):
    message = message.lower()
    return any(kw in message for kw in REAL_TIME_KEYWORDS)

# ✅ Search tool
search_tool = DuckDuckGoSearchRun()

# ✅ Chat using model with chat template and history
@GPU
def generate_full_reply(message, history):
    system_prompt = "You are a helpful and concise AI assistant."
    messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": message}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    output = pipe(prompt, do_sample=True, temperature=0.7)[0]["generated_text"]
    return output.split(prompt)[-1].strip()

# ✅ Flask route
@app.route("/")
def home():
    return render_template("index.html")

@app.route("/chat", methods=["POST"])
@swag_from({
    'tags': ['Chat'],
    'consumes': ['application/json'],
    'summary': 'Get assistant reply',
    'description': 'Send a message and chat history, and receive a full AI-generated response.',
    'parameters': [{
        'name': 'body',
        'in': 'body',
        'required': True,
        'schema': {
            'type': 'object',
            'properties': {
                'message': {'type': 'string', 'example': 'What is Python?'},
                'history': {
                    'type': 'array',
                    'items': {
                        'type': 'object',
                        'properties': {
                            'role': {'type': 'string', 'example': 'user'},
                            'content': {'type': 'string', 'example': 'Tell me about Python'}
                        }
                    }
                }
            },
            'required': ['message']
        }
    }],
    'responses': {
        200: {
            'description': 'Assistant reply',
            'schema': {
                'type': 'object',
                'properties': {
                    'reply': {'type': 'string'}
                }
            }
        }
    }
})
def chat():
    data = request.get_json()
    message = data.get("message")
    history = data.get("history", [])
    
    # Check if real-time search is needed
    if should_search(message):
        result = f"(Live info) {search_tool.run(message)}"
    else:
        result = generate_full_reply(message, history)

    return jsonify({"reply": result})

@app.route("/chat-stream", methods=["POST"])
@swag_from({
    'tags': ['Chat'],
    'consumes': ['application/json'],
    'summary': 'Stream assistant reply',
    'description': 'Send a message and history, receive AI-generated text as a stream (token by token).',
    'parameters': [{
        'name': 'body',
        'in': 'body',
        'required': True,
        'schema': {
            'type': 'object',
            'properties': {
                'message': {'type': 'string', 'example': 'Explain quantum computing.'},
                'history': {
                    'type': 'array',
                    'items': {
                        'type': 'object',
                        'properties': {
                            'role': {'type': 'string', 'example': 'user'},
                            'content': {'type': 'string', 'example': 'What is entanglement?'}
                        }
                    }
                }
            },
            'required': ['message']
        }
    }],
    'responses': {
        200: {
            'description': 'Streamed reply',
            'content': {'text/plain': {}}
        }
    }
})
def chat_stream():
    data = request.get_json()
    message = data.get("message")
    history = data.get("history", [])

    def generate():
       # if should_search(message):
       #     reply = f"(Live info) {search_tool.run(message)}"
       # else:
        reply = generate_full_reply(message, history)
        
        for token in reply.splitlines(keepends=True):
            yield token
            time.sleep(0.05)

    return Response(generate(), mimetype='text/plain')

# ✅ Warm-up on startup
if __name__ == "__main__":
    print("🔧 Warming up...")
    _ = generate_full_reply("Hello", [])
    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))