import os
import torch
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"

tokenizer = None
model = None

def load_model():
    global tokenizer, model
    if model is None:
        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True,
        )
        print("Model loaded!")
    return tokenizer, model


@spaces.GPU(duration=120)
def generate_response(
    message: str,
    history: list,
    system_prompt: str = "",
    temperature: float = 0.7,
    top_p: float = 0.8,
    top_k: int = 20,
    max_tokens: int = 1024,
) -> str:
    tok, mdl = load_model()
    
    messages = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})
    for user_msg, assistant_msg in history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tok([text], return_tensors="pt").to(mdl.device)
    
    outputs = mdl.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        do_sample=True,
        pad_token_id=tok.eos_token_id,
    )
    
    generated = outputs[0][inputs['input_ids'].shape[-1]:]
    return tok.decode(generated, skip_special_tokens=True)


@spaces.GPU(duration=120)
def api_generate(
    prompt: str,
    system_prompt: str = "",
    temperature: float = 0.7,
    top_p: float = 0.8,
    max_tokens: int = 1024,
) -> dict:
    """
    API endpoint for text generation.
    
    Args:
        prompt: The user prompt/question
        system_prompt: Optional system instruction  
        temperature: Sampling temperature (0.0-2.0)
        top_p: Nucleus sampling parameter (0.0-1.0)
        max_tokens: Maximum tokens to generate
        
    Returns:
        Dictionary with 'response' key containing generated text
    """
    try:
        response = generate_response(
            message=prompt,
            history=[],
            system_prompt=system_prompt,
            temperature=temperature,
            top_p=top_p,
            max_tokens=max_tokens,
        )
        return {"response": response, "status": "success"}
    except Exception as e:
        return {"response": None, "status": "error", "error": str(e)}


with gr.Blocks(title="Qwen API", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Qwen2.5-7B-Instruct API
        
        Powered by [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU
        """
    )
    
    with gr.Tab("Chat"):
        chatbot = gr.Chatbot(height=450, label="Conversation")
        
        with gr.Row():
            msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
            submit_btn = gr.Button("Send", variant="primary", scale=1)
        
        with gr.Accordion("Settings", open=False):
            system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
            with gr.Row():
                temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
            with gr.Row():
                top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
                max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
        
        clear_btn = gr.Button("Clear")
        
        def user_submit(message, history):
            return "", history + [[message, None]]
        
        def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens):
            if not history:
                return history
            message = history[-1][0]
            history_without_last = history[:-1]
            response = generate_response(message, history_without_last, system_prompt, temperature, top_p, top_k, max_tokens)
            history[-1][1] = response
            return history
        
        msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot
        )
        submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
            bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot
        )
        clear_btn.click(lambda: [], None, chatbot)
    
    with gr.Tab("API"):
        gr.Markdown(
            """
            ## API Usage
            
            ```python
            from gradio_client import Client
            
            client = Client("Ngixdev/qwen-api")
            result = client.predict(
                prompt="Hello!",
                system_prompt="You are helpful.",
                temperature=0.7,
                top_p=0.8,
                max_tokens=1024,
                api_name="/api_generate"
            )
            print(result)
            ```
            """
        )
        
        with gr.Row():
            with gr.Column():
                api_prompt = gr.Textbox(label="Prompt", lines=3)
                api_system = gr.Textbox(label="System Prompt", lines=2)
                with gr.Row():
                    api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
                    api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
                api_max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
                api_submit = gr.Button("Generate", variant="primary")
            with gr.Column():
                api_output = gr.JSON(label="Response")
        
        api_submit.click(
            api_generate,
            [api_prompt, api_system, api_temp, api_top_p, api_max_tokens],
            api_output,
            api_name="api_generate",
        )

demo.launch()