import os import torch import gradio as gr import spaces from transformers import AutoModelForCausalLM, AutoTokenizer MODEL_ID = "Qwen/Qwen2.5-7B-Instruct" tokenizer = None model = None def load_model(): global tokenizer, model if model is None: print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) print("Model loaded!") return tokenizer, model @spaces.GPU(duration=120) def generate_response( message: str, history: list, system_prompt: str = "", temperature: float = 0.7, top_p: float = 0.8, top_k: int = 20, max_tokens: int = 1024, ) -> str: tok, mdl = load_model() messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) for user_msg, assistant_msg in history: if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tok([text], return_tensors="pt").to(mdl.device) outputs = mdl.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=top_k, do_sample=True, pad_token_id=tok.eos_token_id, ) generated = outputs[0][inputs['input_ids'].shape[-1]:] return tok.decode(generated, skip_special_tokens=True) @spaces.GPU(duration=120) def api_generate( prompt: str, system_prompt: str = "", temperature: float = 0.7, top_p: float = 0.8, max_tokens: int = 1024, ) -> dict: """ API endpoint for text generation. Args: prompt: The user prompt/question system_prompt: Optional system instruction temperature: Sampling temperature (0.0-2.0) top_p: Nucleus sampling parameter (0.0-1.0) max_tokens: Maximum tokens to generate Returns: Dictionary with 'response' key containing generated text """ try: response = generate_response( message=prompt, history=[], system_prompt=system_prompt, temperature=temperature, top_p=top_p, max_tokens=max_tokens, ) return {"response": response, "status": "success"} except Exception as e: return {"response": None, "status": "error", "error": str(e)} with gr.Blocks(title="Qwen API", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # Qwen2.5-7B-Instruct API Powered by [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU """ ) with gr.Tab("Chat"): chatbot = gr.Chatbot(height=450, label="Conversation") with gr.Row(): msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2) submit_btn = gr.Button("Send", variant="primary", scale=1) with gr.Accordion("Settings", open=False): system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2) with gr.Row(): temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature") top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P") with gr.Row(): top_k = gr.Slider(1, 100, 20, step=1, label="Top K") max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens") clear_btn = gr.Button("Clear") def user_submit(message, history): return "", history + [[message, None]] def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens): if not history: return history message = history[-1][0] history_without_last = history[:-1] response = generate_response(message, history_without_last, system_prompt, temperature, top_p, top_k, max_tokens) history[-1][1] = response return history msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot ) submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then( bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot ) clear_btn.click(lambda: [], None, chatbot) with gr.Tab("API"): gr.Markdown( """ ## API Usage ```python from gradio_client import Client client = Client("Ngixdev/qwen-api") result = client.predict( prompt="Hello!", system_prompt="You are helpful.", temperature=0.7, top_p=0.8, max_tokens=1024, api_name="/api_generate" ) print(result) ``` """ ) with gr.Row(): with gr.Column(): api_prompt = gr.Textbox(label="Prompt", lines=3) api_system = gr.Textbox(label="System Prompt", lines=2) with gr.Row(): api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature") api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P") api_max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens") api_submit = gr.Button("Generate", variant="primary") with gr.Column(): api_output = gr.JSON(label="Response") api_submit.click( api_generate, [api_prompt, api_system, api_temp, api_top_p, api_max_tokens], api_output, api_name="api_generate", ) demo.launch()