import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import os # 强制设置环境,让 Gradio 知道它在 Spaces 运行 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" repo_id = "unsloth/gemma-4-E2B-it-GGUF" filename = "gemma-4-E2B-it-Q4_K_M.gguf" print("正在加载模型...") model_path = hf_hub_download(repo_id=repo_id, filename=filename) llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2, chat_format="gemma") def chat_with_gemma(prompt, history): messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": prompt}) stream = llm.create_chat_completion(messages=messages, max_tokens=512, stream=True) response = "" for chunk in stream: delta = chunk['choices'][0]['delta'] if 'content' in delta: response += delta['content'] yield response # 使用最简单的 Block 结构,绕过复杂的 ChatInterface 模板带来的 Jinja2 报错 with gr.Blocks() as demo: gr.Markdown("# Gemma 4 E2B (Docker CPU版)") chatbot = gr.Chatbot() msg = gr.Textbox() def respond(message, chat_history): bot_message = "" # 这里为了简化,直接调用 generate,也可以使用 generator messages = [{"role": "user", "content": message}] stream = llm.create_chat_completion(messages=messages, stream=True) for chunk in stream: delta = chunk['choices'][0]['delta'] if 'content' in delta: bot_message += delta['content'] chat_history.append((message, bot_message)) yield "", chat_history chat_history.pop() msg.submit(respond, [msg, chatbot], [msg, chatbot]) if __name__ == "__main__": # 彻底避开一切参数校验 # 只需这一行,在 Hugging Face Space 上它会自动完成所有配置 demo.launch(debug=True)