import os import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch # ── Token Resolution (Open Source friendly) ────────────────────────────────── token = ( os.environ.get("HF_TOKEN") or os.environ.get("TEST_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN") or None ) if not token: print("⚠️ No HF token found — running unauthenticated (rate limits apply)") else: print("✅ HF token loaded") # ── Model ───────────────────────────────────────────────────────────────────── MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔧 Device: {device}") tokenizer = AutoTokenizer.from_pretrained(MODEL, token=token) model = AutoModelForCausalLM.from_pretrained(MODEL, token=token).to(device) print(f"✅ Model loaded: {MODEL}") # ── Inference ───────────────────────────────────────────────────────────────── def generate(prompt: str, max_new_tokens: int, temperature: float, system_prompt: str): if not prompt.strip(): return "⚠️ Empty prompt", "" messages = [] if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer.encode(text, return_tensors="pt").to(device) input_tokens = inputs.shape[-1] with torch.no_grad(): outputs = model.generate( inputs, max_new_tokens=max_new_tokens, temperature=temperature if temperature > 0 else None, do_sample=temperature > 0, top_p=0.9 if temperature > 0 else None, pad_token_id=tokenizer.eos_token_id, ) new_tokens = outputs[0][input_tokens:] result = tokenizer.decode(new_tokens, skip_special_tokens=True) stats = f"Input tokens: {input_tokens} | Output tokens: {len(new_tokens)} | Device: {device}" return result, stats # ── UI ──────────────────────────────────────────────────────────────────────── with gr.Blocks(title="SmolLM2 Pipeline Test", theme=gr.themes.Monochrome()) as demo: gr.Markdown(""" # 🧪 SmolLM2-135M Pipeline Test `HuggingFaceTB/SmolLM2-135M-Instruct` — CPU/ZeroGPU fallback """) with gr.Row(): with gr.Column(scale=2): system_prompt = gr.Textbox( label="System Prompt (optional)", placeholder="You are a helpful assistant.", lines=2, ) prompt = gr.Textbox( label="User Prompt", placeholder="Was ist die Hauptstadt von Deutschland?", lines=4, ) with gr.Row(): max_tokens = gr.Slider(10, 300, value=150, step=10, label="Max New Tokens") temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature (0 = greedy)") btn = gr.Button("▶ Generate", variant="primary") with gr.Column(scale=2): output = gr.Textbox(label="Output", lines=10, interactive=False) stats = gr.Textbox(label="Stats", lines=1, interactive=False) # Quick test examples gr.Examples( examples=[ ["You are a helpful assistant.", "What is 2+2? Answer in one sentence.", 50, 0.0], ["", "Summarize in one sentence: The Eiffel Tower is a wrought-iron lattice tower in Paris, built in 1889.", 80, 0.2], ["You are a JSON API. Respond only with valid JSON.", 'Extract name and age from: "I am Klaus, 34 years old."', 100, 0.0], ["", "Write a Python function that reverses a string.", 150, 0.3], ], inputs=[system_prompt, prompt, max_tokens, temperature], label="Quick Tests", ) btn.click(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats]) prompt.submit(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats]) gr.Markdown(f""" --- **Token:** `{'✅ loaded' if token else '⚠️ not set'}` | **Model:** `{MODEL}` | **Device:** `{device}` """) gr.Markdown(""" ### 🔗 Links & Ressourcen [WoS](https://www.github.com/wall-of-shames) | [CodeyLab@HF](https://hf.co/codey-lab) | **BadTin & VolkanSah** """) demo.launch()