Alibrown's picture
Update app.py
834cd01 verified
import os
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
# ── Token Resolution (Open Source friendly) ──────────────────────────────────
token = (
os.environ.get("HF_TOKEN") or
os.environ.get("TEST_TOKEN") or
os.environ.get("HUGGINGFACE_TOKEN") or
os.environ.get("HF_API_TOKEN") or
None
)
if not token:
print("⚠️ No HF token found β€” running unauthenticated (rate limits apply)")
else:
print("βœ… HF token loaded")
# ── Model ─────────────────────────────────────────────────────────────────────
MODEL = "HuggingFaceTB/SmolLM2-135M-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”§ Device: {device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=token)
model = AutoModelForCausalLM.from_pretrained(MODEL, token=token).to(device)
print(f"βœ… Model loaded: {MODEL}")
# ── Inference ─────────────────────────────────────────────────────────────────
def generate(prompt: str, max_new_tokens: int, temperature: float, system_prompt: str):
if not prompt.strip():
return "⚠️ Empty prompt", ""
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
input_tokens = inputs.shape[-1]
with torch.no_grad():
outputs = model.generate(
inputs,
max_new_tokens=max_new_tokens,
temperature=temperature if temperature > 0 else None,
do_sample=temperature > 0,
top_p=0.9 if temperature > 0 else None,
pad_token_id=tokenizer.eos_token_id,
)
new_tokens = outputs[0][input_tokens:]
result = tokenizer.decode(new_tokens, skip_special_tokens=True)
stats = f"Input tokens: {input_tokens} | Output tokens: {len(new_tokens)} | Device: {device}"
return result, stats
# ── UI ────────────────────────────────────────────────────────────────────────
with gr.Blocks(title="SmolLM2 Pipeline Test", theme=gr.themes.Monochrome()) as demo:
gr.Markdown("""
# πŸ§ͺ SmolLM2-135M Pipeline Test
`HuggingFaceTB/SmolLM2-135M-Instruct` β€” CPU/ZeroGPU fallback
""")
with gr.Row():
with gr.Column(scale=2):
system_prompt = gr.Textbox(
label="System Prompt (optional)",
placeholder="You are a helpful assistant.",
lines=2,
)
prompt = gr.Textbox(
label="User Prompt",
placeholder="Was ist die Hauptstadt von Deutschland?",
lines=4,
)
with gr.Row():
max_tokens = gr.Slider(10, 300, value=150, step=10, label="Max New Tokens")
temperature = gr.Slider(0.0, 1.5, value=0.2, step=0.05, label="Temperature (0 = greedy)")
btn = gr.Button("β–Ά Generate", variant="primary")
with gr.Column(scale=2):
output = gr.Textbox(label="Output", lines=10, interactive=False)
stats = gr.Textbox(label="Stats", lines=1, interactive=False)
# Quick test examples
gr.Examples(
examples=[
["You are a helpful assistant.", "What is 2+2? Answer in one sentence.", 50, 0.0],
["", "Summarize in one sentence: The Eiffel Tower is a wrought-iron lattice tower in Paris, built in 1889.", 80, 0.2],
["You are a JSON API. Respond only with valid JSON.", 'Extract name and age from: "I am Klaus, 34 years old."', 100, 0.0],
["", "Write a Python function that reverses a string.", 150, 0.3],
],
inputs=[system_prompt, prompt, max_tokens, temperature],
label="Quick Tests",
)
btn.click(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
prompt.submit(fn=generate, inputs=[prompt, max_tokens, temperature, system_prompt], outputs=[output, stats])
gr.Markdown(f"""
---
**Token:** `{'βœ… loaded' if token else '⚠️ not set'}` |
**Model:** `{MODEL}` |
**Device:** `{device}`
""")
gr.Markdown("""
### πŸ”— Links & Ressourcen
[WoS](https://www.github.com/wall-of-shames) | [CodeyLab@HF](https://hf.co/codey-lab) | **BadTin & VolkanSah**
""")
demo.launch()