# app_gradio_llava_onevision.py # Goal: frictionless Gradio UX for llava-onevision-qwen2-0.5b-ov-hf from packaging import version import transformers from transformers import pipeline import torch import gradio as gr from PIL import Image # -------- Governance: ensure task support exists -------- MIN_TRANSFORMERS = "4.46.0" if version.parse(transformers.__version__) < version.parse(MIN_TRANSFORMERS): raise RuntimeError( f"Transformers >= {MIN_TRANSFORMERS} is required for task 'image-text-to-text'. " f"Found {transformers.__version__}. Upgrade via:\n" f" pip install -U 'transformers>={MIN_TRANSFORMERS},<5'" ) MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" # -------- Device & dtype strategy -------- if torch.cuda.is_available(): torch_dtype = torch.float16 elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 # -------- Bootstrap pipeline -------- pipe = pipeline( "image-text-to-text", model=MODEL_ID, device_map="auto", dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # The processor will honor this if supported by the modelโ€™s saved config: use_fast=True ) def infer(image: Image.Image, question: str) -> str: if image is None: return "Please upload an image." q = (question or "").strip() if not q: return "Please enter a question." # Preferred: chat-style messages (ensures image tokens align) try: out = pipe( text=[{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": q}, ], }], max_new_tokens=128, ) except Exception: # Fallback: dict API โ€” images must be a LIST out = pipe({"images": [image], "text": q}, max_new_tokens=128) # ---- Robust readout (handles nested lists/dicts/strings) ---- def extract_text(obj): """ Normalizes pipeline outputs to a plain string. Handles: - {'generated_text': '...'} - {'generated_text': [{'role': 'assistant', 'content': '...'}, ...]} - [{'generated_text': '...'}], nested lists, or raw strings """ if obj is None: return "" # If it's already a string if isinstance(obj, str): return obj # If it's a dict if isinstance(obj, dict): gen = obj.get("generated_text") # Case A: direct string if isinstance(gen, str): return gen # Case B: list of chat turns -> find the assistant message if isinstance(gen, (list, tuple)) and gen: for turn in reversed(gen): if isinstance(turn, dict) and turn.get("role") == "assistant": content = turn.get("content") # Some models return list[str] or str here; normalize if isinstance(content, list): return " ".join(map(str, content)) return str(content) if content is not None else "" # Fallback: stringify first element return extract_text(gen[0]) # Other keys sometimes used by models if "text" in obj and isinstance(obj["text"], str): return obj["text"] # Last resort for dicts return str(obj) # If it's a list/tuple, drill down if isinstance(obj, (list, tuple)) and obj: return extract_text(obj[0]) # Fallback return str(obj) return extract_text(out).strip() # -------- Gradio UX -------- with gr.Blocks(title="LLaVA OneVision Qwen2 0.5B โ€” Image Q&A") as demo: gr.Markdown("# ๐Ÿ–ผ๏ธ๐Ÿ”Ž LLaVA OneVision Qwen2 0.5B โ€” Image Q&A") with gr.Row(): img = gr.Image(type="pil", label="Upload an image") with gr.Column(): prompt = gr.Textbox( label="Question", placeholder="e.g., What animal is on the candy?", lines=2, ) submit = gr.Button("Ask") output = gr.TextArea(label="Answer", lines=6) submit.click(infer, [img, prompt], output) prompt.submit(infer, [img, prompt], output) if __name__ == "__main__": demo.queue().launch(debug=True)