# app_gradio_llava_onevision.py
# Goal: frictionless Gradio UX for llava-onevision-qwen2-0.5b-ov-hf

from packaging import version
import transformers
from transformers import pipeline
import torch
import gradio as gr
from PIL import Image

# -------- Governance: ensure task support exists --------
MIN_TRANSFORMERS = "4.46.0"
if version.parse(transformers.__version__) < version.parse(MIN_TRANSFORMERS):
    raise RuntimeError(
        f"Transformers >= {MIN_TRANSFORMERS} is required for task 'image-text-to-text'. "
        f"Found {transformers.__version__}. Upgrade via:\n"
        f"  pip install -U 'transformers>={MIN_TRANSFORMERS},<5'"
    )

MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"

# -------- Device & dtype strategy --------
if torch.cuda.is_available():
    torch_dtype = torch.float16
elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
    torch_dtype = torch.float16
else:
    torch_dtype = torch.float32

# -------- Bootstrap pipeline --------
pipe = pipeline(
    "image-text-to-text",
    model=MODEL_ID,
    device_map="auto",
    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    # The processor will honor this if supported by the model’s saved config:
    use_fast=True
)

def infer(image: Image.Image, question: str) -> str:
    if image is None:
        return "Please upload an image."
    q = (question or "").strip()
    if not q:
        return "Please enter a question."

    # Preferred: chat-style messages (ensures image tokens align)
    try:
        out = pipe(
            text=[{
                "role": "user",
                "content": [
                    {"type": "image", "image": image},
                    {"type": "text", "text": q},
                ],
            }],
            max_new_tokens=128,
        )
    except Exception:
        # Fallback: dict API — images must be a LIST
        out = pipe({"images": [image], "text": q}, max_new_tokens=128)

    # ---- Robust readout (handles nested lists/dicts/strings) ----
    def extract_text(obj):
        """
        Normalizes pipeline outputs to a plain string.
        Handles:
          - {'generated_text': '...'}
          - {'generated_text': [{'role': 'assistant', 'content': '...'}, ...]}
          - [{'generated_text': '...'}], nested lists, or raw strings
        """
        if obj is None:
            return ""
    
        # If it's already a string
        if isinstance(obj, str):
            return obj
    
        # If it's a dict
        if isinstance(obj, dict):
            gen = obj.get("generated_text")
            # Case A: direct string
            if isinstance(gen, str):
                return gen
            # Case B: list of chat turns -> find the assistant message
            if isinstance(gen, (list, tuple)) and gen:
                for turn in reversed(gen):
                    if isinstance(turn, dict) and turn.get("role") == "assistant":
                        content = turn.get("content")
                        # Some models return list[str] or str here; normalize
                        if isinstance(content, list):
                            return " ".join(map(str, content))
                        return str(content) if content is not None else ""
                # Fallback: stringify first element
                return extract_text(gen[0])
    
            # Other keys sometimes used by models
            if "text" in obj and isinstance(obj["text"], str):
                return obj["text"]
    
            # Last resort for dicts
            return str(obj)
    
        # If it's a list/tuple, drill down
        if isinstance(obj, (list, tuple)) and obj:
            return extract_text(obj[0])
    
        # Fallback
        return str(obj)


    return extract_text(out).strip()


# -------- Gradio UX --------
with gr.Blocks(title="LLaVA OneVision Qwen2 0.5B — Image Q&A") as demo:
    gr.Markdown("# 🖼️🔎 LLaVA OneVision Qwen2 0.5B — Image Q&A")
    with gr.Row():
        img = gr.Image(type="pil", label="Upload an image")
        with gr.Column():
            prompt = gr.Textbox(
                label="Question",
                placeholder="e.g., What animal is on the candy?",
                lines=2,
            )
            submit = gr.Button("Ask")
            output = gr.TextArea(label="Answer", lines=6)

    submit.click(infer, [img, prompt], output)
    prompt.submit(infer, [img, prompt], output)

if __name__ == "__main__":
    demo.queue().launch(debug=True)