import os
import re
import gc
import json
import time
import base64
from io import BytesIO
from threading import Thread
from typing import List, Dict, Any, Optional
import gradio as gr
import numpy as np
import torch
import spaces
from PIL import Image, ImageDraw, ImageFont
from transformers import (
Qwen2_5_VLForConditionalGeneration,
AutoProcessor,
AutoModelForImageTextToText,
AutoModelForVision2Seq,
)
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from qwen_vl_utils import process_vision_info
ACCENT = "#FFFF00"
MAX_INPUT_TEXT_LENGTH = int(os.getenv("MAX_INPUT_TEXT_LENGTH", "2048"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Running on device:", device)
print("torch.__version__ =", torch.__version__)
print("torch.version.cuda =", torch.version.cuda)
print("cuda available:", torch.cuda.is_available())
print("cuda device count:", torch.cuda.device_count())
if torch.cuda.is_available():
print("current device:", torch.cuda.current_device())
print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
print("🔄 Loading Fara-7B...")
MODEL_ID_V = "microsoft/Fara-7B"
try:
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID_V,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
).to(device).eval()
except Exception as e:
print(f"Failed to load Fara: {e}")
model_v = None
processor_v = None
print("🔄 Loading UI-TARS-1.5-7B...")
MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
try:
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
model_x = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_X,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
).to(device).eval()
except Exception as e:
print(f"Failed to load UI-TARS: {e}")
model_x = None
processor_x = None
print("🔄 Loading Holo2-4B...")
MODEL_ID_H = "Hcompany/Holo2-4B"
try:
processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
model_h = AutoModelForImageTextToText.from_pretrained(
MODEL_ID_H,
trust_remote_code=True,
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
).to(device).eval()
except Exception as e:
print(f"Failed to load Holo2: {e}")
model_h = None
processor_h = None
print("🔄 Loading ActIO-UI-7B...")
MODEL_ID_ACT = "Uniphore/actio-ui-7b-rlvr"
try:
processor_act = AutoProcessor.from_pretrained(MODEL_ID_ACT, trust_remote_code=True)
model_act = AutoModelForVision2Seq.from_pretrained(
MODEL_ID_ACT,
trust_remote_code=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map=None
).to(device).eval()
except Exception as e:
print(f"Failed to load ActIO-UI: {e}")
model_act = None
processor_act = None
print("✅ Models loading sequence complete.")
MODEL_MAP = {
"Fara-7B": (processor_v, model_v),
"UI-TARS-1.5-7B": (processor_x, model_x),
"Holo2-4B": (processor_h, model_h),
"ActIO-UI-7B": (processor_act, model_act),
}
MODEL_CHOICES = list(MODEL_MAP.keys())
image_examples = [
{"query": "Click on the Fara-7B model.", "image": "examples/1.png", "model": "Fara-7B"},
{"query": "Click on the VLMs Collection", "image": "examples/2.png", "model": "UI-TARS-1.5-7B"},
{"query": "Click on the 'SAM3'.", "image": "examples/3.png", "model": "Holo2-4B"},
{"query": "Click on the Fara-7B model.", "image": "examples/1.png", "model": "ActIO-UI-7B"},
]
def pil_to_data_url(img: Image.Image, fmt="PNG"):
buf = BytesIO()
img.save(buf, format=fmt)
data = base64.b64encode(buf.getvalue()).decode()
mime = "image/png" if fmt.upper() == "PNG" else "image/jpeg"
return f"data:{mime};base64,{data}"
def file_to_data_url(path):
if not os.path.exists(path):
return ""
ext = path.rsplit(".", 1)[-1].lower()
mime = {
"jpg": "image/jpeg",
"jpeg": "image/jpeg",
"png": "image/png",
"webp": "image/webp",
}.get(ext, "image/jpeg")
with open(path, "rb") as f:
data = base64.b64encode(f.read()).decode()
return f"data:{mime};base64,{data}"
def make_thumb_b64(path, max_dim=240):
try:
img = Image.open(path).convert("RGB")
img.thumbnail((max_dim, max_dim))
return pil_to_data_url(img, "JPEG")
except Exception as e:
print("Thumbnail error:", e)
return ""
def b64_to_pil(b64_str):
if not b64_str:
return None
try:
if b64_str.startswith("data:"):
_, data = b64_str.split(",", 1)
else:
data = b64_str
image_data = base64.b64decode(data)
return Image.open(BytesIO(image_data)).convert("RGB")
except Exception:
return None
def build_example_cards_html():
cards = ""
for i, ex in enumerate(image_examples):
thumb = make_thumb_b64(ex["image"])
prompt_short = ex["query"][:72] + ("..." if len(ex["query"]) > 72 else "")
cards += f"""
{"
" if thumb else "
Preview
"}
{ex["model"]}
{prompt_short}
"""
return cards
EXAMPLE_CARDS_HTML = build_example_cards_html()
def load_example_data(idx_str):
try:
idx = int(str(idx_str).strip())
except Exception:
return gr.update(value=json.dumps({"status": "error", "message": "Invalid example index"}))
if idx < 0 or idx >= len(image_examples):
return gr.update(value=json.dumps({"status": "error", "message": "Example index out of range"}))
ex = image_examples[idx]
img_b64 = file_to_data_url(ex["image"])
if not img_b64:
return gr.update(value=json.dumps({"status": "error", "message": "Could not load example image"}))
return gr.update(value=json.dumps({
"status": "ok",
"query": ex["query"],
"image": img_b64,
"model": ex["model"],
"name": os.path.basename(ex["image"]),
}))
def get_image_proc_params(processor) -> Dict[str, int]:
ip = getattr(processor, "image_processor", None)
default_min = 256 * 256
default_max = 1280 * 1280
patch_size = getattr(ip, "patch_size", 14)
merge_size = getattr(ip, "merge_size", 2)
min_pixels = getattr(ip, "min_pixels", default_min)
max_pixels = getattr(ip, "max_pixels", default_max)
size_config = getattr(ip, "size", {})
if isinstance(size_config, dict):
if "shortest_edge" in size_config:
min_pixels = size_config["shortest_edge"]
if "longest_edge" in size_config:
max_pixels = size_config["longest_edge"]
if min_pixels is None:
min_pixels = default_min
if max_pixels is None:
max_pixels = default_max
return {
"patch_size": patch_size,
"merge_size": merge_size,
"min_pixels": min_pixels,
"max_pixels": max_pixels,
}
def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
if hasattr(processor, "apply_chat_template"):
try:
return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
except TypeError:
return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
tok = getattr(processor, "tokenizer", None)
if tok is not None and hasattr(tok, "apply_chat_template"):
return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
raise AttributeError("Could not apply chat template.")
def trim_generated(generated_ids, inputs):
in_ids = getattr(inputs, "input_ids", None)
if in_ids is None and isinstance(inputs, dict):
in_ids = inputs.get("input_ids", None)
if in_ids is None:
return generated_ids
return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
def get_fara_prompt(task, image):
OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
You need to generate the next action to complete the task.
Output your action inside a block using JSON format.
Include "coordinate": [x, y] in pixels for interactions.
Examples:
{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
"""
return [
{"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]},
]
def get_localization_prompt(task, image):
guidelines = (
"Localize an element on the GUI image according to my instructions and "
"output a click position as Click(x, y) with x num pixels from the left edge "
"and y num pixels from the top edge."
)
return [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": f"{guidelines}\n{task}"}
],
}]
def get_holo2_prompt(task, image):
schema_str = '{"properties": {"x": {"description": "The x coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "Y", "type": "integer"}}, "required": ["x", "y"], "title": "ClickCoordinates", "type": "object"}'
prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
* You must output a valid JSON following the format: {schema_str}
Your target is:"""
return [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": f"{prompt}\n{task}"},
],
}]
def get_actio_prompt(task, image):
system_prompt = (
"You are a GUI agent. You are given a task and a screenshot of the screen. "
"You need to perform a series of pyautogui actions to complete the task."
)
instruction_text = (
"Please perform the following task by providing the action and the coordinates in the format of (x, y): "
+ task
)
return [
{"role": "system", "content": system_prompt},
{
"role": "user",
"content": [
{"type": "text", "text": instruction_text},
{"type": "image", "image": image},
],
},
]
def parse_click_response(text: str) -> List[Dict]:
actions = []
text = text.strip()
matches_click = re.findall(r"(?:click|left_click|right_click|double_click)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
for m in matches_click:
actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
matches_point = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
for m in matches_point:
actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
for m in matches_box:
actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
if not actions:
matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
for m in matches_tuple:
actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
return actions
def parse_fara_response(response: str) -> List[Dict]:
actions = []
matches = re.findall(r"(.*?) ", response, re.DOTALL)
for match in matches:
try:
data = json.loads(match.strip())
args = data.get("arguments", {})
coords = args.get("coordinate", [])
action_type = args.get("action", "unknown")
text_content = args.get("text", "")
if coords and len(coords) == 2:
actions.append({
"type": action_type, "x": float(coords[0]), "y": float(coords[1]), "text": text_content, "norm": False
})
except Exception as e:
print(f"Error parsing Fara JSON: {e}")
return actions
def parse_holo2_response(response: str) -> List[Dict]:
actions = []
try:
data = json.loads(response.strip())
if "x" in data and "y" in data:
actions.append({"type": "click", "x": int(data["x"]), "y": int(data["y"]), "text": "*", "norm": True})
return actions
except Exception:
pass
match = re.search(r"\{\s*['\"]x['\"]\s*:\s*(\d+)\s*,\s*['\"]y['\"]\s*:\s*(\d+)\s*\}", response)
if match:
actions.append({
"type": "click",
"x": int(match.group(1)),
"y": int(match.group(2)),
"text": "Holo2",
"norm": True
})
return actions
def parse_actio_response(response: str) -> List[Dict]:
actions = []
matches = re.findall(r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", response)
for action_name, x, y in matches:
actions.append({
"type": action_name,
"x": int(x),
"y": int(y),
"text": "",
"norm": False
})
return actions
def create_localized_image(original_image: Image.Image, actions: List[Dict]) -> Optional[Image.Image]:
if not actions:
return original_image
img_copy = original_image.copy()
draw = ImageDraw.Draw(img_copy)
try:
font = ImageFont.load_default(size=18)
except Exception:
font = ImageFont.load_default()
for act in actions:
x = int(act["x"])
y = int(act["y"])
color = "#ff3333" if "click" in act["type"].lower() else "#3b82f6"
line_len = 15
width = 4
draw.line((x - line_len, y, x + line_len, y), fill=color, width=width)
draw.line((x, y - line_len, x, y + line_len), fill=color, width=width)
r = 20
draw.ellipse([x - r, y - r, x + r, y + r], outline=color, width=3)
label = f"{act['type']}"
if act.get("text"):
label += f': "{act["text"]}"'
text_pos = (x + 25, y - 15)
try:
bbox = draw.textbbox(text_pos, label, font=font)
padded_bbox = (bbox[0] - 4, bbox[1] - 2, bbox[2] + 4, bbox[3] + 2)
draw.rectangle(padded_bbox, fill="yellow", outline=color)
draw.text(text_pos, label, fill="black", font=font)
except Exception:
draw.text(text_pos, label, fill="white", font=font)
return img_copy
def calc_timeout_process(*args, **kwargs):
gpu_timeout = kwargs.get("gpu_timeout", None)
if gpu_timeout is None and args:
gpu_timeout = args[-1]
try:
return int(gpu_timeout)
except Exception:
return 60
@spaces.GPU(duration=calc_timeout_process)
def process_screenshot_stream(model_choice: str, task: str, image: Image.Image, gpu_timeout: int = 60):
try:
if image is None:
yield json.dumps({"status": "error", "text": "[ERROR] Please upload an image.", "annotated": ""})
return
if not task or not task.strip():
yield json.dumps({"status": "error", "text": "[ERROR] Please provide a task instruction.", "annotated": ""})
return
if len(str(task)) > MAX_INPUT_TEXT_LENGTH * 8:
yield json.dumps({"status": "error", "text": "[ERROR] Task instruction is too long.", "annotated": ""})
return
if model_choice not in MODEL_MAP:
yield json.dumps({"status": "error", "text": "[ERROR] Invalid model selected.", "annotated": ""})
return
input_pil_image = image.convert("RGB")
orig_w, orig_h = input_pil_image.size
raw_response = ""
actions = []
if model_choice == "Fara-7B":
if model_v is None:
yield json.dumps({"status": "error", "text": "[ERROR] Fara model failed to load.", "annotated": ""})
return
messages = get_fara_prompt(task, input_pil_image)
text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor_v(
text=[text_prompt],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
).to(device)
with torch.no_grad():
generated_ids = model_v.generate(**inputs, max_new_tokens=512)
generated_ids = trim_generated(generated_ids, inputs)
raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
actions = parse_fara_response(raw_response)
elif model_choice == "Holo2-4B":
if model_h is None:
yield json.dumps({"status": "error", "text": "[ERROR] Holo2 model failed to load.", "annotated": ""})
return
ip_params = get_image_proc_params(processor_h)
resized_h, resized_w = smart_resize(
input_pil_image.height,
input_pil_image.width,
factor=ip_params["patch_size"] * ip_params["merge_size"],
min_pixels=ip_params["min_pixels"],
max_pixels=ip_params["max_pixels"]
)
proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
messages = get_holo2_prompt(task, proc_image)
text_prompt = apply_chat_template_compat(processor_h, messages, thinking=False)
inputs = processor_h(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model_h.generate(**inputs, max_new_tokens=128)
generated_ids = trim_generated(generated_ids, inputs)
raw_response = processor_h.batch_decode(generated_ids, skip_special_tokens=True)[0]
actions = parse_holo2_response(raw_response)
for a in actions:
if a.get("norm", False):
a["x"] = (a["x"] / 1000.0) * orig_w
a["y"] = (a["y"] / 1000.0) * orig_h
elif model_choice == "UI-TARS-1.5-7B":
if model_x is None:
yield json.dumps({"status": "error", "text": "[ERROR] UI-TARS model failed to load.", "annotated": ""})
return
ip_params = get_image_proc_params(processor_x)
resized_h, resized_w = smart_resize(
input_pil_image.height,
input_pil_image.width,
factor=ip_params["patch_size"] * ip_params["merge_size"],
min_pixels=ip_params["min_pixels"],
max_pixels=ip_params["max_pixels"]
)
proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
messages = get_localization_prompt(task, proc_image)
text_prompt = apply_chat_template_compat(processor_x, messages)
inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model_x.generate(**inputs, max_new_tokens=128)
generated_ids = trim_generated(generated_ids, inputs)
raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
actions = parse_click_response(raw_response)
if resized_w > 0 and resized_h > 0:
scale_x = orig_w / resized_w
scale_y = orig_h / resized_h
for a in actions:
a["x"] = int(a["x"] * scale_x)
a["y"] = int(a["y"] * scale_y)
elif model_choice == "ActIO-UI-7B":
if model_act is None:
yield json.dumps({"status": "error", "text": "[ERROR] ActIO model failed to load.", "annotated": ""})
return
messages = get_actio_prompt(task, input_pil_image)
text_prompt = processor_act.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor_act(
text=[text_prompt],
images=[input_pil_image],
padding=True,
return_tensors="pt"
)
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model_act.generate(
**inputs,
max_new_tokens=1024,
do_sample=False,
)
generated_ids = trim_generated(generated_ids, inputs)
raw_response = processor_act.batch_decode(
generated_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
actions = parse_actio_response(raw_response)
annotated_image = create_localized_image(input_pil_image, actions)
annotated_b64 = pil_to_data_url(annotated_image, "JPEG") if annotated_image else pil_to_data_url(input_pil_image, "JPEG")
yield json.dumps({
"status": "done",
"text": raw_response,
"annotated": annotated_b64
})
except Exception as e:
yield json.dumps({"status": "error", "text": f"[ERROR] {str(e)}", "annotated": ""})
finally:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def run_cua(model_name, text, image_b64, gpu_timeout_v):
try:
image = b64_to_pil(image_b64)
yield from process_screenshot_stream(
model_choice=model_name,
task=text,
image=image,
gpu_timeout=gpu_timeout_v,
)
except Exception as e:
yield json.dumps({"status": "error", "text": f"[ERROR] {str(e)}", "annotated": ""})
def noop():
return None
CUBE_SVG = """
"""
UPLOAD_PREVIEW_SVG = f"""
"""
ANNOTATION_PLACEHOLDER_SVG = f"""
"""
COPY_SVG = f""" """
SAVE_SVG = f""" """
MODEL_TABS_HTML = "".join([
f'{m} '
for m in MODEL_CHOICES
])
css = f"""
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
*{{box-sizing:border-box;margin:0;padding:0}}
html,body{{height:100%;overflow-x:hidden}}
body,.gradio-container{{
background:#0f0f13!important;
font-family:'Inter',system-ui,-apple-system,sans-serif!important;
font-size:14px!important;color:#e4e4e7!important;min-height:100vh;overflow-x:hidden;
}}
.dark body,.dark .gradio-container{{background:#0f0f13!important;color:#e4e4e7!important}}
footer{{display:none!important}}
.hidden-input{{display:none!important;height:0!important;overflow:hidden!important;margin:0!important;padding:0!important}}
#gradio-run-btn,#example-load-btn{{
position:absolute!important;left:-9999px!important;top:-9999px!important;
width:1px!important;height:1px!important;opacity:0.01!important;
pointer-events:none!important;overflow:hidden!important;
}}
.app-shell{{
background:#18181b;border:1px solid #27272a;border-radius:16px;
margin:12px auto;max-width:1440px;overflow:hidden;
box-shadow:0 25px 50px -12px rgba(0,0,0,.6),0 0 0 1px rgba(255,255,255,.03);
}}
.app-header{{
background:linear-gradient(135deg,#18181b,#1e1e24);border-bottom:1px solid #27272a;
padding:14px 24px;display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:12px;
}}
.app-header-left{{display:flex;align-items:center;gap:12px}}
.app-logo{{
width:38px;height:38px;background:linear-gradient(135deg,{ACCENT},#fff06a,#fff7b2);
border-radius:10px;display:flex;align-items:center;justify-content:center;
box-shadow:0 4px 12px rgba(255,255,0,.30);
}}
.app-logo svg{{width:22px;height:22px;fill:#111;flex-shrink:0}}
.app-title{{
font-size:18px;font-weight:700;background:linear-gradient(135deg,#f5f5f5,#d9d9a7);
-webkit-background-clip:text;-webkit-text-fill-color:transparent;letter-spacing:-.3px;
}}
.app-badge{{
font-size:11px;font-weight:600;padding:3px 10px;border-radius:20px;
background:rgba(255,255,0,.10);color:#fff8a6;border:1px solid rgba(255,255,0,.24);letter-spacing:.3px;
}}
.app-badge.fast{{background:rgba(255,255,0,.08);color:#fff39a;border:1px solid rgba(255,255,0,.20)}}
.model-tabs-bar{{
background:#18181b;border-bottom:1px solid #27272a;padding:10px 16px;
display:flex;gap:8px;align-items:center;flex-wrap:wrap;
}}
.model-tab{{
display:inline-flex;align-items:center;justify-content:center;gap:6px;
min-width:32px;height:34px;background:transparent;border:1px solid #27272a;
border-radius:999px;cursor:pointer;font-size:12px;font-weight:600;padding:0 12px;
color:#ffffff!important;transition:all .15s ease;
}}
.model-tab:hover{{background:rgba(255,255,0,.10);border-color:rgba(255,255,0,.35)}}
.model-tab.active{{background:rgba(255,255,0,.16);border-color:{ACCENT};color:#fff!important;box-shadow:0 0 0 2px rgba(255,255,0,.08)}}
.model-tab-label{{font-size:12px;color:#ffffff!important;font-weight:600}}
.app-main-row{{display:flex;gap:0;flex:1;overflow:hidden}}
.app-main-left{{flex:1;display:flex;flex-direction:column;min-width:0;border-right:1px solid #27272a}}
.app-main-right{{width:520px;display:flex;flex-direction:column;flex-shrink:0;background:#18181b}}
#image-drop-zone{{
position:relative;background:#09090b;height:460px;min-height:460px;max-height:460px;
overflow:hidden;
}}
#image-drop-zone.drag-over{{outline:2px solid {ACCENT};outline-offset:-2px;background:rgba(255,255,0,.04)}}
.upload-prompt-modern{{
position:absolute;inset:0;display:flex;align-items:center;justify-content:center;
padding:20px;z-index:20;overflow:hidden;
}}
.upload-click-area{{
display:flex;flex-direction:column;align-items:center;justify-content:center;
cursor:pointer;padding:28px 36px;max-width:92%;max-height:92%;
border:2px dashed #3f3f46;border-radius:16px;
background:rgba(255,255,0,.03);transition:all .2s ease;gap:8px;text-align:center;
overflow:hidden;
}}
.upload-click-area:hover{{background:rgba(255,255,0,.08);border-color:{ACCENT};transform:scale(1.02)}}
.upload-click-area:active{{background:rgba(255,255,0,.12);transform:scale(.99)}}
.upload-click-area svg{{width:86px;height:86px;max-width:100%;flex-shrink:0}}
.upload-main-text{{color:#a1a1aa;font-size:14px;font-weight:600;margin-top:4px}}
.upload-sub-text{{color:#71717a;font-size:12px}}
.single-preview-wrap{{
width:100%;height:100%;display:none;align-items:center;justify-content:center;padding:16px;
overflow:hidden;
}}
.single-preview-card{{
width:100%;height:100%;max-width:100%;max-height:100%;border-radius:14px;
overflow:hidden;border:1px solid #27272a;background:#111114;
display:flex;align-items:center;justify-content:center;position:relative;
}}
.single-preview-card img{{
width:100%;height:100%;max-width:100%;max-height:100%;
object-fit:contain;display:block;
}}
.preview-overlay-actions{{
position:absolute;top:12px;right:12px;display:flex;gap:8px;z-index:5;
}}
.preview-action-btn{{
display:inline-flex;align-items:center;justify-content:center;
min-width:34px;height:34px;padding:0 12px;background:rgba(0,0,0,.65);
border:1px solid rgba(255,255,255,.14);border-radius:10px;cursor:pointer;
color:#fff!important;font-size:12px;font-weight:600;transition:all .15s ease;
}}
.preview-action-btn:hover{{background:{ACCENT};border-color:{ACCENT};color:#121200!important}}
.hint-bar{{
background:rgba(255,255,0,.05);border-top:1px solid #27272a;border-bottom:1px solid #27272a;
padding:10px 20px;font-size:13px;color:#a1a1aa;line-height:1.7;
}}
.hint-bar b{{color:#fff6a0;font-weight:600}}
.hint-bar kbd{{
display:inline-block;padding:1px 6px;background:#27272a;border:1px solid #3f3f46;
border-radius:4px;font-family:'JetBrains Mono',monospace;font-size:11px;color:#a1a1aa;
}}
.examples-section{{border-top:1px solid #27272a;padding:12px 16px}}
.examples-title{{
font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;
letter-spacing:.8px;margin-bottom:10px;
}}
.examples-scroll{{display:flex;gap:10px;overflow-x:auto;padding-bottom:8px}}
.examples-scroll::-webkit-scrollbar{{height:6px}}
.examples-scroll::-webkit-scrollbar-track{{background:#09090b;border-radius:3px}}
.examples-scroll::-webkit-scrollbar-thumb{{background:#27272a;border-radius:3px}}
.examples-scroll::-webkit-scrollbar-thumb:hover{{background:#3f3f46}}
.example-card{{
flex-shrink:0;width:220px;background:#09090b;border:1px solid #27272a;
border-radius:10px;overflow:hidden;cursor:pointer;transition:all .2s ease;
}}
.example-card:hover{{border-color:{ACCENT};transform:translateY(-2px);box-shadow:0 4px 12px rgba(255,255,0,.14)}}
.example-card.loading{{opacity:.5;pointer-events:none}}
.example-thumb-wrap{{height:120px;overflow:hidden;background:#18181b}}
.example-thumb-wrap img{{width:100%;height:100%;object-fit:cover}}
.example-thumb-placeholder{{
width:100%;height:100%;display:flex;align-items:center;justify-content:center;
background:#18181b;color:#3f3f46;font-size:11px;
}}
.example-meta-row{{padding:6px 10px;display:flex;align-items:center;gap:6px}}
.example-badge{{
display:inline-flex;padding:2px 7px;background:rgba(255,255,0,.12);border-radius:4px;
font-size:10px;font-weight:600;color:#fff6a0;font-family:'JetBrains Mono',monospace;white-space:nowrap;
}}
.example-prompt-text{{
padding:0 10px 8px;font-size:11px;color:#a1a1aa;line-height:1.4;
display:-webkit-box;-webkit-line-clamp:2;-webkit-box-orient:vertical;overflow:hidden;
}}
.panel-card{{border-bottom:1px solid #27272a}}
.panel-card-title{{
padding:12px 20px;font-size:12px;font-weight:600;color:#71717a;
text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
}}
.panel-card-body{{padding:16px 20px;display:flex;flex-direction:column;gap:8px}}
.modern-label{{font-size:13px;font-weight:500;color:#a1a1aa;margin-bottom:4px;display:block}}
.modern-textarea{{
width:100%;background:#09090b;border:1px solid #27272a;border-radius:8px;
padding:10px 14px;font-family:'Inter',sans-serif;font-size:14px;color:#e4e4e7;
resize:none;outline:none;min-height:100px;transition:border-color .2s;
}}
.modern-textarea:focus{{border-color:{ACCENT};box-shadow:0 0 0 3px rgba(255,255,0,.14)}}
.modern-textarea::placeholder{{color:#3f3f46}}
.modern-textarea.error-flash{{
border-color:#ef4444!important;box-shadow:0 0 0 3px rgba(239,68,68,.2)!important;animation:shake .4s ease;
}}
@keyframes shake{{0%,100%{{transform:translateX(0)}}20%,60%{{transform:translateX(-4px)}}40%,80%{{transform:translateX(4px)}}}}
.toast-notification{{
position:fixed;top:24px;left:50%;transform:translateX(-50%) translateY(-120%);
z-index:9999;padding:10px 24px;border-radius:10px;font-family:'Inter',sans-serif;
font-size:14px;font-weight:600;display:flex;align-items:center;gap:8px;
box-shadow:0 8px 24px rgba(0,0,0,.5);
transition:transform .35s cubic-bezier(.34,1.56,.64,1),opacity .35s ease;opacity:0;pointer-events:none;
}}
.toast-notification.visible{{transform:translateX(-50%) translateY(0);opacity:1;pointer-events:auto}}
.toast-notification.error{{background:linear-gradient(135deg,#dc2626,#b91c1c);color:#fff;border:1px solid rgba(255,255,255,.15)}}
.toast-notification.warning{{background:linear-gradient(135deg,#b7b700,#8f8f00);color:#fff;border:1px solid rgba(255,255,255,.15)}}
.toast-notification.info{{background:linear-gradient(135deg,#d4d400,{ACCENT});color:#111;border:1px solid rgba(255,255,255,.15)}}
.toast-notification .toast-icon{{font-size:16px;line-height:1}}
.toast-notification .toast-text{{line-height:1.3}}
.btn-run{{
display:flex;align-items:center;justify-content:center;gap:8px;width:100%;
background:linear-gradient(135deg,{ACCENT},#d8d800);border:none;border-radius:10px;
padding:12px 24px;cursor:pointer;font-size:15px;font-weight:700;font-family:'Inter',sans-serif;
color:#ffffff!important;-webkit-text-fill-color:#ffffff!important;
transition:all .2s ease;letter-spacing:-.2px;
box-shadow:0 4px 16px rgba(255,255,0,.25),inset 0 1px 0 rgba(255,255,255,.18);
}}
.btn-run:hover{{
background:linear-gradient(135deg,#ffff7a,{ACCENT});transform:translateY(-1px);
box-shadow:0 6px 24px rgba(255,255,0,.35),inset 0 1px 0 rgba(255,255,255,.22);
}}
.btn-run:active{{transform:translateY(0);box-shadow:0 2px 8px rgba(255,255,0,.25)}}
.annot-frame{{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}}
.annot-title{{
padding:10px 20px;font-size:13px;font-weight:700;text-transform:uppercase;
letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);color:#fff
}}
.annot-body{{
background:#09090b;height:340px;display:flex;align-items:center;justify-content:center;
padding:12px;position:relative;overflow:hidden;
}}
.annot-body img{{
max-width:100%;max-height:100%;object-fit:contain;border:1px solid #27272a;
border-radius:10px;background:#111114;display:none;position:relative;z-index:2;
}}
.annot-placeholder{{
position:absolute;inset:0;display:flex;flex-direction:column;align-items:center;justify-content:center;
gap:10px;color:#666;z-index:1;padding:16px;text-align:center;
}}
.annot-placeholder svg{{width:92px;height:92px;max-width:100%;opacity:.95}}
.annot-placeholder-title{{font-size:13px;font-weight:600;color:#fff6a0}}
.annot-placeholder-sub{{font-size:12px;color:#666;max-width:260px;line-height:1.5}}
.output-frame{{border-bottom:1px solid #27272a;display:flex;flex-direction:column;position:relative}}
.output-frame .out-title,
.output-frame .out-title *,
#output-title-label{{
color:#ffffff!important;
-webkit-text-fill-color:#ffffff!important;
}}
.output-frame .out-title{{
padding:10px 20px;font-size:13px;font-weight:700;
text-transform:uppercase;letter-spacing:.8px;border-bottom:1px solid rgba(39,39,42,.6);
display:flex;align-items:center;justify-content:space-between;gap:8px;flex-wrap:wrap;
}}
.out-title-right{{display:flex;gap:8px;align-items:center}}
.out-action-btn{{
display:inline-flex;align-items:center;justify-content:center;background:rgba(255,255,0,.10);
border:1px solid rgba(255,255,0,.2);border-radius:6px;cursor:pointer;padding:3px 10px;
font-size:11px;font-weight:500;color:#fff6a0!important;gap:4px;height:24px;transition:all .15s;
}}
.out-action-btn:hover{{background:rgba(255,255,0,.2);border-color:rgba(255,255,0,.35);color:#ffffff!important}}
.out-action-btn svg{{width:12px;height:12px;fill:{ACCENT}}}
.output-frame .out-body{{
flex:1;background:#09090b;display:flex;align-items:stretch;justify-content:stretch;
overflow:hidden;min-height:300px;position:relative;
}}
.output-scroll-wrap{{width:100%;height:100%;padding:0;overflow:hidden}}
.output-textarea{{
width:100%;height:300px;min-height:300px;max-height:300px;background:#09090b;color:#e4e4e7;
border:none;outline:none;padding:16px 18px;font-size:13px;line-height:1.6;
font-family:'JetBrains Mono',monospace;overflow:auto;resize:none;white-space:pre-wrap;
}}
.output-textarea::placeholder{{color:#52525b}}
.output-textarea.error-flash{{box-shadow:inset 0 0 0 2px rgba(239,68,68,.6)}}
.modern-loader{{
display:none;position:absolute;top:0;left:0;right:0;bottom:0;background:rgba(9,9,11,.92);
z-index:15;flex-direction:column;align-items:center;justify-content:center;gap:16px;backdrop-filter:blur(4px);
}}
.modern-loader.active{{display:flex}}
.modern-loader .loader-spinner{{
width:36px;height:36px;border:3px solid #27272a;border-top-color:{ACCENT};
border-radius:50%;animation:spin .8s linear infinite;
}}
@keyframes spin{{to{{transform:rotate(360deg)}}}}
.modern-loader .loader-text{{font-size:13px;color:#a1a1aa;font-weight:500}}
.loader-bar-track{{width:200px;height:4px;background:#27272a;border-radius:2px;overflow:hidden}}
.loader-bar-fill{{
height:100%;background:linear-gradient(90deg,{ACCENT},#ffff94,{ACCENT});
background-size:200% 100%;animation:shimmer 1.5s ease-in-out infinite;border-radius:2px;
}}
@keyframes shimmer{{0%{{background-position:200% 0}}100%{{background-position:-200% 0}}}}
.settings-group{{border:1px solid #27272a;border-radius:10px;margin:12px 16px;padding:0;overflow:hidden}}
.settings-group-title{{
font-size:12px;font-weight:600;color:#71717a;text-transform:uppercase;letter-spacing:.8px;
padding:10px 16px;border-bottom:1px solid #27272a;background:rgba(24,24,27,.5);
}}
.settings-group-body{{padding:14px 16px;display:flex;flex-direction:column;gap:12px}}
.slider-row{{display:flex;align-items:center;gap:10px;min-height:28px}}
.slider-row label{{font-size:13px;font-weight:500;color:#a1a1aa;min-width:118px;flex-shrink:0}}
.slider-row input[type="range"]{{
flex:1;-webkit-appearance:none;appearance:none;height:6px;background:#27272a;
border-radius:3px;outline:none;min-width:0;
}}
.slider-row input[type="range"]::-webkit-slider-thumb{{
-webkit-appearance:none;width:16px;height:16px;background:linear-gradient(135deg,{ACCENT},#d8d800);
border-radius:50%;cursor:pointer;box-shadow:0 2px 6px rgba(255,255,0,.35);transition:transform .15s;
}}
.slider-row input[type="range"]::-webkit-slider-thumb:hover{{transform:scale(1.2)}}
.slider-row input[type="range"]::-moz-range-thumb{{
width:16px;height:16px;background:linear-gradient(135deg,{ACCENT},#d8d800);
border-radius:50%;cursor:pointer;border:none;box-shadow:0 2px 6px rgba(255,255,0,.35);
}}
.slider-row .slider-val{{
min-width:58px;text-align:right;font-family:'JetBrains Mono',monospace;font-size:12px;
font-weight:500;padding:3px 8px;background:#09090b;border:1px solid #27272a;
border-radius:6px;color:#a1a1aa;flex-shrink:0;
}}
.app-statusbar{{
background:#18181b;border-top:1px solid #27272a;padding:6px 20px;
display:flex;gap:12px;height:34px;align-items:center;font-size:12px;
}}
.app-statusbar .sb-section{{
padding:0 12px;flex:1;display:flex;align-items:center;font-family:'JetBrains Mono',monospace;
font-size:12px;color:#52525b;overflow:hidden;white-space:nowrap;
}}
.app-statusbar .sb-section.sb-fixed{{
flex:0 0 auto;min-width:110px;text-align:center;justify-content:center;
padding:3px 12px;background:rgba(255,255,0,.08);border-radius:6px;color:#fff6a0;font-weight:500;
}}
.exp-note{{padding:10px 20px;font-size:12px;color:#52525b;border-top:1px solid #27272a;text-align:center}}
.exp-note a{{color:#fff6a0;text-decoration:none}}
.exp-note a:hover{{text-decoration:underline}}
::-webkit-scrollbar{{width:8px;height:8px}}
::-webkit-scrollbar-track{{background:#09090b}}
::-webkit-scrollbar-thumb{{background:#27272a;border-radius:4px}}
::-webkit-scrollbar-thumb:hover{{background:#3f3f46}}
@media(max-width:980px){{
.app-main-row{{flex-direction:column}}
.app-main-right{{width:100%}}
.app-main-left{{border-right:none;border-bottom:1px solid #27272a}}
}}
"""
gallery_js = r"""
() => {
function init() {
if (window.__cuaInitDone) return;
const dropZone = document.getElementById('image-drop-zone');
const uploadPrompt = document.getElementById('upload-prompt');
const uploadClick = document.getElementById('upload-click-area');
const fileInput = document.getElementById('custom-file-input');
const previewWrap = document.getElementById('single-preview-wrap');
const previewImg = document.getElementById('single-preview-img');
const btnUpload = document.getElementById('preview-upload-btn');
const btnClear = document.getElementById('preview-clear-btn');
const promptInput = document.getElementById('custom-query-input');
const runBtnEl = document.getElementById('custom-run-btn');
const outputArea = document.getElementById('custom-output-textarea');
const annotImg = document.getElementById('annotated-output-img');
const annotPlaceholder = document.getElementById('annotated-output-placeholder');
const imgStatus = document.getElementById('sb-image-status');
if (!dropZone || !fileInput || !promptInput || !previewWrap || !previewImg) {
setTimeout(init, 250);
return;
}
window.__cuaInitDone = true;
let imageState = null;
let toastTimer = null;
let examplePoller = null;
let lastSeenExamplePayload = null;
function showToast(message, type) {
let toast = document.getElementById('app-toast');
if (!toast) {
toast = document.createElement('div');
toast.id = 'app-toast';
toast.className = 'toast-notification';
toast.innerHTML = ' ';
document.body.appendChild(toast);
}
const icon = toast.querySelector('.toast-icon');
const text = toast.querySelector('.toast-text');
toast.className = 'toast-notification ' + (type || 'error');
if (type === 'warning') icon.textContent = '\u26A0';
else if (type === 'info') icon.textContent = '\u2139';
else icon.textContent = '\u2717';
text.textContent = message;
if (toastTimer) clearTimeout(toastTimer);
void toast.offsetWidth;
toast.classList.add('visible');
toastTimer = setTimeout(() => toast.classList.remove('visible'), 3500);
}
function showLoader() {
const l = document.getElementById('output-loader');
if (l) l.classList.add('active');
const sb = document.getElementById('sb-run-state');
if (sb) sb.textContent = 'Processing...';
}
function hideLoader() {
const l = document.getElementById('output-loader');
if (l) l.classList.remove('active');
const sb = document.getElementById('sb-run-state');
if (sb) sb.textContent = 'Done';
}
function setRunErrorState() {
const l = document.getElementById('output-loader');
if (l) l.classList.remove('active');
const sb = document.getElementById('sb-run-state');
if (sb) sb.textContent = 'Error';
}
function flashPromptError() {
promptInput.classList.add('error-flash');
promptInput.focus();
setTimeout(() => promptInput.classList.remove('error-flash'), 800);
}
function flashOutputError() {
if (!outputArea) return;
outputArea.classList.add('error-flash');
setTimeout(() => outputArea.classList.remove('error-flash'), 800);
}
function getValueFromContainer(containerId) {
const container = document.getElementById(containerId);
if (!container) return '';
const el = container.querySelector('textarea, input');
return el ? (el.value || '') : '';
}
function setGradioValue(containerId, value) {
const container = document.getElementById(containerId);
if (!container) return false;
const el = container.querySelector('textarea, input');
if (!el) return false;
const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
const ns = Object.getOwnPropertyDescriptor(proto, 'value');
if (ns && ns.set) {
ns.set.call(el, value);
el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
return true;
}
return false;
}
function syncImageToGradio() {
setGradioValue('hidden-image-b64', imageState ? imageState.b64 : '');
if (imgStatus) imgStatus.textContent = imageState ? '1 image uploaded' : 'No image uploaded';
}
function syncPromptToGradio() {
setGradioValue('prompt-gradio-input', promptInput.value);
}
function syncModelToGradio(name) {
setGradioValue('hidden-model-name', name);
}
function updateAnnotationState(src) {
if (!annotImg || !annotPlaceholder) return;
if (src) {
annotImg.src = src;
annotImg.style.display = 'block';
annotPlaceholder.style.display = 'none';
} else {
annotImg.src = '';
annotImg.style.display = 'none';
annotPlaceholder.style.display = 'flex';
}
}
function setPreview(b64, name) {
imageState = {b64, name: name || 'image'};
previewImg.src = b64;
previewWrap.style.display = 'flex';
if (uploadPrompt) uploadPrompt.style.display = 'none';
syncImageToGradio();
}
function clearPreview() {
imageState = null;
previewImg.src = '';
previewWrap.style.display = 'none';
if (uploadPrompt) uploadPrompt.style.display = 'flex';
syncImageToGradio();
updateAnnotationState('');
}
window.__setPreview = setPreview;
window.__clearPreview = clearPreview;
window.__updateAnnotationState = updateAnnotationState;
window.__showToast = showToast;
window.__showLoader = showLoader;
window.__hideLoader = hideLoader;
window.__setRunErrorState = setRunErrorState;
function processFile(file) {
if (!file) return;
if (!file.type.startsWith('image/')) {
showToast('Only image files are supported', 'error');
return;
}
const reader = new FileReader();
reader.onload = (e) => setPreview(e.target.result, file.name);
reader.readAsDataURL(file);
}
fileInput.addEventListener('change', (e) => {
const file = e.target.files && e.target.files[0] ? e.target.files[0] : null;
if (file) processFile(file);
e.target.value = '';
});
if (uploadClick) uploadClick.addEventListener('click', () => fileInput.click());
if (btnUpload) btnUpload.addEventListener('click', () => fileInput.click());
if (btnClear) btnClear.addEventListener('click', clearPreview);
dropZone.addEventListener('dragover', (e) => {
e.preventDefault();
dropZone.classList.add('drag-over');
});
dropZone.addEventListener('dragleave', (e) => {
e.preventDefault();
dropZone.classList.remove('drag-over');
});
dropZone.addEventListener('drop', (e) => {
e.preventDefault();
dropZone.classList.remove('drag-over');
if (e.dataTransfer.files && e.dataTransfer.files.length) processFile(e.dataTransfer.files[0]);
});
promptInput.addEventListener('input', syncPromptToGradio);
function activateModelTab(name) {
document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
btn.classList.toggle('active', btn.getAttribute('data-model') === name);
});
syncModelToGradio(name);
}
window.__activateModelTab = activateModelTab;
document.querySelectorAll('.model-tab[data-model]').forEach(btn => {
btn.addEventListener('click', () => activateModelTab(btn.getAttribute('data-model')));
});
activateModelTab('Fara-7B');
updateAnnotationState('');
function syncSlider(customId, gradioId) {
const slider = document.getElementById(customId);
const valSpan = document.getElementById(customId + '-val');
if (!slider) return;
slider.addEventListener('input', () => {
if (valSpan) valSpan.textContent = slider.value;
const container = document.getElementById(gradioId);
if (!container) return;
container.querySelectorAll('input[type="range"],input[type="number"]').forEach(el => {
const ns = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value');
if (ns && ns.set) {
ns.set.call(el, slider.value);
el.dispatchEvent(new Event('input', {bubbles:true, composed:true}));
el.dispatchEvent(new Event('change', {bubbles:true, composed:true}));
}
});
});
}
syncSlider('custom-gpu-duration', 'gradio-gpu-duration');
function validateBeforeRun() {
const promptVal = promptInput.value.trim();
if (!imageState && !promptVal) {
showToast('Please upload an image and enter your task instruction', 'error');
flashPromptError();
return false;
}
if (!imageState) {
showToast('Please upload an image', 'error');
return false;
}
if (!promptVal) {
showToast('Please enter your task instruction', 'warning');
flashPromptError();
return false;
}
const currentModel = (document.querySelector('.model-tab.active') || {}).dataset?.model;
if (!currentModel) {
showToast('Please select a model', 'error');
return false;
}
return true;
}
window.__clickGradioRunBtn = function() {
if (!validateBeforeRun()) return;
syncPromptToGradio();
syncImageToGradio();
const active = document.querySelector('.model-tab.active');
if (active) syncModelToGradio(active.getAttribute('data-model'));
if (outputArea) outputArea.value = '';
updateAnnotationState('');
showLoader();
setTimeout(() => {
const gradioBtn = document.getElementById('gradio-run-btn');
if (!gradioBtn) {
setRunErrorState();
if (outputArea) outputArea.value = '[ERROR] Run button not found.';
showToast('Run button not found', 'error');
return;
}
const btn = gradioBtn.querySelector('button');
if (btn) btn.click(); else gradioBtn.click();
}, 180);
};
if (runBtnEl) runBtnEl.addEventListener('click', () => window.__clickGradioRunBtn());
const copyBtn = document.getElementById('copy-output-btn');
if (copyBtn) {
copyBtn.addEventListener('click', async () => {
try {
const text = outputArea ? outputArea.value : '';
if (!text.trim()) {
showToast('No output to copy', 'warning');
flashOutputError();
return;
}
await navigator.clipboard.writeText(text);
showToast('Output copied to clipboard', 'info');
} catch(e) {
showToast('Copy failed', 'error');
}
});
}
const saveBtn = document.getElementById('save-output-btn');
if (saveBtn) {
saveBtn.addEventListener('click', () => {
const text = outputArea ? outputArea.value : '';
if (!text.trim()) {
showToast('No output to save', 'warning');
flashOutputError();
return;
}
const blob = new Blob([text], {type: 'text/plain;charset=utf-8'});
const a = document.createElement('a');
a.href = URL.createObjectURL(blob);
a.download = 'cua_gui_operator_output.txt';
document.body.appendChild(a);
a.click();
setTimeout(() => {
URL.revokeObjectURL(a.href);
document.body.removeChild(a);
}, 200);
showToast('Output saved', 'info');
});
}
function applyExamplePayload(raw) {
try {
const data = JSON.parse(raw);
if (data.status === 'ok') {
if (data.image) setPreview(data.image, data.name || 'example.png');
if (data.query) {
promptInput.value = data.query;
syncPromptToGradio();
}
if (data.model) activateModelTab(data.model);
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
showToast('Example loaded', 'info');
} else if (data.status === 'error') {
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
showToast(data.message || 'Failed to load example', 'error');
}
} catch (e) {
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
}
}
function startExamplePolling() {
if (examplePoller) clearInterval(examplePoller);
let attempts = 0;
examplePoller = setInterval(() => {
attempts += 1;
const current = getValueFromContainer('example-result-data');
if (current && current !== lastSeenExamplePayload) {
lastSeenExamplePayload = current;
clearInterval(examplePoller);
examplePoller = null;
applyExamplePayload(current);
return;
}
if (attempts >= 100) {
clearInterval(examplePoller);
examplePoller = null;
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
showToast('Example load timed out', 'error');
}
}, 120);
}
function triggerExampleLoad(idx) {
const btnWrap = document.getElementById('example-load-btn');
const btn = btnWrap ? (btnWrap.querySelector('button') || btnWrap) : null;
if (!btn) return;
let attempts = 0;
function writeIdxAndClick() {
attempts += 1;
const ok1 = setGradioValue('example-idx-input', String(idx));
setGradioValue('example-result-data', '');
const currentVal = getValueFromContainer('example-idx-input');
if (ok1 && currentVal === String(idx)) {
btn.click();
startExamplePolling();
return;
}
if (attempts < 30) {
setTimeout(writeIdxAndClick, 100);
} else {
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
showToast('Failed to initialize example loader', 'error');
}
}
writeIdxAndClick();
}
document.querySelectorAll('.example-card[data-idx]').forEach(card => {
card.addEventListener('click', () => {
const idx = card.getAttribute('data-idx');
if (!idx) return;
document.querySelectorAll('.example-card.loading').forEach(c => c.classList.remove('loading'));
card.classList.add('loading');
showToast('Loading example...', 'info');
triggerExampleLoad(idx);
});
});
const observerTarget = document.getElementById('example-result-data');
if (observerTarget) {
const obs = new MutationObserver(() => {
const current = getValueFromContainer('example-result-data');
if (!current || current === lastSeenExamplePayload) return;
lastSeenExamplePayload = current;
if (examplePoller) {
clearInterval(examplePoller);
examplePoller = null;
}
applyExamplePayload(current);
});
obs.observe(observerTarget, {childList:true, subtree:true, characterData:true, attributes:true});
}
if (outputArea) outputArea.value = '';
const sb = document.getElementById('sb-run-state');
if (sb) sb.textContent = 'Ready';
if (imgStatus) imgStatus.textContent = 'No image uploaded';
}
init();
}
"""
wire_outputs_js = r"""
() => {
function watchOutputs() {
const resultContainer = document.getElementById('gradio-result');
const outArea = document.getElementById('custom-output-textarea');
if (!resultContainer || !outArea) { setTimeout(watchOutputs, 500); return; }
let lastText = '';
function syncOutput() {
const el = resultContainer.querySelector('textarea') || resultContainer.querySelector('input');
if (!el) return;
const val = el.value || '';
if (val !== lastText) {
lastText = val;
try {
const data = JSON.parse(val);
if (data.text !== undefined) {
outArea.value = data.text || '';
outArea.scrollTop = outArea.scrollHeight;
}
if (data.annotated && window.__updateAnnotationState) {
window.__updateAnnotationState(data.annotated);
}
if (data.status === 'error') {
if (window.__setRunErrorState) window.__setRunErrorState();
if (window.__showToast) window.__showToast('Inference failed', 'error');
} else if (data.status === 'done') {
if (window.__hideLoader) window.__hideLoader();
}
} catch (e) {
outArea.value = val;
outArea.scrollTop = outArea.scrollHeight;
}
}
}
const observer = new MutationObserver(syncOutput);
observer.observe(resultContainer, {childList:true, subtree:true, characterData:true, attributes:true});
setInterval(syncOutput, 500);
}
watchOutputs();
}
"""
with gr.Blocks() as demo:
hidden_image_b64 = gr.Textbox(value="", elem_id="hidden-image-b64", elem_classes="hidden-input", container=False)
prompt = gr.Textbox(value="", elem_id="prompt-gradio-input", elem_classes="hidden-input", container=False)
hidden_model_name = gr.Textbox(value="Fara-7B", elem_id="hidden-model-name", elem_classes="hidden-input", container=False)
gpu_duration_state = gr.Number(value=60, elem_id="gradio-gpu-duration", elem_classes="hidden-input", container=False)
result = gr.Textbox(value="", elem_id="gradio-result", elem_classes="hidden-input", container=False)
example_idx = gr.Textbox(value="", elem_id="example-idx-input", elem_classes="hidden-input", container=False)
example_result = gr.Textbox(value="", elem_id="example-result-data", elem_classes="hidden-input", container=False)
example_load_btn = gr.Button("Load Example", elem_id="example-load-btn")
gr.HTML(f"""
{MODEL_TABS_HTML}
Upload: Click or drag to add a UI image ·
Model: Switch model tabs from the header ·
Clear removes the current image
Quick Examples
{EXAMPLE_CARDS_HTML}
Task Instruction
Instruction Input
Call CUA Agent
Visualized Action Points
{ANNOTATION_PLACEHOLDER_SVG}
Annotated UI preview will appear here
Detected click points and grounded actions will be drawn on the uploaded screenshot after inference.
Agent Model Response
{COPY_SVG} Copy
{SAVE_SVG} Save File
Experimental GUI Operator Suite · Fara-7B, UI-TARS-1.5-7B, Holo2-4B, ActIO-UI-7B
""")
run_btn = gr.Button("Run", elem_id="gradio-run-btn")
demo.load(fn=noop, inputs=None, outputs=None, js=gallery_js)
demo.load(fn=noop, inputs=None, outputs=None, js=wire_outputs_js)
run_btn.click(
fn=run_cua,
inputs=[
hidden_model_name,
prompt,
hidden_image_b64,
gpu_duration_state,
],
outputs=[result],
js=r"""(m, p, img, gd) => {
const modelEl = document.querySelector('.model-tab.active');
const model = modelEl ? modelEl.getAttribute('data-model') : m;
const promptEl = document.getElementById('custom-query-input');
const promptVal = promptEl ? promptEl.value : p;
const imgContainer = document.getElementById('hidden-image-b64');
let imgVal = img;
if (imgContainer) {
const inner = imgContainer.querySelector('textarea, input');
if (inner) imgVal = inner.value;
}
return [model, promptVal, imgVal, gd];
}""",
)
example_load_btn.click(
fn=load_example_data,
inputs=[example_idx],
outputs=[example_result],
queue=False,
)
if __name__ == "__main__":
demo.queue(max_size=50).launch(
css=css,
mcp_server=True,
ssr_mode=False,
show_error=True,
allowed_paths=["examples"],
)