Spaces:
Paused
Paused
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
| import sys | |
| import shutil | |
| import uuid | |
| import random | |
| import datetime | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from PIL import Image | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoModel, AutoTokenizer, Qwen2VLProcessor, AutoModelForCausalLM | |
| from diffusers import FlowMatchEulerDiscreteScheduler, AutoencoderKLQwenImage | |
| # ========================================== | |
| # 1. Global Configuration & Styles | |
| # ========================================== | |
| # ================= 1.1 Setup Qwen-Image-MICo Configuration ================= | |
| REPO_ID = "kr-cen/Qwen-Image-MICo" | |
| MODEL_FILENAME = "modeling_qwen_image.py" | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| dtype = torch.bfloat16 | |
| # ================= 1.2 MICo Refine Prompt (CPU Qwen 0.5B) ================= | |
| QWEN_REFINE_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" | |
| print("🧠 Loading Qwen2.5-0.5B-Instruct on CPU for Refine Prompt...") | |
| refine_tokenizer = AutoTokenizer.from_pretrained(QWEN_REFINE_MODEL) | |
| refine_model = AutoModelForCausalLM.from_pretrained( | |
| QWEN_REFINE_MODEL, | |
| device_map="cpu", | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| refine_model.eval() | |
| print("✅ Refine Prompt model loaded (CPU).") | |
| # ================= 1.3 MICo Refine Prompt Logic ================= | |
| def generate_refined_prompt_cpu(image_prompts): | |
| system_prompt = ( | |
| "You are an expert assistant for multi-image composition. " | |
| "You will be given short descriptions of multiple reference images. " | |
| "Each description corresponds to a DIFFERENT image and a DIFFERENT subject.\n" | |
| "Your task is to merge all descriptions into ONE coherent, detailed editing prompt.\n" | |
| "Important: You MUST explicitly refer to each subject using image indices, such as " | |
| "'image 1', 'image 2', 'image 3', etc., whenever describing or distinguishing subjects.\n" | |
| "Do NOT merge different images into a single subject. " | |
| "If multiple inputs describe the same phrase (e.g., 'a woman', 'a woman'), " | |
| "treat them as distinct individuals from different images.\n" | |
| "Keep the final prompt concise, natural, and visually descriptive.\n" | |
| # "Focus on scene layout, relative positions, appearance, actions, lighting, and camera angle. " | |
| "Example:\n" | |
| "Input: 'a man in a leather jacket', 'a woman'\n" | |
| "Output: " | |
| "Set the scene on a city rooftop during twilight, with the man from image 1 wearing a leather jacket " | |
| "standing near the edge of the rooftop, and the woman from image 2 in a chic trench coat beside him; " | |
| "shot from a slightly high angle to capture the panoramic urban backdrop." | |
| ) | |
| user_prompt = "\n".join(image_prompts) | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ] | |
| text = refine_tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = refine_tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = refine_model.generate( | |
| **inputs, | |
| # max_new_tokens=min(max(int(len(image_prompts) * 15), 40), 100), | |
| max_new_tokens=max(100, len(image_prompts) * 25), | |
| do_sample=False | |
| ) | |
| generated_ids = outputs[0][inputs["input_ids"].shape[-1]:] | |
| refined = refine_tokenizer.decode( | |
| generated_ids, | |
| skip_special_tokens=True | |
| ).strip() | |
| print(f"🔑 [Prompt Refiner]: Plain Text {image_prompts}, Structured: {refined}.") | |
| return refined | |
| # return refine_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| description = r""" | |
| 🔥 <b>Official Gradio Demo</b> for | |
| <a href='https://mico-150k.github.io/' target='_blank'> | |
| <b>MICo-150K: A Comprehensive Dataset Advancing Multi-Image Composition</b> | |
| </a>.<br> | |
| 🚀 <b> | |
| <span style="color:#b91c1c;">Qwen-Image-MICo</span> | |
| </b> is fine-tuned from <b>Qwen-Image-Edit</b>, supporting multi-image composition with | |
| <b><span style="color:#b91c1c;">up to 6 reference images</span></b>.<br> | |
| 🖼️ Upload at least <b>two images</b>, provide a descriptive prompt, and generate high-quality composition results.<br> | |
| 🔑 <b> | |
| <span style="color:#b91c1c;">New Feature: Structured Prompting</span></b>. You can now provide a short caption for <b>each reference image</b>. These captions will be automatically merged into a <b>coherent, structured editing prompt</b> for more precise multi-image composition. | |
| """ | |
| article = r""" | |
| --- | |
| <h3>🌟 <b>If you find Qwen-Image-MICo helpful, please consider starring this space and our <a href='https://github.com/A113N-W3I/MICo-150K' target='_blank'>Github Repo</a>. Thanks!</b></h3> | |
| 📑 <b>Citation</b><br> | |
| If you find our work useful for your research, please consider citing:<br> | |
| ```bibtex | |
| @article{wei2025mico, | |
| title={MICo-150K: A Comprehensive Dataset Advancing Multi-Image Composition}, | |
| author={Wei, Xinyu and Cen, Kangrui and Wei, Hongyang and Guo, Zhen and Li, Bairui and Wang, Zeqing and Zhang, Jinrui and Zhang, Lei}, | |
| journal={arXiv preprint arXiv:2512.07348}, | |
| year={2025} | |
| } | |
| ``` | |
| 📝 <b>License</b><br> | |
| This project is licensed under <b>Apache License 2.0</b>.<br> | |
| 📧 <b>Contact</b><br> | |
| For questions or collaborations, feel free to reach out at <b>kangruicen@gmail.com</b> and <b>allen_wei@stu.pku.edu.cn</b>.<br> | |
| """ | |
| # Custom CSS | |
| custom_css = """ | |
| .gradio-container {width: 85% !important; margin: 0 auto;} | |
| /* Output container: fixed height, fit image */ | |
| #output-image-container { | |
| height: 600px !important; | |
| width: 100% !important; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| background-color: var(--background-fill-secondary); | |
| border-radius: 8px; | |
| overflow: hidden; | |
| } | |
| #output-image-container img { | |
| max-height: 100% !important; | |
| max-width: 100% !important; | |
| object-fit: contain !important; | |
| width: auto !important; | |
| height: auto !important; | |
| } | |
| .page-description { | |
| margin-top: 12px !important; | |
| } | |
| /* INPUT IMAGES: 6 Fixed Boxes */ | |
| .input-image-box { | |
| height: 180px !important; | |
| overflow: hidden; | |
| } | |
| .input-image-box img { | |
| object-fit: contain !important; | |
| width: 100%; | |
| height: 100%; | |
| } | |
| /* Example Gallery 容器 */ | |
| #example-gallery { | |
| min-height: 400px; /* Slightly increase the margin */ | |
| overflow-x: auto; | |
| } | |
| #example-gallery .grid { | |
| display: flex !important; | |
| justify-content: flex-start !important; | |
| gap: 16px !important; | |
| flex-wrap: wrap !important; /* Allow line breaks */ | |
| } | |
| /* Force the size of each gallery item (usually a button element) to be fixed. */ | |
| #example-gallery button { | |
| width: 160px !important; | |
| height: 160px !important; | |
| flex: none !important; | |
| border: none !important; | |
| overflow: hidden !important; | |
| } | |
| /* image */ | |
| #example-gallery img { | |
| height: 100% !important; | |
| width: 100% !important; | |
| object-fit: cover !important; | |
| } | |
| #example-gallery .label { | |
| display: none !important; /* Hide possible label text */ | |
| } | |
| /* Typography */ | |
| .title-container { | |
| text-align: center; | |
| padding: 0; | |
| margin: 0; | |
| height: 5vh; | |
| width: 80vw; | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; | |
| font-weight: 60; | |
| } | |
| .footer-text { | |
| font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; | |
| font-size: 0.9rem; | |
| color: var(--body-text-color-subdued); | |
| text-align: center; | |
| margin-top: 20px; | |
| } | |
| /* Global text style scope */ | |
| .global-text-style { | |
| font-family: -apple-system, BlinkMacSystemFont, | |
| "Segoe UI", Roboto, Helvetica, Arial, | |
| "PingFang SC", "Hiragino Sans GB", | |
| "Microsoft YaHei", sans-serif; | |
| } | |
| .global-text-style .markdown { | |
| font-family: inherit; | |
| } | |
| /* Label / Button / Slider / Accordion */ | |
| .global-text-style label, | |
| .global-text-style button, | |
| .global-text-style input, | |
| .global-text-style textarea, | |
| .global-text-style select { | |
| font-family: inherit; | |
| } | |
| #refine-panel { | |
| border: 1px solid var(--border-color-primary); | |
| padding: 12px 16px; | |
| margin-top: 8px; | |
| margin-bottom: 8px; | |
| } | |
| #refine-panel > .wrap { | |
| padding: 0 !important; | |
| } | |
| .refine-helper-text { | |
| background: transparent !important; | |
| padding: 6px 2px !important; | |
| font-size: 0.95em; | |
| color: var(--body-text-color-subdued); | |
| } | |
| .structure-checkbox label { | |
| font-size: 1.05em; | |
| font-weight: 600; | |
| } | |
| .refine-generate-btn { | |
| background: #eef2ff !important; /* very light indigo */ | |
| border: 1px solid #c7d2fe !important; | |
| color: #3730a3 !important; | |
| font-weight: 600; | |
| } | |
| .refine-generate-btn:hover { | |
| background: #e0e7ff !important; | |
| } | |
| """ | |
| # ========================================== | |
| # 2. Custom Model Architecture Loading | |
| # ========================================== | |
| print("⬇️ Loading custom model architecture...") | |
| try: | |
| if not os.path.exists(MODEL_FILENAME): | |
| print(f"Downloading {MODEL_FILENAME} from public repo...") | |
| hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=MODEL_FILENAME, | |
| repo_type="model", | |
| local_dir="." | |
| ) | |
| from modeling_qwen_image import QwenImageTransformer2DModel, QwenImageEditPipeline | |
| print("✅ Custom code imported successfully.") | |
| except Exception as e: | |
| print(f"❌ Failed to load custom code: {e}") | |
| raise RuntimeError("Could not load model architecture. Check logs.") | |
| # ========================================== | |
| # 3. Model Loading (Qwen-Image-MICo) | |
| # ========================================== | |
| print("🚀 Loading model weights ...") | |
| try: | |
| transformer = QwenImageTransformer2DModel.from_pretrained( | |
| REPO_ID, | |
| subfolder="transformer", | |
| torch_dtype=dtype, | |
| use_safetensors=False | |
| ).to(device) | |
| scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( | |
| REPO_ID, subfolder='scheduler' | |
| ) | |
| text_encoder = AutoModel.from_pretrained( | |
| REPO_ID, subfolder='text_encoder', | |
| device_map=device, torch_dtype=dtype | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| REPO_ID, subfolder='tokenizer' | |
| ) | |
| processor = Qwen2VLProcessor.from_pretrained( | |
| REPO_ID, subfolder='processor' | |
| ) | |
| vae = AutoencoderKLQwenImage.from_pretrained( | |
| REPO_ID, subfolder='vae', | |
| torch_dtype=dtype | |
| ).to(device) | |
| pipe = QwenImageEditPipeline( | |
| scheduler=scheduler, | |
| vae=vae, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| processor=processor, | |
| transformer=transformer | |
| ).to(device) | |
| print("✨ Model loaded successfully.") | |
| except Exception as e: | |
| print(f"❌ Critical Model Loading Error: {e}") | |
| raise RuntimeError("Failed to load model weights.") | |
| # ========================================== | |
| # 4. Inference Logic (Streaming Logs) | |
| # ========================================== | |
| def inference(img1, img2, img3, img4, img5, img6, | |
| prompt, negative_prompt, | |
| cfg_scale, seed, width, height, inference_steps, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """ | |
| Main inference function. | |
| Uses 'yield' to stream logs in real-time. | |
| CRITICAL FIX: Moves model back to CPU after inference to prevent ZeroGPU errors. | |
| """ | |
| # Initialize Log Accumulator | |
| log_history = [] | |
| def add_log(msg): | |
| timestamp = datetime.datetime.now().strftime("%H:%M:%S") | |
| entry = f"[{timestamp}] {msg}" | |
| log_history.append(entry) | |
| return "\n".join(log_history) | |
| # Initial Log Flush | |
| current_logs = add_log("Status: Request received. Initializing...") | |
| yield None, current_logs | |
| try: | |
| # --- A. Resource Allocation --- | |
| # current_logs = add_log("Status: ZeroGPU allocated. Moving model to CUDA...") | |
| # yield None, current_logs | |
| # try: | |
| # pipe.to("cuda") | |
| # except Exception as e: | |
| # current_logs = add_log("Error: Failed to access GPU.") | |
| # yield None, current_logs | |
| # raise RuntimeError("GPU allocation failed.") | |
| # --- B. Input Validation --- | |
| current_logs = add_log("Status: Check inputs ...") | |
| yield None, current_logs | |
| raw_inputs = [img1, img2, img3, img4, img5, img6] | |
| valid_input_paths = [img for img in raw_inputs if img is not None] | |
| img_count = len(valid_input_paths) | |
| if img_count < 2: | |
| current_logs = add_log(f"Error: Too few images ({img_count}). Minimum 2 required.") | |
| yield None, current_logs | |
| return | |
| if not prompt or not isinstance(prompt, str) or prompt.strip() == "": | |
| current_logs = add_log("Error: Prompt cannot be empty.") | |
| yield None, current_logs | |
| return | |
| # --- C. Data Processing --- | |
| current_logs = add_log("Status: Processing images ...") | |
| yield None, current_logs | |
| pil_images = [] | |
| for i, image_path in enumerate(valid_input_paths): | |
| try: | |
| img = Image.open(image_path).convert("RGB") | |
| pil_images.append(img) | |
| except Exception as e: | |
| current_logs = add_log(f"Error: Failed to load image #{i+1}.") | |
| yield None, current_logs | |
| return | |
| # --- D. Parameter Setup --- | |
| MAX_SEED = 2**32 - 1 | |
| try: | |
| seed = int(seed) | |
| if seed < 1 or seed > MAX_SEED: | |
| seed = random.randint(1, MAX_SEED) | |
| current_logs = add_log(f"Info: Invalid seed. Using random seed: {seed}") | |
| else: | |
| current_logs = add_log(f"Info: Using user-provided seed: {seed}") | |
| except (TypeError, ValueError): | |
| seed = random.randint(1, MAX_SEED) | |
| current_logs = add_log(f"Info: Using random seed: {seed}") | |
| yield None, current_logs | |
| steps = max(25, min(30, int(inference_steps))) | |
| # Log Confirmation | |
| print(f"🌟 Inference with {img_count} images, prompt: {prompt}, neg_prompt: {negative_prompt}, cfg: {cfg_scale}, steps: {steps}, res: {width}x{height}, seed: {seed}") | |
| current_logs = add_log("--- Parameter Confirmation ---") | |
| current_logs = add_log(f"Images: {len(pil_images)}") | |
| current_logs = add_log(f"Prompt: {prompt[:50]}..." if len(prompt) > 50 else f"Prompt: {prompt}") | |
| current_logs = add_log(f"Neg Prompt: {negative_prompt if negative_prompt else 'None'}") | |
| current_logs = add_log(f"CFG: {cfg_scale}, Steps: {steps}, Res: {width}x{height}") | |
| current_logs = add_log("----------------------------") | |
| yield None, current_logs | |
| generator = torch.manual_seed(seed) | |
| # --- E. Execution --- | |
| current_logs = add_log("Status: Running inference... (approx 30-60s)") | |
| yield None, current_logs | |
| result_image = pipe( | |
| images=pil_images, | |
| height=int(height), | |
| width=int(width), | |
| prompt=prompt, | |
| negative_prompt=negative_prompt if negative_prompt else " ", | |
| num_inference_steps=steps, | |
| true_cfg_scale=float(cfg_scale), | |
| generator=generator | |
| ).images[0] | |
| current_logs = add_log("Status: Generation successful!") | |
| yield result_image, current_logs | |
| except RuntimeError as re: | |
| msg = str(re).lower() | |
| if "out of memory" in msg: | |
| current_logs = add_log("Error: GPU Out of Memory. Reduce resolution/images.") | |
| else: | |
| current_logs = add_log(f"Error: Runtime error occurred: {msg}") | |
| yield None, current_logs | |
| except Exception as e: | |
| print(f"Server Error: {e}") | |
| current_logs = add_log("Error: Unexpected system error. Check console.") | |
| current_logs = add_log("Please check inputs and try again.") | |
| yield None, current_logs | |
| # finally: | |
| # # --- F. Cleanup (CRITICAL FOR ZEROGPU) --- | |
| # # Always move model back to CPU to release GPU memory properly | |
| # # and prevent "device not found" errors on next run. | |
| # try: | |
| # print("Cleaning up: Moving model back to CPU...") | |
| # pipe.to("cpu") | |
| # torch.cuda.empty_cache() | |
| # except Exception as e: | |
| # print(f"Cleanup warning: {e}") | |
| def refine_prompt_handler( | |
| img1, img2, img3, img4, img5, img6, | |
| p1, p2, p3, p4, p5, p6 | |
| ): | |
| images = [img1, img2, img3, img4, img5, img6] | |
| prompts = [p1, p2, p3, p4, p5, p6] | |
| # 1️⃣ Collect prompts only for uploaded images | |
| valid_prompts = [] | |
| for index, (img, p) in enumerate(zip(images, prompts)): | |
| if img is not None: | |
| if p is None or p.strip() == "": | |
| return ( | |
| gr.update(), | |
| "⚠️ Please provide a description for every uploaded image." | |
| ) | |
| valid_prompts.append(p.strip() + f' from image {index+1}') | |
| if len(valid_prompts) == 0: | |
| return gr.update(), "⚠️ No images provided." | |
| # 2️⃣ Run CPU Qwen refine | |
| try: | |
| refined_prompt = generate_refined_prompt_cpu(valid_prompts) | |
| except Exception as e: | |
| print(f"Refine Prompt Error: {e}") | |
| return gr.update(), f"⚠️ Structure prompt failed." | |
| # 3️⃣ Write back to Editing Prompt | |
| return refined_prompt, "✅ Prompt structured successfully." | |
| # ========================================== | |
| # 5. UI Construction | |
| # ========================================== | |
| with gr.Blocks(title="Qwen-Image-MICo", theme=gr.themes.Soft(), css=custom_css) as demo: | |
| with gr.Column(elem_classes="global-text-style"): | |
| # Header | |
| with gr.Row(elem_classes="title-container"): | |
| gr.Markdown( | |
| """ | |
| <div style="display:flex; align-items:center; gap:12px; justify-content:center;"> | |
| <img src="https://huggingface.co/spaces/kr-cen/Qwen-Image-MICo/resolve/main/assets/mico-logo.png" width="48" style="margin-bottom: 0;"/> | |
| <div> | |
| <h1 style="margin-bottom: 0;">Qwen-Image-MICo</h1> | |
| </div> | |
| </div> | |
| <h3 style="margin-top: 5px;">Multi-Image Composition & Editing</h3> | |
| <p>🔥 Upload <b>2 to 6 reference images</b> and provide a prompt to seamlessly compose and edit them.</p> | |
| """, | |
| ) | |
| with gr.Row(): | |
| gr.Markdown(description, elem_classes="page-description") | |
| with gr.Row(): | |
| # --- Left Column: Inputs --- | |
| with gr.Column(scale=4): | |
| gr.Markdown("### Reference Images (2-6 Required)") | |
| # --- 6 Fixed Boxes Layout --- | |
| input_images = [] | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=100): | |
| img1 = gr.Image(label="Img 1", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img1) | |
| with gr.Column(scale=1, min_width=100): | |
| img2 = gr.Image(label="Img 2", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img2) | |
| with gr.Column(scale=1, min_width=100): | |
| img3 = gr.Image(label="Img 3", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img3) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=100): | |
| img4 = gr.Image(label="Img 4", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img4) | |
| with gr.Column(scale=1, min_width=100): | |
| img5 = gr.Image(label="Img 5", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img5) | |
| with gr.Column(scale=1, min_width=100): | |
| img6 = gr.Image(label="Img 6", type="filepath", elem_classes="input-image-box", show_label=True) | |
| input_images.append(img6) | |
| # Prompt Inputs | |
| prompt_input = gr.Textbox( | |
| label="Editing Prompt", | |
| placeholder="Describe how to combine these images. It's recommended to explicitly refer to image indices, e.g., 'image 1', 'image 2'. You can also try Structured Prompting below (click the checkbox).", | |
| lines=5 | |
| ) | |
| # ================= Structure Your Prompt (NEW) ================= | |
| structure_toggle = gr.Checkbox( | |
| label="🔑 Structure Your Prompt (Optional)", value=False, elem_classes="structure-checkbox" | |
| ) | |
| # with gr.Column(visible=False) as structured_prompt_panel: | |
| with gr.Group(visible=False, elem_id="refine-panel") as refine_panel: | |
| with gr.Column(scale=1): | |
| gr.Markdown( | |
| "📝 Provide **one short description per uploaded image**. " | |
| "The generated result will be written into **Editing Prompt**. (You should upload image first)", | |
| elem_classes="refine-helper-text" | |
| ) | |
| structured_boxes = [] | |
| structured_boxes.append( | |
| gr.Textbox( | |
| label=f"Prompt for Img 1", | |
| placeholder="Write a short description here (E.g., a red sports car)", | |
| lines=2, | |
| interactive=False, | |
| visible=False | |
| ) | |
| ) | |
| structured_boxes.append( | |
| gr.Textbox( | |
| label=f"Prompt for Img 2", | |
| placeholder="E.g., a lady in a fancy dress", | |
| lines=2, | |
| interactive=False, | |
| visible=False | |
| ) | |
| ) | |
| structured_boxes.append( | |
| gr.Textbox( | |
| label=f"Prompt for Img 3", | |
| placeholder="E.g., a mug with floral patterns", | |
| lines=2, | |
| interactive=False, | |
| visible=False | |
| ) | |
| ) | |
| for i in range(3, 6): | |
| structured_boxes.append( | |
| gr.Textbox( | |
| label=f"Prompt for Img {i+1}", | |
| placeholder="Write a short description here", | |
| lines=2, | |
| interactive=False, | |
| visible=False | |
| ) | |
| ) | |
| generate_structured_btn = gr.Button("🧠 Generate Prompt", elem_classes="refine-generate-btn") | |
| structured_status = gr.Markdown("⚡️ Powered by **Qwen2.5-0.5B-Instruct** on CPU (ZeroGPU-Free Inference)", elem_classes="refine-helper-text") | |
| # ---------- Toggle panel visibility ---------- | |
| structure_toggle.change( | |
| fn=lambda x: gr.update(visible=x), | |
| inputs=structure_toggle, | |
| outputs=refine_panel | |
| ) | |
| # ---------- Enable / disable boxes based on image count ---------- | |
| def _toggle_structured_boxes(i1, i2, i3, i4, i5, i6): | |
| imgs = [i1, i2, i3, i4, i5, i6] | |
| updates = [] | |
| for img in imgs: | |
| updates.append(gr.update(interactive=img is not None)) | |
| return updates | |
| def _update_prompt_visibility( | |
| img1, img2, img3, img4, img5, img6 | |
| ): | |
| images = [img1, img2, img3, img4, img5, img6] | |
| updates = [] | |
| for img in images: | |
| updates.append(gr.update(visible=img is not None)) | |
| return updates | |
| for img in [img1, img2, img3, img4, img5, img6]: | |
| img.change( | |
| fn=_toggle_structured_boxes, | |
| inputs=[img1, img2, img3, img4, img5, img6], | |
| outputs=structured_boxes | |
| ) | |
| img.change( | |
| fn=_update_prompt_visibility, | |
| inputs=[img1, img2, img3, img4, img5, img6], | |
| outputs=structured_boxes | |
| ) | |
| # ---------- Generate & write back to Editing Prompt ---------- | |
| generate_structured_btn.click( | |
| fn=refine_prompt_handler, | |
| inputs=[img1, img2, img3, img4, img5, img6] + structured_boxes, | |
| outputs=[prompt_input, structured_status] | |
| ) | |
| # Advanced Settings | |
| with gr.Accordion("⚙️ Advanced Settings", open=False): | |
| negative_prompt_input = gr.Textbox(label="Negative Prompt", value="", lines=2) | |
| with gr.Row(): | |
| cfg_input = gr.Slider(minimum=1.0, maximum=10.0, value=4.0, step=0.1, label="CFG Scale") | |
| inference_steps_input = gr.Slider(minimum=25, maximum=30, value=25, step=1, label="Inference Steps") | |
| with gr.Row(): | |
| seed_input = gr.Number(value=0, label="Seed (Zero/Empty for Random)", precision=0) | |
| with gr.Row(): | |
| width_input = gr.Slider(minimum=512, maximum=1280, value=1024, step=32, label="Width") | |
| height_input = gr.Slider(minimum=512, maximum=1280, value=1024, step=32, label="Height") | |
| # Generate Button | |
| run_btn = gr.Button("🚀 Generate Artwork", variant="primary", size="lg") | |
| # --- Right Column: Outputs --- | |
| with gr.Column(scale=5): | |
| gr.Markdown("### Generation Result") | |
| # Output Image | |
| output_image = gr.Image( | |
| label="Result", | |
| elem_id="output-image-container", | |
| show_label=False, | |
| interactive=False | |
| ) | |
| # Real-time Log Window | |
| log_output = gr.Textbox( | |
| label="Process Log", | |
| placeholder="Waiting for start...", | |
| lines=10, | |
| max_lines=12, | |
| interactive=False, | |
| elem_id="log-window" | |
| ) | |
| # ========================================== | |
| # 6. Custom Examples | |
| # ========================================== | |
| example_data = [ | |
| [ | |
| "assets/ex4/ex4-output.png", | |
| ["assets/ex4/ex4-input1.png", "assets/ex4/ex4-input2.png", "assets/ex4/ex4-input3.png", "assets/ex4/ex4-input4.png", "assets/ex4/ex4-input5.png"], | |
| "A tranquil scene unfolds on a serene lake. A boat transporting passengers across water from image 1 gently glides by, while nearby, a ride-on mower for extensive lawn care from image 2 hums softly in a lush garden. On a rustic table, a tall, elegant cylindrical vessel from image 3 stands beside a classic metal flour sifter from image 4. A charming porcelain teacup with floral patterns from image 5 rests atop the table, completing the idyllic setting.", | |
| "", 4.0, 3126538730, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex1/ex1-output.png", | |
| ["assets/ex1/ex1-input1.png", "assets/ex1/ex1-input2.png", "assets/ex1/ex1-input3.png", "assets/ex1/ex1-input4.png"], | |
| "Underneath the tall and iconic Parisian landmark from image 1, a fast and maneuverable aquatic vehicle from image 2 glides gracefully along the Seine. Nearby, a compact travel caravan for outdoor adventures from image 3 is parked, while a bike with extended handlebars and low seat from image 4 leans against it, creating a vibrant setting of exploration and leisure by the riverbank.", | |
| "", 4.0, 42, 1024, 1024, 30 | |
| ], | |
| [ | |
| "assets/ex5/ex5-output.png", | |
| ["assets/ex5/ex5-input1.png", "assets/ex5/ex5-input2.png", "assets/ex5/ex5-input3.png", "assets/ex5/ex5-input4.png", "assets/ex5/ex5-input5.png"], | |
| "A vibrant morning scene unfolds in a sunlit kitchen. A shiny stainless steel four-slice toaster from image 3 sits on the counter, ready for breakfast, as a beverage brewed from fermented grains from image 5 steams nearby. On the shelf rests a cute and cuddly plush toy from image 4, while a gentle and refreshing foaming face wash from image 1 awaits on the sink. Outside the window, a sleek and powerful racehorse from image 2 gallops gracefully across a lush field, completing the picturesque tableau.", | |
| "", 4.0, 3407, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex2/ex2-output.png", | |
| ["assets/ex2/ex2-input1.png", "assets/ex2/ex2-input2.png", "assets/ex2/ex2-input3.png"], | |
| "Amidst the bustling city, a woman in image 1 wears the clothes in image 2, standing confidently in the foreground as the tall, iconic skyscraper with a spire crown in image 3 looms majestically behind her, reflecting the sunlight and vibrant energy of the streets.", | |
| "", 4.0, 42, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex3/ex3-output.png", | |
| ["assets/ex3/ex3-input1.png", "assets/ex3/ex3-input2.png", "assets/ex3/ex3-input3.png", "assets/ex3/ex3-input4.png", "assets/ex3/ex3-input5.png"], | |
| "In a bright, cheerful living room, a girl in image 1 wears the striped pants with a nautical theme in image 2 and the patterned cotton scarf in image 3, playfully tapping on a small, colorful percussion toy drum in image 4. The patterned dining tablecloth in image 5 adorns the nearby table, adding to the vibrant atmosphere.", | |
| "", 4.0, 42, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex6/ex6-output.png", | |
| ["assets/ex6/ex6-input1.png", "assets/ex6/ex6-input2.png", "assets/ex6/ex6-input3.png"], | |
| "In a rustic cafe, the woman sits at a table with the men leaning over her shoulders, all sipping coffee in casual clothes; their faces are turned towards the camera with warm, genuine smiles, captured from a slightly elevated angle to highlight the cozy ambiance.", | |
| "", 4.0, 0, 1120, 928, 25 | |
| ], | |
| [ | |
| "assets/ex7/ex7-output.png", | |
| ["assets/ex7/ex7-input1.png", "assets/ex7/ex7-input2.png", "assets/ex7/ex7-input3.png"], | |
| "Woman from image 1 appears in a chic, monochrome suit, exuding confidence and poise with a soft smile. Her posture is relaxed, hands elegantly on her hips. Woman from image 2, in a vibrant, flowing dress, exhibits gracefulness, with a gentle tilt of the head and a warm, inviting smile. Both stand facing the camera before the Lakmé Fashion Week backdrop. The even lighting highlights their facial features distinctly against the sponsor logos, capturing the dynamic essence of the event.", | |
| "", 4.0, 0, 864, 1184, 25 | |
| ], | |
| [ | |
| "assets/ex8/ex8-output.png", | |
| ["assets/ex8/ex8-input1.png", "assets/ex8/ex8-input2.png", "assets/ex8/ex8-input3.png" ], | |
| "In a rustic kitchen, the man is wearing an apron over casual clothes, standing between the women who are holding cups of tea, all looking at the camera with a shared, warm smile amidst cozy wooden shelves and soft lighting.", | |
| "", 4.0, 0, 864, 1184, 25 | |
| ], | |
| [ | |
| "assets/ex9/ex9-output.png", | |
| ["assets/ex9/ex9-input1.png", "assets/ex9/ex9-input2.png", "assets/ex9/ex9-input3.png", "assets/ex9/ex9-input4.png", "assets/ex9/ex9-input5.png", "assets/ex9/ex9-input6.png"], | |
| "A woman stands gracefully, exuding elegance against a soft beige backdrop. Her long, wavy hair, cascading beautifully down her shoulders, is the rich, natural shade from Image 5. She wears a snug, pure white sweater from Image 1, paired perfectly with dark blue skinny jeans from Image 2, giving a sleek and modern silhouette. Completing her outfit are the bold black over-the-knee boots from Image 3, adding a touch of daring sophistication. Around her neck, the striking gold tassel necklace from Image 4 glimmers subtly, complementing the ensemble with its chic flair. Her confident pose and the harmony of the outfit create a striking visual, blending warmth and style seamlessly.", | |
| "", 4.0, 895005600, 768, 960, 25 | |
| ], | |
| [ | |
| "assets/ex10/ex10-output.png", | |
| ["assets/ex10/ex10-input1.png", "assets/ex10/ex10-input2.png"], | |
| "Set the couple on a city rooftop during twilight, the man in a leather jacket and the woman in a chic trench coat, both staring confidently at the distant skyline; shoot from a slightly high angle to encompass the panoramic urban backdrop.", | |
| "", 4.0, 4047654811, 1184, 864, 25 | |
| ], | |
| [ | |
| "assets/ex11/ex11-output.png", | |
| ["assets/ex11/ex11-input1.png", "assets/ex11/ex11-input2.png", "assets/ex11/ex11-input3.png", "assets/ex11/ex11-input4.png", "assets/ex11/ex11-input5.png"], | |
| "In a serene park, an old man in image 1 wears a classic beige trench coat with belt in image 2, versatile athletic shorts for active lifestyles in image 3, a patterned cotton scarf for diverse fashion styles in image 5, and holds an umbrella cockatoo in image 4 on his arm. He leisurely strolls along the pathway, enjoying the afternoon sun.", | |
| "", 4.0, 0, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex12/ex12-output.png", | |
| ["assets/ex12/ex12-input1.png", "assets/ex12/ex12-input2.png", "assets/ex12/ex12-input3.png", "assets/ex12/ex12-input4.png", "assets/ex12/ex12-input5.png"], | |
| "On a sunlit terrace overlooking the ocean, a woman in image 1 wears clothes in image 2, jeans in image 3, a pearl multilayered necklace in image 4, and a bracelet adorned with beads and charms in image 5. She sips iced tea, enjoying the gentle breeze, while the waves break rhythmically against the shore, creating a serene atmosphere.", | |
| "", 4.0, 0, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex13/ex13-output.png", | |
| ["assets/ex13/ex13-input1.png", "assets/ex13/ex13-input2.png"], | |
| "Capture two men in a cozy library, wearing tweed blazers with subtle patterns, seated side by side in leather armchairs, facing the camera with thoughtful expressions; use soft, warm lighting to highlight their features and shoot from a slightly elevated angle to frame them with books.", | |
| "", 4.0, 0, 1280, 736, 25 | |
| ], | |
| [ | |
| "assets/ex16/ex16-output.png", | |
| ["assets/ex16/ex16-input1.png", "assets/ex16/ex16-input2.png", "assets/ex16/ex16-input3.png", "assets/ex16/ex16-input4.png", "assets/ex16/ex16-input5.png"], | |
| "In a sunlit garden corner, an ideal tool for effective soil cultivation from image 1 leans against a wooden fence, while a yellow, ruled pad with perforated pages from image 2 lies on a nearby table, covered with sketches of flowers. A vintage porcelain collector's item from image 3 is displayed prominently. Above, a versatile cargo and troop transport aircraft from image 4 flies past, casting a shadow over a small, practical kitchen utensil scoop from image 5, resting in a flowerpot.", | |
| "", 4.0, 0, 1024, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex14/ex14-output.png", | |
| ["assets/ex14/ex14-input1.png", "assets/ex14/ex14-input2.png", "assets/ex14/ex14-input3.png", "assets/ex14/ex14-input4.png", "assets/ex14/ex14-input5.png", "assets/ex14/ex14-input6.png"], | |
| "Dressed in a sporty ensemble, the person stands confidently amidst the rugged beauty of a desert landscape. They wear a white and navy blue raglan sleeve shirt with a red collar from Image 1, paired with stylish blue shorts from Image 2. Completing the look are sleek black and white cycling shoes from Image 3, which match perfectly with a vibrant cap featuring dark blue, light blue, and orange panels from Image 4. This dynamic attire is set against the backdrop of towering Joshua trees and sprawling shrubs from Image 6, highlighting the blend of athleticism and nature in this unique portrait.", | |
| "", 4.0, 2128720792, 768, 1024, 25 | |
| ], | |
| [ | |
| "assets/ex15/ex15-output.png", | |
| ["assets/ex15/ex15-input1.png", "assets/ex15/ex15-input2.png", "assets/ex15/ex15-input3.png", "assets/ex15/ex15-input4.png", "assets/ex15/ex15-input5.png"], | |
| "A woman stands elegantly in front of a bright, festively lit magenta backdrop labeled 'BVLGARI FESTA'. Her shiny blonde hair cascades in soft waves from a middle part. She exudes sophistication in a form-fitting black dress, accentuating her structured silhouette. The outfit is perfectly complemented by a pair of glossy black heels. Around her neck, a sparkling diamond necklace with a striking emerald pendant adds a touch of luxury. A matching emerald ring graces her finger, paired with a chic black clutch adorned with a metal clasp, and a dazzling cuff bracelet. Her poised stance and radiant accessories create a captivating presence amidst an atmosphere filled with glamour and allure.", | |
| "", 4.0, 3946676478, 768, 1152, 25 | |
| ], | |
| ] | |
| if os.path.exists("assets"): | |
| gr.Markdown("### 🧩 Examples (Click to visualize)") | |
| example_gallery = gr.Gallery( | |
| value=[item[0] for item in example_data], | |
| # columns=len(example_data), | |
| columns=8, | |
| height=360, | |
| allow_preview=False, | |
| show_label=False, | |
| elem_id="example-gallery" | |
| ) | |
| def apply_example(evt: gr.SelectData): | |
| index = evt.index | |
| data = example_data[index] | |
| input_list = data[1] # List of image paths | |
| # Fill the 6 slots, pad with None if fewer than 6 images | |
| filled_inputs = input_list + [None] * (6 - len(input_list)) | |
| return ( | |
| *filled_inputs, # Unpack 6 images | |
| data[2], # prompt | |
| data[3], # neg_prompt | |
| data[4], # cfg | |
| data[5], # seed | |
| data[6], # width | |
| data[7], # height | |
| data[8], # steps | |
| data[0], # output preview | |
| "Status: Example loaded. Click Generate." | |
| ) | |
| example_gallery.select( | |
| fn=apply_example, | |
| inputs=None, | |
| outputs=[ | |
| img1, img2, img3, img4, img5, img6, # Map to 6 boxes | |
| prompt_input, negative_prompt_input, | |
| cfg_input, seed_input, width_input, height_input, inference_steps_input, | |
| output_image, log_output | |
| ] | |
| ) | |
| # Event Binding for Run Button | |
| run_btn.click( | |
| fn=inference, | |
| inputs=[ | |
| img1, img2, img3, img4, img5, img6, # 6 separate inputs | |
| prompt_input, | |
| negative_prompt_input, | |
| cfg_input, | |
| seed_input, | |
| width_input, | |
| height_input, | |
| inference_steps_input | |
| ], | |
| outputs=[output_image, log_output] # Updates output and logs dynamically | |
| ) | |
| # Footer | |
| gr.Markdown( | |
| """ | |
| <div class="footer-text"> | |
| Model finetuned on Qwen-Image-Edit. Powered by Hugging Face ZeroGPU. | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown(article) | |
| if __name__ == "__main__": | |
| # demo.queue() | |
| demo.launch() |