import os
import subprocess
import sys
import httpx
import json
import base64
from io import BytesIO

# Fix for Python 3.13 audioop removal
try:
    import audioop
except ImportError:
    import audioop_lts as audioop
    sys.modules["audioop"] = audioop

from fastapi import Request,FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn
import gradio as gr
from openai import OpenAI

# --- CONFIGURATION ---
MODEL_ID = "numind/NuMarkdown-8B-Thinking"
GPU_UTILIZATION = 0.95
MAX_MODEL_LEN = 16384 
VLLM_PORT = 8000
HF_PORT = 7860

# --- STEP 1: START vLLM ---
def start_vllm():
    if "VLLM_PID" in os.environ:
        return
    print("🚀 Starting vLLM engine...")
    command = [
        "python3", "-m", "vllm.entrypoints.openai.api_server",
        "--model", MODEL_ID,
        "--host", "127.0.0.1",
        "--port", str(VLLM_PORT),
        "--trust-remote-code",
        "--gpu-memory-utilization", str(GPU_UTILIZATION),
        "--max-model-len", str(MAX_MODEL_LEN),
        "--dtype", "bfloat16",
        "--limit-mm-per-prompt", '{"image": 1}',
        # === Fix for Qwen2-VL Image Processor Warning ===
        "--mm-processor-kwargs", '{"use_fast": true}',
        # Optional but helpful
        "--enforce-eager"        
    ]
    # Connect vLLM logs to the HF console logs
    subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr)
    os.environ["VLLM_PID"] = "running"

start_vllm()
# --- STEP 2: FASTAPI PROXY (API) ---
app = FastAPI()

# We add the external API proxy directly to this app
@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
async def gatekeeper_proxy(path: str, request: Request):
    target_url = f"http://127.0.0.1:{VLLM_PORT}/v1/{path}"
    
    # Strip Host and Content-Length to prevent routing loops on HF
    headers = {k: v for k, v in request.headers.items() if k.lower() not in ["host", "content-length"]}
    
    async with httpx.AsyncClient(timeout=300.0) as client:
        try:
            if path == "chat/completions" and request.method == "POST":
                body = await request.json()
                if not body.get("stream", False):
                    resp = await client.post(target_url, headers=headers, json=body)
                    if resp.status_code == 200:
                        data = resp.json()
                        content = data["choices"][0]["message"].get("content", "")
                        # STRIP THINKING FROM EXTERNAL DOCLING API
                        if "</think>" in content:
                            data["choices"][0]["message"]["content"] = content.split("</think>")[-1].strip()
                        return JSONResponse(content=data)
                    return JSONResponse(status_code=resp.status_code, content=resp.json())

            # Fallback for models list, etc.
            proxy_req = client.build_request(request.method, target_url, headers=headers, content=await request.body())
            r = await client.send(proxy_req, stream=True)
            return StreamingResponse(r.aiter_raw(), status_code=r.status_code, headers=dict(r.headers))
        except Exception as e:
            return JSONResponse(status_code=503, content={"error": f"API Proxy Error: {str(e)}"})
# --- STEP 2: UI LOGIC ---
def run_ui_test(image, prompt):
    if image is None: return "⚠️ Please upload an image."
    
    # Internal check for vLLM
    try:
        with httpx.Client() as check:
            check.get(f"http://127.0.0.1:{VLLM_PORT}/v1/models", timeout=2.0)
    except:
        return "⏳ Model is still loading... please wait 3-5 minutes."

    client = OpenAI(base_url=f"http://127.0.0.1:{VLLM_PORT}/v1", api_key="EMPTY")
    try:
        image = image.convert("RGB")
        buffered = BytesIO()
        image.save(buffered, format="JPEG")
        b64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
        
        completion = client.chat.completions.create(
            model=MODEL_ID,
            messages=[{"role": "user", "content": [
                {"type": "text", "text": prompt or "Convert to markdown."},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
            ]}],
            timeout=300.0
        )
        content = completion.choices[0].message.content
        # Suppress reasoning for UI
        return content.split("</think>")[-1].strip() if "</think>" in content else content
    except Exception as e:
        return f"❌ Error: {str(e)}"

with gr.Blocks(title="NuMarkdown API") as demo:
    gr.Markdown("# NuMarkdown L40S API Server")
    gr.Markdown("The API is live at `/v1/chat/completions` (Reasoning stripped automatically).")
    with gr.Row():
        with gr.Column():
            img_input = gr.Image(type="pil", label="Input Document")
            txt_input = gr.Textbox(value="Convert to markdown.", label="Prompt")
            btn = gr.Button("Extract Markdown", variant="primary")
        with gr.Column():
            out = gr.Textbox(label="Output", lines=20, show_copy_button=True)
    
    btn.click(run_ui_test, inputs=[img_input, txt_input], outputs=[out])

# --- STEP 3: ATTACH PROXY TO GRADIO'S APP ---
# We enable the queue for long tasks
# 1. FIX ATTRIBUTE ERROR: Patch missing attributes onto the demo object
demo.max_file_size = 100 * 1024 * 1024 # 100MB
demo.proxy_url = None
demo.root_path = ""
demo.queue()

# We get the FastAPI instance from Gradio
# app = demo.app
# 3. Mount Gradio to FastAPI
# Using path="" and assigning to the app ensures assets are at the root
app = gr.mount_gradio_app(app, demo, path="/")


# --- STEP 4: RUN ---
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=HF_PORT, workers=1)