import gradio as gr
from transformers import pipeline
from PIL import Image
import io

# ---------- optional: PDF -> PIL first page ----------
def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
    import fitz  # PyMuPDF
    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
        page = doc[0] 
        pix = page.get_pixmap(dpi=200)
        return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

# ---------- init model ----------
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")

# ---------- robust extractor: returns ONLY the model text ----------
def _only_model_text(out) -> str:
    # Case 1: pipelines often return {"generated_text": "..."}
    if isinstance(out, dict) and "generated_text" in out:
        return out["generated_text"]

    # Case 2: list of dicts (mixed roles)
    if isinstance(out, list):
        # Prefer any dict with generated_text first
        for item in out:
            if isinstance(item, dict) and "generated_text" in item:
                return item["generated_text"]
        # Otherwise find assistant role
        for item in out:
            if isinstance(item, dict) and item.get("role") == "assistant":
                content = item.get("content")
                if isinstance(content, str):
                    return content
                if isinstance(content, list):
                    # collect text pieces within the assistant content
                    chunks = []
                    for c in content:
                        if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
                            chunks.append(c["text"])
                    if chunks:
                        return "\n".join(chunks)
    # Fallback
    return str(out)

def infer(file_obj, prompt):
    if file_obj is None:
        return "Please upload an image or PDF."
    if not prompt or not prompt.strip():
        return "Please enter a prompt."

    # read file
    with open(file_obj.name, "rb") as f:
        raw = f.read()

    # load PIL
    name = (file_obj.name or "").lower()
    try:
        if name.endswith(".pdf") or raw[:4] == b"%PDF":
            pil_img = pdf_first_page_to_pil(raw)
        else:
            pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception as e:
        return f"Failed to read the file: {e}"

    # build messages in Qwen2-VL format
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": pil_img},
            {"type": "text", "text": prompt.strip()}
        ]
    }]

    # run model
    out = pipe(text=messages, max_new_tokens=256)

    # return ONLY the assistant text
    return _only_model_text(out)

# ---------- Gradio UI ----------
with gr.Blocks(
    title="Qwen2-VL-2B — File + Prompt",
    css="""
    /* make the output box grow nicely */
    #resp_out textarea {min-height: 220px;}
    """
) as demo:
    gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
    with gr.Row():
        file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
    prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
    run_btn = gr.Button("Run")

    # output textbox that expands (via CSS above)
    resp_out = gr.Textbox(
        label="Model Response",
        lines=8,
        show_copy_button=True,
        elem_id="resp_out"
    )

    run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])

if __name__ == "__main__":
    demo.launch()