import gradio as gr from transformers import pipeline from PIL import Image import io # ---------- optional: PDF -> PIL first page ---------- def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image: import fitz # PyMuPDF with fitz.open(stream=file_bytes, filetype="pdf") as doc: page = doc[0] pix = page.get_pixmap(dpi=200) return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") # ---------- init model ---------- pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct") # ---------- robust extractor: returns ONLY the model text ---------- def _only_model_text(out) -> str: # Case 1: pipelines often return {"generated_text": "..."} if isinstance(out, dict) and "generated_text" in out: return out["generated_text"] # Case 2: list of dicts (mixed roles) if isinstance(out, list): # Prefer any dict with generated_text first for item in out: if isinstance(item, dict) and "generated_text" in item: return item["generated_text"] # Otherwise find assistant role for item in out: if isinstance(item, dict) and item.get("role") == "assistant": content = item.get("content") if isinstance(content, str): return content if isinstance(content, list): # collect text pieces within the assistant content chunks = [] for c in content: if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str): chunks.append(c["text"]) if chunks: return "\n".join(chunks) # Fallback return str(out) def infer(file_obj, prompt): if file_obj is None: return "Please upload an image or PDF." if not prompt or not prompt.strip(): return "Please enter a prompt." # read file with open(file_obj.name, "rb") as f: raw = f.read() # load PIL name = (file_obj.name or "").lower() try: if name.endswith(".pdf") or raw[:4] == b"%PDF": pil_img = pdf_first_page_to_pil(raw) else: pil_img = Image.open(io.BytesIO(raw)).convert("RGB") except Exception as e: return f"Failed to read the file: {e}" # build messages in Qwen2-VL format messages = [{ "role": "user", "content": [ {"type": "image", "image": pil_img}, {"type": "text", "text": prompt.strip()} ] }] # run model out = pipe(text=messages, max_new_tokens=256) # return ONLY the assistant text return _only_model_text(out) # ---------- Gradio UI ---------- with gr.Blocks( title="Qwen2-VL-2B — File + Prompt", css=""" /* make the output box grow nicely */ #resp_out textarea {min-height: 220px;} """ ) as demo: gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.") with gr.Row(): file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"]) prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3) run_btn = gr.Button("Run") # output textbox that expands (via CSS above) resp_out = gr.Textbox( label="Model Response", lines=8, show_copy_button=True, elem_id="resp_out" ) run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out]) if __name__ == "__main__": demo.launch()