Blip / app.py
WaysAheadGlobal's picture
Update app.py
ef81d40 verified
# app.py
import gradio as gr
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path
import torch
from PIL import Image
# --- Disable unnecessary torch init ---
disable_torch_init()
# --- Load TinyLLaVA 3.1B ---
model_path = "bczhou/TinyLLaVA-3.1B" # official HF ID
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path=model_path,
model_base=None, # If you have a base model, point it here; else leave as is
model_name="TinyLLaVA-3.1B"
)
device = torch.device("cpu")
model.to(device)
# --- Gradio handler ---
def describe_image(image, prompt):
# TinyLLaVA wants PIL
image = Image.fromarray(image)
image_tensor = process_images([image], image_processor, model.config)
image_tensor = image_tensor.to(device)
prompt = tokenizer_image_token(prompt, tokenizer, context_len)
inputs = tokenizer([prompt])
input_ids = torch.tensor(inputs.input_ids).unsqueeze(0).to(device)
with torch.no_grad():
output_ids = model.generate(
input_ids,
images=image_tensor,
do_sample=True,
temperature=0.2,
max_new_tokens=200
)
out_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return out_text
iface = gr.Interface(
fn=describe_image,
inputs=[
gr.Image(type="numpy", label="Image"),
gr.Textbox(label="Your question", placeholder="What's happening in this image?")
],
outputs=gr.Textbox(label="TinyLLaVA Answer"),
title="πŸ¦™ TinyLLaVA-3.1B β€” Vision-Language Q&A",
description="A lightweight LLaVA variant that runs on CPU Spaces. Upload an image, ask a question."
)
if __name__ == "__main__":
iface.launch()