import gradio as gr from transformers import pipeline from PIL import Image # pipeline as high level pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224", device=-1, ) def get_image_caption(image): if not image: raise gr.Error("No image provided.") image = image.convert("RGB") # max_new_tokens: limit tokens to trade detail for speed result = pipe(image,text="The person is", max_new_tokens=32) return result[0]['generated_text'] # api w/ gradio api = gr.Interface( fn=get_image_caption, inputs=gr.Image(type="pil", label="Input Image"), outputs="text" ) api.launch(show_api=True)