Spaces:
Runtime error
Runtime error
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model | |
import gradio as gr | |
import spaces # Import the spaces module to use GPU-specific decorators | |
from transformers import pipeline | |
from diffusers import StableDiffusionPipeline | |
import torch | |
import os | |
# Set up Hugging Face token for private model access | |
hf_token = os.getenv("HF_TOKEN") | |
# Set up Meta Llama 3.2 Vision model (using private model with token) | |
llama_vision_model_id = "meta-llama/Llama-3.2-1B-Vision" | |
vision_pipe = pipeline( | |
"image-captioning", # Supports image captioning and image Q&A | |
model=llama_vision_model_id, | |
torch_dtype=torch.bfloat16, | |
device=0, # Force usage of GPU | |
use_auth_token=hf_token, # Use Hugging Face token for authentication | |
) | |
# Set up Meta Segment Anything 2 model (using private model with token) | |
segment_model_id = "meta/segment-anything-2" | |
segment_pipe = pipeline( | |
"image-segmentation", | |
model=segment_model_id, | |
device=0, # Force usage of GPU | |
use_auth_token=hf_token, # Use Hugging Face token for authentication | |
) | |
# Set up Stable Diffusion Lite model | |
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5" | |
diffusion_pipe = StableDiffusionPipeline.from_pretrained( | |
stable_diffusion_model_id, torch_dtype=torch.float16, use_auth_token=hf_token | |
) | |
diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU | |
# Use the GPU decorator for the function that needs GPU access | |
# Allocates GPU for a maximum of 120 seconds | |
def process_image(image): | |
# Step 1: Use Vision model for initial image understanding (captioning) | |
caption_result = vision_pipe(image=image) | |
caption = caption_result[0]['generated_text'] | |
# Step 2: Segment important parts of the image | |
segmented_result = segment_pipe(image=image) | |
segments = segmented_result["segments"] | |
# Step 3: Modify segmented image using Diffusion model | |
# Here, we modify based on the caption result and segmented area | |
output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0] | |
return output_image | |
# Create Gradio interface | |
interface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type="pil"), | |
outputs="image", | |
live=True, # Allow for dynamic updates if necessary | |
allow_flagging="never", # Disallow flagging to keep interactions light | |
title="Image Processor: Vision, Segmentation, and Modification", | |
description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion." | |
) | |
# Launch the app | |
interface.launch() | |