Spaces:
Runtime error
Runtime error
File size: 2,908 Bytes
93dc5ee 89e7803 93dc5ee a2c060f 89e7803 93dc5ee a2c060f b75198d a2c060f bb85f4f 89e7803 93dc5ee 89e7803 93dc5ee 89e7803 93dc5ee a2c060f 93dc5ee a2c060f 93dc5ee a2c060f 93dc5ee a2c060f 93dc5ee a2c060f 89e7803 93dc5ee b75198d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
import gradio as gr
import spaces # Import the spaces module to use GPU-specific decorators
from transformers import MllamaForConditionalGeneration, AutoProcessor
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image
# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
# Set up Meta Llama 3.2 Vision model (using private model with token)
llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
vision_model = MllamaForConditionalGeneration.from_pretrained(
llama_vision_model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
use_auth_token=hf_token
)
processor = AutoProcessor.from_pretrained(llama_vision_model_id, use_auth_token=hf_token)
# Set up Meta Segment Anything 2 model (using private model with token)
segment_model_id = "meta/segment-anything-2"
segment_pipe = pipeline(
"image-segmentation",
model=segment_model_id,
device=0, # Force usage of GPU
use_auth_token=hf_token, # Use Hugging Face token for authentication
)
# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
stable_diffusion_model_id, torch_dtype=torch.float16, use_auth_token=hf_token
)
diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
def process_image(image):
# Step 1: Use Vision model for initial image understanding (captioning)
prompt = "<|image|><|begin_of_text|>Describe the image."
inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
output = vision_model.generate(**inputs, max_new_tokens=50)
caption = processor.decode(output[0], skip_special_tokens=True)
# Step 2: Segment important parts of the image
segmented_result = segment_pipe(image=image)
segments = segmented_result["segments"]
# Step 3: Modify segmented image using Diffusion model
# Here, we modify based on the caption result and segmented area
output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
return output_image
# Create Gradio interface
interface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs="image",
live=True, # Allow for dynamic updates if necessary
allow_flagging="never", # Disallow flagging to keep interactions light
title="Image Processor: Vision, Segmentation, and Modification",
description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)
# Launch the app
interface.launch() |