Spaces:
Runtime error
Runtime error
File size: 2,970 Bytes
93dc5ee 1a6a2a3 93dc5ee a2c060f 89e7803 93dc5ee a2c060f b75198d a2c060f bb85f4f 89e7803 93dc5ee 89e7803 1a6a2a3 93dc5ee 1a6a2a3 93dc5ee a2c060f 93dc5ee 1a6a2a3 93dc5ee a2c060f 93dc5ee 1a6a2a3 93dc5ee a2c060f 89e7803 93dc5ee b75198d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
import gradio as gr
import spaces # Import the spaces module to use GPU-specific decorators
from transformers import MllamaForConditionalGeneration, AutoProcessor, pipeline
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image
# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
# Set up Meta Llama 3.2 Vision model (using private model with token)
llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
vision_model = MllamaForConditionalGeneration.from_pretrained(
llama_vision_model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
token=hf_token # Updated to use 'token' instead of 'use_auth_token'
)
processor = AutoProcessor.from_pretrained(llama_vision_model_id, token=hf_token)
# Set up Meta Segment Anything 2 model (using private model with token)
segment_model_id = "meta/segment-anything-2"
segment_pipe = pipeline(
"image-segmentation",
model=segment_model_id,
device=0, # Force usage of GPU
token=hf_token # Updated to use 'token' instead of 'use_auth_token'
)
# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token # Updated to use 'token'
)
diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
def process_image(image):
# Step 1: Use Vision model for initial image understanding (captioning)
prompt = "<|image|><|begin_of_text|>Describe the image."
inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
output = vision_model.generate(**inputs, max_new_tokens=50)
caption = processor.decode(output[0], skip_special_tokens=True)
# Step 2: Segment important parts of the image
segmented_result = segment_pipe(image=image)
segments = segmented_result["segments"]
# Step 3: Modify segmented image using Diffusion model
# Here, we modify based on the caption result and segmented area
output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
return output_image
# Create Gradio interface
interface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs="image",
live=True, # Allow for dynamic updates if necessary
allow_flagging="never", # Disallow flagging to keep interactions light
title="Image Processor: Vision, Segmentation, and Modification",
description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)
# Launch the app
interface.launch() |