Spaces:
Runtime error
Runtime error
File size: 2,969 Bytes
c0583a3 93dc5ee c0583a3 93dc5ee a2c060f 89e7803 93dc5ee a2c060f b75198d a2c060f c0583a3 89e7803 93dc5ee 89e7803 1a6a2a3 93dc5ee c0583a3 93dc5ee c0583a3 5ecdeb3 93dc5ee a2c060f 93dc5ee 1a6a2a3 93dc5ee a2c060f c0583a3 93dc5ee c0583a3 5ecdeb3 25ac4af 93dc5ee b75198d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Efficient Segmentation, and Diffusion Model
import gradio as gr
import spaces # Import the spaces module to use GPU-specific decorators
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, pipeline
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image
# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
# Set up Meta Llama 3.2 Vision model (using Vision Encoder-Decoder model with token)
llama_vision_model_id = "nlpconnect/vit-gpt2-image-captioning"
vision_model = VisionEncoderDecoderModel.from_pretrained(
llama_vision_model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
token=hf_token # Updated to use 'token' instead of 'use_auth_token'
)
feature_extractor = AutoFeatureExtractor.from_pretrained(llama_vision_model_id, token=hf_token)
# Set up segmentation model using an efficient publicly available model
segment_model_id = "facebook/detr-resnet-50"
segment_pipe = pipeline(
"image-segmentation",
model=segment_model_id,
device=0, # Force usage of GPU
token=hf_token # Updated to use 'token'
)
# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token # Updated to use 'token'
)
diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
def process_image(image):
# Step 1: Use Vision model for initial image understanding (captioning)
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(vision_model.device)
output_ids = vision_model.generate(pixel_values, max_length=50)
caption = vision_model.config.decoder.tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Step 2: Segment important parts of the image using DETR
segmented_result = segment_pipe(image=image)
segments = segmented_result
# Step 3: Modify segmented image using Diffusion model
# Here, we modify based on the caption result and segmented area
output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
return output_image
# Create Gradio interface
interface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs="image",
live=True, # Allow for dynamic updates if necessary
allow_flagging="never", # Disallow flagging to keep interactions light
title="Image Processor: Vision, Segmentation, and Modification",
description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)
# Launch the app
interface.launch() |