File size: 2,971 Bytes
93dc5ee
 
 
1a6a2a3
93dc5ee
 
a2c060f
89e7803
93dc5ee
a2c060f
b75198d
a2c060f
 
bb85f4f
89e7803
 
93dc5ee
89e7803
1a6a2a3
93dc5ee
1a6a2a3
93dc5ee
bdfe3b8
 
5ecdeb3
 
 
 
 
 
93dc5ee
 
a2c060f
93dc5ee
1a6a2a3
93dc5ee
 
 
 
 
 
a2c060f
89e7803
 
 
 
93dc5ee
bdfe3b8
5ecdeb3
25ac4af
93dc5ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b75198d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
import gradio as gr
import spaces  # Import the spaces module to use GPU-specific decorators
from transformers import MllamaForConditionalGeneration, AutoProcessor, pipeline
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image

# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN")  # Fetch token from repository secrets

# Set up Meta Llama 3.2 Vision model (using private model with token)
llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
vision_model = MllamaForConditionalGeneration.from_pretrained(
    llama_vision_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token  # Updated to use 'token' instead of 'use_auth_token'
)
processor = AutoProcessor.from_pretrained(llama_vision_model_id, token=hf_token)

# Set up segmentation model using MaskFormer Swin Large from Hugging Face Hub
segment_model_id = "facebook/maskformer-swin-large"
segment_pipe = pipeline(
    "image-segmentation",
    model=segment_model_id,
    device=0,  # Force usage of GPU
    token=hf_token  # Updated to use 'token'
)

# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
    stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token  # Updated to use 'token'
)
diffusion_pipe = diffusion_pipe.to("cuda")  # Force usage of GPU

# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120)  # Allocates GPU for a maximum of 120 seconds
def process_image(image):
    # Step 1: Use Vision model for initial image understanding (captioning)
    prompt = "<|image|><|begin_of_text|>Describe the image."
    inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
    output = vision_model.generate(**inputs, max_new_tokens=50)
    caption = processor.decode(output[0], skip_special_tokens=True)

    # Step 2: Segment important parts of the image using MaskFormer Swin Large
    segmented_result = segment_pipe(image=image)
    segments = segmented_result

    # Step 3: Modify segmented image using Diffusion model
    # Here, we modify based on the caption result and segmented area
    output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
    
    return output_image

# Create Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs="image",
    live=True,  # Allow for dynamic updates if necessary
    allow_flagging="never",  # Disallow flagging to keep interactions light
    title="Image Processor: Vision, Segmentation, and Modification",
    description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)

# Launch the app
interface.launch()