File size: 2,969 Bytes
c0583a3
93dc5ee
 
c0583a3
93dc5ee
 
a2c060f
89e7803
93dc5ee
a2c060f
b75198d
a2c060f
c0583a3
 
 
89e7803
93dc5ee
89e7803
1a6a2a3
93dc5ee
c0583a3
93dc5ee
c0583a3
 
5ecdeb3
 
 
 
 
 
93dc5ee
 
a2c060f
93dc5ee
1a6a2a3
93dc5ee
 
 
 
 
 
a2c060f
c0583a3
 
 
93dc5ee
c0583a3
5ecdeb3
25ac4af
93dc5ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b75198d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Efficient Segmentation, and Diffusion Model
import gradio as gr
import spaces  # Import the spaces module to use GPU-specific decorators
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, pipeline
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image

# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN")  # Fetch token from repository secrets

# Set up Meta Llama 3.2 Vision model (using Vision Encoder-Decoder model with token)
llama_vision_model_id = "nlpconnect/vit-gpt2-image-captioning"
vision_model = VisionEncoderDecoderModel.from_pretrained(
    llama_vision_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token  # Updated to use 'token' instead of 'use_auth_token'
)
feature_extractor = AutoFeatureExtractor.from_pretrained(llama_vision_model_id, token=hf_token)

# Set up segmentation model using an efficient publicly available model
segment_model_id = "facebook/detr-resnet-50"
segment_pipe = pipeline(
    "image-segmentation",
    model=segment_model_id,
    device=0,  # Force usage of GPU
    token=hf_token  # Updated to use 'token'
)

# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
    stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token  # Updated to use 'token'
)
diffusion_pipe = diffusion_pipe.to("cuda")  # Force usage of GPU

# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120)  # Allocates GPU for a maximum of 120 seconds
def process_image(image):
    # Step 1: Use Vision model for initial image understanding (captioning)
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(vision_model.device)
    output_ids = vision_model.generate(pixel_values, max_length=50)
    caption = vision_model.config.decoder.tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Step 2: Segment important parts of the image using DETR
    segmented_result = segment_pipe(image=image)
    segments = segmented_result

    # Step 3: Modify segmented image using Diffusion model
    # Here, we modify based on the caption result and segmented area
    output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
    
    return output_image

# Create Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs="image",
    live=True,  # Allow for dynamic updates if necessary
    allow_flagging="never",  # Disallow flagging to keep interactions light
    title="Image Processor: Vision, Segmentation, and Modification",
    description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)

# Launch the app
interface.launch()