Ryukijano commited on
Commit
93dc5ee
·
verified ·
1 Parent(s): bb5cb85

create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
2
+ import gradio as gr
3
+ import spaces # Import the spaces module to use GPU-specific decorators
4
+ from transformers import pipeline
5
+ from diffusers import StableDiffusionPipeline
6
+ import torch
7
+
8
+ # Set up Meta Llama 3.2 Vision model
9
+ llama_vision_model_id = "meta-llama/Llama-3.2-1B-Vision"
10
+ llama_pipe = pipeline(
11
+ "image-captioning", # Supports image captioning and image Q&A
12
+ model=llama_vision_model_id,
13
+ torch_dtype=torch.bfloat16,
14
+ device=0, # Force usage of GPU
15
+ )
16
+
17
+ # Set up Meta Segment Anything 2 model
18
+ segment_model_id = "meta/segment-anything-2"
19
+ segment_pipe = pipeline(
20
+ "image-segmentation",
21
+ model=segment_model_id,
22
+ device=0, # Force usage of GPU
23
+ )
24
+
25
+ # Set up Stable Diffusion Lite model
26
+ stable_diffusion_model_id = "CompVis/stable-diffusion-lite"
27
+ diffusion_pipe = StableDiffusionPipeline.from_pretrained(
28
+ stable_diffusion_model_id, torch_dtype=torch.float16
29
+ )
30
+ diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
31
+
32
+ # Use the GPU decorator for the function that needs GPU access
33
+ @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
34
+ def process_image(image):
35
+ # Step 1: Use Llama 3.2 Vision for initial image understanding (captioning)
36
+ caption_result = llama_pipe(image=image)
37
+ caption = caption_result[0]['generated_text']
38
+
39
+ # Step 2: Segment important parts of the image
40
+ segmented_result = segment_pipe(image=image)
41
+ segments = segmented_result["segments"]
42
+
43
+ # Step 3: Modify segmented image using Diffusion model
44
+ # Here, we modify based on the caption result and segmented area
45
+ output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
46
+
47
+ return output_image
48
+
49
+ # Create Gradio interface
50
+ interface = gr.Interface(
51
+ fn=process_image,
52
+ inputs=gr.Image(type="pil"),
53
+ outputs="image",
54
+ live=True, # Allow for dynamic updates if necessary
55
+ allow_flagging="never", # Disallow flagging to keep interactions light
56
+ title="Image Processor: Vision, Segmentation, and Modification",
57
+ description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
58
+ )
59
+
60
+ # Launch the app
61
+ interface.launch()