Ryukijano commited on
Commit
89e7803
·
verified ·
1 Parent(s): bb85f4f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -1,23 +1,24 @@
1
  # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
2
  import gradio as gr
3
  import spaces # Import the spaces module to use GPU-specific decorators
4
- from transformers import pipeline
5
  from diffusers import StableDiffusionPipeline
6
  import torch
7
  import os
 
8
 
9
  # Set up Hugging Face token for private model access
10
  hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
11
 
12
  # Set up Meta Llama 3.2 Vision model (using private model with token)
13
  llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
14
- vision_pipe = pipeline(
15
- "image-captioning", # Supports image captioning and image Q&A
16
- model=llama_vision_model_id,
17
  torch_dtype=torch.bfloat16,
18
- device=0, # Force usage of GPU
19
- use_auth_token=hf_token, # Use Hugging Face token for authentication
20
  )
 
21
 
22
  # Set up Meta Segment Anything 2 model (using private model with token)
23
  segment_model_id = "meta/segment-anything-2"
@@ -39,8 +40,10 @@ diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
39
  @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
40
  def process_image(image):
41
  # Step 1: Use Vision model for initial image understanding (captioning)
42
- caption_result = vision_pipe(image=image)
43
- caption = caption_result[0]['generated_text']
 
 
44
 
45
  # Step 2: Segment important parts of the image
46
  segmented_result = segment_pipe(image=image)
 
1
  # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
2
  import gradio as gr
3
  import spaces # Import the spaces module to use GPU-specific decorators
4
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
5
  from diffusers import StableDiffusionPipeline
6
  import torch
7
  import os
8
+ from PIL import Image
9
 
10
  # Set up Hugging Face token for private model access
11
  hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
12
 
13
  # Set up Meta Llama 3.2 Vision model (using private model with token)
14
  llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
15
+ vision_model = MllamaForConditionalGeneration.from_pretrained(
16
+ llama_vision_model_id,
 
17
  torch_dtype=torch.bfloat16,
18
+ device_map="auto",
19
+ use_auth_token=hf_token
20
  )
21
+ processor = AutoProcessor.from_pretrained(llama_vision_model_id, use_auth_token=hf_token)
22
 
23
  # Set up Meta Segment Anything 2 model (using private model with token)
24
  segment_model_id = "meta/segment-anything-2"
 
40
  @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
41
  def process_image(image):
42
  # Step 1: Use Vision model for initial image understanding (captioning)
43
+ prompt = "<|image|><|begin_of_text|>Describe the image."
44
+ inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
45
+ output = vision_model.generate(**inputs, max_new_tokens=50)
46
+ caption = processor.decode(output[0], skip_special_tokens=True)
47
 
48
  # Step 2: Segment important parts of the image
49
  segmented_result = segment_pipe(image=image)