Spaces:

Ryukijano
/

Image-processor

Runtime error

App Files Files Community

Ryukijano commited on Oct 15, 2024

Commit

89e7803

verified ·

1 Parent(s): bb85f4f

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -8

app.py CHANGED Viewed

@@ -1,23 +1,24 @@
 # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
 import gradio as gr
 import spaces  # Import the spaces module to use GPU-specific decorators
-from transformers import pipeline
 from diffusers import StableDiffusionPipeline
 import torch
 import os
 # Set up Hugging Face token for private model access
 hf_token = os.getenv("HF_TOKEN")  # Fetch token from repository secrets
 # Set up Meta Llama 3.2 Vision model (using private model with token)
 llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-vision_pipe = pipeline(
-    "image-captioning",  # Supports image captioning and image Q&A
-    model=llama_vision_model_id,
     torch_dtype=torch.bfloat16,
-    device=0,  # Force usage of GPU
-    use_auth_token=hf_token,  # Use Hugging Face token for authentication
 )
 # Set up Meta Segment Anything 2 model (using private model with token)
 segment_model_id = "meta/segment-anything-2"
@@ -39,8 +40,10 @@ diffusion_pipe = diffusion_pipe.to("cuda")  # Force usage of GPU
 @spaces.GPU(duration=120)  # Allocates GPU for a maximum of 120 seconds
 def process_image(image):
     # Step 1: Use Vision model for initial image understanding (captioning)
-    caption_result = vision_pipe(image=image)
-    caption = caption_result[0]['generated_text']
     # Step 2: Segment important parts of the image
     segmented_result = segment_pipe(image=image)

 # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
 import gradio as gr
 import spaces  # Import the spaces module to use GPU-specific decorators
+from transformers import MllamaForConditionalGeneration, AutoProcessor
 from diffusers import StableDiffusionPipeline
 import torch
 import os
+from PIL import Image
 # Set up Hugging Face token for private model access
 hf_token = os.getenv("HF_TOKEN")  # Fetch token from repository secrets
 # Set up Meta Llama 3.2 Vision model (using private model with token)
 llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+vision_model = MllamaForConditionalGeneration.from_pretrained(
+    llama_vision_model_id,
     torch_dtype=torch.bfloat16,
+    device_map="auto",
+    use_auth_token=hf_token
 )
+processor = AutoProcessor.from_pretrained(llama_vision_model_id, use_auth_token=hf_token)
 # Set up Meta Segment Anything 2 model (using private model with token)
 segment_model_id = "meta/segment-anything-2"
 @spaces.GPU(duration=120)  # Allocates GPU for a maximum of 120 seconds
 def process_image(image):
     # Step 1: Use Vision model for initial image understanding (captioning)
+    prompt = "<|image|><|begin_of_text|>Describe the image."
+    inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
+    output = vision_model.generate(**inputs, max_new_tokens=50)
+    caption = processor.decode(output[0], skip_special_tokens=True)
     # Step 2: Segment important parts of the image
     segmented_result = segment_pipe(image=image)