Aria-UI

Runtime error

App Files Files Community

Aria-UI commited on Dec 19, 2024

Commit

a4c53e8

verified ·

1 Parent(s): 07eef99

Upload 10 files

Browse files

Files changed (8) hide show

.gitattributes +1 -0
app.py +95 -25
assets/aria_ui_logo.png +0 -0
assets/click.png +0 -0
assets/example_desktop.png +0 -0
assets/example_mobile.jpg +0 -0
assets/example_web.png +3 -0
assets/logo_long.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/example_web.png filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import gradio as gr
-import cv2
 import numpy as np
-from PIL import Image
 import base64
 from io import BytesIO
 import re
 import os
 # Code from user
 openai_api_key = os.environ["aria_ui_api_key"]
 openai_api_base = os.environ["aria_ui_api_base"]
@@ -21,16 +27,15 @@ client = OpenAI(
 models = client.models.list()
 model = models.data[0].id
-def encode_numpy_image_to_base64(image: np.ndarray) -> str:
-    success, buffer = cv2.imencode('.jpg', image)
-    if not success:
-        raise ValueError("Failed to encode image to jpg format")
-    image_bytes = buffer.tobytes()
-    base64_string = base64.b64encode(image_bytes).decode('utf-8')
-    return base64_string
-def request_aria_ui(image: np.ndarray, prompt: str) -> str:
-    image_base64 = encode_numpy_image_to_base64(image)
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role": "user",
@@ -50,7 +55,7 @@ def request_aria_ui(image: np.ndarray, prompt: str) -> str:
         model=model,
         max_tokens=512,
         stop=["<|im_end|>"],
-        extra_body={"split_image": True, "image_max_size": 980}
     )
     result = chat_completion_from_url.choices[0].message.content
@@ -63,7 +68,7 @@ def _extract_coords_from_response(response: str) -> tuple[int, int]:
         raise ValueError(f"Expected exactly 2 coordinates, found {len(numbers)} numbers in response: {response}")
     return int(numbers[0]), int(numbers[1])
-def process_image(image: np.ndarray, prompt: str) -> np.ndarray:
     try:
         # Request processing from API
         response = request_aria_ui(image, prompt)
@@ -72,37 +77,102 @@ def process_image(image: np.ndarray, prompt: str) -> np.ndarray:
         norm_coords = _extract_coords_from_response(response)
         # Convert normalized coordinates to absolute coordinates
-        height, width, _ = image.shape
         abs_coords = (
             int(norm_coords[0] * width / 1000),  # Scale x-coordinate
             int(norm_coords[1] * height / 1000)  # Scale y-coordinate
         )
-        # Draw circle on image
-        output_image = image.copy()
-        cv2.circle(output_image, abs_coords, radius=10, color=(0, 255, 0), thickness=-1)
         return output_image
     except Exception as e:
         raise ValueError(f"An error occurred: {e}")
 # Gradio app
 def gradio_interface(input_image, prompt):
-    input_image = np.array(input_image)  # Convert PIL image to numpy
-    output_image = process_image(input_image, prompt)
-    return Image.fromarray(output_image)
 with gr.Blocks() as demo:
-    gr.Markdown("# GUI Image Processor")
-    gr.Markdown("Upload a GUI image and enter a prompt. The app will process the image and mark a location based on the response.")
     with gr.Row():
-        with gr.Column():
             image_input = gr.Image(type="pil", label="Upload GUI Image")
             prompt_input = gr.Textbox(label="Enter Prompt")
             submit_button = gr.Button("Process")
-        with gr.Column():
-            output_image = gr.Image(label="Processed Image")
     submit_button.click(
         fn=gradio_interface,

 import gradio as gr
 import numpy as np
+from PIL import Image, ImageDraw
 import base64
 from io import BytesIO
 import re
 import os
+examples = [
+    {"image": "assets/example_desktop.png", "prompt": "switch off the wired connection"},
+    {"image": "assets/example_web.png", "prompt": "view all branches"},
+    {"image": "assets/example_mobile.jpg", "prompt": "share the screenshot"},
+]
 # Code from user
 openai_api_key = os.environ["aria_ui_api_key"]
 openai_api_base = os.environ["aria_ui_api_base"]
 models = client.models.list()
 model = models.data[0].id
+def encode_pil_image_to_base64(image: Image.Image) -> str:
+    image = image.convert("RGB")
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return img_str
+def request_aria_ui(image: Image.Image, prompt: str) -> str:
+    image_base64 = encode_pil_image_to_base64(image)
     chat_completion_from_url = client.chat.completions.create(
         messages=[{
             "role": "user",
         model=model,
         max_tokens=512,
         stop=["<|im_end|>"],
+        extra_body={"split_image": True, "image_max_size": 980, "temperature": 0, "top_k": 1}
     )
     result = chat_completion_from_url.choices[0].message.content
         raise ValueError(f"Expected exactly 2 coordinates, found {len(numbers)} numbers in response: {response}")
     return int(numbers[0]), int(numbers[1])
+def image_grounding(image: Image.Image, prompt: str) -> Image.Image:
     try:
         # Request processing from API
         response = request_aria_ui(image, prompt)
         norm_coords = _extract_coords_from_response(response)
         # Convert normalized coordinates to absolute coordinates
+        width, height = image.size
+        long_side = max(width, height)
         abs_coords = (
             int(norm_coords[0] * width / 1000),  # Scale x-coordinate
             int(norm_coords[1] * height / 1000)  # Scale y-coordinate
         )
+        # Load and prepare the click indicator image
+        click_image = Image.open("assets/click.png")
+        # Calculate adaptive size for click indicator
+        # Make it proportional to the image width (e.g., 3% of image width)
+        target_width = int(long_side * 0.03)  # 3% of image width
+        aspect_ratio = click_image.width / click_image.height
+        target_height = int(target_width / aspect_ratio)
+        click_image = click_image.resize((target_width, target_height))
+        # Calculate position to center the click image on the coordinates
+        # Add a small offset downward (20% of click image height)
+        # Calculate position to align the 30% point of the click image with the coordinates
+        click_x = abs_coords[0] - int(click_image.width * 0.3)   # Align 30% from left
+        click_y = abs_coords[1] - int(click_image.height * 0.3)  # Align 30% from top
+        # Create output image and paste the click indicator
+        output_image = image.copy()
+        # Draw bounding box
+        draw = ImageDraw.Draw(output_image)
+        bbox = [
+            click_x,                    # left
+            click_y,                    # top
+            click_x + click_image.width,  # right
+            click_y + click_image.height  # bottom
+        ]
+        draw.rectangle(bbox, outline='red', width=int(click_image.width * 0.1))
+        output_image.paste(click_image, (click_x, click_y), click_image)
         return output_image
     except Exception as e:
         raise ValueError(f"An error occurred: {e}")
+def resize_image_with_max_size(image: Image.Image, max_size: int = 1920) -> Image.Image:
+    """Resize image to have a maximum dimension of max_size while maintaining aspect ratio."""
+    width, height = image.size
+    if width <= max_size and height <= max_size:
+        return image
+    if width > height:
+        new_width = max_size
+        new_height = int(height * (max_size / width))
+    else:
+        new_height = max_size
+        new_width = int(width * (max_size / height))
+    return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
 # Gradio app
 def gradio_interface(input_image, prompt):
+    print(input_image.size)
+    input_image = resize_image_with_max_size(input_image)
+    print(input_image.size)
+    output_image = image_grounding(input_image, prompt)
+    return output_image
 with gr.Blocks() as demo:
+    with gr.Row(elem_classes="container"):
+        gr.Image("assets/logo_long.png", show_label=False, container=False, scale=1, elem_classes="logo", height=76)
+    gr.Markdown("# Aria-UI: Visual Grounding for GUI Instructions")
+    gr.Markdown("🚀🚀 Upload a GUI image and enter a instruction. Aria-UI will try its best to ground the instruction to specific element in the image. 🎯🎯")
     with gr.Row():
+        with gr.Column(scale=2):  # Make this column smaller
             image_input = gr.Image(type="pil", label="Upload GUI Image")
             prompt_input = gr.Textbox(label="Enter Prompt")
             submit_button = gr.Button("Process")
+        with gr.Column(scale=3):  # Make this column larger
+            output_image = gr.Image(label="Grounding Result", height=600)  # Set specific height for larger display
+    with gr.Column(scale=2):
+        # Move examples here and make them vertical
+        gr.Examples(
+            examples=[
+                [
+                    example["image"],
+                    example["prompt"]
+                ]
+                for example in examples
+            ],
+            inputs=[image_input, prompt_input],
+            outputs=[output_image],
+            fn=gradio_interface,
+            cache_examples=False,
+            label="Example Tasks",  # Add label for better organization
+            examples_per_page=5  # Control number of examples shown at once
+        )
     submit_button.click(
         fn=gradio_interface,

assets/aria_ui_logo.png ADDED Viewed

assets/click.png ADDED Viewed

assets/example_desktop.png ADDED Viewed

assets/example_mobile.jpg ADDED Viewed

assets/example_web.png ADDED Viewed

Git LFS Details

SHA256: 9f3add458689b6e3c11ca1ede9032aa65f623bd2c8a6a63976bf77cd4dc865b5
Pointer size: 132 Bytes
Size of remote file: 2.58 MB

assets/logo_long.png ADDED Viewed