Spaces:

fotographerai
/

Zenctrl-Inpaint

Running

App Files Files Community

salso commited on 10 days ago

Commit

545e508

verified ·

1 Parent(s): 952e6da

Upload 28 files

Browse files

Files changed (28) hide show

florence_sam/.gitattributes +36 -0
florence_sam/.gitignore +3 -0
florence_sam/.gradio/certificate.pem +31 -0
florence_sam/Florence.ipynb +0 -0
florence_sam/README.md +13 -0
florence_sam/__init__.py +0 -0
florence_sam/__pycache__/__init__.cpython-310.pyc +0 -0
florence_sam/__pycache__/detect_and_segment.cpython-310.pyc +0 -0
florence_sam/app.py +397 -0
florence_sam/configs/__init__.py +5 -0
florence_sam/configs/sam2_hiera_b+.yaml +113 -0
florence_sam/configs/sam2_hiera_l.yaml +117 -0
florence_sam/configs/sam2_hiera_s.yaml +116 -0
florence_sam/configs/sam2_hiera_t.yaml +118 -0
florence_sam/detect_and_segment.py +151 -0
florence_sam/process_batch.py +169 -0
florence_sam/reassemble.py +133 -0
florence_sam/requirements.txt +10 -0
florence_sam/utils/__init__.py +0 -0
florence_sam/utils/__pycache__/__init__.cpython-310.pyc +0 -0
florence_sam/utils/__pycache__/florence.cpython-310.pyc +0 -0
florence_sam/utils/__pycache__/modes.cpython-310.pyc +0 -0
florence_sam/utils/__pycache__/sam.cpython-310.pyc +0 -0
florence_sam/utils/__pycache__/video.cpython-310.pyc +0 -0
florence_sam/utils/florence.py +58 -0
florence_sam/utils/modes.py +13 -0
florence_sam/utils/sam.py +45 -0
florence_sam/utils/video.py +26 -0

florence_sam/.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

florence_sam/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+/venv
+/.idea
+/tmp

florence_sam/.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

florence_sam/Florence.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

florence_sam/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Florence2 + SAM2
+emoji: 🔥
+colorFrom: purple
+colorTo: green
+sdk: gradio
+sdk_version: 4.40.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

florence_sam/__init__.py ADDED Viewed

File without changes

florence_sam/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (147 Bytes). View file

florence_sam/__pycache__/detect_and_segment.cpython-310.pyc ADDED Viewed

Binary file (4.43 kB). View file

florence_sam/app.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import os
+from typing import Tuple, Optional
+import cv2
+import gradio as gr
+import numpy as np
+#import spaces
+import supervision as sv
+import torch
+from PIL import Image
+from tqdm import tqdm
+from utils.video import generate_unique_name, create_directory, delete_directory
+from utils.florence import load_florence_model, run_florence_inference, \
+    FLORENCE_DETAILED_CAPTION_TASK, \
+    FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
+    IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
+from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
+MARKDOWN = """
+# Florence2 + SAM2 🔥
+<div>
+    <a href="https://github.com/facebookresearch/segment-anything-2">
+        <img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block;">
+    </a>
+    <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-segment-images-with-sam-2.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
+    </a>
+    <a href="https://blog.roboflow.com/what-is-segment-anything-2/">
+        <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
+    </a>
+    <a href="https://www.youtube.com/watch?v=Dv003fTyO-Y">
+        <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
+    </a>
+</div>
+This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
+the first stage, Florence2 performs tasks such as object detection, open-vocabulary
+object detection, image captioning, or phrase grounding. In the second stage, SAM2
+performs object segmentation on the image.
+"""
+IMAGE_PROCESSING_EXAMPLES = [
+    [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw, white napkin, black napkin, hair'],
+    [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
+    [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
+    [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
+]
+VIDEO_PROCESSING_EXAMPLES = [
+    ["videos/clip-07-camera-1.mp4", "player in white outfit, player in black outfit, ball, rim"],
+    ["videos/clip-07-camera-2.mp4", "player in white outfit, player in black outfit, ball, rim"],
+    ["videos/clip-07-camera-3.mp4", "player in white outfit, player in black outfit, ball, rim"]
+]
+VIDEO_SCALE_FACTOR = 0.5
+VIDEO_TARGET_DIRECTORY = "tmp"
+create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
+DEVICE = torch.device("cuda")
+# DEVICE = torch.device("cpu")
+torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+if torch.cuda.get_device_properties(0).major >= 8:
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
+SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
+COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
+COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
+BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+LABEL_ANNOTATOR = sv.LabelAnnotator(
+    color=COLOR_PALETTE,
+    color_lookup=sv.ColorLookup.INDEX,
+    text_position=sv.Position.CENTER_OF_MASS,
+    text_color=sv.Color.from_hex("#000000"),
+    border_radius=5
+)
+MASK_ANNOTATOR = sv.MaskAnnotator(
+    color=COLOR_PALETTE,
+    color_lookup=sv.ColorLookup.INDEX
+)
+def annotate_image(image, detections):
+    output_image = image.copy()
+    output_image = MASK_ANNOTATOR.annotate(output_image, detections)
+    output_image = BOX_ANNOTATOR.annotate(output_image, detections)
+    output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
+    return output_image
+def on_mode_dropdown_change(text):
+    return [
+        gr.Textbox(visible=text == IMAGE_OPEN_VOCABULARY_DETECTION_MODE),
+        gr.Textbox(visible=text == IMAGE_CAPTION_GROUNDING_MASKS_MODE),
+    ]
+#@spaces.GPU
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def process_image(
+    mode_dropdown, image_input, text_input
+) -> Tuple[Optional[Image.Image], Optional[str]]:
+    if not image_input:
+        gr.Info("Please upload an image.")
+        return None, None
+    if mode_dropdown == IMAGE_OPEN_VOCABULARY_DETECTION_MODE:
+        if not text_input:
+            gr.Info("Please enter a text prompt.")
+            return None, None
+        texts = [prompt.strip() for prompt in text_input.split(",")]
+        detections_list = []
+        for text in texts:
+            _, result = run_florence_inference(
+                model=FLORENCE_MODEL,
+                processor=FLORENCE_PROCESSOR,
+                device=DEVICE,
+                image=image_input,
+                task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+                text=text
+            )
+            detections = sv.Detections.from_lmm(
+                lmm=sv.LMM.FLORENCE_2,
+                result=result,
+                resolution_wh=image_input.size
+            )
+            detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+            detections_list.append(detections)
+        detections = sv.Detections.merge(detections_list)
+        detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+        return annotate_image(image_input, detections), None
+    if mode_dropdown == IMAGE_CAPTION_GROUNDING_MASKS_MODE:
+        _, result = run_florence_inference(
+            model=FLORENCE_MODEL,
+            processor=FLORENCE_PROCESSOR,
+            device=DEVICE,
+            image=image_input,
+            task=FLORENCE_DETAILED_CAPTION_TASK
+        )
+        caption = result[FLORENCE_DETAILED_CAPTION_TASK]
+        _, result = run_florence_inference(
+            model=FLORENCE_MODEL,
+            processor=FLORENCE_PROCESSOR,
+            device=DEVICE,
+            image=image_input,
+            task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
+            text=caption
+        )
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2,
+            result=result,
+            resolution_wh=image_input.size
+        )
+        detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
+        return annotate_image(image_input, detections), caption
+#@spaces.GPU(duration=300)
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def process_video(
+    video_input, text_input, progress=gr.Progress(track_tqdm=True)
+) -> Optional[str]:
+    if not video_input:
+        gr.Info("Please upload a video.")
+        return None
+    if not text_input:
+        gr.Info("Please enter a text prompt.")
+        return None
+    frame_generator = sv.get_video_frames_generator(video_input)
+    frame = next(frame_generator)
+    frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+    texts = [prompt.strip() for prompt in text_input.split(",")]
+    detections_list = []
+    for text in texts:
+        _, result = run_florence_inference(
+            model=FLORENCE_MODEL,
+            processor=FLORENCE_PROCESSOR,
+            device=DEVICE,
+            image=frame,
+            task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+            text=text
+        )
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2,
+            result=result,
+            resolution_wh=frame.size
+        )
+        detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
+        detections_list.append(detections)
+    detections = sv.Detections.merge(detections_list)
+    detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
+    if len(detections.mask) == 0:
+        gr.Info(
+            "No objects of class {text_input} found in the first frame of the video. "
+            "Trim the video to make the object appear in the first frame or try a "
+            "different text prompt."
+        )
+        return None
+    name = generate_unique_name()
+    frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
+    frames_sink = sv.ImageSink(
+        target_dir_path=frame_directory_path,
+        image_name_pattern="{:05d}.jpeg"
+    )
+    video_info = sv.VideoInfo.from_video_path(video_input)
+    video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
+    video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
+    frames_generator = sv.get_video_frames_generator(video_input)
+    with frames_sink:
+        for frame in tqdm(
+                frames_generator,
+                total=video_info.total_frames,
+                desc="splitting video into frames"
+        ):
+            frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
+            frames_sink.save_image(frame)
+    inference_state = SAM_VIDEO_MODEL.init_state(
+        video_path=frame_directory_path,
+        device=DEVICE
+    )
+    for mask_index, mask in enumerate(detections.mask):
+        _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
+            inference_state=inference_state,
+            frame_idx=0,
+            obj_id=mask_index,
+            mask=mask
+        )
+    video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
+    frames_generator = sv.get_video_frames_generator(video_input)
+    masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)
+    with sv.VideoSink(video_path, video_info=video_info) as sink:
+        for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
+            frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
+            masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
+            if len(masks.shape) == 4:
+                masks = np.squeeze(masks, axis=1)
+            detections = sv.Detections(
+                xyxy=sv.mask_to_xyxy(masks=masks),
+                mask=masks,
+                class_id=np.array(tracker_ids)
+            )
+            annotated_frame = frame.copy()
+            annotated_frame = MASK_ANNOTATOR.annotate(
+                scene=annotated_frame, detections=detections)
+            annotated_frame = BOX_ANNOTATOR.annotate(
+                scene=annotated_frame, detections=detections)
+            sink.write_frame(annotated_frame)
+    delete_directory(frame_directory_path)
+    return video_path
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Tab("Image"):
+        image_processing_mode_dropdown_component = gr.Dropdown(
+            choices=IMAGE_INFERENCE_MODES,
+            value=IMAGE_INFERENCE_MODES[0],
+            label="Mode",
+            info="Select a mode to use.",
+            interactive=True
+        )
+        with gr.Row():
+            with gr.Column():
+                image_processing_image_input_component = gr.Image(
+                    type='pil', label='Upload image')
+                image_processing_text_input_component = gr.Textbox(
+                    label='Text prompt',
+                    placeholder='Enter comma separated text prompts')
+                image_processing_submit_button_component = gr.Button(
+                    value='Submit', variant='primary')
+            with gr.Column():
+                image_processing_image_output_component = gr.Image(
+                    type='pil', label='Image output')
+                image_processing_text_output_component = gr.Textbox(
+                    label='Caption output', visible=False)
+        with gr.Row():
+            gr.Examples(
+                fn=process_image,
+                examples=IMAGE_PROCESSING_EXAMPLES,
+                inputs=[
+                    image_processing_mode_dropdown_component,
+                    image_processing_image_input_component,
+                    image_processing_text_input_component
+                ],
+                outputs=[
+                    image_processing_image_output_component,
+                    image_processing_text_output_component
+                ],
+                run_on_click=True
+            )
+    with gr.Tab("Video"):
+        video_processing_mode_dropdown_component = gr.Dropdown(
+            choices=VIDEO_INFERENCE_MODES,
+            value=VIDEO_INFERENCE_MODES[0],
+            label="Mode",
+            info="Select a mode to use.",
+            interactive=True
+        )
+        with gr.Row():
+            with gr.Column():
+                video_processing_video_input_component = gr.Video(
+                    label='Upload video')
+                video_processing_text_input_component = gr.Textbox(
+                    label='Text prompt',
+                    placeholder='Enter comma separated text prompts')
+                video_processing_submit_button_component = gr.Button(
+                    value='Submit', variant='primary')
+            with gr.Column():
+                video_processing_video_output_component = gr.Video(
+                    label='Video output')
+        with gr.Row():
+            gr.Examples(
+                fn=process_video,
+                examples=VIDEO_PROCESSING_EXAMPLES,
+                inputs=[
+                    video_processing_video_input_component,
+                    video_processing_text_input_component
+                ],
+                outputs=video_processing_video_output_component,
+                run_on_click=True
+            )
+    image_processing_submit_button_component.click(
+        fn=process_image,
+        inputs=[
+            image_processing_mode_dropdown_component,
+            image_processing_image_input_component,
+            image_processing_text_input_component
+        ],
+        outputs=[
+            image_processing_image_output_component,
+            image_processing_text_output_component
+        ]
+    )
+    image_processing_text_input_component.submit(
+        fn=process_image,
+        inputs=[
+            image_processing_mode_dropdown_component,
+            image_processing_image_input_component,
+            image_processing_text_input_component
+        ],
+        outputs=[
+            image_processing_image_output_component,
+            image_processing_text_output_component
+        ]
+    )
+    image_processing_mode_dropdown_component.change(
+        on_mode_dropdown_change,
+        inputs=[image_processing_mode_dropdown_component],
+        outputs=[
+            image_processing_text_input_component,
+            image_processing_text_output_component
+        ]
+    )
+    video_processing_submit_button_component.click(
+        fn=process_video,
+        inputs=[
+            video_processing_video_input_component,
+            video_processing_text_input_component
+        ],
+        outputs=video_processing_video_output_component
+    )
+    video_processing_text_input_component.submit(
+        fn=process_video,
+        inputs=[
+            video_processing_video_input_component,
+            video_processing_text_input_component
+        ],
+        outputs=video_processing_video_output_component
+    )
+demo.launch(debug=False, show_error=True, share=True)

florence_sam/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.

florence_sam/configs/sam2_hiera_b+.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 112
+      num_heads: 2
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [896, 448, 224, 112]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

florence_sam/configs/sam2_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

florence_sam/configs/sam2_hiera_s.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 11, 2]
+      global_att_blocks: [7, 10, 13]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

florence_sam/configs/sam2_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

florence_sam/detect_and_segment.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# detect_and_segment.py
+import torch
+import supervision as sv
+from typing import List, Tuple, Optional
+# ==== 1. One-time global model loading  =====================================
+from .utils.florence import (
+    load_florence_model,
+    run_florence_inference,
+    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
+)
+from .utils.sam import load_sam_image_model, run_sam_inference
+from PIL import Image, ImageDraw, ImageColor
+import numpy as np
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# load models once – they stay in memory for repeated calls
+FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL               = load_sam_image_model(device=DEVICE)
+# quick annotators
+COLORS          = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
+COLOR_PALETTE   = sv.ColorPalette.from_hex(COLORS)
+BOX_ANNOTATOR   = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+LABEL_ANNOTATOR = sv.LabelAnnotator(
+    color=COLOR_PALETTE,
+    color_lookup=sv.ColorLookup.INDEX,
+    text_position=sv.Position.CENTER_OF_MASS,
+    text_color=sv.Color.from_hex("#000000"),
+    border_radius=5,
+)
+MASK_ANNOTATOR  = sv.MaskAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+# ==== 2. Inference function  ===============================================
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def detect_and_segment(
+    image          : Image.Image,
+    text_prompts   : str | List[str],
+    return_image   : bool = True,
+) -> Tuple[sv.Detections, Optional[Image.Image]]:
+    """
+    Run Florence-2 open-vocabulary detection + SAM2 mask refinement on a PIL image.
+    Parameters
+    ----------
+    image : PIL.Image
+        Input image in RGB.
+    text_prompts : str | List[str]
+        Single prompt or comma-separated list (e.g. "dog, tail, leash").
+    return_image : bool
+        If True, also returns an annotated PIL image.
+    Returns
+    -------
+    detections : sv.Detections
+        Supervision object with xyxy, mask, class_id, etc.
+    annotated  : PIL.Image | None
+        Annotated image (None if return_image=False)
+    """
+    # Normalize prompt list
+    if isinstance(text_prompts, str):
+        prompts = [p.strip() for p in text_prompts.split(",") if p.strip()]
+    else:
+        prompts = [p.strip() for p in text_prompts]
+    if len(prompts) == 0:
+        raise ValueError("Empty prompt list given.")
+    # Collect detections from each prompt
+    det_list: list[sv.Detections] = []
+    for p in prompts:
+        _, result = run_florence_inference(
+            model      = FLORENCE_MODEL,
+            processor  = FLORENCE_PROC,
+            device     = DEVICE,
+            image      = image,
+            task       = FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+            text       = p,
+        )
+        det = sv.Detections.from_lmm(
+            lmm           = sv.LMM.FLORENCE_2,
+            result        = result,
+            resolution_wh = image.size,
+        )
+        det = run_sam_inference(SAM_IMAGE_MODEL, image, det)  # SAM2 refinement
+        det_list.append(det)
+    detections = sv.Detections.merge(det_list)
+    annotated_img = None
+    if return_image:
+        annotated_img = image.copy()
+        annotated_img = MASK_ANNOTATOR.annotate(annotated_img, detections)
+        annotated_img = BOX_ANNOTATOR.annotate(annotated_img, detections)
+        annotated_img = LABEL_ANNOTATOR.annotate(annotated_img, detections)
+    return detections, annotated_img
+def fill_detected_bboxes(
+    image: Image.Image,
+    text: str,
+    inflate_pct: float = 0.10,
+    fill_color: str | tuple[int, int, int] = "#00FF00",
+):
+    """
+    Detect objects matching `text`, inflate each bounding-box by `inflate_pct`,
+    fill the area with `fill_color`, and return the resulting image.
+    Parameters
+    ----------
+    image        : PIL.Image
+        Input image (RGB).
+    text         : str
+        Comma-separated prompt(s) for open-vocabulary detection.
+    inflate_pct  : float, default 0.10
+        Extra margin per side (0.10 = +10 % width & height).
+    fill_color   : str | tuple, default "#00FF00"
+        Solid color used to fill each inflated bbox (hex or RGB tuple).
+    Returns
+    -------
+    filled_img   : PIL.Image
+        Image with each detected (inflated) box filled.
+    detections   : sv.Detections
+        Original detection object from `detect_and_segment`.
+    """
+    # run Florence2 + SAM2 pipeline (your helper from earlier)
+    detections, _ = detect_and_segment(image, text)
+    w, h = image.size
+    filled_img = image.copy()
+    draw = ImageDraw.Draw(filled_img)
+    fill_rgb = ImageColor.getrgb(fill_color) if isinstance(fill_color, str) else fill_color
+    for box in detections.xyxy:
+        # xyxy is numpy array → cast to float for math
+        x1, y1, x2, y2 = box.astype(float)
+        dw, dh = (x2 - x1) * inflate_pct, (y2 - y1) * inflate_pct
+        x1_i = max(0, x1 - dw)
+        y1_i = max(0, y1 - dh)
+        x2_i = min(w, x2 + dw)
+        y2_i = min(h, y2 + dh)
+        draw.rectangle([x1_i, y1_i, x2_i, y2_i], fill=fill_rgb)
+    return filled_img, detections

florence_sam/process_batch.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#!/usr/bin/env python
+# furniture_bbox_to_files.py  ────────────────────────────────────────
+# Florence-2 + SAM-2 batch processor with retries *and* file-based images
+# --------------------------------------------------------------------
+import os, json, random, time
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import List
+import torch, supervision as sv
+from PIL import Image, ImageDraw, ImageColor, ImageOps
+from tqdm.auto import tqdm
+from datasets import load_dataset, Image as HFImage, disable_progress_bar
+# ───── global models ────────────────────────────────────────────────
+from utils.florence import (
+    load_florence_model, run_florence_inference,
+    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
+)
+from utils.sam import load_sam_image_model, run_sam_inference
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
+SAM_IMAGE_MODEL               = load_sam_image_model(device=DEVICE)
+# annotators
+_PALETTE = sv.ColorPalette.from_hex(
+    ['#FF1493','#00BFFF','#FF6347','#FFD700','#32CD32','#8A2BE2'])
+BOX_ANN  = sv.BoxAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+MASK_ANN = sv.MaskAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
+LBL_ANN  = sv.LabelAnnotator(
+    color=_PALETTE, color_lookup=sv.ColorLookup.INDEX,
+    text_position=sv.Position.CENTER_OF_MASS,
+    text_color=sv.Color.from_hex("#000"), border_radius=5)
+# ───── config ───────────────────────────────────────────────────────
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+disable_progress_bar()
+DATASET_NAME  = "fotographerai/furniture_captioned_segment_prompt"
+SPLIT         = "train"
+IMAGE_COL     = "img2"
+PROMPT_COL    = "segmenting_prompt"
+INFLATE_RANGE = (0.01, 0.05)
+FILL_COLOR    = "#00FF00"
+TARGET_SIDE   = 1500
+QA_DIR        = Path("bbox_review_recaptioned")
+GREEN_DIR     = QA_DIR / "green";  GREEN_DIR.mkdir(parents=True, exist_ok=True)
+ANNO_DIR      = QA_DIR / "anno";   ANNO_DIR.mkdir(parents=True, exist_ok=True)
+JSON_DIR      = QA_DIR / "json";   JSON_DIR.mkdir(parents=True, exist_ok=True)
+MAX_WORKERS   = 100
+MAX_RETRIES   = 5
+RETRY_SLEEP   = .3
+FAILED_LOG    = QA_DIR / "failed_rows.jsonl"
+PROMPT_MAP: dict[str,str] = {}   # optional overrides
+# ───── helpers ──────────────────────────────────────────────────────
+def make_square(img: Image.Image, side: int = TARGET_SIDE) -> Image.Image:
+    img = ImageOps.contain(img, (side, side))
+    pad_w, pad_h = side - img.width, side - img.height
+    return ImageOps.expand(img, border=(pad_w//2, pad_h//2,
+                                        pad_w - pad_w//2, pad_h - pad_h//2),
+                           fill=img.getpixel((0,0)))
+def img_to_file(img: Image.Image, fname: str, folder: Path) -> dict:
+    path = folder / f"{fname}.png"
+    if not path.exists():
+        img.save(path)
+    return {"path": str(path), "bytes": None}
+# ───── core functions ───────────────────────────────────────────────
+@torch.inference_mode()
+@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
+def detect_and_segment(img: Image.Image, prompts: str|List[str]) -> sv.Detections:
+    if isinstance(prompts, str):
+        prompts = [p.strip() for p in prompts.split(",") if p.strip()]
+    all_dets = []
+    for p in prompts:
+        _, res = run_florence_inference(
+            model=FLORENCE_MODEL, processor=FLORENCE_PROC, device=DEVICE,
+            image=img, task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK, text=p)
+        d = sv.Detections.from_lmm(sv.LMM.FLORENCE_2, res, img.size)
+        all_dets.append(run_sam_inference(SAM_IMAGE_MODEL, img, d))
+    return sv.Detections.merge(all_dets)
+def fill_detected_bboxes(img: Image.Image, prompt: str,
+                         inflate_pct: float) -> tuple[Image.Image, sv.Detections]:
+    dets = detect_and_segment(img, prompt)
+    filled = img.copy()
+    draw = ImageDraw.Draw(filled)
+    rgb   = ImageColor.getrgb(FILL_COLOR)
+    w,h   = img.size
+    for box in dets.xyxy:
+        x1,y1,x2,y2 = box.astype(float)
+        dw,dh = (x2-x1)*inflate_pct, (y2-y1)*inflate_pct
+        draw.rectangle([max(0,x1-dw), max(0,y1-dh),
+                        min(w,x2+dw), min(h,y2+dh)], fill=rgb)
+    return filled, dets
+# ───── threaded worker ──────────────────────────────────────────────
+def process_row(idx: int, sample):
+    prompt = PROMPT_MAP.get(sample[PROMPT_COL],
+                            sample[PROMPT_COL].split(",",1)[0].strip())
+    img_sq = make_square(sample[IMAGE_COL].convert("RGB"))
+    for attempt in range(1, MAX_RETRIES+1):
+        try:
+            filled, dets = fill_detected_bboxes(
+                img_sq, prompt, inflate_pct=random.uniform(*INFLATE_RANGE))
+            if len(dets.xyxy) == 0:
+                raise ValueError("no detections")
+            sid = f"{idx:06d}"
+            json_p = JSON_DIR / f"{sid}_bbox.json"
+            json_p.write_text(json.dumps({"xyxy": dets.xyxy.tolist()}))
+            anno = img_sq.copy()
+            for ann in (MASK_ANN, BOX_ANN, LABEL_ANN):
+                anno = ann.annotate(anno, dets)
+            return ("ok",
+                    img_to_file(filled, sid, GREEN_DIR),
+                    img_to_file(anno,   sid, ANNO_DIR),
+                    json_p.read_text())
+        except Exception as e:
+            if attempt < MAX_RETRIES:
+                time.sleep(RETRY_SLEEP)
+            else:
+                return ("fail", str(e))
+# ───── run batch ────────────────────────────────────────────────────
+ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
+N  = len(ds)
+print("Rows:", N)
+filled_col, anno_col, json_col = [None]*N, [None]*N, [None]*N
+fails = 0
+with ThreadPoolExecutor(MAX_WORKERS) as pool:
+    fut2idx = {pool.submit(process_row, i, ds[i]): i for i in range(N)}
+    for fut in tqdm(as_completed(fut2idx), total=N, desc="Florence+SAM"):
+        idx = fut2idx[fut]
+        status, *data = fut.result()
+        if status == "ok":
+            filled_col[idx], anno_col[idx], json_col[idx] = data
+        else:
+            fails += 1
+            FAILED_LOG.write_text(json.dumps({"idx": idx, "reason": data[0]})+"\n")
+print(f"❌ permanently failed rows: {fails}")
+keep = [i for i,x in enumerate(filled_col) if x]
+new_ds = ds.select(keep)
+new_ds = new_ds.add_column("bbox_filled", [filled_col[i] for i in keep])
+new_ds = new_ds.add_column("annotated",   [anno_col[i]   for i in keep])
+new_ds = new_ds.add_column("bbox_json",   [json_col[i]   for i in keep])
+new_ds = new_ds.cast_column("bbox_filled", HFImage())
+new_ds = new_ds.cast_column("annotated",   HFImage())
+print(f"✅ successes: {len(new_ds)} / {N}")
+print("Columns:", new_ds.column_names)
+print("QA artefacts →", QA_DIR.resolve())
+# optional push
+new_ds.push_to_hub("fotographerai/surround_furniture_bboxfilled",
+                    private=True, max_shard_size="500MB")

florence_sam/reassemble.py ADDED Viewed

	@@ -0,0 +1,133 @@

+#!/usr/bin/env python
+"""
+reassemble_bbox_dataset_resume.py
+---------------------------------
+Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from
+QA artefacts and pushes the final dataset **privately** to HF Hub.
+• Safe to ^C / rerun (uses on-disk Arrow cache)
+• When NOTHING is left to process it *just* loads the cache and pushes.
+• Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny.
+"""
+import os, json
+from pathlib import Path
+from tqdm.auto import tqdm
+from datasets import (
+    load_dataset, load_from_disk, Dataset, disable_progress_bar, Features,
+    Value, Image as HFImage
+)
+from PIL import Image
+from huggingface_hub.utils import HfHubHTTPError
+disable_progress_bar()
+# ══════ CONFIG ══════════════════════════════════════════════════════
+DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
+SPLIT        = "train"
+QA_DIR       = Path("bbox_review_recaptioned")      # artefacts
+CACHE_DIR    = Path("rebuild_cache")                # incremental Arrow cache
+CACHE_DIR.mkdir(exist_ok=True)
+TARGET_SIDE  = 1500
+GREEN_RGB    = (0, 255, 0)
+BATCH_SAVE   = 500
+HUB_REPO     = "fotographerai/furniture_bboxfilled_rebuild"
+HF_TOKEN     = os.environ.get("HF_TOKEN", "").strip()  # needs write+private
+# ══════ HELPERS ═════════════════════════════════════════════════════
+def img_ref(p: Path) -> dict:                        # path-only image dict
+    return {"path": str(p), "bytes": None}
+def make_green_png(p: Path):
+    if not p.exists():
+        Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p)
+def ensure_full_bbox(p: Path):
+    if not p.exists():
+        p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]}))
+# ══════ LOAD SOURCE DATASET ═════════════════════════════════════════
+base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
+N_TOTAL = len(base_ds)
+print("Original rows:", N_TOTAL)
+# ══════ LOAD OR INIT CACHE ══════════════════════════════════════════
+if (CACHE_DIR / "dataset_info.json").exists():
+    cache_ds = load_from_disk(CACHE_DIR)
+    done     = set(cache_ds["__row_idx__"])
+    print(f"Cache found → {len(done)} rows already processed.")
+    records  = {k: list(v) for k, v in cache_ds.to_dict().items()}
+else:
+    done, records = set(), {"__row_idx__": [], "bbox_filled": [],
+                            "annotated": [], "bbox_json": []}
+missing = [i for i in range(N_TOTAL) if i not in done]
+print("Rows still to process:", len(missing))
+# ══════ NO WORK LEFT?  push & exit ══════════════════════════════════
+if not missing:
+    print("💤 nothing new to process – pushing cached dataset…")
+    try:
+        url = cache_ds.push_to_hub(
+            HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
+        )
+        print("🚀 dataset pushed to:", url)
+    except HfHubHTTPError as e:
+        print("❌ push failed:", e)
+    exit(0)
+# ══════ PROCESS MISSING ROWS ═══════════════════════════════════════
+for n, i in enumerate(tqdm(missing, desc="Re-assembling")):
+    g_png  = QA_DIR / f"{i:06d}_green.png"
+    a_png  = QA_DIR / f"{i:06d}_anno.png"
+    bbox_j = QA_DIR / f"{i:06d}_bbox.json"
+    if not (g_png.exists() and a_png.exists() and bbox_j.exists()):
+        mask_png = QA_DIR / f"{i:06d}_mask.png"
+        make_green_png(mask_png)
+        g_png = a_png = mask_png
+        ensure_full_bbox(bbox_j)
+    row = base_ds[i]                       # copy original cols once
+    records["__row_idx__"].append(i)
+    for k, v in row.items():
+        records.setdefault(k, []).append(v)
+    records["bbox_filled"].append(img_ref(g_png))
+    records["annotated"].append(img_ref(a_png))
+    records["bbox_json"].append(bbox_j.read_text())
+    if (n + 1) % BATCH_SAVE == 0:
+        Dataset.from_dict(records).save_to_disk(CACHE_DIR)
+        print(f"⏫ cached at {n+1}/{len(missing)}")
+# ══════ FINAL DATASET FEATURES & SAVE ═══════════════════════════════
+features = Features({
+    "__row_idx__" : Value("int32"),
+    "bbox_filled" : HFImage(decode=False),
+    "annotated"   : HFImage(decode=False),
+    "bbox_json"   : Value("string"),
+    # original columns inferred below
+})
+for k in base_ds.features:
+    if k not in features:
+        features[k] = base_ds.features[k]
+final_ds = Dataset.from_dict(records, features=features)
+final_ds.save_to_disk(CACHE_DIR)
+print("✅ cached dataset saved to", CACHE_DIR.resolve())
+# ══════ PUSH PRIVATE ═══════════════════════════════════════════════
+if not HF_TOKEN:
+    print("⚠️  HF_TOKEN env-var not set – skipping push.")
+else:
+    try:
+        url = final_ds.push_to_hub(
+            HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
+        )
+        print("🚀 dataset pushed to:", url)
+    except HfHubHTTPError as e:
+        print("❌ push failed:", e)

florence_sam/requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+tqdm
+einops
+spaces
+timm
+transformers
+samv2
+gradio
+supervision
+opencv-python
+pytest

florence_sam/utils/__init__.py ADDED Viewed

File without changes

florence_sam/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (141 Bytes). View file

florence_sam/utils/__pycache__/florence.cpython-310.pyc ADDED Viewed

Binary file (2.29 kB). View file

florence_sam/utils/__pycache__/modes.cpython-310.pyc ADDED Viewed

Binary file (450 Bytes). View file

florence_sam/utils/__pycache__/sam.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

florence_sam/utils/__pycache__/video.cpython-310.pyc ADDED Viewed

Binary file (984 Bytes). View file

florence_sam/utils/florence.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from typing import Union, Any, Tuple, Dict
+from unittest.mock import patch
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers.dynamic_module_utils import get_imports
+FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
+FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
+FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
+FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
+FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
+FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    #imports.remove("flash_attn")
+    return imports
+def load_florence_model(
+    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
+) -> Tuple[Any, Any]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        model = AutoModelForCausalLM.from_pretrained(
+            checkpoint, trust_remote_code=True).to(device).eval()
+        processor = AutoProcessor.from_pretrained(
+            checkpoint, trust_remote_code=True)
+        return model, processor
+def run_florence_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response

florence_sam/utils/modes.py ADDED Viewed

	@@ -0,0 +1,13 @@

+IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
+IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"
+IMAGE_INFERENCE_MODES = [
+    IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
+    IMAGE_CAPTION_GROUNDING_MASKS_MODE
+]
+VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"
+VIDEO_INFERENCE_MODES = [
+    VIDEO_OPEN_VOCABULARY_DETECTION_MODE
+]

florence_sam/utils/sam.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import Any
+import numpy as np
+import supervision as sv
+import torch
+from PIL import Image
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+from sam2.sam2_image_predictor import SAM2ImagePredictor
+SAM_CHECKPOINT = "/home/comdoleger1/ZenCtrl/app/florence_sam/checkpoints/sam2_hiera_small.pt" #"./checkpoints/sam2_hiera_small.pt"
+SAM_CONFIG = "sam2_hiera_s.yaml"
+def load_sam_image_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> SAM2ImagePredictor:
+    model = build_sam2(config, checkpoint, device=device)
+    return SAM2ImagePredictor(sam_model=model)
+def load_sam_video_model(
+    device: torch.device,
+    config: str = SAM_CONFIG,
+    checkpoint: str = SAM_CHECKPOINT
+) -> Any:
+    return build_sam2_video_predictor(config, checkpoint, device=device)
+def run_sam_inference(
+    model: Any,
+    image: Image,
+    detections: sv.Detections
+) -> sv.Detections:
+    image = np.array(image.convert("RGB"))
+    model.set_image(image)
+    mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
+    # dirty fix; remove this later
+    if len(mask.shape) == 4:
+        mask = np.squeeze(mask)
+    detections.mask = mask.astype(bool)
+    return detections

florence_sam/utils/video.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import datetime
+import os
+import shutil
+import uuid
+def create_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+def delete_directory(directory_path: str) -> None:
+    if not os.path.exists(directory_path):
+        raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
+    try:
+        shutil.rmtree(directory_path)
+    except PermissionError:
+        raise PermissionError(
+            f"Permission denied: Unable to delete '{directory_path}'.")
+def generate_unique_name():
+    current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
+    unique_id = uuid.uuid4()
+    return f"{current_datetime}_{unique_id}"