Spaces:
Running
Running
Upload 28 files
Browse files- florence_sam/.gitattributes +36 -0
- florence_sam/.gitignore +3 -0
- florence_sam/.gradio/certificate.pem +31 -0
- florence_sam/Florence.ipynb +0 -0
- florence_sam/README.md +13 -0
- florence_sam/__init__.py +0 -0
- florence_sam/__pycache__/__init__.cpython-310.pyc +0 -0
- florence_sam/__pycache__/detect_and_segment.cpython-310.pyc +0 -0
- florence_sam/app.py +397 -0
- florence_sam/configs/__init__.py +5 -0
- florence_sam/configs/sam2_hiera_b+.yaml +113 -0
- florence_sam/configs/sam2_hiera_l.yaml +117 -0
- florence_sam/configs/sam2_hiera_s.yaml +116 -0
- florence_sam/configs/sam2_hiera_t.yaml +118 -0
- florence_sam/detect_and_segment.py +151 -0
- florence_sam/process_batch.py +169 -0
- florence_sam/reassemble.py +133 -0
- florence_sam/requirements.txt +10 -0
- florence_sam/utils/__init__.py +0 -0
- florence_sam/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- florence_sam/utils/__pycache__/florence.cpython-310.pyc +0 -0
- florence_sam/utils/__pycache__/modes.cpython-310.pyc +0 -0
- florence_sam/utils/__pycache__/sam.cpython-310.pyc +0 -0
- florence_sam/utils/__pycache__/video.cpython-310.pyc +0 -0
- florence_sam/utils/florence.py +58 -0
- florence_sam/utils/modes.py +13 -0
- florence_sam/utils/sam.py +45 -0
- florence_sam/utils/video.py +26 -0
florence_sam/.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
florence_sam/.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
/venv
|
2 |
+
/.idea
|
3 |
+
/tmp
|
florence_sam/.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
florence_sam/Florence.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
florence_sam/README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Florence2 + SAM2
|
3 |
+
emoji: 🔥
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.40.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
florence_sam/__init__.py
ADDED
File without changes
|
florence_sam/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (147 Bytes). View file
|
|
florence_sam/__pycache__/detect_and_segment.cpython-310.pyc
ADDED
Binary file (4.43 kB). View file
|
|
florence_sam/app.py
ADDED
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Tuple, Optional
|
3 |
+
|
4 |
+
import cv2
|
5 |
+
import gradio as gr
|
6 |
+
import numpy as np
|
7 |
+
#import spaces
|
8 |
+
import supervision as sv
|
9 |
+
import torch
|
10 |
+
from PIL import Image
|
11 |
+
from tqdm import tqdm
|
12 |
+
from utils.video import generate_unique_name, create_directory, delete_directory
|
13 |
+
|
14 |
+
from utils.florence import load_florence_model, run_florence_inference, \
|
15 |
+
FLORENCE_DETAILED_CAPTION_TASK, \
|
16 |
+
FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
17 |
+
from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
|
18 |
+
IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
|
19 |
+
from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
|
20 |
+
|
21 |
+
MARKDOWN = """
|
22 |
+
# Florence2 + SAM2 🔥
|
23 |
+
|
24 |
+
<div>
|
25 |
+
<a href="https://github.com/facebookresearch/segment-anything-2">
|
26 |
+
<img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block;">
|
27 |
+
</a>
|
28 |
+
<a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-segment-images-with-sam-2.ipynb">
|
29 |
+
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
|
30 |
+
</a>
|
31 |
+
<a href="https://blog.roboflow.com/what-is-segment-anything-2/">
|
32 |
+
<img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
|
33 |
+
</a>
|
34 |
+
<a href="https://www.youtube.com/watch?v=Dv003fTyO-Y">
|
35 |
+
<img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
|
36 |
+
</a>
|
37 |
+
</div>
|
38 |
+
|
39 |
+
This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
|
40 |
+
the first stage, Florence2 performs tasks such as object detection, open-vocabulary
|
41 |
+
object detection, image captioning, or phrase grounding. In the second stage, SAM2
|
42 |
+
performs object segmentation on the image.
|
43 |
+
"""
|
44 |
+
|
45 |
+
IMAGE_PROCESSING_EXAMPLES = [
|
46 |
+
[IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw, white napkin, black napkin, hair'],
|
47 |
+
[IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
|
48 |
+
[IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
|
49 |
+
[IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
|
50 |
+
]
|
51 |
+
VIDEO_PROCESSING_EXAMPLES = [
|
52 |
+
["videos/clip-07-camera-1.mp4", "player in white outfit, player in black outfit, ball, rim"],
|
53 |
+
["videos/clip-07-camera-2.mp4", "player in white outfit, player in black outfit, ball, rim"],
|
54 |
+
["videos/clip-07-camera-3.mp4", "player in white outfit, player in black outfit, ball, rim"]
|
55 |
+
]
|
56 |
+
|
57 |
+
VIDEO_SCALE_FACTOR = 0.5
|
58 |
+
VIDEO_TARGET_DIRECTORY = "tmp"
|
59 |
+
create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
|
60 |
+
|
61 |
+
DEVICE = torch.device("cuda")
|
62 |
+
# DEVICE = torch.device("cpu")
|
63 |
+
|
64 |
+
torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
|
65 |
+
if torch.cuda.get_device_properties(0).major >= 8:
|
66 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
67 |
+
torch.backends.cudnn.allow_tf32 = True
|
68 |
+
|
69 |
+
|
70 |
+
FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
|
71 |
+
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
72 |
+
SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
|
73 |
+
COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
|
74 |
+
COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
|
75 |
+
BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
|
76 |
+
LABEL_ANNOTATOR = sv.LabelAnnotator(
|
77 |
+
color=COLOR_PALETTE,
|
78 |
+
color_lookup=sv.ColorLookup.INDEX,
|
79 |
+
text_position=sv.Position.CENTER_OF_MASS,
|
80 |
+
text_color=sv.Color.from_hex("#000000"),
|
81 |
+
border_radius=5
|
82 |
+
)
|
83 |
+
MASK_ANNOTATOR = sv.MaskAnnotator(
|
84 |
+
color=COLOR_PALETTE,
|
85 |
+
color_lookup=sv.ColorLookup.INDEX
|
86 |
+
)
|
87 |
+
|
88 |
+
|
89 |
+
def annotate_image(image, detections):
|
90 |
+
output_image = image.copy()
|
91 |
+
output_image = MASK_ANNOTATOR.annotate(output_image, detections)
|
92 |
+
output_image = BOX_ANNOTATOR.annotate(output_image, detections)
|
93 |
+
output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
|
94 |
+
return output_image
|
95 |
+
|
96 |
+
|
97 |
+
def on_mode_dropdown_change(text):
|
98 |
+
return [
|
99 |
+
gr.Textbox(visible=text == IMAGE_OPEN_VOCABULARY_DETECTION_MODE),
|
100 |
+
gr.Textbox(visible=text == IMAGE_CAPTION_GROUNDING_MASKS_MODE),
|
101 |
+
]
|
102 |
+
|
103 |
+
|
104 |
+
#@spaces.GPU
|
105 |
+
@torch.inference_mode()
|
106 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
107 |
+
def process_image(
|
108 |
+
mode_dropdown, image_input, text_input
|
109 |
+
) -> Tuple[Optional[Image.Image], Optional[str]]:
|
110 |
+
if not image_input:
|
111 |
+
gr.Info("Please upload an image.")
|
112 |
+
return None, None
|
113 |
+
|
114 |
+
if mode_dropdown == IMAGE_OPEN_VOCABULARY_DETECTION_MODE:
|
115 |
+
if not text_input:
|
116 |
+
gr.Info("Please enter a text prompt.")
|
117 |
+
return None, None
|
118 |
+
|
119 |
+
texts = [prompt.strip() for prompt in text_input.split(",")]
|
120 |
+
detections_list = []
|
121 |
+
for text in texts:
|
122 |
+
_, result = run_florence_inference(
|
123 |
+
model=FLORENCE_MODEL,
|
124 |
+
processor=FLORENCE_PROCESSOR,
|
125 |
+
device=DEVICE,
|
126 |
+
image=image_input,
|
127 |
+
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
128 |
+
text=text
|
129 |
+
)
|
130 |
+
detections = sv.Detections.from_lmm(
|
131 |
+
lmm=sv.LMM.FLORENCE_2,
|
132 |
+
result=result,
|
133 |
+
resolution_wh=image_input.size
|
134 |
+
)
|
135 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
|
136 |
+
detections_list.append(detections)
|
137 |
+
|
138 |
+
detections = sv.Detections.merge(detections_list)
|
139 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
|
140 |
+
return annotate_image(image_input, detections), None
|
141 |
+
|
142 |
+
if mode_dropdown == IMAGE_CAPTION_GROUNDING_MASKS_MODE:
|
143 |
+
_, result = run_florence_inference(
|
144 |
+
model=FLORENCE_MODEL,
|
145 |
+
processor=FLORENCE_PROCESSOR,
|
146 |
+
device=DEVICE,
|
147 |
+
image=image_input,
|
148 |
+
task=FLORENCE_DETAILED_CAPTION_TASK
|
149 |
+
)
|
150 |
+
caption = result[FLORENCE_DETAILED_CAPTION_TASK]
|
151 |
+
_, result = run_florence_inference(
|
152 |
+
model=FLORENCE_MODEL,
|
153 |
+
processor=FLORENCE_PROCESSOR,
|
154 |
+
device=DEVICE,
|
155 |
+
image=image_input,
|
156 |
+
task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
|
157 |
+
text=caption
|
158 |
+
)
|
159 |
+
detections = sv.Detections.from_lmm(
|
160 |
+
lmm=sv.LMM.FLORENCE_2,
|
161 |
+
result=result,
|
162 |
+
resolution_wh=image_input.size
|
163 |
+
)
|
164 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
|
165 |
+
return annotate_image(image_input, detections), caption
|
166 |
+
|
167 |
+
|
168 |
+
#@spaces.GPU(duration=300)
|
169 |
+
@torch.inference_mode()
|
170 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
171 |
+
def process_video(
|
172 |
+
video_input, text_input, progress=gr.Progress(track_tqdm=True)
|
173 |
+
) -> Optional[str]:
|
174 |
+
if not video_input:
|
175 |
+
gr.Info("Please upload a video.")
|
176 |
+
return None
|
177 |
+
|
178 |
+
if not text_input:
|
179 |
+
gr.Info("Please enter a text prompt.")
|
180 |
+
return None
|
181 |
+
|
182 |
+
frame_generator = sv.get_video_frames_generator(video_input)
|
183 |
+
frame = next(frame_generator)
|
184 |
+
frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
185 |
+
|
186 |
+
texts = [prompt.strip() for prompt in text_input.split(",")]
|
187 |
+
detections_list = []
|
188 |
+
for text in texts:
|
189 |
+
_, result = run_florence_inference(
|
190 |
+
model=FLORENCE_MODEL,
|
191 |
+
processor=FLORENCE_PROCESSOR,
|
192 |
+
device=DEVICE,
|
193 |
+
image=frame,
|
194 |
+
task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
195 |
+
text=text
|
196 |
+
)
|
197 |
+
detections = sv.Detections.from_lmm(
|
198 |
+
lmm=sv.LMM.FLORENCE_2,
|
199 |
+
result=result,
|
200 |
+
resolution_wh=frame.size
|
201 |
+
)
|
202 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
|
203 |
+
detections_list.append(detections)
|
204 |
+
|
205 |
+
detections = sv.Detections.merge(detections_list)
|
206 |
+
detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
|
207 |
+
|
208 |
+
if len(detections.mask) == 0:
|
209 |
+
gr.Info(
|
210 |
+
"No objects of class {text_input} found in the first frame of the video. "
|
211 |
+
"Trim the video to make the object appear in the first frame or try a "
|
212 |
+
"different text prompt."
|
213 |
+
)
|
214 |
+
return None
|
215 |
+
|
216 |
+
name = generate_unique_name()
|
217 |
+
frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
|
218 |
+
frames_sink = sv.ImageSink(
|
219 |
+
target_dir_path=frame_directory_path,
|
220 |
+
image_name_pattern="{:05d}.jpeg"
|
221 |
+
)
|
222 |
+
|
223 |
+
video_info = sv.VideoInfo.from_video_path(video_input)
|
224 |
+
video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
|
225 |
+
video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
|
226 |
+
|
227 |
+
frames_generator = sv.get_video_frames_generator(video_input)
|
228 |
+
with frames_sink:
|
229 |
+
for frame in tqdm(
|
230 |
+
frames_generator,
|
231 |
+
total=video_info.total_frames,
|
232 |
+
desc="splitting video into frames"
|
233 |
+
):
|
234 |
+
frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
|
235 |
+
frames_sink.save_image(frame)
|
236 |
+
|
237 |
+
inference_state = SAM_VIDEO_MODEL.init_state(
|
238 |
+
video_path=frame_directory_path,
|
239 |
+
device=DEVICE
|
240 |
+
)
|
241 |
+
|
242 |
+
for mask_index, mask in enumerate(detections.mask):
|
243 |
+
_, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
|
244 |
+
inference_state=inference_state,
|
245 |
+
frame_idx=0,
|
246 |
+
obj_id=mask_index,
|
247 |
+
mask=mask
|
248 |
+
)
|
249 |
+
|
250 |
+
video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
|
251 |
+
frames_generator = sv.get_video_frames_generator(video_input)
|
252 |
+
masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)
|
253 |
+
with sv.VideoSink(video_path, video_info=video_info) as sink:
|
254 |
+
for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
|
255 |
+
frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
|
256 |
+
masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
|
257 |
+
if len(masks.shape) == 4:
|
258 |
+
masks = np.squeeze(masks, axis=1)
|
259 |
+
|
260 |
+
detections = sv.Detections(
|
261 |
+
xyxy=sv.mask_to_xyxy(masks=masks),
|
262 |
+
mask=masks,
|
263 |
+
class_id=np.array(tracker_ids)
|
264 |
+
)
|
265 |
+
annotated_frame = frame.copy()
|
266 |
+
annotated_frame = MASK_ANNOTATOR.annotate(
|
267 |
+
scene=annotated_frame, detections=detections)
|
268 |
+
annotated_frame = BOX_ANNOTATOR.annotate(
|
269 |
+
scene=annotated_frame, detections=detections)
|
270 |
+
sink.write_frame(annotated_frame)
|
271 |
+
|
272 |
+
delete_directory(frame_directory_path)
|
273 |
+
return video_path
|
274 |
+
|
275 |
+
|
276 |
+
with gr.Blocks() as demo:
|
277 |
+
gr.Markdown(MARKDOWN)
|
278 |
+
with gr.Tab("Image"):
|
279 |
+
image_processing_mode_dropdown_component = gr.Dropdown(
|
280 |
+
choices=IMAGE_INFERENCE_MODES,
|
281 |
+
value=IMAGE_INFERENCE_MODES[0],
|
282 |
+
label="Mode",
|
283 |
+
info="Select a mode to use.",
|
284 |
+
interactive=True
|
285 |
+
)
|
286 |
+
with gr.Row():
|
287 |
+
with gr.Column():
|
288 |
+
image_processing_image_input_component = gr.Image(
|
289 |
+
type='pil', label='Upload image')
|
290 |
+
image_processing_text_input_component = gr.Textbox(
|
291 |
+
label='Text prompt',
|
292 |
+
placeholder='Enter comma separated text prompts')
|
293 |
+
image_processing_submit_button_component = gr.Button(
|
294 |
+
value='Submit', variant='primary')
|
295 |
+
with gr.Column():
|
296 |
+
image_processing_image_output_component = gr.Image(
|
297 |
+
type='pil', label='Image output')
|
298 |
+
image_processing_text_output_component = gr.Textbox(
|
299 |
+
label='Caption output', visible=False)
|
300 |
+
|
301 |
+
with gr.Row():
|
302 |
+
gr.Examples(
|
303 |
+
fn=process_image,
|
304 |
+
examples=IMAGE_PROCESSING_EXAMPLES,
|
305 |
+
inputs=[
|
306 |
+
image_processing_mode_dropdown_component,
|
307 |
+
image_processing_image_input_component,
|
308 |
+
image_processing_text_input_component
|
309 |
+
],
|
310 |
+
outputs=[
|
311 |
+
image_processing_image_output_component,
|
312 |
+
image_processing_text_output_component
|
313 |
+
],
|
314 |
+
run_on_click=True
|
315 |
+
)
|
316 |
+
with gr.Tab("Video"):
|
317 |
+
video_processing_mode_dropdown_component = gr.Dropdown(
|
318 |
+
choices=VIDEO_INFERENCE_MODES,
|
319 |
+
value=VIDEO_INFERENCE_MODES[0],
|
320 |
+
label="Mode",
|
321 |
+
info="Select a mode to use.",
|
322 |
+
interactive=True
|
323 |
+
)
|
324 |
+
with gr.Row():
|
325 |
+
with gr.Column():
|
326 |
+
video_processing_video_input_component = gr.Video(
|
327 |
+
label='Upload video')
|
328 |
+
video_processing_text_input_component = gr.Textbox(
|
329 |
+
label='Text prompt',
|
330 |
+
placeholder='Enter comma separated text prompts')
|
331 |
+
video_processing_submit_button_component = gr.Button(
|
332 |
+
value='Submit', variant='primary')
|
333 |
+
with gr.Column():
|
334 |
+
video_processing_video_output_component = gr.Video(
|
335 |
+
label='Video output')
|
336 |
+
with gr.Row():
|
337 |
+
gr.Examples(
|
338 |
+
fn=process_video,
|
339 |
+
examples=VIDEO_PROCESSING_EXAMPLES,
|
340 |
+
inputs=[
|
341 |
+
video_processing_video_input_component,
|
342 |
+
video_processing_text_input_component
|
343 |
+
],
|
344 |
+
outputs=video_processing_video_output_component,
|
345 |
+
run_on_click=True
|
346 |
+
)
|
347 |
+
|
348 |
+
image_processing_submit_button_component.click(
|
349 |
+
fn=process_image,
|
350 |
+
inputs=[
|
351 |
+
image_processing_mode_dropdown_component,
|
352 |
+
image_processing_image_input_component,
|
353 |
+
image_processing_text_input_component
|
354 |
+
],
|
355 |
+
outputs=[
|
356 |
+
image_processing_image_output_component,
|
357 |
+
image_processing_text_output_component
|
358 |
+
]
|
359 |
+
)
|
360 |
+
image_processing_text_input_component.submit(
|
361 |
+
fn=process_image,
|
362 |
+
inputs=[
|
363 |
+
image_processing_mode_dropdown_component,
|
364 |
+
image_processing_image_input_component,
|
365 |
+
image_processing_text_input_component
|
366 |
+
],
|
367 |
+
outputs=[
|
368 |
+
image_processing_image_output_component,
|
369 |
+
image_processing_text_output_component
|
370 |
+
]
|
371 |
+
)
|
372 |
+
image_processing_mode_dropdown_component.change(
|
373 |
+
on_mode_dropdown_change,
|
374 |
+
inputs=[image_processing_mode_dropdown_component],
|
375 |
+
outputs=[
|
376 |
+
image_processing_text_input_component,
|
377 |
+
image_processing_text_output_component
|
378 |
+
]
|
379 |
+
)
|
380 |
+
video_processing_submit_button_component.click(
|
381 |
+
fn=process_video,
|
382 |
+
inputs=[
|
383 |
+
video_processing_video_input_component,
|
384 |
+
video_processing_text_input_component
|
385 |
+
],
|
386 |
+
outputs=video_processing_video_output_component
|
387 |
+
)
|
388 |
+
video_processing_text_input_component.submit(
|
389 |
+
fn=process_video,
|
390 |
+
inputs=[
|
391 |
+
video_processing_video_input_component,
|
392 |
+
video_processing_text_input_component
|
393 |
+
],
|
394 |
+
outputs=video_processing_video_output_component
|
395 |
+
)
|
396 |
+
|
397 |
+
demo.launch(debug=False, show_error=True, share=True)
|
florence_sam/configs/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
2 |
+
# All rights reserved.
|
3 |
+
|
4 |
+
# This source code is licensed under the license found in the
|
5 |
+
# LICENSE file in the root directory of this source tree.
|
florence_sam/configs/sam2_hiera_b+.yaml
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 112
|
12 |
+
num_heads: 2
|
13 |
+
neck:
|
14 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
15 |
+
position_encoding:
|
16 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
17 |
+
num_pos_feats: 256
|
18 |
+
normalize: true
|
19 |
+
scale: null
|
20 |
+
temperature: 10000
|
21 |
+
d_model: 256
|
22 |
+
backbone_channel_list: [896, 448, 224, 112]
|
23 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
24 |
+
fpn_interp_model: nearest
|
25 |
+
|
26 |
+
memory_attention:
|
27 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
28 |
+
d_model: 256
|
29 |
+
pos_enc_at_input: true
|
30 |
+
layer:
|
31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
32 |
+
activation: relu
|
33 |
+
dim_feedforward: 2048
|
34 |
+
dropout: 0.1
|
35 |
+
pos_enc_at_attn: false
|
36 |
+
self_attention:
|
37 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
38 |
+
rope_theta: 10000.0
|
39 |
+
feat_sizes: [32, 32]
|
40 |
+
embedding_dim: 256
|
41 |
+
num_heads: 1
|
42 |
+
downsample_rate: 1
|
43 |
+
dropout: 0.1
|
44 |
+
d_model: 256
|
45 |
+
pos_enc_at_cross_attn_keys: true
|
46 |
+
pos_enc_at_cross_attn_queries: false
|
47 |
+
cross_attention:
|
48 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
49 |
+
rope_theta: 10000.0
|
50 |
+
feat_sizes: [32, 32]
|
51 |
+
rope_k_repeat: True
|
52 |
+
embedding_dim: 256
|
53 |
+
num_heads: 1
|
54 |
+
downsample_rate: 1
|
55 |
+
dropout: 0.1
|
56 |
+
kv_in_dim: 64
|
57 |
+
num_layers: 4
|
58 |
+
|
59 |
+
memory_encoder:
|
60 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
61 |
+
out_dim: 64
|
62 |
+
position_encoding:
|
63 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
64 |
+
num_pos_feats: 64
|
65 |
+
normalize: true
|
66 |
+
scale: null
|
67 |
+
temperature: 10000
|
68 |
+
mask_downsampler:
|
69 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
70 |
+
kernel_size: 3
|
71 |
+
stride: 2
|
72 |
+
padding: 1
|
73 |
+
fuser:
|
74 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
75 |
+
layer:
|
76 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
77 |
+
dim: 256
|
78 |
+
kernel_size: 7
|
79 |
+
padding: 3
|
80 |
+
layer_scale_init_value: 1e-6
|
81 |
+
use_dwconv: True # depth-wise convs
|
82 |
+
num_layers: 2
|
83 |
+
|
84 |
+
num_maskmem: 7
|
85 |
+
image_size: 1024
|
86 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
87 |
+
sigmoid_scale_for_mem_enc: 20.0
|
88 |
+
sigmoid_bias_for_mem_enc: -10.0
|
89 |
+
use_mask_input_as_output_without_sam: true
|
90 |
+
# Memory
|
91 |
+
directly_add_no_mem_embed: true
|
92 |
+
# use high-resolution feature map in the SAM mask decoder
|
93 |
+
use_high_res_features_in_sam: true
|
94 |
+
# output 3 masks on the first click on initial conditioning frames
|
95 |
+
multimask_output_in_sam: true
|
96 |
+
# SAM heads
|
97 |
+
iou_prediction_use_sigmoid: True
|
98 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
99 |
+
use_obj_ptrs_in_encoder: true
|
100 |
+
add_tpos_enc_to_obj_ptrs: false
|
101 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
102 |
+
# object occlusion prediction
|
103 |
+
pred_obj_scores: true
|
104 |
+
pred_obj_scores_mlp: true
|
105 |
+
fixed_no_obj_ptr: true
|
106 |
+
# multimask tracking settings
|
107 |
+
multimask_output_for_tracking: true
|
108 |
+
use_multimask_token_for_obj_ptr: true
|
109 |
+
multimask_min_pt_num: 0
|
110 |
+
multimask_max_pt_num: 1
|
111 |
+
use_mlp_for_obj_ptr_proj: true
|
112 |
+
# Compilation flag
|
113 |
+
compile_image_encoder: False
|
florence_sam/configs/sam2_hiera_l.yaml
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 144
|
12 |
+
num_heads: 2
|
13 |
+
stages: [2, 6, 36, 4]
|
14 |
+
global_att_blocks: [23, 33, 43]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
window_spec: [8, 4, 16, 8]
|
17 |
+
neck:
|
18 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
19 |
+
position_encoding:
|
20 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
21 |
+
num_pos_feats: 256
|
22 |
+
normalize: true
|
23 |
+
scale: null
|
24 |
+
temperature: 10000
|
25 |
+
d_model: 256
|
26 |
+
backbone_channel_list: [1152, 576, 288, 144]
|
27 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
28 |
+
fpn_interp_model: nearest
|
29 |
+
|
30 |
+
memory_attention:
|
31 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
32 |
+
d_model: 256
|
33 |
+
pos_enc_at_input: true
|
34 |
+
layer:
|
35 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
36 |
+
activation: relu
|
37 |
+
dim_feedforward: 2048
|
38 |
+
dropout: 0.1
|
39 |
+
pos_enc_at_attn: false
|
40 |
+
self_attention:
|
41 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
42 |
+
rope_theta: 10000.0
|
43 |
+
feat_sizes: [32, 32]
|
44 |
+
embedding_dim: 256
|
45 |
+
num_heads: 1
|
46 |
+
downsample_rate: 1
|
47 |
+
dropout: 0.1
|
48 |
+
d_model: 256
|
49 |
+
pos_enc_at_cross_attn_keys: true
|
50 |
+
pos_enc_at_cross_attn_queries: false
|
51 |
+
cross_attention:
|
52 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
53 |
+
rope_theta: 10000.0
|
54 |
+
feat_sizes: [32, 32]
|
55 |
+
rope_k_repeat: True
|
56 |
+
embedding_dim: 256
|
57 |
+
num_heads: 1
|
58 |
+
downsample_rate: 1
|
59 |
+
dropout: 0.1
|
60 |
+
kv_in_dim: 64
|
61 |
+
num_layers: 4
|
62 |
+
|
63 |
+
memory_encoder:
|
64 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
65 |
+
out_dim: 64
|
66 |
+
position_encoding:
|
67 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
68 |
+
num_pos_feats: 64
|
69 |
+
normalize: true
|
70 |
+
scale: null
|
71 |
+
temperature: 10000
|
72 |
+
mask_downsampler:
|
73 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
74 |
+
kernel_size: 3
|
75 |
+
stride: 2
|
76 |
+
padding: 1
|
77 |
+
fuser:
|
78 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
79 |
+
layer:
|
80 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
81 |
+
dim: 256
|
82 |
+
kernel_size: 7
|
83 |
+
padding: 3
|
84 |
+
layer_scale_init_value: 1e-6
|
85 |
+
use_dwconv: True # depth-wise convs
|
86 |
+
num_layers: 2
|
87 |
+
|
88 |
+
num_maskmem: 7
|
89 |
+
image_size: 1024
|
90 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
93 |
+
use_mask_input_as_output_without_sam: true
|
94 |
+
# Memory
|
95 |
+
directly_add_no_mem_embed: true
|
96 |
+
# use high-resolution feature map in the SAM mask decoder
|
97 |
+
use_high_res_features_in_sam: true
|
98 |
+
# output 3 masks on the first click on initial conditioning frames
|
99 |
+
multimask_output_in_sam: true
|
100 |
+
# SAM heads
|
101 |
+
iou_prediction_use_sigmoid: True
|
102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
+
use_obj_ptrs_in_encoder: true
|
104 |
+
add_tpos_enc_to_obj_ptrs: false
|
105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
+
# object occlusion prediction
|
107 |
+
pred_obj_scores: true
|
108 |
+
pred_obj_scores_mlp: true
|
109 |
+
fixed_no_obj_ptr: true
|
110 |
+
# multimask tracking settings
|
111 |
+
multimask_output_for_tracking: true
|
112 |
+
use_multimask_token_for_obj_ptr: true
|
113 |
+
multimask_min_pt_num: 0
|
114 |
+
multimask_max_pt_num: 1
|
115 |
+
use_mlp_for_obj_ptr_proj: true
|
116 |
+
# Compilation flag
|
117 |
+
compile_image_encoder: False
|
florence_sam/configs/sam2_hiera_s.yaml
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 96
|
12 |
+
num_heads: 1
|
13 |
+
stages: [1, 2, 11, 2]
|
14 |
+
global_att_blocks: [7, 10, 13]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
neck:
|
17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
+
position_encoding:
|
19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
+
num_pos_feats: 256
|
21 |
+
normalize: true
|
22 |
+
scale: null
|
23 |
+
temperature: 10000
|
24 |
+
d_model: 256
|
25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
+
fpn_interp_model: nearest
|
28 |
+
|
29 |
+
memory_attention:
|
30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
+
d_model: 256
|
32 |
+
pos_enc_at_input: true
|
33 |
+
layer:
|
34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
+
activation: relu
|
36 |
+
dim_feedforward: 2048
|
37 |
+
dropout: 0.1
|
38 |
+
pos_enc_at_attn: false
|
39 |
+
self_attention:
|
40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
+
rope_theta: 10000.0
|
42 |
+
feat_sizes: [32, 32]
|
43 |
+
embedding_dim: 256
|
44 |
+
num_heads: 1
|
45 |
+
downsample_rate: 1
|
46 |
+
dropout: 0.1
|
47 |
+
d_model: 256
|
48 |
+
pos_enc_at_cross_attn_keys: true
|
49 |
+
pos_enc_at_cross_attn_queries: false
|
50 |
+
cross_attention:
|
51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
+
rope_theta: 10000.0
|
53 |
+
feat_sizes: [32, 32]
|
54 |
+
rope_k_repeat: True
|
55 |
+
embedding_dim: 256
|
56 |
+
num_heads: 1
|
57 |
+
downsample_rate: 1
|
58 |
+
dropout: 0.1
|
59 |
+
kv_in_dim: 64
|
60 |
+
num_layers: 4
|
61 |
+
|
62 |
+
memory_encoder:
|
63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
+
out_dim: 64
|
65 |
+
position_encoding:
|
66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
+
num_pos_feats: 64
|
68 |
+
normalize: true
|
69 |
+
scale: null
|
70 |
+
temperature: 10000
|
71 |
+
mask_downsampler:
|
72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
+
kernel_size: 3
|
74 |
+
stride: 2
|
75 |
+
padding: 1
|
76 |
+
fuser:
|
77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
+
layer:
|
79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
+
dim: 256
|
81 |
+
kernel_size: 7
|
82 |
+
padding: 3
|
83 |
+
layer_scale_init_value: 1e-6
|
84 |
+
use_dwconv: True # depth-wise convs
|
85 |
+
num_layers: 2
|
86 |
+
|
87 |
+
num_maskmem: 7
|
88 |
+
image_size: 1024
|
89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
+
sigmoid_scale_for_mem_enc: 20.0
|
91 |
+
sigmoid_bias_for_mem_enc: -10.0
|
92 |
+
use_mask_input_as_output_without_sam: true
|
93 |
+
# Memory
|
94 |
+
directly_add_no_mem_embed: true
|
95 |
+
# use high-resolution feature map in the SAM mask decoder
|
96 |
+
use_high_res_features_in_sam: true
|
97 |
+
# output 3 masks on the first click on initial conditioning frames
|
98 |
+
multimask_output_in_sam: true
|
99 |
+
# SAM heads
|
100 |
+
iou_prediction_use_sigmoid: True
|
101 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
102 |
+
use_obj_ptrs_in_encoder: true
|
103 |
+
add_tpos_enc_to_obj_ptrs: false
|
104 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
105 |
+
# object occlusion prediction
|
106 |
+
pred_obj_scores: true
|
107 |
+
pred_obj_scores_mlp: true
|
108 |
+
fixed_no_obj_ptr: true
|
109 |
+
# multimask tracking settings
|
110 |
+
multimask_output_for_tracking: true
|
111 |
+
use_multimask_token_for_obj_ptr: true
|
112 |
+
multimask_min_pt_num: 0
|
113 |
+
multimask_max_pt_num: 1
|
114 |
+
use_mlp_for_obj_ptr_proj: true
|
115 |
+
# Compilation flag
|
116 |
+
compile_image_encoder: False
|
florence_sam/configs/sam2_hiera_t.yaml
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# @package _global_
|
2 |
+
|
3 |
+
# Model
|
4 |
+
model:
|
5 |
+
_target_: sam2.modeling.sam2_base.SAM2Base
|
6 |
+
image_encoder:
|
7 |
+
_target_: sam2.modeling.backbones.image_encoder.ImageEncoder
|
8 |
+
scalp: 1
|
9 |
+
trunk:
|
10 |
+
_target_: sam2.modeling.backbones.hieradet.Hiera
|
11 |
+
embed_dim: 96
|
12 |
+
num_heads: 1
|
13 |
+
stages: [1, 2, 7, 2]
|
14 |
+
global_att_blocks: [5, 7, 9]
|
15 |
+
window_pos_embed_bkg_spatial_size: [7, 7]
|
16 |
+
neck:
|
17 |
+
_target_: sam2.modeling.backbones.image_encoder.FpnNeck
|
18 |
+
position_encoding:
|
19 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
20 |
+
num_pos_feats: 256
|
21 |
+
normalize: true
|
22 |
+
scale: null
|
23 |
+
temperature: 10000
|
24 |
+
d_model: 256
|
25 |
+
backbone_channel_list: [768, 384, 192, 96]
|
26 |
+
fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
|
27 |
+
fpn_interp_model: nearest
|
28 |
+
|
29 |
+
memory_attention:
|
30 |
+
_target_: sam2.modeling.memory_attention.MemoryAttention
|
31 |
+
d_model: 256
|
32 |
+
pos_enc_at_input: true
|
33 |
+
layer:
|
34 |
+
_target_: sam2.modeling.memory_attention.MemoryAttentionLayer
|
35 |
+
activation: relu
|
36 |
+
dim_feedforward: 2048
|
37 |
+
dropout: 0.1
|
38 |
+
pos_enc_at_attn: false
|
39 |
+
self_attention:
|
40 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
41 |
+
rope_theta: 10000.0
|
42 |
+
feat_sizes: [32, 32]
|
43 |
+
embedding_dim: 256
|
44 |
+
num_heads: 1
|
45 |
+
downsample_rate: 1
|
46 |
+
dropout: 0.1
|
47 |
+
d_model: 256
|
48 |
+
pos_enc_at_cross_attn_keys: true
|
49 |
+
pos_enc_at_cross_attn_queries: false
|
50 |
+
cross_attention:
|
51 |
+
_target_: sam2.modeling.sam.transformer.RoPEAttention
|
52 |
+
rope_theta: 10000.0
|
53 |
+
feat_sizes: [32, 32]
|
54 |
+
rope_k_repeat: True
|
55 |
+
embedding_dim: 256
|
56 |
+
num_heads: 1
|
57 |
+
downsample_rate: 1
|
58 |
+
dropout: 0.1
|
59 |
+
kv_in_dim: 64
|
60 |
+
num_layers: 4
|
61 |
+
|
62 |
+
memory_encoder:
|
63 |
+
_target_: sam2.modeling.memory_encoder.MemoryEncoder
|
64 |
+
out_dim: 64
|
65 |
+
position_encoding:
|
66 |
+
_target_: sam2.modeling.position_encoding.PositionEmbeddingSine
|
67 |
+
num_pos_feats: 64
|
68 |
+
normalize: true
|
69 |
+
scale: null
|
70 |
+
temperature: 10000
|
71 |
+
mask_downsampler:
|
72 |
+
_target_: sam2.modeling.memory_encoder.MaskDownSampler
|
73 |
+
kernel_size: 3
|
74 |
+
stride: 2
|
75 |
+
padding: 1
|
76 |
+
fuser:
|
77 |
+
_target_: sam2.modeling.memory_encoder.Fuser
|
78 |
+
layer:
|
79 |
+
_target_: sam2.modeling.memory_encoder.CXBlock
|
80 |
+
dim: 256
|
81 |
+
kernel_size: 7
|
82 |
+
padding: 3
|
83 |
+
layer_scale_init_value: 1e-6
|
84 |
+
use_dwconv: True # depth-wise convs
|
85 |
+
num_layers: 2
|
86 |
+
|
87 |
+
num_maskmem: 7
|
88 |
+
image_size: 1024
|
89 |
+
# apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
|
90 |
+
# SAM decoder
|
91 |
+
sigmoid_scale_for_mem_enc: 20.0
|
92 |
+
sigmoid_bias_for_mem_enc: -10.0
|
93 |
+
use_mask_input_as_output_without_sam: true
|
94 |
+
# Memory
|
95 |
+
directly_add_no_mem_embed: true
|
96 |
+
# use high-resolution feature map in the SAM mask decoder
|
97 |
+
use_high_res_features_in_sam: true
|
98 |
+
# output 3 masks on the first click on initial conditioning frames
|
99 |
+
multimask_output_in_sam: true
|
100 |
+
# SAM heads
|
101 |
+
iou_prediction_use_sigmoid: True
|
102 |
+
# cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
|
103 |
+
use_obj_ptrs_in_encoder: true
|
104 |
+
add_tpos_enc_to_obj_ptrs: false
|
105 |
+
only_obj_ptrs_in_the_past_for_eval: true
|
106 |
+
# object occlusion prediction
|
107 |
+
pred_obj_scores: true
|
108 |
+
pred_obj_scores_mlp: true
|
109 |
+
fixed_no_obj_ptr: true
|
110 |
+
# multimask tracking settings
|
111 |
+
multimask_output_for_tracking: true
|
112 |
+
use_multimask_token_for_obj_ptr: true
|
113 |
+
multimask_min_pt_num: 0
|
114 |
+
multimask_max_pt_num: 1
|
115 |
+
use_mlp_for_obj_ptr_proj: true
|
116 |
+
# Compilation flag
|
117 |
+
# HieraT does not currently support compilation, should always be set to False
|
118 |
+
compile_image_encoder: False
|
florence_sam/detect_and_segment.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# detect_and_segment.py
|
2 |
+
import torch
|
3 |
+
import supervision as sv
|
4 |
+
from typing import List, Tuple, Optional
|
5 |
+
|
6 |
+
# ==== 1. One-time global model loading =====================================
|
7 |
+
from .utils.florence import (
|
8 |
+
load_florence_model,
|
9 |
+
run_florence_inference,
|
10 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
11 |
+
)
|
12 |
+
from .utils.sam import load_sam_image_model, run_sam_inference
|
13 |
+
|
14 |
+
from PIL import Image, ImageDraw, ImageColor
|
15 |
+
import numpy as np
|
16 |
+
|
17 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
18 |
+
|
19 |
+
# load models once – they stay in memory for repeated calls
|
20 |
+
FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
|
21 |
+
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
22 |
+
|
23 |
+
# quick annotators
|
24 |
+
COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
|
25 |
+
COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
|
26 |
+
BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
|
27 |
+
LABEL_ANNOTATOR = sv.LabelAnnotator(
|
28 |
+
color=COLOR_PALETTE,
|
29 |
+
color_lookup=sv.ColorLookup.INDEX,
|
30 |
+
text_position=sv.Position.CENTER_OF_MASS,
|
31 |
+
text_color=sv.Color.from_hex("#000000"),
|
32 |
+
border_radius=5,
|
33 |
+
)
|
34 |
+
MASK_ANNOTATOR = sv.MaskAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
|
35 |
+
|
36 |
+
# ==== 2. Inference function ===============================================
|
37 |
+
|
38 |
+
@torch.inference_mode()
|
39 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
40 |
+
def detect_and_segment(
|
41 |
+
image : Image.Image,
|
42 |
+
text_prompts : str | List[str],
|
43 |
+
return_image : bool = True,
|
44 |
+
) -> Tuple[sv.Detections, Optional[Image.Image]]:
|
45 |
+
"""
|
46 |
+
Run Florence-2 open-vocabulary detection + SAM2 mask refinement on a PIL image.
|
47 |
+
|
48 |
+
Parameters
|
49 |
+
----------
|
50 |
+
image : PIL.Image
|
51 |
+
Input image in RGB.
|
52 |
+
text_prompts : str | List[str]
|
53 |
+
Single prompt or comma-separated list (e.g. "dog, tail, leash").
|
54 |
+
return_image : bool
|
55 |
+
If True, also returns an annotated PIL image.
|
56 |
+
|
57 |
+
Returns
|
58 |
+
-------
|
59 |
+
detections : sv.Detections
|
60 |
+
Supervision object with xyxy, mask, class_id, etc.
|
61 |
+
annotated : PIL.Image | None
|
62 |
+
Annotated image (None if return_image=False)
|
63 |
+
"""
|
64 |
+
# Normalize prompt list
|
65 |
+
if isinstance(text_prompts, str):
|
66 |
+
prompts = [p.strip() for p in text_prompts.split(",") if p.strip()]
|
67 |
+
else:
|
68 |
+
prompts = [p.strip() for p in text_prompts]
|
69 |
+
|
70 |
+
if len(prompts) == 0:
|
71 |
+
raise ValueError("Empty prompt list given.")
|
72 |
+
|
73 |
+
# Collect detections from each prompt
|
74 |
+
det_list: list[sv.Detections] = []
|
75 |
+
for p in prompts:
|
76 |
+
_, result = run_florence_inference(
|
77 |
+
model = FLORENCE_MODEL,
|
78 |
+
processor = FLORENCE_PROC,
|
79 |
+
device = DEVICE,
|
80 |
+
image = image,
|
81 |
+
task = FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
82 |
+
text = p,
|
83 |
+
)
|
84 |
+
det = sv.Detections.from_lmm(
|
85 |
+
lmm = sv.LMM.FLORENCE_2,
|
86 |
+
result = result,
|
87 |
+
resolution_wh = image.size,
|
88 |
+
)
|
89 |
+
det = run_sam_inference(SAM_IMAGE_MODEL, image, det) # SAM2 refinement
|
90 |
+
det_list.append(det)
|
91 |
+
|
92 |
+
detections = sv.Detections.merge(det_list)
|
93 |
+
|
94 |
+
annotated_img = None
|
95 |
+
if return_image:
|
96 |
+
annotated_img = image.copy()
|
97 |
+
annotated_img = MASK_ANNOTATOR.annotate(annotated_img, detections)
|
98 |
+
annotated_img = BOX_ANNOTATOR.annotate(annotated_img, detections)
|
99 |
+
annotated_img = LABEL_ANNOTATOR.annotate(annotated_img, detections)
|
100 |
+
|
101 |
+
return detections, annotated_img
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
def fill_detected_bboxes(
|
106 |
+
image: Image.Image,
|
107 |
+
text: str,
|
108 |
+
inflate_pct: float = 0.10,
|
109 |
+
fill_color: str | tuple[int, int, int] = "#00FF00",
|
110 |
+
):
|
111 |
+
"""
|
112 |
+
Detect objects matching `text`, inflate each bounding-box by `inflate_pct`,
|
113 |
+
fill the area with `fill_color`, and return the resulting image.
|
114 |
+
|
115 |
+
Parameters
|
116 |
+
----------
|
117 |
+
image : PIL.Image
|
118 |
+
Input image (RGB).
|
119 |
+
text : str
|
120 |
+
Comma-separated prompt(s) for open-vocabulary detection.
|
121 |
+
inflate_pct : float, default 0.10
|
122 |
+
Extra margin per side (0.10 = +10 % width & height).
|
123 |
+
fill_color : str | tuple, default "#00FF00"
|
124 |
+
Solid color used to fill each inflated bbox (hex or RGB tuple).
|
125 |
+
|
126 |
+
Returns
|
127 |
+
-------
|
128 |
+
filled_img : PIL.Image
|
129 |
+
Image with each detected (inflated) box filled.
|
130 |
+
detections : sv.Detections
|
131 |
+
Original detection object from `detect_and_segment`.
|
132 |
+
"""
|
133 |
+
# run Florence2 + SAM2 pipeline (your helper from earlier)
|
134 |
+
detections, _ = detect_and_segment(image, text)
|
135 |
+
|
136 |
+
w, h = image.size
|
137 |
+
filled_img = image.copy()
|
138 |
+
draw = ImageDraw.Draw(filled_img)
|
139 |
+
fill_rgb = ImageColor.getrgb(fill_color) if isinstance(fill_color, str) else fill_color
|
140 |
+
|
141 |
+
for box in detections.xyxy:
|
142 |
+
# xyxy is numpy array → cast to float for math
|
143 |
+
x1, y1, x2, y2 = box.astype(float)
|
144 |
+
dw, dh = (x2 - x1) * inflate_pct, (y2 - y1) * inflate_pct
|
145 |
+
x1_i = max(0, x1 - dw)
|
146 |
+
y1_i = max(0, y1 - dh)
|
147 |
+
x2_i = min(w, x2 + dw)
|
148 |
+
y2_i = min(h, y2 + dh)
|
149 |
+
draw.rectangle([x1_i, y1_i, x2_i, y2_i], fill=fill_rgb)
|
150 |
+
|
151 |
+
return filled_img, detections
|
florence_sam/process_batch.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# furniture_bbox_to_files.py ────────────────────────────────────────
|
3 |
+
# Florence-2 + SAM-2 batch processor with retries *and* file-based images
|
4 |
+
# --------------------------------------------------------------------
|
5 |
+
import os, json, random, time
|
6 |
+
from pathlib import Path
|
7 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
8 |
+
from typing import List
|
9 |
+
|
10 |
+
import torch, supervision as sv
|
11 |
+
from PIL import Image, ImageDraw, ImageColor, ImageOps
|
12 |
+
from tqdm.auto import tqdm
|
13 |
+
from datasets import load_dataset, Image as HFImage, disable_progress_bar
|
14 |
+
|
15 |
+
# ───── global models ────────────────────────────────────────────────
|
16 |
+
from utils.florence import (
|
17 |
+
load_florence_model, run_florence_inference,
|
18 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
|
19 |
+
)
|
20 |
+
from utils.sam import load_sam_image_model, run_sam_inference
|
21 |
+
|
22 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
+
FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
|
24 |
+
SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
25 |
+
|
26 |
+
# annotators
|
27 |
+
_PALETTE = sv.ColorPalette.from_hex(
|
28 |
+
['#FF1493','#00BFFF','#FF6347','#FFD700','#32CD32','#8A2BE2'])
|
29 |
+
BOX_ANN = sv.BoxAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
|
30 |
+
MASK_ANN = sv.MaskAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
|
31 |
+
LBL_ANN = sv.LabelAnnotator(
|
32 |
+
color=_PALETTE, color_lookup=sv.ColorLookup.INDEX,
|
33 |
+
text_position=sv.Position.CENTER_OF_MASS,
|
34 |
+
text_color=sv.Color.from_hex("#000"), border_radius=5)
|
35 |
+
|
36 |
+
# ───── config ───────────────────────────────────────────────────────
|
37 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
38 |
+
disable_progress_bar()
|
39 |
+
|
40 |
+
DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
|
41 |
+
SPLIT = "train"
|
42 |
+
IMAGE_COL = "img2"
|
43 |
+
PROMPT_COL = "segmenting_prompt"
|
44 |
+
|
45 |
+
INFLATE_RANGE = (0.01, 0.05)
|
46 |
+
FILL_COLOR = "#00FF00"
|
47 |
+
TARGET_SIDE = 1500
|
48 |
+
|
49 |
+
QA_DIR = Path("bbox_review_recaptioned")
|
50 |
+
GREEN_DIR = QA_DIR / "green"; GREEN_DIR.mkdir(parents=True, exist_ok=True)
|
51 |
+
ANNO_DIR = QA_DIR / "anno"; ANNO_DIR.mkdir(parents=True, exist_ok=True)
|
52 |
+
JSON_DIR = QA_DIR / "json"; JSON_DIR.mkdir(parents=True, exist_ok=True)
|
53 |
+
|
54 |
+
MAX_WORKERS = 100
|
55 |
+
MAX_RETRIES = 5
|
56 |
+
RETRY_SLEEP = .3
|
57 |
+
FAILED_LOG = QA_DIR / "failed_rows.jsonl"
|
58 |
+
|
59 |
+
PROMPT_MAP: dict[str,str] = {} # optional overrides
|
60 |
+
|
61 |
+
# ───── helpers ──────────────────────────────────────────────────────
|
62 |
+
def make_square(img: Image.Image, side: int = TARGET_SIDE) -> Image.Image:
|
63 |
+
img = ImageOps.contain(img, (side, side))
|
64 |
+
pad_w, pad_h = side - img.width, side - img.height
|
65 |
+
return ImageOps.expand(img, border=(pad_w//2, pad_h//2,
|
66 |
+
pad_w - pad_w//2, pad_h - pad_h//2),
|
67 |
+
fill=img.getpixel((0,0)))
|
68 |
+
|
69 |
+
def img_to_file(img: Image.Image, fname: str, folder: Path) -> dict:
|
70 |
+
path = folder / f"{fname}.png"
|
71 |
+
if not path.exists():
|
72 |
+
img.save(path)
|
73 |
+
return {"path": str(path), "bytes": None}
|
74 |
+
|
75 |
+
# ───── core functions ───────────────────────────────────────────────
|
76 |
+
@torch.inference_mode()
|
77 |
+
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
78 |
+
def detect_and_segment(img: Image.Image, prompts: str|List[str]) -> sv.Detections:
|
79 |
+
if isinstance(prompts, str):
|
80 |
+
prompts = [p.strip() for p in prompts.split(",") if p.strip()]
|
81 |
+
all_dets = []
|
82 |
+
for p in prompts:
|
83 |
+
_, res = run_florence_inference(
|
84 |
+
model=FLORENCE_MODEL, processor=FLORENCE_PROC, device=DEVICE,
|
85 |
+
image=img, task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK, text=p)
|
86 |
+
d = sv.Detections.from_lmm(sv.LMM.FLORENCE_2, res, img.size)
|
87 |
+
all_dets.append(run_sam_inference(SAM_IMAGE_MODEL, img, d))
|
88 |
+
return sv.Detections.merge(all_dets)
|
89 |
+
|
90 |
+
def fill_detected_bboxes(img: Image.Image, prompt: str,
|
91 |
+
inflate_pct: float) -> tuple[Image.Image, sv.Detections]:
|
92 |
+
dets = detect_and_segment(img, prompt)
|
93 |
+
filled = img.copy()
|
94 |
+
draw = ImageDraw.Draw(filled)
|
95 |
+
rgb = ImageColor.getrgb(FILL_COLOR)
|
96 |
+
w,h = img.size
|
97 |
+
for box in dets.xyxy:
|
98 |
+
x1,y1,x2,y2 = box.astype(float)
|
99 |
+
dw,dh = (x2-x1)*inflate_pct, (y2-y1)*inflate_pct
|
100 |
+
draw.rectangle([max(0,x1-dw), max(0,y1-dh),
|
101 |
+
min(w,x2+dw), min(h,y2+dh)], fill=rgb)
|
102 |
+
return filled, dets
|
103 |
+
|
104 |
+
# ───── threaded worker ──────────────────────────────────────────────
|
105 |
+
def process_row(idx: int, sample):
|
106 |
+
prompt = PROMPT_MAP.get(sample[PROMPT_COL],
|
107 |
+
sample[PROMPT_COL].split(",",1)[0].strip())
|
108 |
+
img_sq = make_square(sample[IMAGE_COL].convert("RGB"))
|
109 |
+
for attempt in range(1, MAX_RETRIES+1):
|
110 |
+
try:
|
111 |
+
filled, dets = fill_detected_bboxes(
|
112 |
+
img_sq, prompt, inflate_pct=random.uniform(*INFLATE_RANGE))
|
113 |
+
if len(dets.xyxy) == 0:
|
114 |
+
raise ValueError("no detections")
|
115 |
+
|
116 |
+
sid = f"{idx:06d}"
|
117 |
+
json_p = JSON_DIR / f"{sid}_bbox.json"
|
118 |
+
json_p.write_text(json.dumps({"xyxy": dets.xyxy.tolist()}))
|
119 |
+
|
120 |
+
anno = img_sq.copy()
|
121 |
+
for ann in (MASK_ANN, BOX_ANN, LABEL_ANN):
|
122 |
+
anno = ann.annotate(anno, dets)
|
123 |
+
|
124 |
+
return ("ok",
|
125 |
+
img_to_file(filled, sid, GREEN_DIR),
|
126 |
+
img_to_file(anno, sid, ANNO_DIR),
|
127 |
+
json_p.read_text())
|
128 |
+
except Exception as e:
|
129 |
+
if attempt < MAX_RETRIES:
|
130 |
+
time.sleep(RETRY_SLEEP)
|
131 |
+
else:
|
132 |
+
return ("fail", str(e))
|
133 |
+
|
134 |
+
# ───── run batch ────────────────────────────────────────────────────
|
135 |
+
ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
|
136 |
+
N = len(ds)
|
137 |
+
print("Rows:", N)
|
138 |
+
|
139 |
+
filled_col, anno_col, json_col = [None]*N, [None]*N, [None]*N
|
140 |
+
fails = 0
|
141 |
+
|
142 |
+
with ThreadPoolExecutor(MAX_WORKERS) as pool:
|
143 |
+
fut2idx = {pool.submit(process_row, i, ds[i]): i for i in range(N)}
|
144 |
+
for fut in tqdm(as_completed(fut2idx), total=N, desc="Florence+SAM"):
|
145 |
+
idx = fut2idx[fut]
|
146 |
+
status, *data = fut.result()
|
147 |
+
if status == "ok":
|
148 |
+
filled_col[idx], anno_col[idx], json_col[idx] = data
|
149 |
+
else:
|
150 |
+
fails += 1
|
151 |
+
FAILED_LOG.write_text(json.dumps({"idx": idx, "reason": data[0]})+"\n")
|
152 |
+
|
153 |
+
print(f"❌ permanently failed rows: {fails}")
|
154 |
+
|
155 |
+
keep = [i for i,x in enumerate(filled_col) if x]
|
156 |
+
new_ds = ds.select(keep)
|
157 |
+
new_ds = new_ds.add_column("bbox_filled", [filled_col[i] for i in keep])
|
158 |
+
new_ds = new_ds.add_column("annotated", [anno_col[i] for i in keep])
|
159 |
+
new_ds = new_ds.add_column("bbox_json", [json_col[i] for i in keep])
|
160 |
+
new_ds = new_ds.cast_column("bbox_filled", HFImage())
|
161 |
+
new_ds = new_ds.cast_column("annotated", HFImage())
|
162 |
+
|
163 |
+
print(f"✅ successes: {len(new_ds)} / {N}")
|
164 |
+
print("Columns:", new_ds.column_names)
|
165 |
+
print("QA artefacts →", QA_DIR.resolve())
|
166 |
+
|
167 |
+
# optional push
|
168 |
+
new_ds.push_to_hub("fotographerai/surround_furniture_bboxfilled",
|
169 |
+
private=True, max_shard_size="500MB")
|
florence_sam/reassemble.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
"""
|
3 |
+
reassemble_bbox_dataset_resume.py
|
4 |
+
---------------------------------
|
5 |
+
Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from
|
6 |
+
QA artefacts and pushes the final dataset **privately** to HF Hub.
|
7 |
+
|
8 |
+
• Safe to ^C / rerun (uses on-disk Arrow cache)
|
9 |
+
• When NOTHING is left to process it *just* loads the cache and pushes.
|
10 |
+
• Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny.
|
11 |
+
"""
|
12 |
+
|
13 |
+
import os, json
|
14 |
+
from pathlib import Path
|
15 |
+
from tqdm.auto import tqdm
|
16 |
+
from datasets import (
|
17 |
+
load_dataset, load_from_disk, Dataset, disable_progress_bar, Features,
|
18 |
+
Value, Image as HFImage
|
19 |
+
)
|
20 |
+
from PIL import Image
|
21 |
+
from huggingface_hub.utils import HfHubHTTPError
|
22 |
+
|
23 |
+
disable_progress_bar()
|
24 |
+
|
25 |
+
# ══════ CONFIG ══════════════════════════════════════════════════════
|
26 |
+
DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
|
27 |
+
SPLIT = "train"
|
28 |
+
|
29 |
+
QA_DIR = Path("bbox_review_recaptioned") # artefacts
|
30 |
+
CACHE_DIR = Path("rebuild_cache") # incremental Arrow cache
|
31 |
+
CACHE_DIR.mkdir(exist_ok=True)
|
32 |
+
|
33 |
+
TARGET_SIDE = 1500
|
34 |
+
GREEN_RGB = (0, 255, 0)
|
35 |
+
|
36 |
+
BATCH_SAVE = 500
|
37 |
+
HUB_REPO = "fotographerai/furniture_bboxfilled_rebuild"
|
38 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() # needs write+private
|
39 |
+
|
40 |
+
# ══════ HELPERS ═════════════════════════════════════════════════════
|
41 |
+
def img_ref(p: Path) -> dict: # path-only image dict
|
42 |
+
return {"path": str(p), "bytes": None}
|
43 |
+
|
44 |
+
def make_green_png(p: Path):
|
45 |
+
if not p.exists():
|
46 |
+
Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p)
|
47 |
+
|
48 |
+
def ensure_full_bbox(p: Path):
|
49 |
+
if not p.exists():
|
50 |
+
p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]}))
|
51 |
+
|
52 |
+
# ══════ LOAD SOURCE DATASET ═════════════════════════════════════════
|
53 |
+
base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
|
54 |
+
N_TOTAL = len(base_ds)
|
55 |
+
print("Original rows:", N_TOTAL)
|
56 |
+
|
57 |
+
# ══════ LOAD OR INIT CACHE ══════════════════════════════════════════
|
58 |
+
if (CACHE_DIR / "dataset_info.json").exists():
|
59 |
+
cache_ds = load_from_disk(CACHE_DIR)
|
60 |
+
done = set(cache_ds["__row_idx__"])
|
61 |
+
print(f"Cache found → {len(done)} rows already processed.")
|
62 |
+
records = {k: list(v) for k, v in cache_ds.to_dict().items()}
|
63 |
+
else:
|
64 |
+
done, records = set(), {"__row_idx__": [], "bbox_filled": [],
|
65 |
+
"annotated": [], "bbox_json": []}
|
66 |
+
|
67 |
+
missing = [i for i in range(N_TOTAL) if i not in done]
|
68 |
+
print("Rows still to process:", len(missing))
|
69 |
+
|
70 |
+
# ══════ NO WORK LEFT? push & exit ══════════════════════════════════
|
71 |
+
if not missing:
|
72 |
+
print("💤 nothing new to process – pushing cached dataset…")
|
73 |
+
try:
|
74 |
+
url = cache_ds.push_to_hub(
|
75 |
+
HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
|
76 |
+
)
|
77 |
+
print("🚀 dataset pushed to:", url)
|
78 |
+
except HfHubHTTPError as e:
|
79 |
+
print("❌ push failed:", e)
|
80 |
+
exit(0)
|
81 |
+
|
82 |
+
# ══════ PROCESS MISSING ROWS ═══════════════════════════════════════
|
83 |
+
for n, i in enumerate(tqdm(missing, desc="Re-assembling")):
|
84 |
+
g_png = QA_DIR / f"{i:06d}_green.png"
|
85 |
+
a_png = QA_DIR / f"{i:06d}_anno.png"
|
86 |
+
bbox_j = QA_DIR / f"{i:06d}_bbox.json"
|
87 |
+
|
88 |
+
if not (g_png.exists() and a_png.exists() and bbox_j.exists()):
|
89 |
+
mask_png = QA_DIR / f"{i:06d}_mask.png"
|
90 |
+
make_green_png(mask_png)
|
91 |
+
g_png = a_png = mask_png
|
92 |
+
ensure_full_bbox(bbox_j)
|
93 |
+
|
94 |
+
row = base_ds[i] # copy original cols once
|
95 |
+
records["__row_idx__"].append(i)
|
96 |
+
for k, v in row.items():
|
97 |
+
records.setdefault(k, []).append(v)
|
98 |
+
|
99 |
+
records["bbox_filled"].append(img_ref(g_png))
|
100 |
+
records["annotated"].append(img_ref(a_png))
|
101 |
+
records["bbox_json"].append(bbox_j.read_text())
|
102 |
+
|
103 |
+
if (n + 1) % BATCH_SAVE == 0:
|
104 |
+
Dataset.from_dict(records).save_to_disk(CACHE_DIR)
|
105 |
+
print(f"⏫ cached at {n+1}/{len(missing)}")
|
106 |
+
|
107 |
+
# ══════ FINAL DATASET FEATURES & SAVE ═══════════════════════════════
|
108 |
+
features = Features({
|
109 |
+
"__row_idx__" : Value("int32"),
|
110 |
+
"bbox_filled" : HFImage(decode=False),
|
111 |
+
"annotated" : HFImage(decode=False),
|
112 |
+
"bbox_json" : Value("string"),
|
113 |
+
# original columns inferred below
|
114 |
+
})
|
115 |
+
for k in base_ds.features:
|
116 |
+
if k not in features:
|
117 |
+
features[k] = base_ds.features[k]
|
118 |
+
|
119 |
+
final_ds = Dataset.from_dict(records, features=features)
|
120 |
+
final_ds.save_to_disk(CACHE_DIR)
|
121 |
+
print("✅ cached dataset saved to", CACHE_DIR.resolve())
|
122 |
+
|
123 |
+
# ══════ PUSH PRIVATE ═══════════════════════════════════════════════
|
124 |
+
if not HF_TOKEN:
|
125 |
+
print("⚠️ HF_TOKEN env-var not set – skipping push.")
|
126 |
+
else:
|
127 |
+
try:
|
128 |
+
url = final_ds.push_to_hub(
|
129 |
+
HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
|
130 |
+
)
|
131 |
+
print("🚀 dataset pushed to:", url)
|
132 |
+
except HfHubHTTPError as e:
|
133 |
+
print("❌ push failed:", e)
|
florence_sam/requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tqdm
|
2 |
+
einops
|
3 |
+
spaces
|
4 |
+
timm
|
5 |
+
transformers
|
6 |
+
samv2
|
7 |
+
gradio
|
8 |
+
supervision
|
9 |
+
opencv-python
|
10 |
+
pytest
|
florence_sam/utils/__init__.py
ADDED
File without changes
|
florence_sam/utils/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (141 Bytes). View file
|
|
florence_sam/utils/__pycache__/florence.cpython-310.pyc
ADDED
Binary file (2.29 kB). View file
|
|
florence_sam/utils/__pycache__/modes.cpython-310.pyc
ADDED
Binary file (450 Bytes). View file
|
|
florence_sam/utils/__pycache__/sam.cpython-310.pyc
ADDED
Binary file (1.46 kB). View file
|
|
florence_sam/utils/__pycache__/video.cpython-310.pyc
ADDED
Binary file (984 Bytes). View file
|
|
florence_sam/utils/florence.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Union, Any, Tuple, Dict
|
3 |
+
from unittest.mock import patch
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
8 |
+
from transformers.dynamic_module_utils import get_imports
|
9 |
+
|
10 |
+
FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
|
11 |
+
FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
|
12 |
+
FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
|
13 |
+
FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
|
14 |
+
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
|
15 |
+
FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
|
16 |
+
|
17 |
+
|
18 |
+
def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
|
19 |
+
"""Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
|
20 |
+
if not str(filename).endswith("/modeling_florence2.py"):
|
21 |
+
return get_imports(filename)
|
22 |
+
imports = get_imports(filename)
|
23 |
+
#imports.remove("flash_attn")
|
24 |
+
return imports
|
25 |
+
|
26 |
+
|
27 |
+
def load_florence_model(
|
28 |
+
device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
|
29 |
+
) -> Tuple[Any, Any]:
|
30 |
+
with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
|
31 |
+
model = AutoModelForCausalLM.from_pretrained(
|
32 |
+
checkpoint, trust_remote_code=True).to(device).eval()
|
33 |
+
processor = AutoProcessor.from_pretrained(
|
34 |
+
checkpoint, trust_remote_code=True)
|
35 |
+
return model, processor
|
36 |
+
|
37 |
+
|
38 |
+
def run_florence_inference(
|
39 |
+
model: Any,
|
40 |
+
processor: Any,
|
41 |
+
device: torch.device,
|
42 |
+
image: Image,
|
43 |
+
task: str,
|
44 |
+
text: str = ""
|
45 |
+
) -> Tuple[str, Dict]:
|
46 |
+
prompt = task + text
|
47 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
48 |
+
generated_ids = model.generate(
|
49 |
+
input_ids=inputs["input_ids"],
|
50 |
+
pixel_values=inputs["pixel_values"],
|
51 |
+
max_new_tokens=1024,
|
52 |
+
num_beams=3
|
53 |
+
)
|
54 |
+
generated_text = processor.batch_decode(
|
55 |
+
generated_ids, skip_special_tokens=False)[0]
|
56 |
+
response = processor.post_process_generation(
|
57 |
+
generated_text, task=task, image_size=image.size)
|
58 |
+
return generated_text, response
|
florence_sam/utils/modes.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
|
2 |
+
IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"
|
3 |
+
|
4 |
+
IMAGE_INFERENCE_MODES = [
|
5 |
+
IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
|
6 |
+
IMAGE_CAPTION_GROUNDING_MASKS_MODE
|
7 |
+
]
|
8 |
+
|
9 |
+
VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"
|
10 |
+
|
11 |
+
VIDEO_INFERENCE_MODES = [
|
12 |
+
VIDEO_OPEN_VOCABULARY_DETECTION_MODE
|
13 |
+
]
|
florence_sam/utils/sam.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import supervision as sv
|
5 |
+
import torch
|
6 |
+
from PIL import Image
|
7 |
+
from sam2.build_sam import build_sam2, build_sam2_video_predictor
|
8 |
+
from sam2.sam2_image_predictor import SAM2ImagePredictor
|
9 |
+
|
10 |
+
SAM_CHECKPOINT = "/home/comdoleger1/ZenCtrl/app/florence_sam/checkpoints/sam2_hiera_small.pt" #"./checkpoints/sam2_hiera_small.pt"
|
11 |
+
SAM_CONFIG = "sam2_hiera_s.yaml"
|
12 |
+
|
13 |
+
|
14 |
+
def load_sam_image_model(
|
15 |
+
device: torch.device,
|
16 |
+
config: str = SAM_CONFIG,
|
17 |
+
checkpoint: str = SAM_CHECKPOINT
|
18 |
+
) -> SAM2ImagePredictor:
|
19 |
+
model = build_sam2(config, checkpoint, device=device)
|
20 |
+
return SAM2ImagePredictor(sam_model=model)
|
21 |
+
|
22 |
+
|
23 |
+
def load_sam_video_model(
|
24 |
+
device: torch.device,
|
25 |
+
config: str = SAM_CONFIG,
|
26 |
+
checkpoint: str = SAM_CHECKPOINT
|
27 |
+
) -> Any:
|
28 |
+
return build_sam2_video_predictor(config, checkpoint, device=device)
|
29 |
+
|
30 |
+
|
31 |
+
def run_sam_inference(
|
32 |
+
model: Any,
|
33 |
+
image: Image,
|
34 |
+
detections: sv.Detections
|
35 |
+
) -> sv.Detections:
|
36 |
+
image = np.array(image.convert("RGB"))
|
37 |
+
model.set_image(image)
|
38 |
+
mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
|
39 |
+
|
40 |
+
# dirty fix; remove this later
|
41 |
+
if len(mask.shape) == 4:
|
42 |
+
mask = np.squeeze(mask)
|
43 |
+
|
44 |
+
detections.mask = mask.astype(bool)
|
45 |
+
return detections
|
florence_sam/utils/video.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import uuid
|
5 |
+
|
6 |
+
|
7 |
+
def create_directory(directory_path: str) -> None:
|
8 |
+
if not os.path.exists(directory_path):
|
9 |
+
os.makedirs(directory_path)
|
10 |
+
|
11 |
+
|
12 |
+
def delete_directory(directory_path: str) -> None:
|
13 |
+
if not os.path.exists(directory_path):
|
14 |
+
raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
|
15 |
+
|
16 |
+
try:
|
17 |
+
shutil.rmtree(directory_path)
|
18 |
+
except PermissionError:
|
19 |
+
raise PermissionError(
|
20 |
+
f"Permission denied: Unable to delete '{directory_path}'.")
|
21 |
+
|
22 |
+
|
23 |
+
def generate_unique_name():
|
24 |
+
current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
|
25 |
+
unique_id = uuid.uuid4()
|
26 |
+
return f"{current_datetime}_{unique_id}"
|