salso commited on
Commit
545e508
·
verified ·
1 Parent(s): 952e6da

Upload 28 files

Browse files
florence_sam/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
florence_sam/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /venv
2
+ /.idea
3
+ /tmp
florence_sam/.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
florence_sam/Florence.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
florence_sam/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Florence2 + SAM2
3
+ emoji: 🔥
4
+ colorFrom: purple
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 4.40.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
florence_sam/__init__.py ADDED
File without changes
florence_sam/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (147 Bytes). View file
 
florence_sam/__pycache__/detect_and_segment.cpython-310.pyc ADDED
Binary file (4.43 kB). View file
 
florence_sam/app.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Tuple, Optional
3
+
4
+ import cv2
5
+ import gradio as gr
6
+ import numpy as np
7
+ #import spaces
8
+ import supervision as sv
9
+ import torch
10
+ from PIL import Image
11
+ from tqdm import tqdm
12
+ from utils.video import generate_unique_name, create_directory, delete_directory
13
+
14
+ from utils.florence import load_florence_model, run_florence_inference, \
15
+ FLORENCE_DETAILED_CAPTION_TASK, \
16
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK, FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
17
+ from utils.modes import IMAGE_INFERENCE_MODES, IMAGE_OPEN_VOCABULARY_DETECTION_MODE, \
18
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE, VIDEO_INFERENCE_MODES
19
+ from utils.sam import load_sam_image_model, run_sam_inference, load_sam_video_model
20
+
21
+ MARKDOWN = """
22
+ # Florence2 + SAM2 🔥
23
+
24
+ <div>
25
+ <a href="https://github.com/facebookresearch/segment-anything-2">
26
+ <img src="https://badges.aleen42.com/src/github.svg" alt="GitHub" style="display:inline-block;">
27
+ </a>
28
+ <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-segment-images-with-sam-2.ipynb">
29
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
30
+ </a>
31
+ <a href="https://blog.roboflow.com/what-is-segment-anything-2/">
32
+ <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
33
+ </a>
34
+ <a href="https://www.youtube.com/watch?v=Dv003fTyO-Y">
35
+ <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
36
+ </a>
37
+ </div>
38
+
39
+ This demo integrates Florence2 and SAM2 by creating a two-stage inference pipeline. In
40
+ the first stage, Florence2 performs tasks such as object detection, open-vocabulary
41
+ object detection, image captioning, or phrase grounding. In the second stage, SAM2
42
+ performs object segmentation on the image.
43
+ """
44
+
45
+ IMAGE_PROCESSING_EXAMPLES = [
46
+ [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", 'straw, white napkin, black napkin, hair'],
47
+ [IMAGE_OPEN_VOCABULARY_DETECTION_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", 'tail'],
48
+ [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
+ [IMAGE_CAPTION_GROUNDING_MASKS_MODE, "https://media.roboflow.com/notebooks/examples/dog-3.jpeg", None],
50
+ ]
51
+ VIDEO_PROCESSING_EXAMPLES = [
52
+ ["videos/clip-07-camera-1.mp4", "player in white outfit, player in black outfit, ball, rim"],
53
+ ["videos/clip-07-camera-2.mp4", "player in white outfit, player in black outfit, ball, rim"],
54
+ ["videos/clip-07-camera-3.mp4", "player in white outfit, player in black outfit, ball, rim"]
55
+ ]
56
+
57
+ VIDEO_SCALE_FACTOR = 0.5
58
+ VIDEO_TARGET_DIRECTORY = "tmp"
59
+ create_directory(directory_path=VIDEO_TARGET_DIRECTORY)
60
+
61
+ DEVICE = torch.device("cuda")
62
+ # DEVICE = torch.device("cpu")
63
+
64
+ torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
65
+ if torch.cuda.get_device_properties(0).major >= 8:
66
+ torch.backends.cuda.matmul.allow_tf32 = True
67
+ torch.backends.cudnn.allow_tf32 = True
68
+
69
+
70
+ FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
71
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
72
+ SAM_VIDEO_MODEL = load_sam_video_model(device=DEVICE)
73
+ COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
74
+ COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
75
+ BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
76
+ LABEL_ANNOTATOR = sv.LabelAnnotator(
77
+ color=COLOR_PALETTE,
78
+ color_lookup=sv.ColorLookup.INDEX,
79
+ text_position=sv.Position.CENTER_OF_MASS,
80
+ text_color=sv.Color.from_hex("#000000"),
81
+ border_radius=5
82
+ )
83
+ MASK_ANNOTATOR = sv.MaskAnnotator(
84
+ color=COLOR_PALETTE,
85
+ color_lookup=sv.ColorLookup.INDEX
86
+ )
87
+
88
+
89
+ def annotate_image(image, detections):
90
+ output_image = image.copy()
91
+ output_image = MASK_ANNOTATOR.annotate(output_image, detections)
92
+ output_image = BOX_ANNOTATOR.annotate(output_image, detections)
93
+ output_image = LABEL_ANNOTATOR.annotate(output_image, detections)
94
+ return output_image
95
+
96
+
97
+ def on_mode_dropdown_change(text):
98
+ return [
99
+ gr.Textbox(visible=text == IMAGE_OPEN_VOCABULARY_DETECTION_MODE),
100
+ gr.Textbox(visible=text == IMAGE_CAPTION_GROUNDING_MASKS_MODE),
101
+ ]
102
+
103
+
104
+ #@spaces.GPU
105
+ @torch.inference_mode()
106
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
107
+ def process_image(
108
+ mode_dropdown, image_input, text_input
109
+ ) -> Tuple[Optional[Image.Image], Optional[str]]:
110
+ if not image_input:
111
+ gr.Info("Please upload an image.")
112
+ return None, None
113
+
114
+ if mode_dropdown == IMAGE_OPEN_VOCABULARY_DETECTION_MODE:
115
+ if not text_input:
116
+ gr.Info("Please enter a text prompt.")
117
+ return None, None
118
+
119
+ texts = [prompt.strip() for prompt in text_input.split(",")]
120
+ detections_list = []
121
+ for text in texts:
122
+ _, result = run_florence_inference(
123
+ model=FLORENCE_MODEL,
124
+ processor=FLORENCE_PROCESSOR,
125
+ device=DEVICE,
126
+ image=image_input,
127
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
128
+ text=text
129
+ )
130
+ detections = sv.Detections.from_lmm(
131
+ lmm=sv.LMM.FLORENCE_2,
132
+ result=result,
133
+ resolution_wh=image_input.size
134
+ )
135
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
136
+ detections_list.append(detections)
137
+
138
+ detections = sv.Detections.merge(detections_list)
139
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
140
+ return annotate_image(image_input, detections), None
141
+
142
+ if mode_dropdown == IMAGE_CAPTION_GROUNDING_MASKS_MODE:
143
+ _, result = run_florence_inference(
144
+ model=FLORENCE_MODEL,
145
+ processor=FLORENCE_PROCESSOR,
146
+ device=DEVICE,
147
+ image=image_input,
148
+ task=FLORENCE_DETAILED_CAPTION_TASK
149
+ )
150
+ caption = result[FLORENCE_DETAILED_CAPTION_TASK]
151
+ _, result = run_florence_inference(
152
+ model=FLORENCE_MODEL,
153
+ processor=FLORENCE_PROCESSOR,
154
+ device=DEVICE,
155
+ image=image_input,
156
+ task=FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK,
157
+ text=caption
158
+ )
159
+ detections = sv.Detections.from_lmm(
160
+ lmm=sv.LMM.FLORENCE_2,
161
+ result=result,
162
+ resolution_wh=image_input.size
163
+ )
164
+ detections = run_sam_inference(SAM_IMAGE_MODEL, image_input, detections)
165
+ return annotate_image(image_input, detections), caption
166
+
167
+
168
+ #@spaces.GPU(duration=300)
169
+ @torch.inference_mode()
170
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
171
+ def process_video(
172
+ video_input, text_input, progress=gr.Progress(track_tqdm=True)
173
+ ) -> Optional[str]:
174
+ if not video_input:
175
+ gr.Info("Please upload a video.")
176
+ return None
177
+
178
+ if not text_input:
179
+ gr.Info("Please enter a text prompt.")
180
+ return None
181
+
182
+ frame_generator = sv.get_video_frames_generator(video_input)
183
+ frame = next(frame_generator)
184
+ frame = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
185
+
186
+ texts = [prompt.strip() for prompt in text_input.split(",")]
187
+ detections_list = []
188
+ for text in texts:
189
+ _, result = run_florence_inference(
190
+ model=FLORENCE_MODEL,
191
+ processor=FLORENCE_PROCESSOR,
192
+ device=DEVICE,
193
+ image=frame,
194
+ task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
195
+ text=text
196
+ )
197
+ detections = sv.Detections.from_lmm(
198
+ lmm=sv.LMM.FLORENCE_2,
199
+ result=result,
200
+ resolution_wh=frame.size
201
+ )
202
+ detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
203
+ detections_list.append(detections)
204
+
205
+ detections = sv.Detections.merge(detections_list)
206
+ detections = run_sam_inference(SAM_IMAGE_MODEL, frame, detections)
207
+
208
+ if len(detections.mask) == 0:
209
+ gr.Info(
210
+ "No objects of class {text_input} found in the first frame of the video. "
211
+ "Trim the video to make the object appear in the first frame or try a "
212
+ "different text prompt."
213
+ )
214
+ return None
215
+
216
+ name = generate_unique_name()
217
+ frame_directory_path = os.path.join(VIDEO_TARGET_DIRECTORY, name)
218
+ frames_sink = sv.ImageSink(
219
+ target_dir_path=frame_directory_path,
220
+ image_name_pattern="{:05d}.jpeg"
221
+ )
222
+
223
+ video_info = sv.VideoInfo.from_video_path(video_input)
224
+ video_info.width = int(video_info.width * VIDEO_SCALE_FACTOR)
225
+ video_info.height = int(video_info.height * VIDEO_SCALE_FACTOR)
226
+
227
+ frames_generator = sv.get_video_frames_generator(video_input)
228
+ with frames_sink:
229
+ for frame in tqdm(
230
+ frames_generator,
231
+ total=video_info.total_frames,
232
+ desc="splitting video into frames"
233
+ ):
234
+ frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
235
+ frames_sink.save_image(frame)
236
+
237
+ inference_state = SAM_VIDEO_MODEL.init_state(
238
+ video_path=frame_directory_path,
239
+ device=DEVICE
240
+ )
241
+
242
+ for mask_index, mask in enumerate(detections.mask):
243
+ _, object_ids, mask_logits = SAM_VIDEO_MODEL.add_new_mask(
244
+ inference_state=inference_state,
245
+ frame_idx=0,
246
+ obj_id=mask_index,
247
+ mask=mask
248
+ )
249
+
250
+ video_path = os.path.join(VIDEO_TARGET_DIRECTORY, f"{name}.mp4")
251
+ frames_generator = sv.get_video_frames_generator(video_input)
252
+ masks_generator = SAM_VIDEO_MODEL.propagate_in_video(inference_state)
253
+ with sv.VideoSink(video_path, video_info=video_info) as sink:
254
+ for frame, (_, tracker_ids, mask_logits) in zip(frames_generator, masks_generator):
255
+ frame = sv.scale_image(frame, VIDEO_SCALE_FACTOR)
256
+ masks = (mask_logits > 0.0).cpu().numpy().astype(bool)
257
+ if len(masks.shape) == 4:
258
+ masks = np.squeeze(masks, axis=1)
259
+
260
+ detections = sv.Detections(
261
+ xyxy=sv.mask_to_xyxy(masks=masks),
262
+ mask=masks,
263
+ class_id=np.array(tracker_ids)
264
+ )
265
+ annotated_frame = frame.copy()
266
+ annotated_frame = MASK_ANNOTATOR.annotate(
267
+ scene=annotated_frame, detections=detections)
268
+ annotated_frame = BOX_ANNOTATOR.annotate(
269
+ scene=annotated_frame, detections=detections)
270
+ sink.write_frame(annotated_frame)
271
+
272
+ delete_directory(frame_directory_path)
273
+ return video_path
274
+
275
+
276
+ with gr.Blocks() as demo:
277
+ gr.Markdown(MARKDOWN)
278
+ with gr.Tab("Image"):
279
+ image_processing_mode_dropdown_component = gr.Dropdown(
280
+ choices=IMAGE_INFERENCE_MODES,
281
+ value=IMAGE_INFERENCE_MODES[0],
282
+ label="Mode",
283
+ info="Select a mode to use.",
284
+ interactive=True
285
+ )
286
+ with gr.Row():
287
+ with gr.Column():
288
+ image_processing_image_input_component = gr.Image(
289
+ type='pil', label='Upload image')
290
+ image_processing_text_input_component = gr.Textbox(
291
+ label='Text prompt',
292
+ placeholder='Enter comma separated text prompts')
293
+ image_processing_submit_button_component = gr.Button(
294
+ value='Submit', variant='primary')
295
+ with gr.Column():
296
+ image_processing_image_output_component = gr.Image(
297
+ type='pil', label='Image output')
298
+ image_processing_text_output_component = gr.Textbox(
299
+ label='Caption output', visible=False)
300
+
301
+ with gr.Row():
302
+ gr.Examples(
303
+ fn=process_image,
304
+ examples=IMAGE_PROCESSING_EXAMPLES,
305
+ inputs=[
306
+ image_processing_mode_dropdown_component,
307
+ image_processing_image_input_component,
308
+ image_processing_text_input_component
309
+ ],
310
+ outputs=[
311
+ image_processing_image_output_component,
312
+ image_processing_text_output_component
313
+ ],
314
+ run_on_click=True
315
+ )
316
+ with gr.Tab("Video"):
317
+ video_processing_mode_dropdown_component = gr.Dropdown(
318
+ choices=VIDEO_INFERENCE_MODES,
319
+ value=VIDEO_INFERENCE_MODES[0],
320
+ label="Mode",
321
+ info="Select a mode to use.",
322
+ interactive=True
323
+ )
324
+ with gr.Row():
325
+ with gr.Column():
326
+ video_processing_video_input_component = gr.Video(
327
+ label='Upload video')
328
+ video_processing_text_input_component = gr.Textbox(
329
+ label='Text prompt',
330
+ placeholder='Enter comma separated text prompts')
331
+ video_processing_submit_button_component = gr.Button(
332
+ value='Submit', variant='primary')
333
+ with gr.Column():
334
+ video_processing_video_output_component = gr.Video(
335
+ label='Video output')
336
+ with gr.Row():
337
+ gr.Examples(
338
+ fn=process_video,
339
+ examples=VIDEO_PROCESSING_EXAMPLES,
340
+ inputs=[
341
+ video_processing_video_input_component,
342
+ video_processing_text_input_component
343
+ ],
344
+ outputs=video_processing_video_output_component,
345
+ run_on_click=True
346
+ )
347
+
348
+ image_processing_submit_button_component.click(
349
+ fn=process_image,
350
+ inputs=[
351
+ image_processing_mode_dropdown_component,
352
+ image_processing_image_input_component,
353
+ image_processing_text_input_component
354
+ ],
355
+ outputs=[
356
+ image_processing_image_output_component,
357
+ image_processing_text_output_component
358
+ ]
359
+ )
360
+ image_processing_text_input_component.submit(
361
+ fn=process_image,
362
+ inputs=[
363
+ image_processing_mode_dropdown_component,
364
+ image_processing_image_input_component,
365
+ image_processing_text_input_component
366
+ ],
367
+ outputs=[
368
+ image_processing_image_output_component,
369
+ image_processing_text_output_component
370
+ ]
371
+ )
372
+ image_processing_mode_dropdown_component.change(
373
+ on_mode_dropdown_change,
374
+ inputs=[image_processing_mode_dropdown_component],
375
+ outputs=[
376
+ image_processing_text_input_component,
377
+ image_processing_text_output_component
378
+ ]
379
+ )
380
+ video_processing_submit_button_component.click(
381
+ fn=process_video,
382
+ inputs=[
383
+ video_processing_video_input_component,
384
+ video_processing_text_input_component
385
+ ],
386
+ outputs=video_processing_video_output_component
387
+ )
388
+ video_processing_text_input_component.submit(
389
+ fn=process_video,
390
+ inputs=[
391
+ video_processing_video_input_component,
392
+ video_processing_text_input_component
393
+ ],
394
+ outputs=video_processing_video_output_component
395
+ )
396
+
397
+ demo.launch(debug=False, show_error=True, share=True)
florence_sam/configs/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
florence_sam/configs/sam2_hiera_b+.yaml ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 112
12
+ num_heads: 2
13
+ neck:
14
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
15
+ position_encoding:
16
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
17
+ num_pos_feats: 256
18
+ normalize: true
19
+ scale: null
20
+ temperature: 10000
21
+ d_model: 256
22
+ backbone_channel_list: [896, 448, 224, 112]
23
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
24
+ fpn_interp_model: nearest
25
+
26
+ memory_attention:
27
+ _target_: sam2.modeling.memory_attention.MemoryAttention
28
+ d_model: 256
29
+ pos_enc_at_input: true
30
+ layer:
31
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
32
+ activation: relu
33
+ dim_feedforward: 2048
34
+ dropout: 0.1
35
+ pos_enc_at_attn: false
36
+ self_attention:
37
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
38
+ rope_theta: 10000.0
39
+ feat_sizes: [32, 32]
40
+ embedding_dim: 256
41
+ num_heads: 1
42
+ downsample_rate: 1
43
+ dropout: 0.1
44
+ d_model: 256
45
+ pos_enc_at_cross_attn_keys: true
46
+ pos_enc_at_cross_attn_queries: false
47
+ cross_attention:
48
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
49
+ rope_theta: 10000.0
50
+ feat_sizes: [32, 32]
51
+ rope_k_repeat: True
52
+ embedding_dim: 256
53
+ num_heads: 1
54
+ downsample_rate: 1
55
+ dropout: 0.1
56
+ kv_in_dim: 64
57
+ num_layers: 4
58
+
59
+ memory_encoder:
60
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
61
+ out_dim: 64
62
+ position_encoding:
63
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
64
+ num_pos_feats: 64
65
+ normalize: true
66
+ scale: null
67
+ temperature: 10000
68
+ mask_downsampler:
69
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
70
+ kernel_size: 3
71
+ stride: 2
72
+ padding: 1
73
+ fuser:
74
+ _target_: sam2.modeling.memory_encoder.Fuser
75
+ layer:
76
+ _target_: sam2.modeling.memory_encoder.CXBlock
77
+ dim: 256
78
+ kernel_size: 7
79
+ padding: 3
80
+ layer_scale_init_value: 1e-6
81
+ use_dwconv: True # depth-wise convs
82
+ num_layers: 2
83
+
84
+ num_maskmem: 7
85
+ image_size: 1024
86
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
87
+ sigmoid_scale_for_mem_enc: 20.0
88
+ sigmoid_bias_for_mem_enc: -10.0
89
+ use_mask_input_as_output_without_sam: true
90
+ # Memory
91
+ directly_add_no_mem_embed: true
92
+ # use high-resolution feature map in the SAM mask decoder
93
+ use_high_res_features_in_sam: true
94
+ # output 3 masks on the first click on initial conditioning frames
95
+ multimask_output_in_sam: true
96
+ # SAM heads
97
+ iou_prediction_use_sigmoid: True
98
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
99
+ use_obj_ptrs_in_encoder: true
100
+ add_tpos_enc_to_obj_ptrs: false
101
+ only_obj_ptrs_in_the_past_for_eval: true
102
+ # object occlusion prediction
103
+ pred_obj_scores: true
104
+ pred_obj_scores_mlp: true
105
+ fixed_no_obj_ptr: true
106
+ # multimask tracking settings
107
+ multimask_output_for_tracking: true
108
+ use_multimask_token_for_obj_ptr: true
109
+ multimask_min_pt_num: 0
110
+ multimask_max_pt_num: 1
111
+ use_mlp_for_obj_ptr_proj: true
112
+ # Compilation flag
113
+ compile_image_encoder: False
florence_sam/configs/sam2_hiera_l.yaml ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 144
12
+ num_heads: 2
13
+ stages: [2, 6, 36, 4]
14
+ global_att_blocks: [23, 33, 43]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ window_spec: [8, 4, 16, 8]
17
+ neck:
18
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
19
+ position_encoding:
20
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
21
+ num_pos_feats: 256
22
+ normalize: true
23
+ scale: null
24
+ temperature: 10000
25
+ d_model: 256
26
+ backbone_channel_list: [1152, 576, 288, 144]
27
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
28
+ fpn_interp_model: nearest
29
+
30
+ memory_attention:
31
+ _target_: sam2.modeling.memory_attention.MemoryAttention
32
+ d_model: 256
33
+ pos_enc_at_input: true
34
+ layer:
35
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
36
+ activation: relu
37
+ dim_feedforward: 2048
38
+ dropout: 0.1
39
+ pos_enc_at_attn: false
40
+ self_attention:
41
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
42
+ rope_theta: 10000.0
43
+ feat_sizes: [32, 32]
44
+ embedding_dim: 256
45
+ num_heads: 1
46
+ downsample_rate: 1
47
+ dropout: 0.1
48
+ d_model: 256
49
+ pos_enc_at_cross_attn_keys: true
50
+ pos_enc_at_cross_attn_queries: false
51
+ cross_attention:
52
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
53
+ rope_theta: 10000.0
54
+ feat_sizes: [32, 32]
55
+ rope_k_repeat: True
56
+ embedding_dim: 256
57
+ num_heads: 1
58
+ downsample_rate: 1
59
+ dropout: 0.1
60
+ kv_in_dim: 64
61
+ num_layers: 4
62
+
63
+ memory_encoder:
64
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
65
+ out_dim: 64
66
+ position_encoding:
67
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
68
+ num_pos_feats: 64
69
+ normalize: true
70
+ scale: null
71
+ temperature: 10000
72
+ mask_downsampler:
73
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
74
+ kernel_size: 3
75
+ stride: 2
76
+ padding: 1
77
+ fuser:
78
+ _target_: sam2.modeling.memory_encoder.Fuser
79
+ layer:
80
+ _target_: sam2.modeling.memory_encoder.CXBlock
81
+ dim: 256
82
+ kernel_size: 7
83
+ padding: 3
84
+ layer_scale_init_value: 1e-6
85
+ use_dwconv: True # depth-wise convs
86
+ num_layers: 2
87
+
88
+ num_maskmem: 7
89
+ image_size: 1024
90
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
91
+ sigmoid_scale_for_mem_enc: 20.0
92
+ sigmoid_bias_for_mem_enc: -10.0
93
+ use_mask_input_as_output_without_sam: true
94
+ # Memory
95
+ directly_add_no_mem_embed: true
96
+ # use high-resolution feature map in the SAM mask decoder
97
+ use_high_res_features_in_sam: true
98
+ # output 3 masks on the first click on initial conditioning frames
99
+ multimask_output_in_sam: true
100
+ # SAM heads
101
+ iou_prediction_use_sigmoid: True
102
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
+ use_obj_ptrs_in_encoder: true
104
+ add_tpos_enc_to_obj_ptrs: false
105
+ only_obj_ptrs_in_the_past_for_eval: true
106
+ # object occlusion prediction
107
+ pred_obj_scores: true
108
+ pred_obj_scores_mlp: true
109
+ fixed_no_obj_ptr: true
110
+ # multimask tracking settings
111
+ multimask_output_for_tracking: true
112
+ use_multimask_token_for_obj_ptr: true
113
+ multimask_min_pt_num: 0
114
+ multimask_max_pt_num: 1
115
+ use_mlp_for_obj_ptr_proj: true
116
+ # Compilation flag
117
+ compile_image_encoder: False
florence_sam/configs/sam2_hiera_s.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 96
12
+ num_heads: 1
13
+ stages: [1, 2, 11, 2]
14
+ global_att_blocks: [7, 10, 13]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ neck:
17
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
+ position_encoding:
19
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
+ num_pos_feats: 256
21
+ normalize: true
22
+ scale: null
23
+ temperature: 10000
24
+ d_model: 256
25
+ backbone_channel_list: [768, 384, 192, 96]
26
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
+ fpn_interp_model: nearest
28
+
29
+ memory_attention:
30
+ _target_: sam2.modeling.memory_attention.MemoryAttention
31
+ d_model: 256
32
+ pos_enc_at_input: true
33
+ layer:
34
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
+ activation: relu
36
+ dim_feedforward: 2048
37
+ dropout: 0.1
38
+ pos_enc_at_attn: false
39
+ self_attention:
40
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
41
+ rope_theta: 10000.0
42
+ feat_sizes: [32, 32]
43
+ embedding_dim: 256
44
+ num_heads: 1
45
+ downsample_rate: 1
46
+ dropout: 0.1
47
+ d_model: 256
48
+ pos_enc_at_cross_attn_keys: true
49
+ pos_enc_at_cross_attn_queries: false
50
+ cross_attention:
51
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
52
+ rope_theta: 10000.0
53
+ feat_sizes: [32, 32]
54
+ rope_k_repeat: True
55
+ embedding_dim: 256
56
+ num_heads: 1
57
+ downsample_rate: 1
58
+ dropout: 0.1
59
+ kv_in_dim: 64
60
+ num_layers: 4
61
+
62
+ memory_encoder:
63
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
+ out_dim: 64
65
+ position_encoding:
66
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
+ num_pos_feats: 64
68
+ normalize: true
69
+ scale: null
70
+ temperature: 10000
71
+ mask_downsampler:
72
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
+ kernel_size: 3
74
+ stride: 2
75
+ padding: 1
76
+ fuser:
77
+ _target_: sam2.modeling.memory_encoder.Fuser
78
+ layer:
79
+ _target_: sam2.modeling.memory_encoder.CXBlock
80
+ dim: 256
81
+ kernel_size: 7
82
+ padding: 3
83
+ layer_scale_init_value: 1e-6
84
+ use_dwconv: True # depth-wise convs
85
+ num_layers: 2
86
+
87
+ num_maskmem: 7
88
+ image_size: 1024
89
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
+ sigmoid_scale_for_mem_enc: 20.0
91
+ sigmoid_bias_for_mem_enc: -10.0
92
+ use_mask_input_as_output_without_sam: true
93
+ # Memory
94
+ directly_add_no_mem_embed: true
95
+ # use high-resolution feature map in the SAM mask decoder
96
+ use_high_res_features_in_sam: true
97
+ # output 3 masks on the first click on initial conditioning frames
98
+ multimask_output_in_sam: true
99
+ # SAM heads
100
+ iou_prediction_use_sigmoid: True
101
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
102
+ use_obj_ptrs_in_encoder: true
103
+ add_tpos_enc_to_obj_ptrs: false
104
+ only_obj_ptrs_in_the_past_for_eval: true
105
+ # object occlusion prediction
106
+ pred_obj_scores: true
107
+ pred_obj_scores_mlp: true
108
+ fixed_no_obj_ptr: true
109
+ # multimask tracking settings
110
+ multimask_output_for_tracking: true
111
+ use_multimask_token_for_obj_ptr: true
112
+ multimask_min_pt_num: 0
113
+ multimask_max_pt_num: 1
114
+ use_mlp_for_obj_ptr_proj: true
115
+ # Compilation flag
116
+ compile_image_encoder: False
florence_sam/configs/sam2_hiera_t.yaml ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # Model
4
+ model:
5
+ _target_: sam2.modeling.sam2_base.SAM2Base
6
+ image_encoder:
7
+ _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
8
+ scalp: 1
9
+ trunk:
10
+ _target_: sam2.modeling.backbones.hieradet.Hiera
11
+ embed_dim: 96
12
+ num_heads: 1
13
+ stages: [1, 2, 7, 2]
14
+ global_att_blocks: [5, 7, 9]
15
+ window_pos_embed_bkg_spatial_size: [7, 7]
16
+ neck:
17
+ _target_: sam2.modeling.backbones.image_encoder.FpnNeck
18
+ position_encoding:
19
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
20
+ num_pos_feats: 256
21
+ normalize: true
22
+ scale: null
23
+ temperature: 10000
24
+ d_model: 256
25
+ backbone_channel_list: [768, 384, 192, 96]
26
+ fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features
27
+ fpn_interp_model: nearest
28
+
29
+ memory_attention:
30
+ _target_: sam2.modeling.memory_attention.MemoryAttention
31
+ d_model: 256
32
+ pos_enc_at_input: true
33
+ layer:
34
+ _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
35
+ activation: relu
36
+ dim_feedforward: 2048
37
+ dropout: 0.1
38
+ pos_enc_at_attn: false
39
+ self_attention:
40
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
41
+ rope_theta: 10000.0
42
+ feat_sizes: [32, 32]
43
+ embedding_dim: 256
44
+ num_heads: 1
45
+ downsample_rate: 1
46
+ dropout: 0.1
47
+ d_model: 256
48
+ pos_enc_at_cross_attn_keys: true
49
+ pos_enc_at_cross_attn_queries: false
50
+ cross_attention:
51
+ _target_: sam2.modeling.sam.transformer.RoPEAttention
52
+ rope_theta: 10000.0
53
+ feat_sizes: [32, 32]
54
+ rope_k_repeat: True
55
+ embedding_dim: 256
56
+ num_heads: 1
57
+ downsample_rate: 1
58
+ dropout: 0.1
59
+ kv_in_dim: 64
60
+ num_layers: 4
61
+
62
+ memory_encoder:
63
+ _target_: sam2.modeling.memory_encoder.MemoryEncoder
64
+ out_dim: 64
65
+ position_encoding:
66
+ _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
67
+ num_pos_feats: 64
68
+ normalize: true
69
+ scale: null
70
+ temperature: 10000
71
+ mask_downsampler:
72
+ _target_: sam2.modeling.memory_encoder.MaskDownSampler
73
+ kernel_size: 3
74
+ stride: 2
75
+ padding: 1
76
+ fuser:
77
+ _target_: sam2.modeling.memory_encoder.Fuser
78
+ layer:
79
+ _target_: sam2.modeling.memory_encoder.CXBlock
80
+ dim: 256
81
+ kernel_size: 7
82
+ padding: 3
83
+ layer_scale_init_value: 1e-6
84
+ use_dwconv: True # depth-wise convs
85
+ num_layers: 2
86
+
87
+ num_maskmem: 7
88
+ image_size: 1024
89
+ # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
90
+ # SAM decoder
91
+ sigmoid_scale_for_mem_enc: 20.0
92
+ sigmoid_bias_for_mem_enc: -10.0
93
+ use_mask_input_as_output_without_sam: true
94
+ # Memory
95
+ directly_add_no_mem_embed: true
96
+ # use high-resolution feature map in the SAM mask decoder
97
+ use_high_res_features_in_sam: true
98
+ # output 3 masks on the first click on initial conditioning frames
99
+ multimask_output_in_sam: true
100
+ # SAM heads
101
+ iou_prediction_use_sigmoid: True
102
+ # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
103
+ use_obj_ptrs_in_encoder: true
104
+ add_tpos_enc_to_obj_ptrs: false
105
+ only_obj_ptrs_in_the_past_for_eval: true
106
+ # object occlusion prediction
107
+ pred_obj_scores: true
108
+ pred_obj_scores_mlp: true
109
+ fixed_no_obj_ptr: true
110
+ # multimask tracking settings
111
+ multimask_output_for_tracking: true
112
+ use_multimask_token_for_obj_ptr: true
113
+ multimask_min_pt_num: 0
114
+ multimask_max_pt_num: 1
115
+ use_mlp_for_obj_ptr_proj: true
116
+ # Compilation flag
117
+ # HieraT does not currently support compilation, should always be set to False
118
+ compile_image_encoder: False
florence_sam/detect_and_segment.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # detect_and_segment.py
2
+ import torch
3
+ import supervision as sv
4
+ from typing import List, Tuple, Optional
5
+
6
+ # ==== 1. One-time global model loading =====================================
7
+ from .utils.florence import (
8
+ load_florence_model,
9
+ run_florence_inference,
10
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
11
+ )
12
+ from .utils.sam import load_sam_image_model, run_sam_inference
13
+
14
+ from PIL import Image, ImageDraw, ImageColor
15
+ import numpy as np
16
+
17
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+
19
+ # load models once – they stay in memory for repeated calls
20
+ FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
21
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
22
+
23
+ # quick annotators
24
+ COLORS = ['#FF1493', '#00BFFF', '#FF6347', '#FFD700', '#32CD32', '#8A2BE2']
25
+ COLOR_PALETTE = sv.ColorPalette.from_hex(COLORS)
26
+ BOX_ANNOTATOR = sv.BoxAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
27
+ LABEL_ANNOTATOR = sv.LabelAnnotator(
28
+ color=COLOR_PALETTE,
29
+ color_lookup=sv.ColorLookup.INDEX,
30
+ text_position=sv.Position.CENTER_OF_MASS,
31
+ text_color=sv.Color.from_hex("#000000"),
32
+ border_radius=5,
33
+ )
34
+ MASK_ANNOTATOR = sv.MaskAnnotator(color=COLOR_PALETTE, color_lookup=sv.ColorLookup.INDEX)
35
+
36
+ # ==== 2. Inference function ===============================================
37
+
38
+ @torch.inference_mode()
39
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
40
+ def detect_and_segment(
41
+ image : Image.Image,
42
+ text_prompts : str | List[str],
43
+ return_image : bool = True,
44
+ ) -> Tuple[sv.Detections, Optional[Image.Image]]:
45
+ """
46
+ Run Florence-2 open-vocabulary detection + SAM2 mask refinement on a PIL image.
47
+
48
+ Parameters
49
+ ----------
50
+ image : PIL.Image
51
+ Input image in RGB.
52
+ text_prompts : str | List[str]
53
+ Single prompt or comma-separated list (e.g. "dog, tail, leash").
54
+ return_image : bool
55
+ If True, also returns an annotated PIL image.
56
+
57
+ Returns
58
+ -------
59
+ detections : sv.Detections
60
+ Supervision object with xyxy, mask, class_id, etc.
61
+ annotated : PIL.Image | None
62
+ Annotated image (None if return_image=False)
63
+ """
64
+ # Normalize prompt list
65
+ if isinstance(text_prompts, str):
66
+ prompts = [p.strip() for p in text_prompts.split(",") if p.strip()]
67
+ else:
68
+ prompts = [p.strip() for p in text_prompts]
69
+
70
+ if len(prompts) == 0:
71
+ raise ValueError("Empty prompt list given.")
72
+
73
+ # Collect detections from each prompt
74
+ det_list: list[sv.Detections] = []
75
+ for p in prompts:
76
+ _, result = run_florence_inference(
77
+ model = FLORENCE_MODEL,
78
+ processor = FLORENCE_PROC,
79
+ device = DEVICE,
80
+ image = image,
81
+ task = FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
82
+ text = p,
83
+ )
84
+ det = sv.Detections.from_lmm(
85
+ lmm = sv.LMM.FLORENCE_2,
86
+ result = result,
87
+ resolution_wh = image.size,
88
+ )
89
+ det = run_sam_inference(SAM_IMAGE_MODEL, image, det) # SAM2 refinement
90
+ det_list.append(det)
91
+
92
+ detections = sv.Detections.merge(det_list)
93
+
94
+ annotated_img = None
95
+ if return_image:
96
+ annotated_img = image.copy()
97
+ annotated_img = MASK_ANNOTATOR.annotate(annotated_img, detections)
98
+ annotated_img = BOX_ANNOTATOR.annotate(annotated_img, detections)
99
+ annotated_img = LABEL_ANNOTATOR.annotate(annotated_img, detections)
100
+
101
+ return detections, annotated_img
102
+
103
+
104
+
105
+ def fill_detected_bboxes(
106
+ image: Image.Image,
107
+ text: str,
108
+ inflate_pct: float = 0.10,
109
+ fill_color: str | tuple[int, int, int] = "#00FF00",
110
+ ):
111
+ """
112
+ Detect objects matching `text`, inflate each bounding-box by `inflate_pct`,
113
+ fill the area with `fill_color`, and return the resulting image.
114
+
115
+ Parameters
116
+ ----------
117
+ image : PIL.Image
118
+ Input image (RGB).
119
+ text : str
120
+ Comma-separated prompt(s) for open-vocabulary detection.
121
+ inflate_pct : float, default 0.10
122
+ Extra margin per side (0.10 = +10 % width & height).
123
+ fill_color : str | tuple, default "#00FF00"
124
+ Solid color used to fill each inflated bbox (hex or RGB tuple).
125
+
126
+ Returns
127
+ -------
128
+ filled_img : PIL.Image
129
+ Image with each detected (inflated) box filled.
130
+ detections : sv.Detections
131
+ Original detection object from `detect_and_segment`.
132
+ """
133
+ # run Florence2 + SAM2 pipeline (your helper from earlier)
134
+ detections, _ = detect_and_segment(image, text)
135
+
136
+ w, h = image.size
137
+ filled_img = image.copy()
138
+ draw = ImageDraw.Draw(filled_img)
139
+ fill_rgb = ImageColor.getrgb(fill_color) if isinstance(fill_color, str) else fill_color
140
+
141
+ for box in detections.xyxy:
142
+ # xyxy is numpy array → cast to float for math
143
+ x1, y1, x2, y2 = box.astype(float)
144
+ dw, dh = (x2 - x1) * inflate_pct, (y2 - y1) * inflate_pct
145
+ x1_i = max(0, x1 - dw)
146
+ y1_i = max(0, y1 - dh)
147
+ x2_i = min(w, x2 + dw)
148
+ y2_i = min(h, y2 + dh)
149
+ draw.rectangle([x1_i, y1_i, x2_i, y2_i], fill=fill_rgb)
150
+
151
+ return filled_img, detections
florence_sam/process_batch.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # furniture_bbox_to_files.py ────────────────────────────────────────
3
+ # Florence-2 + SAM-2 batch processor with retries *and* file-based images
4
+ # --------------------------------------------------------------------
5
+ import os, json, random, time
6
+ from pathlib import Path
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+ from typing import List
9
+
10
+ import torch, supervision as sv
11
+ from PIL import Image, ImageDraw, ImageColor, ImageOps
12
+ from tqdm.auto import tqdm
13
+ from datasets import load_dataset, Image as HFImage, disable_progress_bar
14
+
15
+ # ───── global models ────────────────────────────────────────────────
16
+ from utils.florence import (
17
+ load_florence_model, run_florence_inference,
18
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
19
+ )
20
+ from utils.sam import load_sam_image_model, run_sam_inference
21
+
22
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ FLORENCE_MODEL, FLORENCE_PROC = load_florence_model(device=DEVICE)
24
+ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
25
+
26
+ # annotators
27
+ _PALETTE = sv.ColorPalette.from_hex(
28
+ ['#FF1493','#00BFFF','#FF6347','#FFD700','#32CD32','#8A2BE2'])
29
+ BOX_ANN = sv.BoxAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
30
+ MASK_ANN = sv.MaskAnnotator(color=_PALETTE, color_lookup=sv.ColorLookup.INDEX)
31
+ LBL_ANN = sv.LabelAnnotator(
32
+ color=_PALETTE, color_lookup=sv.ColorLookup.INDEX,
33
+ text_position=sv.Position.CENTER_OF_MASS,
34
+ text_color=sv.Color.from_hex("#000"), border_radius=5)
35
+
36
+ # ───── config ───────────────────────────────────────────────────────
37
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
38
+ disable_progress_bar()
39
+
40
+ DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
41
+ SPLIT = "train"
42
+ IMAGE_COL = "img2"
43
+ PROMPT_COL = "segmenting_prompt"
44
+
45
+ INFLATE_RANGE = (0.01, 0.05)
46
+ FILL_COLOR = "#00FF00"
47
+ TARGET_SIDE = 1500
48
+
49
+ QA_DIR = Path("bbox_review_recaptioned")
50
+ GREEN_DIR = QA_DIR / "green"; GREEN_DIR.mkdir(parents=True, exist_ok=True)
51
+ ANNO_DIR = QA_DIR / "anno"; ANNO_DIR.mkdir(parents=True, exist_ok=True)
52
+ JSON_DIR = QA_DIR / "json"; JSON_DIR.mkdir(parents=True, exist_ok=True)
53
+
54
+ MAX_WORKERS = 100
55
+ MAX_RETRIES = 5
56
+ RETRY_SLEEP = .3
57
+ FAILED_LOG = QA_DIR / "failed_rows.jsonl"
58
+
59
+ PROMPT_MAP: dict[str,str] = {} # optional overrides
60
+
61
+ # ───── helpers ──────────────────────────────────────────────────────
62
+ def make_square(img: Image.Image, side: int = TARGET_SIDE) -> Image.Image:
63
+ img = ImageOps.contain(img, (side, side))
64
+ pad_w, pad_h = side - img.width, side - img.height
65
+ return ImageOps.expand(img, border=(pad_w//2, pad_h//2,
66
+ pad_w - pad_w//2, pad_h - pad_h//2),
67
+ fill=img.getpixel((0,0)))
68
+
69
+ def img_to_file(img: Image.Image, fname: str, folder: Path) -> dict:
70
+ path = folder / f"{fname}.png"
71
+ if not path.exists():
72
+ img.save(path)
73
+ return {"path": str(path), "bytes": None}
74
+
75
+ # ───── core functions ───────────────────────────────────────────────
76
+ @torch.inference_mode()
77
+ @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
78
+ def detect_and_segment(img: Image.Image, prompts: str|List[str]) -> sv.Detections:
79
+ if isinstance(prompts, str):
80
+ prompts = [p.strip() for p in prompts.split(",") if p.strip()]
81
+ all_dets = []
82
+ for p in prompts:
83
+ _, res = run_florence_inference(
84
+ model=FLORENCE_MODEL, processor=FLORENCE_PROC, device=DEVICE,
85
+ image=img, task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK, text=p)
86
+ d = sv.Detections.from_lmm(sv.LMM.FLORENCE_2, res, img.size)
87
+ all_dets.append(run_sam_inference(SAM_IMAGE_MODEL, img, d))
88
+ return sv.Detections.merge(all_dets)
89
+
90
+ def fill_detected_bboxes(img: Image.Image, prompt: str,
91
+ inflate_pct: float) -> tuple[Image.Image, sv.Detections]:
92
+ dets = detect_and_segment(img, prompt)
93
+ filled = img.copy()
94
+ draw = ImageDraw.Draw(filled)
95
+ rgb = ImageColor.getrgb(FILL_COLOR)
96
+ w,h = img.size
97
+ for box in dets.xyxy:
98
+ x1,y1,x2,y2 = box.astype(float)
99
+ dw,dh = (x2-x1)*inflate_pct, (y2-y1)*inflate_pct
100
+ draw.rectangle([max(0,x1-dw), max(0,y1-dh),
101
+ min(w,x2+dw), min(h,y2+dh)], fill=rgb)
102
+ return filled, dets
103
+
104
+ # ───── threaded worker ──────────────────────────────────────────────
105
+ def process_row(idx: int, sample):
106
+ prompt = PROMPT_MAP.get(sample[PROMPT_COL],
107
+ sample[PROMPT_COL].split(",",1)[0].strip())
108
+ img_sq = make_square(sample[IMAGE_COL].convert("RGB"))
109
+ for attempt in range(1, MAX_RETRIES+1):
110
+ try:
111
+ filled, dets = fill_detected_bboxes(
112
+ img_sq, prompt, inflate_pct=random.uniform(*INFLATE_RANGE))
113
+ if len(dets.xyxy) == 0:
114
+ raise ValueError("no detections")
115
+
116
+ sid = f"{idx:06d}"
117
+ json_p = JSON_DIR / f"{sid}_bbox.json"
118
+ json_p.write_text(json.dumps({"xyxy": dets.xyxy.tolist()}))
119
+
120
+ anno = img_sq.copy()
121
+ for ann in (MASK_ANN, BOX_ANN, LABEL_ANN):
122
+ anno = ann.annotate(anno, dets)
123
+
124
+ return ("ok",
125
+ img_to_file(filled, sid, GREEN_DIR),
126
+ img_to_file(anno, sid, ANNO_DIR),
127
+ json_p.read_text())
128
+ except Exception as e:
129
+ if attempt < MAX_RETRIES:
130
+ time.sleep(RETRY_SLEEP)
131
+ else:
132
+ return ("fail", str(e))
133
+
134
+ # ───── run batch ────────────────────────────────────────────────────
135
+ ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
136
+ N = len(ds)
137
+ print("Rows:", N)
138
+
139
+ filled_col, anno_col, json_col = [None]*N, [None]*N, [None]*N
140
+ fails = 0
141
+
142
+ with ThreadPoolExecutor(MAX_WORKERS) as pool:
143
+ fut2idx = {pool.submit(process_row, i, ds[i]): i for i in range(N)}
144
+ for fut in tqdm(as_completed(fut2idx), total=N, desc="Florence+SAM"):
145
+ idx = fut2idx[fut]
146
+ status, *data = fut.result()
147
+ if status == "ok":
148
+ filled_col[idx], anno_col[idx], json_col[idx] = data
149
+ else:
150
+ fails += 1
151
+ FAILED_LOG.write_text(json.dumps({"idx": idx, "reason": data[0]})+"\n")
152
+
153
+ print(f"❌ permanently failed rows: {fails}")
154
+
155
+ keep = [i for i,x in enumerate(filled_col) if x]
156
+ new_ds = ds.select(keep)
157
+ new_ds = new_ds.add_column("bbox_filled", [filled_col[i] for i in keep])
158
+ new_ds = new_ds.add_column("annotated", [anno_col[i] for i in keep])
159
+ new_ds = new_ds.add_column("bbox_json", [json_col[i] for i in keep])
160
+ new_ds = new_ds.cast_column("bbox_filled", HFImage())
161
+ new_ds = new_ds.cast_column("annotated", HFImage())
162
+
163
+ print(f"✅ successes: {len(new_ds)} / {N}")
164
+ print("Columns:", new_ds.column_names)
165
+ print("QA artefacts →", QA_DIR.resolve())
166
+
167
+ # optional push
168
+ new_ds.push_to_hub("fotographerai/surround_furniture_bboxfilled",
169
+ private=True, max_shard_size="500MB")
florence_sam/reassemble.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ reassemble_bbox_dataset_resume.py
4
+ ---------------------------------
5
+ Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from
6
+ QA artefacts and pushes the final dataset **privately** to HF Hub.
7
+
8
+ • Safe to ^C / rerun (uses on-disk Arrow cache)
9
+ • When NOTHING is left to process it *just* loads the cache and pushes.
10
+ • Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny.
11
+ """
12
+
13
+ import os, json
14
+ from pathlib import Path
15
+ from tqdm.auto import tqdm
16
+ from datasets import (
17
+ load_dataset, load_from_disk, Dataset, disable_progress_bar, Features,
18
+ Value, Image as HFImage
19
+ )
20
+ from PIL import Image
21
+ from huggingface_hub.utils import HfHubHTTPError
22
+
23
+ disable_progress_bar()
24
+
25
+ # ══════ CONFIG ══════════════════════════════════════════════════════
26
+ DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
27
+ SPLIT = "train"
28
+
29
+ QA_DIR = Path("bbox_review_recaptioned") # artefacts
30
+ CACHE_DIR = Path("rebuild_cache") # incremental Arrow cache
31
+ CACHE_DIR.mkdir(exist_ok=True)
32
+
33
+ TARGET_SIDE = 1500
34
+ GREEN_RGB = (0, 255, 0)
35
+
36
+ BATCH_SAVE = 500
37
+ HUB_REPO = "fotographerai/furniture_bboxfilled_rebuild"
38
+ HF_TOKEN = os.environ.get("HF_TOKEN", "").strip() # needs write+private
39
+
40
+ # ══════ HELPERS ═════════════════════════════════════════════════════
41
+ def img_ref(p: Path) -> dict: # path-only image dict
42
+ return {"path": str(p), "bytes": None}
43
+
44
+ def make_green_png(p: Path):
45
+ if not p.exists():
46
+ Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p)
47
+
48
+ def ensure_full_bbox(p: Path):
49
+ if not p.exists():
50
+ p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]}))
51
+
52
+ # ══════ LOAD SOURCE DATASET ═════════════════════════════════════════
53
+ base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
54
+ N_TOTAL = len(base_ds)
55
+ print("Original rows:", N_TOTAL)
56
+
57
+ # ══════ LOAD OR INIT CACHE ══════════════════════════════════════════
58
+ if (CACHE_DIR / "dataset_info.json").exists():
59
+ cache_ds = load_from_disk(CACHE_DIR)
60
+ done = set(cache_ds["__row_idx__"])
61
+ print(f"Cache found → {len(done)} rows already processed.")
62
+ records = {k: list(v) for k, v in cache_ds.to_dict().items()}
63
+ else:
64
+ done, records = set(), {"__row_idx__": [], "bbox_filled": [],
65
+ "annotated": [], "bbox_json": []}
66
+
67
+ missing = [i for i in range(N_TOTAL) if i not in done]
68
+ print("Rows still to process:", len(missing))
69
+
70
+ # ══════ NO WORK LEFT? push & exit ══════════════════════════════════
71
+ if not missing:
72
+ print("💤 nothing new to process – pushing cached dataset…")
73
+ try:
74
+ url = cache_ds.push_to_hub(
75
+ HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
76
+ )
77
+ print("🚀 dataset pushed to:", url)
78
+ except HfHubHTTPError as e:
79
+ print("❌ push failed:", e)
80
+ exit(0)
81
+
82
+ # ══════ PROCESS MISSING ROWS ═══════════════════════════════════════
83
+ for n, i in enumerate(tqdm(missing, desc="Re-assembling")):
84
+ g_png = QA_DIR / f"{i:06d}_green.png"
85
+ a_png = QA_DIR / f"{i:06d}_anno.png"
86
+ bbox_j = QA_DIR / f"{i:06d}_bbox.json"
87
+
88
+ if not (g_png.exists() and a_png.exists() and bbox_j.exists()):
89
+ mask_png = QA_DIR / f"{i:06d}_mask.png"
90
+ make_green_png(mask_png)
91
+ g_png = a_png = mask_png
92
+ ensure_full_bbox(bbox_j)
93
+
94
+ row = base_ds[i] # copy original cols once
95
+ records["__row_idx__"].append(i)
96
+ for k, v in row.items():
97
+ records.setdefault(k, []).append(v)
98
+
99
+ records["bbox_filled"].append(img_ref(g_png))
100
+ records["annotated"].append(img_ref(a_png))
101
+ records["bbox_json"].append(bbox_j.read_text())
102
+
103
+ if (n + 1) % BATCH_SAVE == 0:
104
+ Dataset.from_dict(records).save_to_disk(CACHE_DIR)
105
+ print(f"⏫ cached at {n+1}/{len(missing)}")
106
+
107
+ # ══════ FINAL DATASET FEATURES & SAVE ═══════════════════════════════
108
+ features = Features({
109
+ "__row_idx__" : Value("int32"),
110
+ "bbox_filled" : HFImage(decode=False),
111
+ "annotated" : HFImage(decode=False),
112
+ "bbox_json" : Value("string"),
113
+ # original columns inferred below
114
+ })
115
+ for k in base_ds.features:
116
+ if k not in features:
117
+ features[k] = base_ds.features[k]
118
+
119
+ final_ds = Dataset.from_dict(records, features=features)
120
+ final_ds.save_to_disk(CACHE_DIR)
121
+ print("✅ cached dataset saved to", CACHE_DIR.resolve())
122
+
123
+ # ══════ PUSH PRIVATE ═══════════════════════════════════════════════
124
+ if not HF_TOKEN:
125
+ print("⚠️ HF_TOKEN env-var not set – skipping push.")
126
+ else:
127
+ try:
128
+ url = final_ds.push_to_hub(
129
+ HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
130
+ )
131
+ print("🚀 dataset pushed to:", url)
132
+ except HfHubHTTPError as e:
133
+ print("❌ push failed:", e)
florence_sam/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ tqdm
2
+ einops
3
+ spaces
4
+ timm
5
+ transformers
6
+ samv2
7
+ gradio
8
+ supervision
9
+ opencv-python
10
+ pytest
florence_sam/utils/__init__.py ADDED
File without changes
florence_sam/utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (141 Bytes). View file
 
florence_sam/utils/__pycache__/florence.cpython-310.pyc ADDED
Binary file (2.29 kB). View file
 
florence_sam/utils/__pycache__/modes.cpython-310.pyc ADDED
Binary file (450 Bytes). View file
 
florence_sam/utils/__pycache__/sam.cpython-310.pyc ADDED
Binary file (1.46 kB). View file
 
florence_sam/utils/__pycache__/video.cpython-310.pyc ADDED
Binary file (984 Bytes). View file
 
florence_sam/utils/florence.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Union, Any, Tuple, Dict
3
+ from unittest.mock import patch
4
+
5
+ import torch
6
+ from PIL import Image
7
+ from transformers import AutoModelForCausalLM, AutoProcessor
8
+ from transformers.dynamic_module_utils import get_imports
9
+
10
+ FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
11
+ FLORENCE_OBJECT_DETECTION_TASK = '<OD>'
12
+ FLORENCE_DETAILED_CAPTION_TASK = '<MORE_DETAILED_CAPTION>'
13
+ FLORENCE_CAPTION_TO_PHRASE_GROUNDING_TASK = '<CAPTION_TO_PHRASE_GROUNDING>'
14
+ FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
15
+ FLORENCE_DENSE_REGION_CAPTION_TASK = '<DENSE_REGION_CAPTION>'
16
+
17
+
18
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
19
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
20
+ if not str(filename).endswith("/modeling_florence2.py"):
21
+ return get_imports(filename)
22
+ imports = get_imports(filename)
23
+ #imports.remove("flash_attn")
24
+ return imports
25
+
26
+
27
+ def load_florence_model(
28
+ device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
29
+ ) -> Tuple[Any, Any]:
30
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ checkpoint, trust_remote_code=True).to(device).eval()
33
+ processor = AutoProcessor.from_pretrained(
34
+ checkpoint, trust_remote_code=True)
35
+ return model, processor
36
+
37
+
38
+ def run_florence_inference(
39
+ model: Any,
40
+ processor: Any,
41
+ device: torch.device,
42
+ image: Image,
43
+ task: str,
44
+ text: str = ""
45
+ ) -> Tuple[str, Dict]:
46
+ prompt = task + text
47
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
48
+ generated_ids = model.generate(
49
+ input_ids=inputs["input_ids"],
50
+ pixel_values=inputs["pixel_values"],
51
+ max_new_tokens=1024,
52
+ num_beams=3
53
+ )
54
+ generated_text = processor.batch_decode(
55
+ generated_ids, skip_special_tokens=False)[0]
56
+ response = processor.post_process_generation(
57
+ generated_text, task=task, image_size=image.size)
58
+ return generated_text, response
florence_sam/utils/modes.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ IMAGE_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + image masks"
2
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE = "caption + grounding + image masks"
3
+
4
+ IMAGE_INFERENCE_MODES = [
5
+ IMAGE_OPEN_VOCABULARY_DETECTION_MODE,
6
+ IMAGE_CAPTION_GROUNDING_MASKS_MODE
7
+ ]
8
+
9
+ VIDEO_OPEN_VOCABULARY_DETECTION_MODE = "open vocabulary detection + video masks"
10
+
11
+ VIDEO_INFERENCE_MODES = [
12
+ VIDEO_OPEN_VOCABULARY_DETECTION_MODE
13
+ ]
florence_sam/utils/sam.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ import numpy as np
4
+ import supervision as sv
5
+ import torch
6
+ from PIL import Image
7
+ from sam2.build_sam import build_sam2, build_sam2_video_predictor
8
+ from sam2.sam2_image_predictor import SAM2ImagePredictor
9
+
10
+ SAM_CHECKPOINT = "/home/comdoleger1/ZenCtrl/app/florence_sam/checkpoints/sam2_hiera_small.pt" #"./checkpoints/sam2_hiera_small.pt"
11
+ SAM_CONFIG = "sam2_hiera_s.yaml"
12
+
13
+
14
+ def load_sam_image_model(
15
+ device: torch.device,
16
+ config: str = SAM_CONFIG,
17
+ checkpoint: str = SAM_CHECKPOINT
18
+ ) -> SAM2ImagePredictor:
19
+ model = build_sam2(config, checkpoint, device=device)
20
+ return SAM2ImagePredictor(sam_model=model)
21
+
22
+
23
+ def load_sam_video_model(
24
+ device: torch.device,
25
+ config: str = SAM_CONFIG,
26
+ checkpoint: str = SAM_CHECKPOINT
27
+ ) -> Any:
28
+ return build_sam2_video_predictor(config, checkpoint, device=device)
29
+
30
+
31
+ def run_sam_inference(
32
+ model: Any,
33
+ image: Image,
34
+ detections: sv.Detections
35
+ ) -> sv.Detections:
36
+ image = np.array(image.convert("RGB"))
37
+ model.set_image(image)
38
+ mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
39
+
40
+ # dirty fix; remove this later
41
+ if len(mask.shape) == 4:
42
+ mask = np.squeeze(mask)
43
+
44
+ detections.mask = mask.astype(bool)
45
+ return detections
florence_sam/utils/video.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import shutil
4
+ import uuid
5
+
6
+
7
+ def create_directory(directory_path: str) -> None:
8
+ if not os.path.exists(directory_path):
9
+ os.makedirs(directory_path)
10
+
11
+
12
+ def delete_directory(directory_path: str) -> None:
13
+ if not os.path.exists(directory_path):
14
+ raise FileNotFoundError(f"Directory '{directory_path}' does not exist.")
15
+
16
+ try:
17
+ shutil.rmtree(directory_path)
18
+ except PermissionError:
19
+ raise PermissionError(
20
+ f"Permission denied: Unable to delete '{directory_path}'.")
21
+
22
+
23
+ def generate_unique_name():
24
+ current_datetime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
25
+ unique_id = uuid.uuid4()
26
+ return f"{current_datetime}_{unique_id}"