Update for new transformers release

#6
by ankke - opened
Files changed (4) hide show
  1. README.md +3 -4
  2. config.json +2 -7
  3. modeling_lfm2_vl.py +0 -688
  4. processing_lfm2_vl.py +0 -645
README.md CHANGED
@@ -89,7 +89,7 @@ You can apply it using the dedicated [`.apply_chat_template()`](https://huggingf
89
 
90
  ## 🏃 How to run LFM2-VL
91
 
92
- You can run LFM2-VL with Hugging Face [`transformers`](https://github.com/huggingface/transformers) v4.55 or more recent as follows:
93
 
94
  ```bash
95
  pip install -U transformers pillow
@@ -106,10 +106,9 @@ model_id = "LiquidAI/LFM2-VL-1.6B"
106
  model = AutoModelForImageTextToText.from_pretrained(
107
  model_id,
108
  device_map="auto",
109
- torch_dtype="bfloat16",
110
- trust_remote_code=True
111
  )
112
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
113
 
114
  # Load image and create conversation
115
  url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 
89
 
90
  ## 🏃 How to run LFM2-VL
91
 
92
+ You can run LFM2-VL with Hugging Face [`transformers`](https://github.com/huggingface/transformers) v4.57 or more recent as follows:
93
 
94
  ```bash
95
  pip install -U transformers pillow
 
106
  model = AutoModelForImageTextToText.from_pretrained(
107
  model_id,
108
  device_map="auto",
109
+ dtype="bfloat16"
 
110
  )
111
+ processor = AutoProcessor.from_pretrained(model_id)
112
 
113
  # Load image and create conversation
114
  url = "https://www.ilankelman.org/stopsigns/australia.jpg"
config.json CHANGED
@@ -2,10 +2,6 @@
2
  "architectures": [
3
  "Lfm2VlForConditionalGeneration"
4
  ],
5
- "auto_map": {
6
- "AutoConfig": "modeling_lfm2_vl.Lfm2VlConfig",
7
- "AutoModelForImageTextToText": "modeling_lfm2_vl.Lfm2VlForConditionalGeneration"
8
- },
9
  "do_image_splitting": true,
10
  "downsample_factor": 2,
11
  "encoder_patch_size": 16,
@@ -16,7 +12,7 @@
16
  "max_tiles": 10,
17
  "min_image_tokens": 64,
18
  "min_tiles": 2,
19
- "model_type": "lfm2-vl",
20
  "projector_bias": true,
21
  "projector_hidden_act": "gelu",
22
  "projector_hidden_size": 2560,
@@ -94,6 +90,5 @@
94
  "patch_size": 16,
95
  "torch_dtype": "bfloat16",
96
  "vision_use_head": false
97
- },
98
- "vision_feature_layer": -2
99
  }
 
2
  "architectures": [
3
  "Lfm2VlForConditionalGeneration"
4
  ],
 
 
 
 
5
  "do_image_splitting": true,
6
  "downsample_factor": 2,
7
  "encoder_patch_size": 16,
 
12
  "max_tiles": 10,
13
  "min_image_tokens": 64,
14
  "min_tiles": 2,
15
+ "model_type": "lfm2_vl",
16
  "projector_bias": true,
17
  "projector_hidden_act": "gelu",
18
  "projector_hidden_size": 2560,
 
90
  "patch_size": 16,
91
  "torch_dtype": "bfloat16",
92
  "vision_use_head": false
93
+ }
 
94
  }
modeling_lfm2_vl.py DELETED
@@ -1,688 +0,0 @@
1
- """PyTorch LFM2-VL model."""
2
-
3
- from dataclasses import dataclass
4
-
5
- import torch
6
- from torch import nn
7
- from transformers import AutoConfig, AutoModel
8
- from transformers.activations import ACT2FN
9
- from transformers.cache_utils import Cache
10
- from transformers.configuration_utils import PretrainedConfig
11
- from transformers.generation import GenerationMixin
12
- from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
13
- from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
14
- from transformers.modeling_utils import PreTrainedModel
15
- from transformers.models.lfm2.configuration_lfm2 import Lfm2Config
16
- from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
17
- from transformers.models.siglip2.modeling_siglip2 import Siglip2VisionModel
18
- from transformers.processing_utils import Unpack
19
- from transformers.utils import can_return_tuple, logging
20
-
21
- logger = logging.get_logger(__name__)
22
-
23
-
24
- class Lfm2VlConfig(PretrainedConfig):
25
- r"""
26
- This is the configuration class to store the configuration of a [`Lfm2VlForConditionalGeneration`]. It is used to instantiate an
27
- Lfm2Vl model according to the specified arguments, defining the model architecture. Instantiating a configuration
28
- with the defaults will yield a similar configuration to that of the Lfm2-VL-1.6B.
29
-
30
- e.g. [LiquidAI/LFM2-VL-1.6B](https://huggingface.co/LiquidAI/LFM2-VL-1.6B)
31
-
32
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
33
- documentation from [`PretrainedConfig`] for more information.
34
-
35
- Args:
36
- vision_config (`AutoConfig | dict`, *optional*, defaults to `Siglip2ImageConfig`):
37
- The config object or dictionary of the vision backbone.
38
- text_config (`AutoConfig | dict`, *optional*, defaults to `Lfm2Config`):
39
- The config object or dictionary of the text backbone.
40
- image_token_id (`int`, *optional*, defaults to 396):
41
- The image token index to encode the image prompt.
42
- projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
43
- The activation function used by the multimodal projector.
44
- projector_hidden_size (`int`, *optional*, defaults to 2056):
45
- The hidden size of the multimodal projector.
46
- projector_bias (`bool`, *optional*, defaults to `True`):
47
- Whether to use bias in the multimodal projector.
48
- downsample_factor (`int`, *optional*, defaults to 2):
49
- The downsample_factor factor of the vision backbone.
50
- vision_feature_layer (`int`, *optional*, defaults to -1):
51
- The layer of the vision tower to use as features.
52
- min_image_tokens (`int`, *optional*, defaults to 64):
53
- The minimum number of image tokens for smart resize.
54
- max_image_tokens (`int`, *optional*, defaults to 256):
55
- The maximum number of image tokens for smart resize.
56
- encoder_patch_size (`int`, *optional*, defaults to 16):
57
- The patch size of the encoder.
58
- max_num_patches (`int`, *optional*, defaults to 1024):
59
- The maximum number of image tokens passed to the encoder per image or tile.
60
- use_image_special_tokens (`bool`, *optional*, defaults to `True`):
61
- Whether to use image special tokens.
62
- do_image_splitting (`bool`, *optional*, defaults to `True`):
63
- Whether to split large images into tiles.
64
- min_tiles (`int`, *optional*, defaults to 2):
65
- The minimum number of tiles to split the image into.
66
- max_tiles (`int`, *optional*, defaults to 10):
67
- The maximum number of tiles to split the image into.
68
- tile_size (`int`, *optional*, defaults to 512):
69
- The size of the tile to split the image into.
70
- max_pixels_tolerance (`float`, *optional*, defaults to 2.0):
71
- The maximum tolerance for the number of pixels in the image before splitting.
72
- use_thumbnail (`bool`, *optional*, defaults to `True`):
73
- Whether to append the thumbnail of the image when splitting.
74
- """
75
-
76
- model_type = "lfm2-vl"
77
- attribute_map = {
78
- "image_token_id": "image_token_index",
79
- }
80
- sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
81
-
82
- def __init__(
83
- self,
84
- vision_config=None,
85
- text_config=None,
86
- image_token_index=396,
87
- projector_hidden_act="gelu",
88
- projector_hidden_size=2560,
89
- projector_bias=True,
90
- downsample_factor=2,
91
- vision_feature_layer=-1,
92
- min_image_tokens=64,
93
- max_image_tokens=256,
94
- encoder_patch_size=16,
95
- max_num_patches=1024,
96
- use_image_special_tokens=True,
97
- do_image_splitting=True,
98
- min_tiles=2,
99
- max_tiles=10,
100
- tile_size=512,
101
- max_pixels_tolerance=2.0,
102
- use_thumbnail=True,
103
- torch_dtype=torch.bfloat16,
104
- **kwargs,
105
- ):
106
- self.vision_config = vision_config
107
- self.text_config = text_config
108
- self.image_token_index = image_token_index
109
- self.projector_hidden_act = projector_hidden_act
110
- self.projector_hidden_size = projector_hidden_size
111
- self.projector_bias = projector_bias
112
- self.downsample_factor = downsample_factor
113
- self.vision_feature_layer = vision_feature_layer
114
- self.min_image_tokens = min_image_tokens
115
- self.max_image_tokens = max_image_tokens
116
- self.encoder_patch_size = encoder_patch_size
117
- self.max_num_patches = max_num_patches
118
- self.use_image_special_tokens = use_image_special_tokens
119
- self.do_image_splitting = do_image_splitting
120
- self.min_tiles = min_tiles
121
- self.max_tiles = max_tiles
122
- self.tile_size = tile_size
123
- self.max_pixels_tolerance = max_pixels_tolerance
124
- self.use_thumbnail = use_thumbnail
125
- self.torch_dtype = torch_dtype
126
-
127
- if isinstance(vision_config, dict):
128
- vision_config = Siglip2VisionConfig(**vision_config)
129
- elif vision_config is None:
130
- vision_config = Siglip2VisionConfig()
131
- self.vision_config = vision_config
132
-
133
- self.vision_config = vision_config
134
-
135
- if isinstance(text_config, dict):
136
- text_config = Lfm2Config(**text_config)
137
- elif text_config is None:
138
- text_config = Lfm2Config()
139
-
140
- self.text_config = text_config
141
-
142
- super().__init__(**kwargs)
143
-
144
-
145
- @dataclass
146
- class Lfm2VlModelOutputWithPast(BaseModelOutputWithPast):
147
- r"""
148
- past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
149
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
150
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
151
-
152
- Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
153
- `past_key_values` input) to speed up sequential decoding.
154
- image_hidden_states (`torch.FloatTensor`, *optional*):
155
- A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
156
- image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
157
- """
158
-
159
- image_hidden_states: torch.FloatTensor | None = None
160
-
161
-
162
- @dataclass
163
- class Lfm2VlCausalLMOutputWithPast(ModelOutput):
164
- r"""
165
- loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
166
- Language modeling loss (for next-token prediction).
167
- logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
168
- Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
169
- past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
170
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
171
- `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
172
-
173
- Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
174
- `past_key_values` input) to speed up sequential decoding.
175
- image_hidden_states (`torch.FloatTensor`, *optional*):
176
- A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
177
- image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
178
- """
179
-
180
- loss: torch.FloatTensor | None = None
181
- logits: torch.FloatTensor | None = None
182
- past_key_values: list[torch.FloatTensor] | None = None
183
- hidden_states: tuple[torch.FloatTensor] | None = None
184
- attentions: tuple[torch.FloatTensor] | None = None
185
- image_hidden_states: torch.FloatTensor | None = None
186
-
187
-
188
- class Lfm2VlMultiModalProjector(nn.Module):
189
- def __init__(self, config: Lfm2VlConfig):
190
- super().__init__()
191
- in_channels = config.vision_config.hidden_size * (config.downsample_factor**2)
192
- self.layer_norm = nn.LayerNorm(in_channels)
193
- self.linear_1 = nn.Linear(
194
- in_channels,
195
- config.projector_hidden_size,
196
- bias=config.projector_bias,
197
- )
198
- self.act = ACT2FN[config.projector_hidden_act]
199
- self.linear_2 = nn.Linear(
200
- config.projector_hidden_size,
201
- config.text_config.hidden_size,
202
- bias=config.projector_bias,
203
- )
204
-
205
- def forward(self, image_features):
206
- image_features = self.layer_norm(image_features)
207
- hidden_states = self.linear_1(image_features)
208
- hidden_states = self.act(hidden_states)
209
- hidden_states = self.linear_2(hidden_states)
210
- return hidden_states
211
-
212
-
213
- class PixelUnshuffleBlock(nn.Module):
214
- def __init__(self, factor: int):
215
- super().__init__()
216
- self.factor = factor
217
-
218
- def forward(self, x: torch.Tensor) -> torch.Tensor:
219
- n, w, h, c = x.size()
220
- if w % self.factor != 0:
221
- x = torch.concat(
222
- [
223
- x,
224
- torch.zeros(
225
- (n, self.factor - (w % self.factor), h, c), dtype=x.dtype
226
- ).to(x.device),
227
- ],
228
- dim=1,
229
- ).contiguous()
230
- n, w, h, c = x.size()
231
- x = x.contiguous()
232
- if h % self.factor != 0:
233
- x = torch.concat(
234
- [
235
- x,
236
- torch.zeros(
237
- (n, w, self.factor - (h % self.factor), c), dtype=x.dtype
238
- ).to(x.device),
239
- ],
240
- dim=2,
241
- ).contiguous()
242
- n, w, h, c = x.size()
243
- x = x.view(n, w, int(h / self.factor), int(c * self.factor))
244
- x = x.permute(0, 2, 1, 3).contiguous()
245
- x = x.view(
246
- n, int(h / self.factor), int(w / self.factor), int(c * self.factor**2)
247
- )
248
- x = x.permute(0, 2, 1, 3).contiguous()
249
- return x
250
-
251
-
252
- class Lfm2VlPreTrainedModel(PreTrainedModel):
253
- config: Lfm2VlConfig
254
- base_model_prefix = ""
255
- supports_gradient_checkpointing = True
256
- _skip_keys_device_placement = ["past_key_values"]
257
-
258
- _supports_flash_attn = True
259
- _supports_sdpa = True
260
-
261
- _can_compile_fullgraph = False
262
- _supports_flex_attn = True
263
- _supports_attention_backend = True
264
-
265
-
266
- class Lfm2VlModel(Lfm2VlPreTrainedModel):
267
- _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
268
-
269
- def __init__(self, config: Lfm2VlConfig):
270
- super().__init__(config)
271
- self.vision_tower = Siglip2VisionModel(config.vision_config)
272
-
273
- if config.vision_feature_layer != -1:
274
- self.vision_tower.vision_model.encoder.layers = (
275
- self.vision_tower.vision_model.encoder.layers[
276
- : config.vision_feature_layer + 1
277
- ]
278
- )
279
- if config.downsample_factor > 1:
280
- self.pixel_unshuffle = PixelUnshuffleBlock(config.downsample_factor)
281
- else:
282
- self.pixel_unshuffle = nn.Identity()
283
-
284
- self.multi_modal_projector = Lfm2VlMultiModalProjector(config)
285
- self.language_model = AutoModel.from_config(config.text_config)
286
- self.post_init()
287
-
288
- def get_input_embeddings(self):
289
- return self.language_model.get_input_embeddings()
290
-
291
- def set_input_embeddings(self, value):
292
- self.language_model.set_input_embeddings(value)
293
-
294
- def set_decoder(self, decoder):
295
- self.language_model = decoder
296
-
297
- def get_decoder(self):
298
- return self.language_model
299
-
300
- def get_image_features(
301
- self,
302
- pixel_values: torch.FloatTensor,
303
- spatial_shapes: torch.Tensor,
304
- pixel_attention_mask: torch.Tensor,
305
- **kwargs,
306
- ) -> list[torch.Tensor]:
307
- """
308
- Obtains image last hidden states from the vision tower and apply multimodal projection.
309
-
310
- Args:
311
- pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
312
- The tensors corresponding to the input images.
313
- spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`):
314
- The spatial shapes of the input images.
315
- pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`):
316
- The pixel attention mask of the input images.
317
- Returns:
318
- image_features (`list[torch.Tensor]`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
319
- """
320
- image_outputs = self.vision_tower(
321
- pixel_values=pixel_values,
322
- spatial_shapes=spatial_shapes,
323
- pixel_attention_mask=pixel_attention_mask,
324
- ).last_hidden_state
325
-
326
- img_feature_lengths = pixel_attention_mask.sum(dim=1)
327
- image_features = []
328
-
329
- for img_idx in range(image_outputs.size(0)):
330
- feature = image_outputs[img_idx]
331
- # unpad the image representation
332
- feature = feature[: img_feature_lengths[img_idx], :].unsqueeze(0)
333
-
334
- feature_org_h, feature_org_w = spatial_shapes[img_idx]
335
- feature = feature.reshape(1, feature_org_h, feature_org_w, -1)
336
- feature = self.pixel_unshuffle(feature)
337
-
338
- # project the image representation
339
- img_embedding = self.multi_modal_projector(feature)
340
-
341
- # flatten here to handle variable length in naflex
342
- img_embedding = img_embedding.reshape(-1, img_embedding.size(-1))
343
- image_features.append(img_embedding)
344
-
345
- return image_features
346
-
347
- def get_placeholder_mask(
348
- self,
349
- input_ids: torch.LongTensor | None,
350
- inputs_embeds: torch.FloatTensor,
351
- image_features: torch.FloatTensor,
352
- ):
353
- """
354
- Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
355
- equal to the length of multimodal features. If the lengths are different, an error is raised.
356
- """
357
- if input_ids is None:
358
- special_image_mask = inputs_embeds == self.get_input_embeddings()(
359
- torch.tensor(
360
- self.config.image_token_id,
361
- dtype=torch.long,
362
- device=inputs_embeds.device,
363
- )
364
- )
365
- special_image_mask = special_image_mask.all(-1)
366
- else:
367
- special_image_mask = input_ids == self.config.image_token_id
368
- n_image_tokens = special_image_mask.sum()
369
- special_image_mask = (
370
- special_image_mask.unsqueeze(-1)
371
- .expand_as(inputs_embeds)
372
- .to(inputs_embeds.device)
373
- )
374
- n_image_features = image_features.shape[0]
375
- if inputs_embeds[special_image_mask].numel() != image_features.numel():
376
- raise ValueError(
377
- f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
378
- )
379
- return special_image_mask
380
-
381
- @can_return_tuple
382
- def forward(
383
- self,
384
- input_ids: torch.LongTensor = None,
385
- attention_mask: torch.Tensor | None = None,
386
- position_ids: torch.LongTensor | None = None,
387
- pixel_values: torch.FloatTensor = None,
388
- spatial_shapes: torch.Tensor = None,
389
- pixel_attention_mask: torch.Tensor = None,
390
- past_key_values: Cache | None = None,
391
- inputs_embeds: torch.FloatTensor | None = None,
392
- use_cache: bool | None = None,
393
- output_attentions: bool | None = None,
394
- output_hidden_states: bool | None = None,
395
- return_dict: bool | None = None,
396
- cache_position: torch.LongTensor | None = None,
397
- image_sizes: torch.Tensor = None,
398
- **kwargs: Unpack[FlashAttentionKwargs],
399
- ) -> tuple | Lfm2VlModelOutputWithPast:
400
- """
401
- spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
402
- The spatial shapes of the input images.
403
- pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*):
404
- The pixel attention mask of the input images.
405
- """
406
- output_attentions = (
407
- output_attentions
408
- if output_attentions is not None
409
- else self.config.output_attentions
410
- )
411
- output_hidden_states = (
412
- output_hidden_states
413
- if output_hidden_states is not None
414
- else self.config.output_hidden_states
415
- )
416
- return_dict = (
417
- return_dict if return_dict is not None else self.config.use_return_dict
418
- )
419
-
420
- if (input_ids is None) ^ (inputs_embeds is not None):
421
- raise ValueError(
422
- "You must specify exactly one of input_ids or inputs_embeds"
423
- )
424
-
425
- if inputs_embeds is None:
426
- inputs_embeds = self.get_input_embeddings()(input_ids)
427
-
428
- if pixel_values is not None:
429
- image_features = self.get_image_features(
430
- pixel_values=pixel_values,
431
- spatial_shapes=spatial_shapes,
432
- pixel_attention_mask=pixel_attention_mask,
433
- )
434
- image_features = torch.cat(image_features, dim=0).to(
435
- inputs_embeds.device, inputs_embeds.dtype
436
- )
437
- special_image_mask = self.get_placeholder_mask(
438
- input_ids=input_ids,
439
- inputs_embeds=inputs_embeds,
440
- image_features=image_features,
441
- )
442
- inputs_embeds = inputs_embeds.masked_scatter(
443
- special_image_mask, image_features
444
- )
445
-
446
- outputs = self.language_model(
447
- attention_mask=attention_mask,
448
- position_ids=position_ids,
449
- past_key_values=past_key_values,
450
- inputs_embeds=inputs_embeds,
451
- use_cache=use_cache,
452
- output_attentions=output_attentions,
453
- output_hidden_states=output_hidden_states,
454
- return_dict=True,
455
- cache_position=cache_position,
456
- **kwargs,
457
- )
458
-
459
- return Lfm2VlModelOutputWithPast(
460
- last_hidden_state=outputs.last_hidden_state,
461
- past_key_values=outputs.past_key_values,
462
- hidden_states=outputs.hidden_states,
463
- attentions=outputs.attentions,
464
- image_hidden_states=image_features if pixel_values is not None else None,
465
- )
466
-
467
-
468
- class Lfm2VlForConditionalGeneration(Lfm2VlPreTrainedModel, GenerationMixin):
469
- _tied_weights_keys = ["lm_head.weight"]
470
-
471
- def __init__(self, config: Lfm2VlConfig):
472
- super().__init__(config)
473
- self.model = Lfm2VlModel(config)
474
- self.lm_head = nn.Linear(
475
- config.text_config.hidden_size, config.text_config.vocab_size, bias=False
476
- )
477
- self.post_init()
478
-
479
- def _supports_default_dynamic_cache(self):
480
- return False
481
-
482
- def get_input_embeddings(self):
483
- return self.model.get_input_embeddings()
484
-
485
- def set_input_embeddings(self, value):
486
- self.model.set_input_embeddings(value)
487
-
488
- def get_output_embeddings(self) -> nn.Module:
489
- return self.lm_head
490
-
491
- def set_decoder(self, decoder):
492
- self.model.set_decoder(decoder)
493
-
494
- def get_decoder(self):
495
- return self.model.get_decoder()
496
-
497
- def get_image_features(
498
- self,
499
- pixel_values: torch.FloatTensor,
500
- spatial_shapes: torch.Tensor,
501
- pixel_attention_mask: torch.Tensor,
502
- **kwargs,
503
- ):
504
- return self.model.get_image_features(
505
- pixel_values=pixel_values,
506
- spatial_shapes=spatial_shapes,
507
- pixel_attention_mask=pixel_attention_mask,
508
- **kwargs,
509
- )
510
-
511
- @property
512
- def language_model(self):
513
- return self.model.language_model
514
-
515
- @property
516
- def vision_tower(self):
517
- return self.model.vision_tower
518
-
519
- @property
520
- def multi_modal_projector(self):
521
- return self.model.multi_modal_projector
522
-
523
- @can_return_tuple
524
- def forward(
525
- self,
526
- input_ids: torch.LongTensor = None,
527
- pixel_values: torch.FloatTensor = None,
528
- spatial_shapes: torch.Tensor = None,
529
- pixel_attention_mask: torch.Tensor = None,
530
- attention_mask: torch.Tensor | None = None,
531
- position_ids: torch.LongTensor | None = None,
532
- past_key_values: Cache | None = None,
533
- inputs_embeds: torch.FloatTensor | None = None,
534
- labels: torch.LongTensor | None = None,
535
- use_cache: bool | None = None,
536
- output_attentions: bool | None = None,
537
- output_hidden_states: bool | None = None,
538
- return_dict: bool | None = None,
539
- cache_position: torch.LongTensor | None = None,
540
- logits_to_keep: int | torch.Tensor = 0,
541
- image_sizes: torch.Tensor | None = None,
542
- **kwargs,
543
- ) -> tuple | Lfm2VlCausalLMOutputWithPast:
544
- r"""
545
- pixel_values (`torch.FloatTensor` of shape `(batch_size, channels, height, width)`, *optional*):
546
- The input image tensors.
547
- spatial_shapes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
548
- The spatial shapes of the input images.
549
- pixel_attention_mask (`torch.Tensor` of shape `(batch_size, height, width)`, *optional*):
550
- The pixel attention mask of the input images.
551
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
552
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
553
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
554
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
555
-
556
- Example:
557
-
558
- ```python
559
- >>> from PIL import Image
560
- >>> import requests
561
- >>> from transformers import AutoProcessor, AutoModelForImageTextToText
562
- >>> from transformers.image_utils import load_image
563
-
564
- >>> model = AutoModelForImageTextToText.from_pretrained(
565
- ... "LiquidAI/LFM2-VL-1.6B",
566
- ... trust_remote_code=True
567
- ... )
568
- >>> processor = AutoProcessor.from_pretrained(
569
- ... "LiquidAI/LFM2-VL-1.6B",
570
- ... trust_remote_code=True
571
- ... )
572
-
573
- >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
574
- >>> image = load_image(url)
575
-
576
- >>> conversation = [
577
- ... {
578
- ... "role": "user",
579
- ... "content": [
580
- ... {"type": "image", "image": image},
581
- ... {"type": "text", "text": "What is in this image?"},
582
- ... ],
583
- ... },
584
- ... ]
585
-
586
- >>> inputs = processor.apply_chat_template(
587
- ... conversation,
588
- ... add_generation_prompt=True,
589
- ... tokenize=True,
590
- ... return_dict=True,
591
- ... return_tensors="pt"
592
- ... )
593
-
594
- >>> # Generate
595
- >>> outputs = model.generate(**inputs, max_new_tokens=45)
596
- >>> processor.batch_decode(outputs, skip_special_tokens=True)[0]
597
- 'This image depicts a vibrant street scene in what appears to be a Chinatown or similar cultural area. The focal point is a large red stop sign with white lettering, mounted on a pole.'
598
- ```"""
599
- output_attentions = (
600
- output_attentions
601
- if output_attentions is not None
602
- else self.config.output_attentions
603
- )
604
- output_hidden_states = (
605
- output_hidden_states
606
- if output_hidden_states is not None
607
- else self.config.output_hidden_states
608
- )
609
- return_dict = (
610
- return_dict if return_dict is not None else self.config.use_return_dict
611
- )
612
-
613
- outputs = self.model(
614
- input_ids=input_ids,
615
- pixel_values=pixel_values,
616
- spatial_shapes=spatial_shapes,
617
- pixel_attention_mask=pixel_attention_mask,
618
- attention_mask=attention_mask,
619
- position_ids=position_ids,
620
- past_key_values=past_key_values,
621
- inputs_embeds=inputs_embeds,
622
- use_cache=use_cache,
623
- output_attentions=output_attentions,
624
- output_hidden_states=output_hidden_states,
625
- return_dict=True,
626
- cache_position=cache_position,
627
- image_sizes=image_sizes,
628
- **kwargs,
629
- )
630
-
631
- hidden_states = outputs[0]
632
- # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
633
- slice_indices = (
634
- slice(-logits_to_keep, None)
635
- if isinstance(logits_to_keep, int)
636
- else logits_to_keep
637
- )
638
- logits = self.lm_head(hidden_states[:, slice_indices, :])
639
-
640
- loss = None
641
- if labels is not None:
642
- loss = self.loss_function(
643
- logits=logits,
644
- labels=labels,
645
- vocab_size=self.config.text_config.vocab_size,
646
- **kwargs,
647
- )
648
-
649
- return Lfm2VlCausalLMOutputWithPast(
650
- loss=loss,
651
- logits=logits,
652
- past_key_values=outputs.past_key_values,
653
- hidden_states=outputs.hidden_states,
654
- attentions=outputs.attentions,
655
- image_hidden_states=outputs.image_hidden_states,
656
- )
657
-
658
- def prepare_inputs_for_generation(
659
- self,
660
- input_ids,
661
- past_key_values=None,
662
- inputs_embeds=None,
663
- pixel_values=None,
664
- attention_mask=None,
665
- cache_position=None,
666
- logits_to_keep=None,
667
- **kwargs,
668
- ):
669
- # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
670
- model_inputs = super().prepare_inputs_for_generation(
671
- input_ids,
672
- past_key_values=past_key_values,
673
- inputs_embeds=inputs_embeds,
674
- attention_mask=attention_mask,
675
- cache_position=cache_position,
676
- logits_to_keep=logits_to_keep,
677
- **kwargs,
678
- )
679
-
680
- if cache_position[0] == 0:
681
- # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
682
- # Otherwise we need pixel values to be passed to model
683
- model_inputs["pixel_values"] = pixel_values
684
-
685
- return model_inputs
686
-
687
-
688
- __all__ = ["Lfm2VlForConditionalGeneration", "Lfm2VlModel", "Lfm2VlPreTrainedModel"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_lfm2_vl.py DELETED
@@ -1,645 +0,0 @@
1
- import math
2
- from typing import Union
3
-
4
- from PIL import Image
5
- from transformers.feature_extraction_utils import BatchFeature
6
- from transformers.image_utils import ImageInput, make_nested_list_of_images
7
- from transformers.image_transforms import to_pil_image
8
- from transformers.processing_utils import (
9
- ImagesKwargs,
10
- ProcessingKwargs,
11
- ProcessorMixin,
12
- Unpack,
13
- )
14
- from transformers.tokenization_utils_base import BatchEncoding, TextInput
15
- from transformers.utils import logging
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
-
20
- # resize adapted from qwen2.5
21
- # https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
22
- def round_by_factor(number: float, factor: int) -> int:
23
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
24
- return round(number / factor) * factor
25
-
26
-
27
- def ceil_by_factor(number: float, factor: int) -> int:
28
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
29
- return math.ceil(number / factor) * factor
30
-
31
-
32
- def floor_by_factor(number: float, factor: int) -> int:
33
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
34
- return math.floor(number / factor) * factor
35
-
36
-
37
- def find_closest_aspect_ratio(
38
- aspect_ratio: float,
39
- target_ratios: list[tuple[int, int]],
40
- width: int,
41
- height: int,
42
- image_size: int,
43
- ) -> tuple[int, int]:
44
- """Find the closest aspect ratio from target_ratios to match the input aspect ratio.
45
-
46
- Args:
47
- aspect_ratio: The aspect ratio to match (width/height).
48
- target_ratios: List of possible aspect ratios as tuples of (width, height) integers.
49
- width: Original image width in pixels.
50
- height: Original image height in pixels.
51
- image_size: Base size for calculating target area.
52
-
53
- Returns:
54
- tuple[int, int]: The best matching ratio as (width, height) integers.
55
- """
56
- best_ratio_diff = float("inf")
57
- best_ratio = (1, 1)
58
- area = width * height
59
-
60
- for ratio in target_ratios:
61
- target_aspect_ratio = ratio[0] / ratio[1]
62
- ratio_diff = abs(aspect_ratio - target_aspect_ratio)
63
-
64
- # update best ratio if we found a closer match
65
- if ratio_diff < best_ratio_diff:
66
- best_ratio_diff = ratio_diff
67
- best_ratio = ratio
68
- # if equally close, prefer the ratio that better matches the original image area
69
- elif ratio_diff == best_ratio_diff:
70
- target_area = image_size * image_size * ratio[0] * ratio[1]
71
- if area > 0.5 * target_area:
72
- best_ratio = ratio
73
-
74
- return best_ratio
75
-
76
-
77
- class Lfm2VlImagesKwargs(ImagesKwargs, total=False):
78
- return_row_col_info: bool | None
79
- max_image_size: dict[str, int] | None
80
-
81
-
82
- class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
83
- images_kwargs: Lfm2VlImagesKwargs
84
-
85
- _defaults = {
86
- "text_kwargs": {
87
- "add_special_tokens": False,
88
- "padding": False,
89
- "is_split_into_words": False,
90
- },
91
- "images_kwargs": {
92
- "do_resize": False,
93
- },
94
- }
95
-
96
-
97
- class Lfm2VlProcessor(ProcessorMixin):
98
- r"""
99
- Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2Vl image processor into a single processor.
100
-
101
- [`Lfm2VlProcessor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`Lfm2Tokenizer`].
102
-
103
- Args:
104
- image_processor (`Siglip2ImageProcessor`):
105
- An instance of [`Siglip2ImageProcessor`]. The image processor is a required input.
106
- tokenizer (`PreTrainedTokenizerBase`):
107
- An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
108
- """
109
-
110
- attributes = ["image_processor", "tokenizer"]
111
- image_processor_class = "Siglip2ImageProcessor"
112
- tokenizer_class = "AutoTokenizer"
113
-
114
- def __init__(
115
- self,
116
- image_processor,
117
- tokenizer,
118
- chat_template: str,
119
- use_image_special_tokens: bool,
120
- downsample_factor: int,
121
- do_image_splitting: bool,
122
- min_tiles: int,
123
- max_tiles: int,
124
- use_thumbnail: bool,
125
- min_image_tokens: int,
126
- max_image_tokens: int,
127
- encoder_patch_size: int,
128
- tile_size: int,
129
- max_pixels_tolerance: float,
130
- max_num_patches: int,
131
- auto_map: dict[str, str] = None,
132
- **kwargs,
133
- ):
134
- self.image_token = getattr(tokenizer, "image_token", "<image>")
135
- self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
136
- self.use_image_special_tokens = use_image_special_tokens
137
- self.image_start_token = getattr(
138
- tokenizer, "image_start_token", "<|image_start|>"
139
- )
140
- self.image_end_token = getattr(tokenizer, "image_end_token", "<|image_end|>")
141
- self.image_thumbnail_token = getattr(
142
- tokenizer, "image_thumbnail", "<|img_thumbnail|>"
143
- )
144
- self.downsample_factor = downsample_factor
145
- self.do_image_splitting = do_image_splitting
146
- self.min_tiles = min_tiles
147
- self.max_tiles = max_tiles
148
- self.use_thumbnail = use_thumbnail
149
- self.min_image_tokens = min_image_tokens
150
- self.max_image_tokens = max_image_tokens
151
- self.encoder_patch_size = encoder_patch_size
152
- self.tile_size = tile_size
153
- self.max_pixels_tolerance = max_pixels_tolerance
154
- self.chat_template = chat_template
155
- self.auto_map = auto_map
156
- super().__init__(
157
- image_processor, tokenizer, chat_template=chat_template, **kwargs
158
- )
159
- self.max_num_patches = max_num_patches
160
- self.image_processor.max_num_patches = max_num_patches
161
-
162
- def _high_res_preprocessor(
163
- self,
164
- image: Image.Image,
165
- min_tiles,
166
- max_tiles,
167
- tile_size,
168
- ) -> tuple[list[Image.Image], int, int, int]:
169
- """Process a high resolution image into patches.
170
- This method splits a high resolution image into a grid of smaller patches while trying to maintain
171
- the original aspect ratio. It finds the optimal grid configuration within the specified tile constraints.
172
- """
173
- orig_width, orig_height = image.size
174
- aspect_ratio = orig_width / orig_height
175
-
176
- # generate valid patch grid configurations (width, height)
177
- target_ratios = [
178
- (w, h)
179
- for n in range(min_tiles, max_tiles + 1)
180
- for w in range(1, n + 1)
181
- for h in range(1, n + 1)
182
- if min_tiles <= w * h <= max_tiles
183
- ]
184
- target_ratios = sorted(set(target_ratios), key=lambda x: x[0] * x[1])
185
-
186
- # default to 1x1 if no valid configurations found
187
- if not target_ratios:
188
- return [], 0, 0
189
-
190
- # find best matching grid configuration
191
- grid_width, grid_height = find_closest_aspect_ratio(
192
- aspect_ratio, target_ratios, orig_width, orig_height, tile_size
193
- )
194
-
195
- target_width = tile_size * grid_width
196
- target_height = tile_size * grid_height
197
- total_patches = grid_width * grid_height
198
-
199
- # resize and split image into patches
200
- resized_img = image.resize((target_width, target_height))
201
- patches = []
202
-
203
- for i in range(total_patches):
204
- # calculate patch coordinates
205
- col = i % grid_width
206
- row = i // grid_width
207
- box = (
208
- col * tile_size,
209
- row * tile_size,
210
- (col + 1) * tile_size,
211
- (row + 1) * tile_size,
212
- )
213
- patch = resized_img.crop(box)
214
- patches.append(patch)
215
-
216
- num_rows = grid_height
217
- num_columns = grid_width
218
-
219
- return patches, num_rows, num_columns
220
-
221
- def _smart_resize(
222
- self,
223
- image: Image.Image,
224
- downsample_factor: int,
225
- min_image_tokens: int,
226
- max_image_tokens: int,
227
- encoder_patch_size: int,
228
- ) -> Image.Image:
229
- """
230
- Rescales the image so that the following conditions are met:
231
- 1. Both dimensions (height and width) are divisible by 'encoder_patch_size' * 'downsample_factor'.
232
- This ensures no padding is needed in the downsampling step.
233
- 2. The total number of pixels is within the range ['smart_resize_min_pixels', 'smart_resize_max_pixels'].
234
- 3. The aspect ratio of the image is maintained as closely as possible.
235
- """
236
- width, height = image.size
237
-
238
- total_factor = encoder_patch_size * downsample_factor
239
- smart_resize_min_pixels = (
240
- min_image_tokens
241
- * encoder_patch_size ** 2
242
- * downsample_factor ** 2
243
- )
244
- smart_resize_max_pixels = (
245
- max_image_tokens
246
- * encoder_patch_size ** 2
247
- * downsample_factor ** 2
248
- )
249
-
250
- h_bar = max(total_factor, round_by_factor(height, total_factor))
251
- w_bar = max(total_factor, round_by_factor(width, total_factor))
252
-
253
- if h_bar * w_bar > smart_resize_max_pixels:
254
- beta = math.sqrt((height * width) / smart_resize_max_pixels)
255
- h_bar = max(total_factor, floor_by_factor(height / beta, total_factor))
256
- w_bar = max(total_factor, floor_by_factor(width / beta, total_factor))
257
- elif h_bar * w_bar < smart_resize_min_pixels:
258
- beta = math.sqrt(smart_resize_min_pixels / (height * width))
259
- h_bar = ceil_by_factor(height * beta, total_factor)
260
- w_bar = ceil_by_factor(width * beta, total_factor)
261
-
262
- resized_img = image.resize((w_bar, h_bar))
263
- return resized_img
264
-
265
- def _get_tokens_num(self, image_height: int, image_width: int) -> int:
266
- num_patches_height = image_height // self.encoder_patch_size
267
- num_patches_width = image_width // self.encoder_patch_size
268
-
269
- dwn_num_patches_height = math.ceil(num_patches_height / self.downsample_factor)
270
- dwn_num_patches_width = math.ceil(num_patches_width / self.downsample_factor)
271
-
272
- return dwn_num_patches_height * dwn_num_patches_width
273
-
274
- def _is_img_too_large(
275
- self,
276
- image: Image.Image,
277
- max_image_tokens: int,
278
- encoder_patch_size: int,
279
- max_pixels_tolerance: float,
280
- ) -> bool:
281
- """Check if the image is too large to be processed as one tile."""
282
- width, height = image.size
283
-
284
- h_bar = max(encoder_patch_size, round_by_factor(height, encoder_patch_size))
285
- w_bar = max(encoder_patch_size, round_by_factor(width, encoder_patch_size))
286
- return (
287
- h_bar * w_bar
288
- > max_image_tokens
289
- * encoder_patch_size ** 2
290
- * self.downsample_factor ** 2
291
- * max_pixels_tolerance
292
- )
293
-
294
- def _resize_and_maybe_split(
295
- self,
296
- image: ImageInput,
297
- downsample_factor: int,
298
- min_tiles: int,
299
- max_tiles: int,
300
- use_thumbnail: bool,
301
- min_image_tokens: int,
302
- max_image_tokens: int,
303
- encoder_patch_size: int,
304
- tile_size: int,
305
- max_pixels_tolerance: float,
306
- ) -> tuple[list[Image.Image], int, int, int, int]:
307
- """Apply smart resize and maybe split the image into tiles if image too large.
308
- Return:
309
- image_tiles: ImageInput
310
- num_tokens_per_tile: int
311
- num_rows: int
312
- num_cols: int
313
- num_thumbnail_tokens: int
314
- """
315
- image = to_pil_image(image)
316
- do_image_splitting = not min_tiles == max_tiles == 1
317
- if (
318
- self._is_img_too_large(
319
- image,
320
- max_image_tokens,
321
- encoder_patch_size,
322
- max_pixels_tolerance,
323
- )
324
- and do_image_splitting
325
- ):
326
- image_tiles, num_rows, num_cols = self._high_res_preprocessor(
327
- image, min_tiles, max_tiles, tile_size
328
- )
329
- if len(image_tiles) > 1:
330
- num_thumbnail_tokens = 0
331
- if use_thumbnail:
332
- thumbnail_image = self._smart_resize(
333
- image,
334
- downsample_factor,
335
- min_image_tokens,
336
- max_image_tokens,
337
- encoder_patch_size,
338
- )
339
- num_thumbnail_tokens = self._get_tokens_num(
340
- thumbnail_image.height, thumbnail_image.width
341
- )
342
- image_tiles.append(thumbnail_image)
343
-
344
- return (
345
- image_tiles,
346
- self._get_tokens_num(tile_size, tile_size),
347
- num_rows,
348
- num_cols,
349
- num_thumbnail_tokens,
350
- )
351
- else:
352
- image = self._smart_resize(
353
- image,
354
- downsample_factor,
355
- min_image_tokens,
356
- max_image_tokens,
357
- encoder_patch_size,
358
- )
359
- return [image], self._get_tokens_num(image.height, image.width), 1, 1, 0
360
-
361
- def process_vision(
362
- self,
363
- text: list[str],
364
- images: list[list[ImageInput]],
365
- use_image_special_tokens: bool,
366
- downsample_factor: int,
367
- min_tiles: int,
368
- max_tiles: int,
369
- use_thumbnail: bool,
370
- min_image_tokens: int,
371
- max_image_tokens: int,
372
- encoder_patch_size: int,
373
- tile_size: int,
374
- max_pixels_tolerance: float,
375
- output_kwargs: dict,
376
- ):
377
- if text is not None:
378
- n_images_in_text = [sample.count(self.image_token) for sample in text]
379
-
380
- n_images_in_images = [len(sublist) for sublist in images]
381
-
382
- if n_images_in_images != n_images_in_text:
383
- raise ValueError(
384
- f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same."
385
- )
386
-
387
- prompt_strings = []
388
- image_inputs = []
389
-
390
- for sample_text, sample_images in zip(text, images, strict=False):
391
- split_sample = sample_text.split(self.image_token)
392
- sample_tiles = []
393
- sample_text_with_image_tokens = ""
394
- for i, image in enumerate(sample_images):
395
- sample_text_with_image_tokens += split_sample[i]
396
- if use_image_special_tokens:
397
- sample_text_with_image_tokens += self.image_start_token
398
- (
399
- image_tiles,
400
- num_tokens_per_tile,
401
- num_rows,
402
- num_cols,
403
- num_thumbnail_tokens,
404
- ) = self._resize_and_maybe_split(
405
- image,
406
- downsample_factor,
407
- min_tiles,
408
- max_tiles,
409
- use_thumbnail,
410
- min_image_tokens,
411
- max_image_tokens,
412
- encoder_patch_size,
413
- tile_size,
414
- max_pixels_tolerance,
415
- )
416
-
417
- if len(image_tiles) > 1:
418
- for row in range(num_rows):
419
- for col in range(num_cols):
420
- if use_image_special_tokens:
421
- sample_text_with_image_tokens += (
422
- f"<|img_row_{row + 1}_col_{col + 1}|>"
423
- )
424
- sample_text_with_image_tokens += (
425
- self.image_token * num_tokens_per_tile
426
- )
427
-
428
- if num_thumbnail_tokens > 0:
429
- if use_image_special_tokens:
430
- sample_text_with_image_tokens += self.image_thumbnail_token
431
- sample_text_with_image_tokens += (
432
- self.image_token * num_thumbnail_tokens
433
- )
434
- else:
435
- sample_text_with_image_tokens += (
436
- self.image_token * num_tokens_per_tile
437
- )
438
-
439
- if use_image_special_tokens:
440
- sample_text_with_image_tokens += self.image_end_token
441
-
442
- sample_text_with_image_tokens += split_sample[i + 1]
443
- sample_tiles.extend(image_tiles)
444
-
445
- prompt_strings.append(sample_text_with_image_tokens)
446
- image_inputs.append(sample_tiles)
447
-
448
- image_inputs = self.image_processor(
449
- image_inputs, **output_kwargs["images_kwargs"]
450
- )
451
-
452
- if text is None:
453
- return None, image_inputs
454
-
455
- return prompt_strings, image_inputs
456
-
457
- def __call__(
458
- self,
459
- images: ImageInput | list[ImageInput] | list[list[ImageInput]] = None,
460
- text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
461
- use_image_special_tokens: bool | None = None,
462
- downsample_factor: int | None = None,
463
- min_image_tokens: int | None = None,
464
- max_image_tokens: int | None = None,
465
- do_image_splitting: bool | None = None,
466
- min_tiles: int | None = None,
467
- max_tiles: int | None = None,
468
- use_thumbnail: bool | None = None,
469
- encoder_patch_size: int | None = None,
470
- tile_size: int | None = None,
471
- max_pixels_tolerance: float | None = None,
472
- **kwargs: Unpack[Lfm2VlProcessorKwargs],
473
- ) -> BatchEncoding:
474
- """
475
- Processes the input prompts and returns a BatchFeature.
476
-
477
- Example:
478
-
479
- ```python
480
- >>> import requests
481
- >>> from transformers import AutoProcessor
482
- >>> from transformers.image_utils import load_image
483
- >>> processor = AutoProcessor.from_pretrained("LiquidAI/LFM2-VL-1.6B", trust_remote_code=True)
484
-
485
- >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
486
- >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
487
-
488
- >>> image1, image2 = load_image(url1), load_image(url2)
489
- >>> images = [image1, image2]
490
-
491
- >>> conversation = [
492
- ... {
493
- ... "role": "user",
494
- ... "content": [
495
- ... {"type": "image", "url": image1},
496
- ... {"type": "image", "url": image2},
497
- ... {"type": "text", "text": "Compare the two images."},
498
- ... ],
499
- ... },
500
- ... ]
501
- >>> chat_inputs = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
502
- >>> outputs = processor(images=images, text=chat_inputs, return_tensors="pt")
503
- >>> input_ids = outputs.input_ids
504
- >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
505
- >>> print(input_tokens)
506
- '['user\nCompare the two images.\nassistant\n']'
507
- ```
508
-
509
- Args:
510
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
511
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
512
- tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
513
- text (`TextInput`, *optional*):
514
- The sequence or batch of sequences to be encoded.
515
- Wherever an image token, `<image>` is encountered it is expanded to a proper sequence of image tokens.
516
- return_tensors (`str | TensorType`, *optional*):
517
- If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
518
- information.
519
- """
520
- use_image_special_tokens = (
521
- use_image_special_tokens
522
- if use_image_special_tokens is not None
523
- else self.use_image_special_tokens
524
- )
525
- downsample_factor = (
526
- downsample_factor
527
- if downsample_factor is not None
528
- else self.downsample_factor
529
- )
530
- do_image_splitting = (
531
- do_image_splitting
532
- if do_image_splitting is not None
533
- else self.do_image_splitting
534
- )
535
-
536
- min_tiles = min_tiles if min_tiles is not None else self.min_tiles
537
- max_tiles = max_tiles if max_tiles is not None else self.max_tiles
538
-
539
- if not do_image_splitting:
540
- min_tiles = 1
541
- max_tiles = 1
542
- logger.debug(
543
- "Image splitting is disabled, setting min_tiles and max_tiles to 1. Set do_image_splitting=True to enable splitting."
544
- )
545
-
546
- if do_image_splitting and min_tiles > max_tiles:
547
- raise ValueError("min_tiles must be less than or equal to max_tiles")
548
-
549
- use_thumbnail = (
550
- use_thumbnail if use_thumbnail is not None else self.use_thumbnail
551
- )
552
- min_image_tokens = (
553
- min_image_tokens if min_image_tokens is not None else self.min_image_tokens
554
- )
555
- max_image_tokens = (
556
- max_image_tokens if max_image_tokens is not None else self.max_image_tokens
557
- )
558
- encoder_patch_size = (
559
- encoder_patch_size
560
- if encoder_patch_size is not None
561
- else self.encoder_patch_size
562
- )
563
- tile_size = tile_size if tile_size is not None else self.tile_size
564
- max_pixels_tolerance = (
565
- max_pixels_tolerance
566
- if max_pixels_tolerance is not None
567
- else self.max_pixels_tolerance
568
- )
569
-
570
- if text is None and images is None:
571
- raise ValueError("You must provide one of `text` or `images`.")
572
-
573
- output_kwargs = self._merge_kwargs(
574
- Lfm2VlProcessorKwargs,
575
- tokenizer_init_kwargs=self.tokenizer.init_kwargs,
576
- **kwargs,
577
- )
578
-
579
- if text is not None:
580
- if isinstance(text, str):
581
- text = [text]
582
- elif not isinstance(text, list) and not isinstance(text[0], str):
583
- raise ValueError(
584
- "Invalid input text. Please provide a string, or a list of strings"
585
- )
586
- n_images_in_text = sum([sample.count(self.image_token) for sample in text])
587
- if n_images_in_text > 0 and (images is None):
588
- raise ValueError(
589
- f"We detected {n_images_in_text} tokens in the text but no images were passed"
590
- )
591
-
592
- inputs = {}
593
-
594
- if images is not None:
595
- images = make_nested_list_of_images(images)
596
- text, vision_inputs = self.process_vision(
597
- text,
598
- images,
599
- use_image_special_tokens,
600
- downsample_factor,
601
- min_tiles,
602
- max_tiles,
603
- use_thumbnail,
604
- min_image_tokens,
605
- max_image_tokens,
606
- encoder_patch_size,
607
- tile_size,
608
- max_pixels_tolerance,
609
- output_kwargs,
610
- )
611
- inputs.update(vision_inputs)
612
-
613
- return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
614
-
615
- if text is not None:
616
- text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
617
- self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
618
- inputs.update(text_inputs)
619
-
620
- return BatchFeature(inputs, tensor_type=return_tensors)
621
-
622
- def batch_decode(self, *args, **kwargs):
623
- """
624
- This method forwards all its arguments to LFM2Tokeniser's [`~PreTrainedTokenizer.batch_decode`]. Please
625
- refer to the docstring of this method for more information.
626
- """
627
- batched_decode_output = self.tokenizer.batch_decode(*args, **kwargs)
628
- return batched_decode_output
629
-
630
- def decode(self, *args, **kwargs):
631
- """
632
- This method forwards all its arguments to LFM2Tokeniser's [`~PreTrainedTokenizer.decode`]. Please refer to
633
- the docstring of this method for more information.
634
- """
635
- decode_output = self.tokenizer.decode(*args, **kwargs)
636
- return decode_output
637
-
638
- @property
639
- def model_input_names(self):
640
- tokenizer_input_names = self.tokenizer.model_input_names
641
- image_processor_input_names = self.image_processor.model_input_names
642
- return list(dict.fromkeys(image_processor_input_names + tokenizer_input_names))
643
-
644
-
645
- __all__ = ["Lfm2VlProcessor"]