Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,97 @@
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: apache-2.0
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
Same as https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2 with two changes:
|
| 6 |
+
- increase max resolution to 980 x 980 (instead of 384 x 384) by interpolating the position embeddings
|
| 7 |
+
- implement the strategy in [NaViT](https://arxiv.org/abs/2307.06304) to allow a/ variable resoltion images, b/ aspect ratio preserved images
|
| 8 |
+
|
| 9 |
+
These changes only apply to the vision tower. No changes to the text tower.
|
| 10 |
+
Implementation is fully backward compatible to `https://huggingface.co/HuggingFaceM4/siglip-so400m-14-384-flash-attn2` -> just don't specify the `patch_attention_mask`
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
```python
|
| 15 |
+
import torch
|
| 16 |
+
from modeling_siglip import SiglipVisionModel
|
| 17 |
+
|
| 18 |
+
DEVICE = torch.device("cuda:0")
|
| 19 |
+
PATCH_SIZE = 14
|
| 20 |
+
|
| 21 |
+
pixel_values = torch.randn(2, 3, 28, 42, dtype=torch.bfloat16, device=DEVICE)
|
| 22 |
+
pixel_attention_mask = [
|
| 23 |
+
[
|
| 24 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 25 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 26 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 27 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 28 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 29 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 30 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 31 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 32 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 33 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 34 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 35 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 36 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 37 |
+
[1] * 14 + [1] * 14 + [1] * 14,
|
| 38 |
+
|
| 39 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 40 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 41 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 42 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 43 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 44 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 45 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 46 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 47 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 48 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 49 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 50 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 51 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 52 |
+
[0] * 14 + [0] * 14 + [0] * 14,
|
| 53 |
+
],
|
| 54 |
+
[
|
| 55 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 56 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 57 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 58 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 59 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 60 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 61 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 62 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 63 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 64 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 65 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 66 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 67 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 68 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 69 |
+
|
| 70 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 71 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 72 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 73 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 74 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 75 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 76 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 77 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 78 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 79 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 80 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 81 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 82 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 83 |
+
[1] * 14 + [1] * 14 + [0] * 14,
|
| 84 |
+
],
|
| 85 |
+
]
|
| 86 |
+
pixel_attention_mask = torch.tensor(pixel_attention_mask, dtype=torch.bool, device=DEVICE)
|
| 87 |
+
patches_subgrid = pixel_attention_mask.unfold(
|
| 88 |
+
dimension=1, size=PATCH_SIZE, step=PATCH_SIZE
|
| 89 |
+
).unfold(dimension=2, size=PATCH_SIZE, step=PATCH_SIZE)
|
| 90 |
+
patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
|
| 91 |
+
|
| 92 |
+
model = SiglipVisionModel.from_pretrained("HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit", _flash_attn_2_enabled=True)
|
| 93 |
+
model.train()
|
| 94 |
+
model.vision_model.to(DEVICE, dtype=torch.bfloat16)
|
| 95 |
+
|
| 96 |
+
output = model.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
|
| 97 |
+
```
|