Upload folder using huggingface_hub
Browse files- README.md +260 -0
- config.json +34 -0
- merges.txt +0 -0
- onnx/model.onnx +3 -0
- onnx/model_bnb4.onnx +3 -0
- onnx/model_fp16.onnx +3 -0
- onnx/model_int8.onnx +3 -0
- onnx/model_q4.onnx +3 -0
- onnx/model_q4f16.onnx +3 -0
- onnx/model_quantized.onnx +3 -0
- onnx/model_uint8.onnx +3 -0
- preprocessor_config.json +28 -0
- quantize_config.json +18 -0
- special_tokens_map.json +30 -0
- tokenizer.json +0 -0
- tokenizer_config.json +32 -0
- vocab.json +0 -0
README.md
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
language:
|
| 4 |
+
- en
|
| 5 |
+
pipeline_tag: zero-shot-image-classification
|
| 6 |
+
widget:
|
| 7 |
+
- src: https://huggingface.co/geolocal/StreetCLIP/resolve/main/nagasaki.jpg
|
| 8 |
+
candidate_labels: China, South Korea, Japan, Phillipines, Taiwan, Vietnam, Cambodia
|
| 9 |
+
example_title: Countries
|
| 10 |
+
- src: https://huggingface.co/geolocal/StreetCLIP/resolve/main/sanfrancisco.jpeg
|
| 11 |
+
candidate_labels: San Jose, San Diego, Los Angeles, Las Vegas, San Francisco, Seattle
|
| 12 |
+
example_title: Cities
|
| 13 |
+
library_name: transformers.js
|
| 14 |
+
tags:
|
| 15 |
+
- geolocalization
|
| 16 |
+
- geolocation
|
| 17 |
+
- geographic
|
| 18 |
+
- street
|
| 19 |
+
- climate
|
| 20 |
+
- clip
|
| 21 |
+
- urban
|
| 22 |
+
- rural
|
| 23 |
+
- multi-modal
|
| 24 |
+
- geoguessr
|
| 25 |
+
base_model:
|
| 26 |
+
- geolocal/StreetCLIP
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# StreetCLIP (ONNX)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
This is an ONNX version of [geolocal/StreetCLIP](https://huggingface.co/geolocal/StreetCLIP). It was automatically converted and uploaded using [this Hugging Face Space](https://huggingface.co/spaces/onnx-community/convert-to-onnx).
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
## Usage with Transformers.js
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
See the pipeline documentation for `zero-shot-image-classification`: https://huggingface.co/docs/transformers.js/api/pipelines#module_pipelines.ZeroShotImageClassificationPipeline
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Model Card for StreetCLIP
|
| 47 |
+
|
| 48 |
+
StreetCLIP is a robust foundation model for open-domain image geolocalization and other
|
| 49 |
+
geographic and climate-related tasks.
|
| 50 |
+
|
| 51 |
+
Trained on an original dataset of 1.1 million street-level urban and rural geo-tagged images, it achieves
|
| 52 |
+
state-of-the-art performance on multiple open-domain image geolocalization benchmarks in zero-shot,
|
| 53 |
+
outperforming supervised models trained on millions of images.
|
| 54 |
+
|
| 55 |
+
# Model Description
|
| 56 |
+
|
| 57 |
+
StreetCLIP is a model pretrained by deriving image captions synthetically from image class labels using
|
| 58 |
+
a domain-specific caption template. This allows StreetCLIP to transfer its generalized zero-shot learning
|
| 59 |
+
capabilities to a specific domain (i.e. the domain of image geolocalization).
|
| 60 |
+
StreetCLIP builds on the OpenAI's pretrained large version of CLIP ViT, using 14x14 pixel
|
| 61 |
+
patches and images with a 336 pixel side length.
|
| 62 |
+
|
| 63 |
+
## Model Details
|
| 64 |
+
|
| 65 |
+
- **Model type:** [CLIP](https://openai.com/blog/clip/)
|
| 66 |
+
- **Language:** English
|
| 67 |
+
- **License:** Create Commons Attribution Non Commercial 4.0
|
| 68 |
+
- **Trained from model:** [openai/clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336)
|
| 69 |
+
|
| 70 |
+
## Model Sources
|
| 71 |
+
|
| 72 |
+
- **Paper:** [Preprint](https://arxiv.org/abs/2302.00275)
|
| 73 |
+
- **Cite preprint as:**
|
| 74 |
+
```bibtex
|
| 75 |
+
@misc{haas2023learning,
|
| 76 |
+
title={Learning Generalized Zero-Shot Learners for Open-Domain Image Geolocalization},
|
| 77 |
+
author={Lukas Haas and Silas Alberti and Michal Skreta},
|
| 78 |
+
year={2023},
|
| 79 |
+
eprint={2302.00275},
|
| 80 |
+
archivePrefix={arXiv},
|
| 81 |
+
primaryClass={cs.CV}
|
| 82 |
+
}
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
# Uses
|
| 86 |
+
|
| 87 |
+
StreetCLIP has a deep understanding of the visual features found in street-level urban and rural scenes
|
| 88 |
+
and knows how to relate these concepts to specific countries, regions, and cities. Given its training setup,
|
| 89 |
+
the following use cases are recommended for StreetCLIP.
|
| 90 |
+
|
| 91 |
+
## Direct Use
|
| 92 |
+
|
| 93 |
+
StreetCLIP can be used out-of-the box using zero-shot learning to infer the geolocation of images on a country, region,
|
| 94 |
+
or city level. Given that StreetCLIP was pretrained on a dataset of street-level urban and rural images,
|
| 95 |
+
the best performance can be expected on images from a similar distribution.
|
| 96 |
+
|
| 97 |
+
Broader direct use cases are any zero-shot image classification tasks that rely on urban and rural street-level
|
| 98 |
+
understanding or geographical information relating visual clues to their region of origin.
|
| 99 |
+
|
| 100 |
+
## Downstream Use
|
| 101 |
+
|
| 102 |
+
StreetCLIP can be finetuned for any downstream applications that require geographic or street-level urban or rural
|
| 103 |
+
scene understanding. Examples of use cases are the following:
|
| 104 |
+
|
| 105 |
+
**Understanding the Built Environment**
|
| 106 |
+
|
| 107 |
+
- Analyzing building quality
|
| 108 |
+
- Building type classifcation
|
| 109 |
+
- Building energy efficiency Classification
|
| 110 |
+
|
| 111 |
+
**Analyzing Infrastructure**
|
| 112 |
+
|
| 113 |
+
- Analyzing road quality
|
| 114 |
+
- Utility pole maintenance
|
| 115 |
+
- Identifying damage from natural disasters or armed conflicts
|
| 116 |
+
|
| 117 |
+
**Understanding the Natural Environment**
|
| 118 |
+
|
| 119 |
+
- Mapping vegetation
|
| 120 |
+
- Vegetation classification
|
| 121 |
+
- Soil type classifcation
|
| 122 |
+
- Tracking deforestation
|
| 123 |
+
|
| 124 |
+
**General Use Cases**
|
| 125 |
+
|
| 126 |
+
- Street-level image segmentation
|
| 127 |
+
- Urban and rural scene classification
|
| 128 |
+
- Object detection in urban or rural environments
|
| 129 |
+
- Improving navigation and self-driving car technology
|
| 130 |
+
|
| 131 |
+
## Out-of-Scope Use
|
| 132 |
+
|
| 133 |
+
Any use cases attempting to geolocate users' private images are out-of-scope and discouraged.
|
| 134 |
+
|
| 135 |
+
# Bias, Risks, and Limitations
|
| 136 |
+
|
| 137 |
+
StreetCLIP was not trained on social media images or images of identifable people for a reason. As such, any use case
|
| 138 |
+
attempting to geolocalize users' private images
|
| 139 |
+
|
| 140 |
+
## Recommendations
|
| 141 |
+
We encourage the community to apply StreetCLIP to applications with significant social impact of which there are many.
|
| 142 |
+
The first three categories of potential use cases under Downstream Use list potential use cases with social impact
|
| 143 |
+
to explore.
|
| 144 |
+
|
| 145 |
+
## How to Get Started with the Model
|
| 146 |
+
|
| 147 |
+
Use the code below to get started with the model.
|
| 148 |
+
|
| 149 |
+
```python
|
| 150 |
+
from PIL import Image
|
| 151 |
+
import requests
|
| 152 |
+
|
| 153 |
+
from transformers import CLIPProcessor, CLIPModel
|
| 154 |
+
|
| 155 |
+
model = CLIPModel.from_pretrained("geolocal/StreetCLIP")
|
| 156 |
+
processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")
|
| 157 |
+
|
| 158 |
+
url = "https://huggingface.co/geolocal/StreetCLIP/resolve/main/sanfrancisco.jpeg"
|
| 159 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
| 160 |
+
|
| 161 |
+
choices = ["San Jose", "San Diego", "Los Angeles", "Las Vegas", "San Francisco"]
|
| 162 |
+
inputs = processor(text=choices, images=image, return_tensors="pt", padding=True)
|
| 163 |
+
|
| 164 |
+
outputs = model(**inputs)
|
| 165 |
+
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 166 |
+
probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
# Training Details
|
| 170 |
+
|
| 171 |
+
## Training Data
|
| 172 |
+
|
| 173 |
+
StreetCLIP was trained on an original, unreleased street-level dataset of 1.1 million real-world,
|
| 174 |
+
urban and rural images. The data used to train the model comes from 101 countries, biased towards
|
| 175 |
+
western countries and not including India and China.
|
| 176 |
+
|
| 177 |
+
## Preprocessing
|
| 178 |
+
|
| 179 |
+
Same preprocessing as [openai/clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336).
|
| 180 |
+
|
| 181 |
+
## Training Procedure
|
| 182 |
+
|
| 183 |
+
StreetCLIP is initialized with OpenAI's pretrained large version of CLIP ViT and then pretrained using the synthetic
|
| 184 |
+
caption domain-specific pretraining method described in the paper corresponding to this work. StreetCLIP was trained
|
| 185 |
+
for 3 epochs using an AdamW optimizer with a learning rate of 1e-6 on 3 NVIDIA A100 80GB GPUs, a batch size of 32,
|
| 186 |
+
and gradient accumulation of 12 steps.
|
| 187 |
+
|
| 188 |
+
StreetCLIP was trained with the goal of matching images in the batch
|
| 189 |
+
with the caption correponding to the correct city, region, and country of the images' origins.
|
| 190 |
+
|
| 191 |
+
# Evaluation
|
| 192 |
+
|
| 193 |
+
StreetCLIP was evaluated in zero-shot on two open-domain image geolocalization benchmarks using a
|
| 194 |
+
technique called hierarchical linear probing. Hierarchical linear probing sequentially attempts to
|
| 195 |
+
identify the correct country and then city of geographical image origin.
|
| 196 |
+
|
| 197 |
+
## Testing Data and Metrics
|
| 198 |
+
|
| 199 |
+
### Testing Data
|
| 200 |
+
|
| 201 |
+
StreetCLIP was evaluated on the following two open-domain image geolocalization benchmarks.
|
| 202 |
+
|
| 203 |
+
* [IM2GPS](http://graphics.cs.cmu.edu/projects/im2gps/).
|
| 204 |
+
* [IM2GPS3K](https://github.com/lugiavn/revisiting-im2gps)
|
| 205 |
+
|
| 206 |
+
### Metrics
|
| 207 |
+
|
| 208 |
+
The objective of the listed benchmark datasets is to predict the images' coordinates of origin with as
|
| 209 |
+
little deviation as possible. A common metric set forth in prior literature is called Percentage at Kilometer (% @ KM).
|
| 210 |
+
The Percentage at Kilometer metric first calculates the distance in kilometers between the predicted coordinates
|
| 211 |
+
to the ground truth coordinates and then looks at what percentage of error distances are below a certain kilometer threshold.
|
| 212 |
+
|
| 213 |
+
## Results
|
| 214 |
+
|
| 215 |
+
**IM2GPS**
|
| 216 |
+
| Model | 25km | 200km | 750km | 2,500km |
|
| 217 |
+
|----------|:-------------:|:------:|:------:|:------:|
|
| 218 |
+
| PlaNet (2016) | 24.5 | 37.6 | 53.6 | 71.3 |
|
| 219 |
+
| ISNs (2018) | 43.0 | 51.9 | 66.7 | 80.2 |
|
| 220 |
+
| TransLocator (2022) | **48.1** | **64.6** | **75.6** | 86.7 |
|
| 221 |
+
| **Zero-Shot CLIP (ours)** | 27.0 | 42.2 | 71.7 | 86.9 |
|
| 222 |
+
| **Zero-Shot StreetCLIP (ours)** | 28.3 | 45.1 | 74.7 | **88.2** |
|
| 223 |
+
Metric: Percentage at Kilometer (% @ KM)
|
| 224 |
+
|
| 225 |
+
**IM2GPS3K**
|
| 226 |
+
| Model | 25km | 200km | 750km | 2,500km |
|
| 227 |
+
|----------|:-------------:|:------:|:------:|:------:|
|
| 228 |
+
| PlaNet (2016) | 24.8 | 34.3 | 48.4 | 64.6 |
|
| 229 |
+
| ISNs (2018) | 28.0 | 36.6 | 49.7 | 66.0 |
|
| 230 |
+
| TransLocator (2022) | **31.1** | **46.7** | 58.9 | 80.1 |
|
| 231 |
+
| **Zero-Shot CLIP (ours)** | 19.5 | 34.0 | 60.0 | 78.1 |
|
| 232 |
+
| **Zero-Shot StreetCLIP (ours)** | 22.4 | 37.4 | **61.3** | **80.4** |
|
| 233 |
+
Metric: Percentage at Kilometer (% @ KM)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
### Summary
|
| 237 |
+
|
| 238 |
+
Our experiments demonstrate that our synthetic caption pretraining method is capable of significantly
|
| 239 |
+
improving CLIP's generalized zero-shot capabilities applied to open-domain image geolocalization while
|
| 240 |
+
achieving state-of-the-art performance on a selection of benchmark metrics.
|
| 241 |
+
|
| 242 |
+
# Environmental Impact
|
| 243 |
+
|
| 244 |
+
- **Hardware Type:** 4 NVIDIA A100 GPUs
|
| 245 |
+
- **Hours used:** 12
|
| 246 |
+
|
| 247 |
+
# Citation
|
| 248 |
+
|
| 249 |
+
Cite preprint as:
|
| 250 |
+
|
| 251 |
+
```bibtex
|
| 252 |
+
@misc{haas2023learning,
|
| 253 |
+
title={Learning Generalized Zero-Shot Learners for Open-Domain Image Geolocalization},
|
| 254 |
+
author={Lukas Haas and Silas Alberti and Michal Skreta},
|
| 255 |
+
year={2023},
|
| 256 |
+
eprint={2302.00275},
|
| 257 |
+
archivePrefix={arXiv},
|
| 258 |
+
primaryClass={cs.CV}
|
| 259 |
+
}
|
| 260 |
+
```
|
config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_attn_implementation_autoset": true,
|
| 3 |
+
"_name_or_path": "geolocal/StreetCLIP",
|
| 4 |
+
"architectures": [
|
| 5 |
+
"CLIPModel"
|
| 6 |
+
],
|
| 7 |
+
"initializer_factor": 1.0,
|
| 8 |
+
"logit_scale_init_value": 2.6592,
|
| 9 |
+
"model_type": "clip",
|
| 10 |
+
"projection_dim": 768,
|
| 11 |
+
"text_config": {
|
| 12 |
+
"dropout": 0.0,
|
| 13 |
+
"hidden_size": 768,
|
| 14 |
+
"intermediate_size": 3072,
|
| 15 |
+
"model_type": "clip_text_model",
|
| 16 |
+
"num_attention_heads": 12,
|
| 17 |
+
"projection_dim": 768,
|
| 18 |
+
"torch_dtype": "float32"
|
| 19 |
+
},
|
| 20 |
+
"torch_dtype": "float32",
|
| 21 |
+
"transformers_version": "4.49.0",
|
| 22 |
+
"vision_config": {
|
| 23 |
+
"dropout": 0.0,
|
| 24 |
+
"hidden_size": 1024,
|
| 25 |
+
"image_size": 336,
|
| 26 |
+
"intermediate_size": 4096,
|
| 27 |
+
"model_type": "clip_vision_model",
|
| 28 |
+
"num_attention_heads": 16,
|
| 29 |
+
"num_hidden_layers": 24,
|
| 30 |
+
"patch_size": 14,
|
| 31 |
+
"projection_dim": 768,
|
| 32 |
+
"torch_dtype": "float32"
|
| 33 |
+
}
|
| 34 |
+
}
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
onnx/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb60bb52e9f4f54111d9c4db66c934970746ab770b2237ba3f5c99c961763c22
|
| 3 |
+
size 1712529037
|
onnx/model_bnb4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d11444aaf34f3f070cf4a948ab4416cf677916112567876da43ce2e15ad02f1
|
| 3 |
+
size 377779159
|
onnx/model_fp16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60d47e9d9c61b55e5f8b66d24e27d1be13428eca33a78368db970e3ade36aada
|
| 3 |
+
size 856641119
|
onnx/model_int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90f3c46c95caaec38e4e3f2ac2779643778468861bd24ccf2548e77a5103ea1f
|
| 3 |
+
size 433623188
|
onnx/model_q4.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e55198ff4108ac95c6781f5d6c61e546bdf4c04374ac1a34913a89865df34fb
|
| 3 |
+
size 402046241
|
onnx/model_q4f16.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c00f511530563d49494655fa6c0356047b383a5cffdb2c88adda9a6b23845e8
|
| 3 |
+
size 298491223
|
onnx/model_quantized.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3dda43fc568cdc7157d18df1e6e0913abbd7aecca2fe8c02ee842f2c459384e
|
| 3 |
+
size 433623188
|
onnx/model_uint8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3dda43fc568cdc7157d18df1e6e0913abbd7aecca2fe8c02ee842f2c459384e
|
| 3 |
+
size 433623188
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": {
|
| 3 |
+
"height": 336,
|
| 4 |
+
"width": 336
|
| 5 |
+
},
|
| 6 |
+
"do_center_crop": true,
|
| 7 |
+
"do_convert_rgb": true,
|
| 8 |
+
"do_normalize": true,
|
| 9 |
+
"do_rescale": true,
|
| 10 |
+
"do_resize": true,
|
| 11 |
+
"image_mean": [
|
| 12 |
+
0.48145466,
|
| 13 |
+
0.4578275,
|
| 14 |
+
0.40821073
|
| 15 |
+
],
|
| 16 |
+
"image_processor_type": "CLIPFeatureExtractor",
|
| 17 |
+
"image_std": [
|
| 18 |
+
0.26862954,
|
| 19 |
+
0.26130258,
|
| 20 |
+
0.27577711
|
| 21 |
+
],
|
| 22 |
+
"processor_class": "CLIPProcessor",
|
| 23 |
+
"resample": 3,
|
| 24 |
+
"rescale_factor": 0.00392156862745098,
|
| 25 |
+
"size": {
|
| 26 |
+
"shortest_edge": 336
|
| 27 |
+
}
|
| 28 |
+
}
|
quantize_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"modes": [
|
| 3 |
+
"fp16",
|
| 4 |
+
"q8",
|
| 5 |
+
"int8",
|
| 6 |
+
"uint8",
|
| 7 |
+
"q4",
|
| 8 |
+
"q4f16",
|
| 9 |
+
"bnb4"
|
| 10 |
+
],
|
| 11 |
+
"per_channel": true,
|
| 12 |
+
"reduce_range": true,
|
| 13 |
+
"block_size": null,
|
| 14 |
+
"is_symmetric": true,
|
| 15 |
+
"accuracy_level": null,
|
| 16 |
+
"quant_type": 1,
|
| 17 |
+
"op_block_list": null
|
| 18 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|startoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|endoftext|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<|endoftext|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<|endoftext|>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"added_tokens_decoder": {
|
| 4 |
+
"49406": {
|
| 5 |
+
"content": "<|startoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false,
|
| 10 |
+
"special": true
|
| 11 |
+
},
|
| 12 |
+
"49407": {
|
| 13 |
+
"content": "<|endoftext|>",
|
| 14 |
+
"lstrip": false,
|
| 15 |
+
"normalized": false,
|
| 16 |
+
"rstrip": false,
|
| 17 |
+
"single_word": false,
|
| 18 |
+
"special": true
|
| 19 |
+
}
|
| 20 |
+
},
|
| 21 |
+
"bos_token": "<|startoftext|>",
|
| 22 |
+
"clean_up_tokenization_spaces": false,
|
| 23 |
+
"do_lower_case": true,
|
| 24 |
+
"eos_token": "<|endoftext|>",
|
| 25 |
+
"errors": "replace",
|
| 26 |
+
"extra_special_tokens": {},
|
| 27 |
+
"model_max_length": 77,
|
| 28 |
+
"pad_token": "<|endoftext|>",
|
| 29 |
+
"processor_class": "CLIPProcessor",
|
| 30 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 31 |
+
"unk_token": "<|endoftext|>"
|
| 32 |
+
}
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|