Add ONNX sample code (#8)
Browse files- Add ONNX sample code (acc054744d60adea9d88214dab6e2219e6e512a7)
Co-authored-by: Joshua <[email protected]>
README.md
CHANGED
|
@@ -105,6 +105,147 @@ To summarize, the image captures a significant historical statue of liberty, sit
|
|
| 105 |
"""
|
| 106 |
```
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
### Model optimizations
|
| 110 |
|
|
|
|
| 105 |
"""
|
| 106 |
```
|
| 107 |
|
| 108 |
+
We also provide ONNX weights for the model, which you can run with ONNX Runtime as follows:
|
| 109 |
+
<details>
|
| 110 |
+
|
| 111 |
+
<summary>Click here to see the sample code</summary>
|
| 112 |
+
|
| 113 |
+
```python
|
| 114 |
+
from transformers import AutoConfig, AutoProcessor
|
| 115 |
+
from transformers.image_utils import load_image
|
| 116 |
+
import onnxruntime
|
| 117 |
+
import numpy as np
|
| 118 |
+
|
| 119 |
+
# 1. Load models
|
| 120 |
+
## Load config and processor
|
| 121 |
+
model_id = "HuggingFaceTB/SmolVLM-256M-Instruct"
|
| 122 |
+
config = AutoConfig.from_pretrained(model_id)
|
| 123 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
| 124 |
+
|
| 125 |
+
## Load sessions
|
| 126 |
+
## !wget https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct/resolve/main/onnx/vision_encoder.onnx
|
| 127 |
+
## !wget https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct/resolve/main/onnx/embed_tokens.onnx
|
| 128 |
+
## !wget https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct/resolve/main/onnx/decoder_model_merged.onnx
|
| 129 |
+
vision_session = onnxruntime.InferenceSession("vision_encoder.onnx")
|
| 130 |
+
embed_session = onnxruntime.InferenceSession("embed_tokens.onnx")
|
| 131 |
+
decoder_session = onnxruntime.InferenceSession("decoder_model_merged.onnx")
|
| 132 |
+
|
| 133 |
+
## Set config values
|
| 134 |
+
num_key_value_heads = config.text_config.num_key_value_heads
|
| 135 |
+
head_dim = config.text_config.head_dim
|
| 136 |
+
num_hidden_layers = config.text_config.num_hidden_layers
|
| 137 |
+
eos_token_id = config.text_config.eos_token_id
|
| 138 |
+
image_token_id = config.image_token_id
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# 2. Prepare inputs
|
| 142 |
+
## Create input messages
|
| 143 |
+
messages = [
|
| 144 |
+
{
|
| 145 |
+
"role": "user",
|
| 146 |
+
"content": [
|
| 147 |
+
{"type": "image"},
|
| 148 |
+
{"type": "text", "text": "Can you describe this image?"}
|
| 149 |
+
]
|
| 150 |
+
},
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
## Load image and apply processor
|
| 154 |
+
image = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
|
| 155 |
+
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
| 156 |
+
inputs = processor(text=prompt, images=[image], return_tensors="np")
|
| 157 |
+
|
| 158 |
+
## Prepare decoder inputs
|
| 159 |
+
batch_size = inputs['input_ids'].shape[0]
|
| 160 |
+
past_key_values = {
|
| 161 |
+
f'past_key_values.{layer}.{kv}': np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
|
| 162 |
+
for layer in range(num_hidden_layers)
|
| 163 |
+
for kv in ('key', 'value')
|
| 164 |
+
}
|
| 165 |
+
image_features = None
|
| 166 |
+
input_ids = inputs['input_ids']
|
| 167 |
+
attention_mask = inputs['attention_mask']
|
| 168 |
+
position_ids = np.cumsum(inputs['attention_mask'], axis=-1)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
# 3. Generation loop
|
| 172 |
+
max_new_tokens = 1024
|
| 173 |
+
generated_tokens = np.array([[]], dtype=np.int64)
|
| 174 |
+
for i in range(max_new_tokens):
|
| 175 |
+
inputs_embeds = embed_session.run(None, {'input_ids': input_ids})[0]
|
| 176 |
+
|
| 177 |
+
if image_features is None:
|
| 178 |
+
## Only compute vision features if not already computed
|
| 179 |
+
image_features = vision_session.run(
|
| 180 |
+
['image_features'], # List of output names or indices
|
| 181 |
+
{
|
| 182 |
+
'pixel_values': inputs['pixel_values'],
|
| 183 |
+
'pixel_attention_mask': inputs['pixel_attention_mask'].astype(np.bool_)
|
| 184 |
+
}
|
| 185 |
+
)[0]
|
| 186 |
+
|
| 187 |
+
## Merge text and vision embeddings
|
| 188 |
+
inputs_embeds[inputs['input_ids'] == image_token_id] = image_features.reshape(-1, image_features.shape[-1])
|
| 189 |
+
|
| 190 |
+
logits, *present_key_values = decoder_session.run(None, dict(
|
| 191 |
+
inputs_embeds=inputs_embeds,
|
| 192 |
+
attention_mask=attention_mask,
|
| 193 |
+
position_ids=position_ids,
|
| 194 |
+
**past_key_values,
|
| 195 |
+
))
|
| 196 |
+
|
| 197 |
+
## Update values for next generation loop
|
| 198 |
+
input_ids = logits[:, -1].argmax(-1, keepdims=True)
|
| 199 |
+
attention_mask = np.ones_like(input_ids)
|
| 200 |
+
position_ids = position_ids[:, -1:] + 1
|
| 201 |
+
for j, key in enumerate(past_key_values):
|
| 202 |
+
past_key_values[key] = present_key_values[j]
|
| 203 |
+
|
| 204 |
+
generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
|
| 205 |
+
if (input_ids == eos_token_id).all():
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
## (Optional) Streaming
|
| 209 |
+
print(processor.decode(input_ids[0]), end='')
|
| 210 |
+
print()
|
| 211 |
+
|
| 212 |
+
# 4. Output result
|
| 213 |
+
print(processor.batch_decode(generated_tokens))
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
Example output:
|
| 217 |
+
```
|
| 218 |
+
The image depicts a large, historic statue of Liberty situated on a small island in a body of water. The statue is a green, cylindrical structure with a human figure at the top, which is the actual statue of Liberty. The statue is mounted on a pedestal that is supported by a cylindrical tower. The pedestal is rectangular and appears to be made of stone or a similar material. The statue is surrounded by a large, flat, rectangular area that is likely a base for the statue.
|
| 219 |
+
|
| 220 |
+
In the background, there is a cityscape with a variety of buildings, including skyscrapers and high-rise buildings. The sky is clear with a gradient of colors, transitioning from a pale blue at the top to a deeper blue at the bottom. The buildings are mostly modern, with a mix of glass and concrete. The buildings are densely packed, with many skyscrapers and high-rise buildings visible.
|
| 221 |
+
|
| 222 |
+
There are trees and greenery visible on the left side of the image, indicating that the statue is located near a park or a park area. The water in the foreground is calm, with small ripples indicating that the statue is in the water.
|
| 223 |
+
|
| 224 |
+
The overall scene suggests a peaceful and serene environment, likely a public park or a park area in a city. The statue is likely a representation of liberty, representing the city's commitment to freedom and democracy.
|
| 225 |
+
|
| 226 |
+
### Analysis and Description:
|
| 227 |
+
|
| 228 |
+
#### Statue of Liberty:
|
| 229 |
+
- **Location**: The statue is located on a small island in a body of water.
|
| 230 |
+
- **Statue**: The statue is a green cylindrical structure with a human figure at the top, which is the actual statue of Liberty.
|
| 231 |
+
- **Pedestal**: The pedestal is rectangular and supports the statue.
|
| 232 |
+
- **Pedestrian**: The pedestal is surrounded by a flat rectangular area.
|
| 233 |
+
- **Water**: The water is calm, with small ripples indicating that the statue is in the water.
|
| 234 |
+
|
| 235 |
+
#### Cityscape:
|
| 236 |
+
- **Buildings**: The buildings are modern, with a mix of glass and concrete.
|
| 237 |
+
- **Sky**: The sky is clear with a gradient of colors, transitioning from a pale blue at the top to a deeper blue at the bottom.
|
| 238 |
+
- **Trees**: There are trees and greenery visible on the left side of the image, indicating that the statue is located near a park or a park area.
|
| 239 |
+
|
| 240 |
+
#### Environment:
|
| 241 |
+
- **Water**: The water is calm, with small ripples indicating that the statue is in the water.
|
| 242 |
+
- **Sky**: The sky is clear with a gradient of colors, transitioning from a pale blue at the top to a deeper blue at the bottom.
|
| 243 |
+
|
| 244 |
+
### Conclusion:
|
| 245 |
+
The image depicts a peaceful and serene public park or park area in a city, with the statue of Liberty prominently featured. The cityscape in the background includes modern buildings and a clear sky, suggesting a well-maintained public space.<end_of_utterance>
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
</details>
|
| 249 |
|
| 250 |
### Model optimizations
|
| 251 |
|