Update model card with paper link and library details
Browse filesThis PR improves the model card by:
- Adding a direct link to the official Hugging Face paper page ([Voxtral](https://huggingface.co/papers/2507.13264)), providing full academic context.
- Updating the `library_name` metadata tag to `transformers`. This correctly reflects the model's native support in the Hugging Face Transformers library and ensures the "how to use" widget is displayed on the model page.
- Adding `vllm` to the `tags` metadata, highlighting its support for the vLLM inference engine as mentioned in the usage examples.
README.md
CHANGED
|
@@ -8,20 +8,22 @@ language:
|
|
| 8 |
- pt
|
| 9 |
- nl
|
| 10 |
- hi
|
|
|
|
| 11 |
license: apache-2.0
|
| 12 |
-
library_name: vllm
|
| 13 |
-
inference: false
|
| 14 |
-
extra_gated_description: >-
|
| 15 |
-
If you want to learn more about how we process your personal data, please read
|
| 16 |
-
our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
|
| 17 |
pipeline_tag: audio-text-to-text
|
| 18 |
tags:
|
| 19 |
- transformers
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
---
|
|
|
|
| 21 |
# Voxtral Mini 1.0 (3B) - 2507
|
| 22 |
|
| 23 |
Voxtral Mini is an enhancement of [Ministral 3B](https://mistral.ai/news/ministraux), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
|
| 24 |
|
|
|
|
| 25 |
Learn more about Voxtral in our blog post [here](https://mistral.ai/news/voxtral).
|
| 26 |
|
| 27 |
## Key Features
|
|
@@ -151,7 +153,9 @@ user_msg = UserMessage(content=[file_to_chunk(obama_file), file_to_chunk(bcn_fil
|
|
| 151 |
|
| 152 |
print(30 * "=" + "USER 1" + 30 * "=")
|
| 153 |
print(text_chunk.text)
|
| 154 |
-
print("
|
|
|
|
|
|
|
| 155 |
|
| 156 |
response = client.chat.completions.create(
|
| 157 |
model=model,
|
|
@@ -163,7 +167,9 @@ content = response.choices[0].message.content
|
|
| 163 |
|
| 164 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
| 165 |
print(content)
|
| 166 |
-
print("
|
|
|
|
|
|
|
| 167 |
# The speaker who is more inspiring is the one who delivered the farewell address, as they express
|
| 168 |
# gratitude, optimism, and a strong commitment to the nation and its citizens. They emphasize the importance of
|
| 169 |
# self-government and active citizenship, encouraging everyone to participate in the democratic process. In contrast,
|
|
@@ -182,7 +188,9 @@ messages = [
|
|
| 182 |
]
|
| 183 |
print(30 * "=" + "USER 2" + 30 * "=")
|
| 184 |
print(messages[-1]["content"])
|
| 185 |
-
print("
|
|
|
|
|
|
|
| 186 |
|
| 187 |
response = client.chat.completions.create(
|
| 188 |
model=model,
|
|
@@ -292,7 +300,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 292 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 293 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 294 |
|
| 295 |
-
print("
|
|
|
|
| 296 |
print("=" * 80)
|
| 297 |
print(decoded_outputs[0])
|
| 298 |
print("=" * 80)
|
|
@@ -350,7 +359,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 350 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 351 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 352 |
|
| 353 |
-
print("
|
|
|
|
| 354 |
print("=" * 80)
|
| 355 |
print(decoded_outputs[0])
|
| 356 |
print("=" * 80)
|
|
@@ -389,7 +399,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 389 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 390 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 391 |
|
| 392 |
-
print("
|
|
|
|
| 393 |
print("=" * 80)
|
| 394 |
print(decoded_outputs[0])
|
| 395 |
print("=" * 80)
|
|
@@ -428,7 +439,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 428 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 429 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 430 |
|
| 431 |
-
print("
|
|
|
|
| 432 |
print("=" * 80)
|
| 433 |
print(decoded_outputs[0])
|
| 434 |
print("=" * 80)
|
|
@@ -489,7 +501,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 489 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 490 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 491 |
|
| 492 |
-
print("
|
|
|
|
| 493 |
print("=" * 80)
|
| 494 |
for decoded_output in decoded_outputs:
|
| 495 |
print(decoded_output)
|
|
@@ -518,7 +531,8 @@ inputs = inputs.to(device, dtype=torch.bfloat16)
|
|
| 518 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 519 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 520 |
|
| 521 |
-
print("
|
|
|
|
| 522 |
print("=" * 80)
|
| 523 |
for decoded_output in decoded_outputs:
|
| 524 |
print(decoded_output)
|
|
|
|
| 8 |
- pt
|
| 9 |
- nl
|
| 10 |
- hi
|
| 11 |
+
library_name: transformers
|
| 12 |
license: apache-2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
pipeline_tag: audio-text-to-text
|
| 14 |
tags:
|
| 15 |
- transformers
|
| 16 |
+
- vllm
|
| 17 |
+
inference: false
|
| 18 |
+
extra_gated_description: If you want to learn more about how we process your personal
|
| 19 |
+
data, please read our <a href="https://mistral.ai/terms/">Privacy Policy</a>.
|
| 20 |
---
|
| 21 |
+
|
| 22 |
# Voxtral Mini 1.0 (3B) - 2507
|
| 23 |
|
| 24 |
Voxtral Mini is an enhancement of [Ministral 3B](https://mistral.ai/news/ministraux), incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance. It excels at speech transcription, translation and audio understanding.
|
| 25 |
|
| 26 |
+
The model was presented in the paper [Voxtral](https://huggingface.co/papers/2507.13264).
|
| 27 |
Learn more about Voxtral in our blog post [here](https://mistral.ai/news/voxtral).
|
| 28 |
|
| 29 |
## Key Features
|
|
|
|
| 153 |
|
| 154 |
print(30 * "=" + "USER 1" + 30 * "=")
|
| 155 |
print(text_chunk.text)
|
| 156 |
+
print("
|
| 157 |
+
|
| 158 |
+
")
|
| 159 |
|
| 160 |
response = client.chat.completions.create(
|
| 161 |
model=model,
|
|
|
|
| 167 |
|
| 168 |
print(30 * "=" + "BOT 1" + 30 * "=")
|
| 169 |
print(content)
|
| 170 |
+
print("
|
| 171 |
+
|
| 172 |
+
")
|
| 173 |
# The speaker who is more inspiring is the one who delivered the farewell address, as they express
|
| 174 |
# gratitude, optimism, and a strong commitment to the nation and its citizens. They emphasize the importance of
|
| 175 |
# self-government and active citizenship, encouraging everyone to participate in the democratic process. In contrast,
|
|
|
|
| 188 |
]
|
| 189 |
print(30 * "=" + "USER 2" + 30 * "=")
|
| 190 |
print(messages[-1]["content"])
|
| 191 |
+
print("
|
| 192 |
+
|
| 193 |
+
")
|
| 194 |
|
| 195 |
response = client.chat.completions.create(
|
| 196 |
model=model,
|
|
|
|
| 300 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 301 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 302 |
|
| 303 |
+
print("
|
| 304 |
+
Generated response:")
|
| 305 |
print("=" * 80)
|
| 306 |
print(decoded_outputs[0])
|
| 307 |
print("=" * 80)
|
|
|
|
| 359 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 360 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 361 |
|
| 362 |
+
print("
|
| 363 |
+
Generated response:")
|
| 364 |
print("=" * 80)
|
| 365 |
print(decoded_outputs[0])
|
| 366 |
print("=" * 80)
|
|
|
|
| 399 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 400 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 401 |
|
| 402 |
+
print("
|
| 403 |
+
Generated response:")
|
| 404 |
print("=" * 80)
|
| 405 |
print(decoded_outputs[0])
|
| 406 |
print("=" * 80)
|
|
|
|
| 439 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 440 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 441 |
|
| 442 |
+
print("
|
| 443 |
+
Generated response:")
|
| 444 |
print("=" * 80)
|
| 445 |
print(decoded_outputs[0])
|
| 446 |
print("=" * 80)
|
|
|
|
| 501 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 502 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 503 |
|
| 504 |
+
print("
|
| 505 |
+
Generated responses:")
|
| 506 |
print("=" * 80)
|
| 507 |
for decoded_output in decoded_outputs:
|
| 508 |
print(decoded_output)
|
|
|
|
| 531 |
outputs = model.generate(**inputs, max_new_tokens=500)
|
| 532 |
decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 533 |
|
| 534 |
+
print("
|
| 535 |
+
Generated responses:")
|
| 536 |
print("=" * 80)
|
| 537 |
for decoded_output in decoded_outputs:
|
| 538 |
print(decoded_output)
|