Update README.md
Browse files
README.md
CHANGED
|
@@ -268,146 +268,116 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2, HumanEval, an
|
|
| 268 |
<tr>
|
| 269 |
<th>Category</th>
|
| 270 |
<th>Metric</th>
|
| 271 |
-
<th>Llama-4-Scout-17B-16E-Instruct
|
| 272 |
-
<th>Llama-4-Scout-17B-16E-Instruct-NVFP4 (
|
| 273 |
-
<th>Recovery
|
| 274 |
</tr>
|
| 275 |
</thead>
|
| 276 |
<tbody>
|
| 277 |
<tr>
|
| 278 |
<td rowspan="8"><b>OpenLLM V1</b></td>
|
| 279 |
-
<td>
|
| 280 |
-
<td>
|
| 281 |
-
<td>
|
| 282 |
-
<td>
|
| 283 |
</tr>
|
| 284 |
<tr>
|
| 285 |
-
<td>
|
| 286 |
-
<td>
|
| 287 |
-
<td>
|
| 288 |
-
<td>
|
| 289 |
</tr>
|
| 290 |
<tr>
|
| 291 |
-
<td>
|
| 292 |
-
<td>
|
| 293 |
-
<td>
|
| 294 |
-
<td>
|
| 295 |
</tr>
|
| 296 |
<tr>
|
| 297 |
-
<td>
|
| 298 |
-
<td>
|
| 299 |
-
<td>
|
| 300 |
-
<td>
|
| 301 |
</tr>
|
| 302 |
<tr>
|
| 303 |
-
<td>
|
| 304 |
-
<td>79.
|
| 305 |
-
<td>78.
|
| 306 |
-
<td>99.
|
| 307 |
</tr>
|
| 308 |
<tr>
|
| 309 |
-
<td>
|
| 310 |
-
<td>
|
| 311 |
-
<td>
|
| 312 |
-
<td>
|
| 313 |
</tr>
|
| 314 |
<tr>
|
| 315 |
-
<td>
|
| 316 |
-
<td>
|
| 317 |
-
<td>
|
| 318 |
-
<td>
|
| 319 |
</tr>
|
| 320 |
<tr>
|
| 321 |
<td><b>Average</b></td>
|
| 322 |
-
<td><b>81.
|
| 323 |
-
<td><b>80.
|
| 324 |
-
<td><b>98.
|
| 325 |
</tr>
|
| 326 |
<tr>
|
| 327 |
<td rowspan="7"><b>OpenLLM V2</b></td>
|
| 328 |
-
<td>MMLU-Pro</td>
|
| 329 |
-
<td>55.
|
| 330 |
-
<td>53.
|
| 331 |
-
<td>
|
| 332 |
</tr>
|
| 333 |
<tr>
|
| 334 |
-
<td>IFEval</td>
|
| 335 |
<td>89.09</td>
|
| 336 |
-
<td>89.
|
| 337 |
-
<td>100.
|
| 338 |
</tr>
|
| 339 |
<tr>
|
| 340 |
-
<td>BBH</td>
|
| 341 |
-
<td>65.
|
| 342 |
-
<td>
|
| 343 |
-
<td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
-
<td>Math-
|
| 347 |
-
<td>
|
| 348 |
-
<td>
|
| 349 |
-
<td>
|
| 350 |
</tr>
|
| 351 |
<tr>
|
| 352 |
-
<td>GPQA</td>
|
| 353 |
-
<td>
|
| 354 |
-
<td>31.
|
| 355 |
-
<td>
|
| 356 |
</tr>
|
| 357 |
<tr>
|
| 358 |
-
<td>MuSR</td>
|
| 359 |
-
<td>42.
|
| 360 |
-
<td>
|
| 361 |
-
<td>
|
| 362 |
</tr>
|
| 363 |
<tr>
|
| 364 |
<td><b>Average</b></td>
|
| 365 |
-
<td><b>
|
| 366 |
-
<td><b>56.
|
| 367 |
-
<td><b>
|
| 368 |
-
</tr>
|
| 369 |
-
<tr>
|
| 370 |
-
<td><b>Coding</b></td>
|
| 371 |
-
<td>HumanEval Instruct pass@1</td>
|
| 372 |
-
<td>81.71</td>
|
| 373 |
-
<td>76.22</td>
|
| 374 |
-
<td>93.29%</td>
|
| 375 |
</tr>
|
| 376 |
<tr>
|
| 377 |
-
<td rowspan="
|
| 378 |
-
<td>
|
| 379 |
-
<td>83.
|
| 380 |
-
<td>81
|
| 381 |
-
<td>
|
| 382 |
-
</tr>
|
| 383 |
-
<tr>
|
| 384 |
-
<td>HumanEval 64 Instruct pass@8</td>
|
| 385 |
-
<td>87.71</td>
|
| 386 |
-
<td>88.66</td>
|
| 387 |
-
<td>101.08%</td>
|
| 388 |
-
</tr>
|
| 389 |
-
<tr>
|
| 390 |
-
<td>HumanEval 64 Instruct pass@16</td>
|
| 391 |
-
<td>88.71</td>
|
| 392 |
-
<td>90.11</td>
|
| 393 |
-
<td>101.58%</td>
|
| 394 |
-
</tr>
|
| 395 |
-
<tr>
|
| 396 |
-
<td>HumanEval 64 Instruct pass@32</td>
|
| 397 |
-
<td>89.38</td>
|
| 398 |
-
<td>90.91</td>
|
| 399 |
-
<td>101.71%</td>
|
| 400 |
-
</tr>
|
| 401 |
-
<tr>
|
| 402 |
-
<td>HumanEval 64 Instruct pass@64</td>
|
| 403 |
-
<td>89.63</td>
|
| 404 |
-
<td>91.46</td>
|
| 405 |
-
<td>102.04%</td>
|
| 406 |
</tr>
|
| 407 |
</tbody>
|
| 408 |
</table>
|
| 409 |
|
| 410 |
|
|
|
|
| 411 |
### Reproduction
|
| 412 |
|
| 413 |
The results were obtained using the following commands:
|
|
|
|
| 268 |
<tr>
|
| 269 |
<th>Category</th>
|
| 270 |
<th>Metric</th>
|
| 271 |
+
<th>Llama-4-Scout-17B-16E-Instruct</th>
|
| 272 |
+
<th>Llama-4-Scout-17B-16E-Instruct-NVFP4 (this model)</th>
|
| 273 |
+
<th>Recovery</th>
|
| 274 |
</tr>
|
| 275 |
</thead>
|
| 276 |
<tbody>
|
| 277 |
<tr>
|
| 278 |
<td rowspan="8"><b>OpenLLM V1</b></td>
|
| 279 |
+
<td>mmlu_llama</td>
|
| 280 |
+
<td>81.06</td>
|
| 281 |
+
<td>79.11</td>
|
| 282 |
+
<td>97.59</td>
|
| 283 |
</tr>
|
| 284 |
<tr>
|
| 285 |
+
<td>mmlu_cot_llama (0-shot)</td>
|
| 286 |
+
<td>85.86</td>
|
| 287 |
+
<td>84.07</td>
|
| 288 |
+
<td>97.92</td>
|
| 289 |
</tr>
|
| 290 |
<tr>
|
| 291 |
+
<td>arc_challenge_llama (0-shot)</td>
|
| 292 |
+
<td>93.39</td>
|
| 293 |
+
<td>92.02</td>
|
| 294 |
+
<td>98.53</td>
|
| 295 |
</tr>
|
| 296 |
<tr>
|
| 297 |
+
<td>gsm8k_llama (8-shot, strict-match)</td>
|
| 298 |
+
<td>93.78</td>
|
| 299 |
+
<td>93.78</td>
|
| 300 |
+
<td>100.00</td>
|
| 301 |
</tr>
|
| 302 |
<tr>
|
| 303 |
+
<td>hellaswag (10-shot)</td>
|
| 304 |
+
<td>79.06</td>
|
| 305 |
+
<td>78.63</td>
|
| 306 |
+
<td>99.46</td>
|
| 307 |
</tr>
|
| 308 |
<tr>
|
| 309 |
+
<td>winogrande (5-shot)</td>
|
| 310 |
+
<td>74.43</td>
|
| 311 |
+
<td>73.48</td>
|
| 312 |
+
<td>98.72</td>
|
| 313 |
</tr>
|
| 314 |
<tr>
|
| 315 |
+
<td>truthfulQA (0-shot, mc2)</td>
|
| 316 |
+
<td>62.15</td>
|
| 317 |
+
<td>60.63</td>
|
| 318 |
+
<td>97.55</td>
|
| 319 |
</tr>
|
| 320 |
<tr>
|
| 321 |
<td><b>Average</b></td>
|
| 322 |
+
<td><b>81.39</b></td>
|
| 323 |
+
<td><b>80.25</b></td>
|
| 324 |
+
<td><b>98.59</b></td>
|
| 325 |
</tr>
|
| 326 |
<tr>
|
| 327 |
<td rowspan="7"><b>OpenLLM V2</b></td>
|
| 328 |
+
<td>MMLU-Pro (5-shot)</td>
|
| 329 |
+
<td>55.68</td>
|
| 330 |
+
<td>53.05</td>
|
| 331 |
+
<td>95.28</td>
|
| 332 |
</tr>
|
| 333 |
<tr>
|
| 334 |
+
<td>IFEval (0-shot)</td>
|
| 335 |
<td>89.09</td>
|
| 336 |
+
<td>89.57</td>
|
| 337 |
+
<td>100.54</td>
|
| 338 |
</tr>
|
| 339 |
<tr>
|
| 340 |
+
<td>BBH (3-shot)</td>
|
| 341 |
+
<td>65.11</td>
|
| 342 |
+
<td>63.53</td>
|
| 343 |
+
<td>97.57</td>
|
| 344 |
</tr>
|
| 345 |
<tr>
|
| 346 |
+
<td>Math-|v|-5 (4-shot)</td>
|
| 347 |
+
<td>57.70</td>
|
| 348 |
+
<td>55.06</td>
|
| 349 |
+
<td>95.42</td>
|
| 350 |
</tr>
|
| 351 |
<tr>
|
| 352 |
+
<td>GPQA (0-shot)</td>
|
| 353 |
+
<td>30.70</td>
|
| 354 |
+
<td>31.04</td>
|
| 355 |
+
<td>101.11</td>
|
| 356 |
</tr>
|
| 357 |
<tr>
|
| 358 |
+
<td>MuSR (0-shot)</td>
|
| 359 |
+
<td>42.59</td>
|
| 360 |
+
<td>43.52</td>
|
| 361 |
+
<td>102.18</td>
|
| 362 |
</tr>
|
| 363 |
<tr>
|
| 364 |
<td><b>Average</b></td>
|
| 365 |
+
<td><b>57.04</b></td>
|
| 366 |
+
<td><b>56.54</b></td>
|
| 367 |
+
<td><b>99.13</b></td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
</tr>
|
| 369 |
<tr>
|
| 370 |
+
<td rowspan="1"><b>Coding</b></td>
|
| 371 |
+
<td>HumanEval_64 pass@2</td>
|
| 372 |
+
<td>83.83</td>
|
| 373 |
+
<td>84.81</td>
|
| 374 |
+
<td>101.17</td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
</tr>
|
| 376 |
</tbody>
|
| 377 |
</table>
|
| 378 |
|
| 379 |
|
| 380 |
+
|
| 381 |
### Reproduction
|
| 382 |
|
| 383 |
The results were obtained using the following commands:
|