nm-research commited on
Commit
258a4b0
·
verified ·
1 Parent(s): abf2f91

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +66 -96
README.md CHANGED
@@ -268,146 +268,116 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2, HumanEval, an
268
  <tr>
269
  <th>Category</th>
270
  <th>Metric</th>
271
- <th>Llama-4-Scout-17B-16E-Instruct (A100)</th>
272
- <th>Llama-4-Scout-17B-16E-Instruct-NVFP4 (B200)</th>
273
- <th>Recovery (%)</th>
274
  </tr>
275
  </thead>
276
  <tbody>
277
  <tr>
278
  <td rowspan="8"><b>OpenLLM V1</b></td>
279
- <td>ARC Challenge (LLaMA)</td>
280
- <td>93.39</td>
281
- <td>92.10</td>
282
- <td>98.62%</td>
283
  </tr>
284
  <tr>
285
- <td>GSM8K (LLaMA)</td>
286
- <td>92.87</td>
287
- <td>94.31</td>
288
- <td>101.55%</td>
289
  </tr>
290
  <tr>
291
- <td>MMLU (LLaMA)</td>
292
- <td>81.01</td>
293
- <td>79.37</td>
294
- <td>97.98%</td>
295
  </tr>
296
  <tr>
297
- <td>MMLU-CoT (LLaMA)</td>
298
- <td>85.99</td>
299
- <td>84.58</td>
300
- <td>98.36%</td>
301
  </tr>
302
  <tr>
303
- <td>Hellaswag</td>
304
- <td>79.13</td>
305
- <td>78.47</td>
306
- <td>99.17%</td>
307
  </tr>
308
  <tr>
309
- <td>TruthfulQA-mc2</td>
310
- <td>62.53</td>
311
- <td>60.83</td>
312
- <td>97.28%</td>
313
  </tr>
314
  <tr>
315
- <td>Winogrande</td>
316
- <td>73.56</td>
317
- <td>73.01</td>
318
- <td>99.25%</td>
319
  </tr>
320
  <tr>
321
  <td><b>Average</b></td>
322
- <td><b>81.21</b></td>
323
- <td><b>80.38</b></td>
324
- <td><b>98.89%</b></td>
325
  </tr>
326
  <tr>
327
  <td rowspan="7"><b>OpenLLM V2</b></td>
328
- <td>MMLU-Pro</td>
329
- <td>55.64</td>
330
- <td>53.84</td>
331
- <td>96.76%</td>
332
  </tr>
333
  <tr>
334
- <td>IFEval</td>
335
  <td>89.09</td>
336
- <td>89.93</td>
337
- <td>100.94%</td>
338
  </tr>
339
  <tr>
340
- <td>BBH</td>
341
- <td>65.14</td>
342
- <td>64.00</td>
343
- <td>98.25%</td>
344
  </tr>
345
  <tr>
346
- <td>Math-Hard</td>
347
- <td>52.64</td>
348
- <td>56.12</td>
349
- <td>106.61%</td>
350
  </tr>
351
  <tr>
352
- <td>GPQA</td>
353
- <td>32.21</td>
354
- <td>31.88</td>
355
- <td>98.98%</td>
356
  </tr>
357
  <tr>
358
- <td>MuSR</td>
359
- <td>42.20</td>
360
- <td>42.99</td>
361
- <td>101.87%</td>
362
  </tr>
363
  <tr>
364
  <td><b>Average</b></td>
365
- <td><b>56.15</b></td>
366
- <td><b>56.46</b></td>
367
- <td><b>100.55%</b></td>
368
- </tr>
369
- <tr>
370
- <td><b>Coding</b></td>
371
- <td>HumanEval Instruct pass@1</td>
372
- <td>81.71</td>
373
- <td>76.22</td>
374
- <td>93.29%</td>
375
  </tr>
376
  <tr>
377
- <td rowspan="5"></td>
378
- <td>HumanEval 64 Instruct pass@2</td>
379
- <td>83.49</td>
380
- <td>81.10</td>
381
- <td>97.14%</td>
382
- </tr>
383
- <tr>
384
- <td>HumanEval 64 Instruct pass@8</td>
385
- <td>87.71</td>
386
- <td>88.66</td>
387
- <td>101.08%</td>
388
- </tr>
389
- <tr>
390
- <td>HumanEval 64 Instruct pass@16</td>
391
- <td>88.71</td>
392
- <td>90.11</td>
393
- <td>101.58%</td>
394
- </tr>
395
- <tr>
396
- <td>HumanEval 64 Instruct pass@32</td>
397
- <td>89.38</td>
398
- <td>90.91</td>
399
- <td>101.71%</td>
400
- </tr>
401
- <tr>
402
- <td>HumanEval 64 Instruct pass@64</td>
403
- <td>89.63</td>
404
- <td>91.46</td>
405
- <td>102.04%</td>
406
  </tr>
407
  </tbody>
408
  </table>
409
 
410
 
 
411
  ### Reproduction
412
 
413
  The results were obtained using the following commands:
 
268
  <tr>
269
  <th>Category</th>
270
  <th>Metric</th>
271
+ <th>Llama-4-Scout-17B-16E-Instruct</th>
272
+ <th>Llama-4-Scout-17B-16E-Instruct-NVFP4 (this model)</th>
273
+ <th>Recovery</th>
274
  </tr>
275
  </thead>
276
  <tbody>
277
  <tr>
278
  <td rowspan="8"><b>OpenLLM V1</b></td>
279
+ <td>mmlu_llama</td>
280
+ <td>81.06</td>
281
+ <td>79.11</td>
282
+ <td>97.59</td>
283
  </tr>
284
  <tr>
285
+ <td>mmlu_cot_llama (0-shot)</td>
286
+ <td>85.86</td>
287
+ <td>84.07</td>
288
+ <td>97.92</td>
289
  </tr>
290
  <tr>
291
+ <td>arc_challenge_llama (0-shot)</td>
292
+ <td>93.39</td>
293
+ <td>92.02</td>
294
+ <td>98.53</td>
295
  </tr>
296
  <tr>
297
+ <td>gsm8k_llama (8-shot, strict-match)</td>
298
+ <td>93.78</td>
299
+ <td>93.78</td>
300
+ <td>100.00</td>
301
  </tr>
302
  <tr>
303
+ <td>hellaswag (10-shot)</td>
304
+ <td>79.06</td>
305
+ <td>78.63</td>
306
+ <td>99.46</td>
307
  </tr>
308
  <tr>
309
+ <td>winogrande (5-shot)</td>
310
+ <td>74.43</td>
311
+ <td>73.48</td>
312
+ <td>98.72</td>
313
  </tr>
314
  <tr>
315
+ <td>truthfulQA (0-shot, mc2)</td>
316
+ <td>62.15</td>
317
+ <td>60.63</td>
318
+ <td>97.55</td>
319
  </tr>
320
  <tr>
321
  <td><b>Average</b></td>
322
+ <td><b>81.39</b></td>
323
+ <td><b>80.25</b></td>
324
+ <td><b>98.59</b></td>
325
  </tr>
326
  <tr>
327
  <td rowspan="7"><b>OpenLLM V2</b></td>
328
+ <td>MMLU-Pro (5-shot)</td>
329
+ <td>55.68</td>
330
+ <td>53.05</td>
331
+ <td>95.28</td>
332
  </tr>
333
  <tr>
334
+ <td>IFEval (0-shot)</td>
335
  <td>89.09</td>
336
+ <td>89.57</td>
337
+ <td>100.54</td>
338
  </tr>
339
  <tr>
340
+ <td>BBH (3-shot)</td>
341
+ <td>65.11</td>
342
+ <td>63.53</td>
343
+ <td>97.57</td>
344
  </tr>
345
  <tr>
346
+ <td>Math-|v|-5 (4-shot)</td>
347
+ <td>57.70</td>
348
+ <td>55.06</td>
349
+ <td>95.42</td>
350
  </tr>
351
  <tr>
352
+ <td>GPQA (0-shot)</td>
353
+ <td>30.70</td>
354
+ <td>31.04</td>
355
+ <td>101.11</td>
356
  </tr>
357
  <tr>
358
+ <td>MuSR (0-shot)</td>
359
+ <td>42.59</td>
360
+ <td>43.52</td>
361
+ <td>102.18</td>
362
  </tr>
363
  <tr>
364
  <td><b>Average</b></td>
365
+ <td><b>57.04</b></td>
366
+ <td><b>56.54</b></td>
367
+ <td><b>99.13</b></td>
 
 
 
 
 
 
 
368
  </tr>
369
  <tr>
370
+ <td rowspan="1"><b>Coding</b></td>
371
+ <td>HumanEval_64 pass@2</td>
372
+ <td>83.83</td>
373
+ <td>84.81</td>
374
+ <td>101.17</td>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  </tr>
376
  </tbody>
377
  </table>
378
 
379
 
380
+
381
  ### Reproduction
382
 
383
  The results were obtained using the following commands: