alexmarques commited on
Commit
3bceedd
·
verified ·
1 Parent(s): 2962787

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -21
README.md CHANGED
@@ -255,71 +255,71 @@ The model was evaluated on the OpenLLM leaderboard tasks (versions 1 and 2), usi
255
  </td>
256
  <td>MMLU (5-shot)
257
  </td>
258
- <td>80.96
259
  </td>
260
- <td>80.36
261
  </td>
262
- <td>99.3%
263
  </td>
264
  </tr>
265
  <tr>
266
  <td>ARC Challenge (25-shot)
267
  </td>
268
- <td>69.03
269
  </td>
270
- <td>68.69
271
  </td>
272
- <td>99.5%
273
  </td>
274
  </tr>
275
  <tr>
276
  <td>GSM-8K (5-shot, strict-match)
277
  </td>
278
- <td>87.64
279
  </td>
280
- <td>85.97
281
  </td>
282
- <td>98.1%
283
  </td>
284
  </tr>
285
  <tr>
286
  <td>Hellaswag (10-shot)
287
  </td>
288
- <td>71.10
289
  </td>
290
- <td>71.18
291
  </td>
292
- <td>100.1%
293
  </td>
294
  </tr>
295
  <tr>
296
  <td>Winogrande (5-shot)
297
  </td>
298
- <td>69.77
299
  </td>
300
- <td>70.90
301
  </td>
302
- <td>100.5%
303
  </td>
304
  </tr>
305
  <tr>
306
  <td>TruthfulQA (0-shot, mc2)
307
  </td>
308
- <td>58.63
309
  </td>
310
- <td>58.86
311
  </td>
312
- <td>100.4%
313
  </td>
314
  </tr>
315
  <tr>
316
  <td><strong>Average</strong>
317
  </td>
318
- <td><strong>72.86</strong>
319
  </td>
320
- <td><strong>72.52</strong>
321
  </td>
322
- <td><strong>99.6%</strong>
323
  </td>
324
  </tr>
325
  <tr>
 
255
  </td>
256
  <td>MMLU (5-shot)
257
  </td>
258
+ <td>42.82
259
  </td>
260
+ <td>39.80
261
  </td>
262
+ <td>93.00%
263
  </td>
264
  </tr>
265
  <tr>
266
  <td>ARC Challenge (25-shot)
267
  </td>
268
+ <td>32.85
269
  </td>
270
+ <td>30.72
271
  </td>
272
+ <td>93.5%
273
  </td>
274
  </tr>
275
  <tr>
276
  <td>GSM-8K (5-shot, strict-match)
277
  </td>
278
+ <td>1.82
279
  </td>
280
+ <td>2.20
281
  </td>
282
+ <td>---
283
  </td>
284
  </tr>
285
  <tr>
286
  <td>Hellaswag (10-shot)
287
  </td>
288
+ <td>43.04
289
  </td>
290
+ <td>41.02
291
  </td>
292
+ <td>95.3%
293
  </td>
294
  </tr>
295
  <tr>
296
  <td>Winogrande (5-shot)
297
  </td>
298
+ <td>54.54
299
  </td>
300
+ <td>54.62
301
  </td>
302
+ <td>100.1%
303
  </td>
304
  </tr>
305
  <tr>
306
  <td>TruthfulQA (0-shot, mc2)
307
  </td>
308
+ <td>51.61
309
  </td>
310
+ <td>48.77
311
  </td>
312
+ <td>94.5%
313
  </td>
314
  </tr>
315
  <tr>
316
  <td><strong>Average</strong>
317
  </td>
318
+ <td><strong>37.78</strong>
319
  </td>
320
+ <td><strong>36.19</strong>
321
  </td>
322
+ <td><strong>95.8%</strong>
323
  </td>
324
  </tr>
325
  <tr>