Spaces:
Running
Running
Upload from GitHub Actions: Add math benchmarks
Browse files- datasets.json +44 -43
- evals/backend.py +1 -0
- evals/datasets_/mgsm.py +45 -0
- evals/main.py +7 -4
- evals/tasks.py +41 -0
- frontend/src/components/ModelTable.js +0 -1
- frontend/src/components/ScoreColumns.js +13 -2
- results.json +0 -0
datasets.json
CHANGED
|
@@ -249,6 +249,50 @@
|
|
| 249 |
"implemented": false,
|
| 250 |
"group": "Multitask Language Understanding"
|
| 251 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
{
|
| 253 |
"name": "FLEURS",
|
| 254 |
"author": "Meta",
|
|
@@ -477,49 +521,6 @@
|
|
| 477 |
"implemented": false,
|
| 478 |
"group": "Adversarial Language Modelling"
|
| 479 |
},
|
| 480 |
-
{
|
| 481 |
-
"name": "MGSM",
|
| 482 |
-
"author": "Google",
|
| 483 |
-
"author_url": "https://google.com",
|
| 484 |
-
"url": "https://huggingface.co/datasets/juletxara/mgsm",
|
| 485 |
-
"n_languages": 10,
|
| 486 |
-
"tasks": [
|
| 487 |
-
"math"
|
| 488 |
-
],
|
| 489 |
-
"parallel": true,
|
| 490 |
-
"base": "MGSM",
|
| 491 |
-
"group": "Grade School Math"
|
| 492 |
-
},
|
| 493 |
-
{
|
| 494 |
-
"name": "AfriMGSM",
|
| 495 |
-
"author": "Masakhane",
|
| 496 |
-
"author_url": "https://www.masakhane.io",
|
| 497 |
-
"url": "https://huggingface.co/datasets/masakhane/afrimgsm",
|
| 498 |
-
"n_languages": 18,
|
| 499 |
-
"tasks": [
|
| 500 |
-
"math"
|
| 501 |
-
],
|
| 502 |
-
"parallel": true,
|
| 503 |
-
"translation": "human",
|
| 504 |
-
"base": "MGSM",
|
| 505 |
-
"implemented": false,
|
| 506 |
-
"group": "Grade School Math"
|
| 507 |
-
},
|
| 508 |
-
{
|
| 509 |
-
"name": "GSM8K-X",
|
| 510 |
-
"author": "OpenGPT-X",
|
| 511 |
-
"author_url": "https://opengpt-x.de",
|
| 512 |
-
"url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
|
| 513 |
-
"n_languages": 20,
|
| 514 |
-
"tasks": [
|
| 515 |
-
"math"
|
| 516 |
-
],
|
| 517 |
-
"parallel": true,
|
| 518 |
-
"translation": "machine",
|
| 519 |
-
"base": "MGSM",
|
| 520 |
-
"implemented": false,
|
| 521 |
-
"group": "Grade School Math"
|
| 522 |
-
},
|
| 523 |
{
|
| 524 |
"name": "WikiANN / PAN-X",
|
| 525 |
"author": "Academic",
|
|
|
|
| 249 |
"implemented": false,
|
| 250 |
"group": "Multitask Language Understanding"
|
| 251 |
},
|
| 252 |
+
{
|
| 253 |
+
"name": "MGSM",
|
| 254 |
+
"author": "Google",
|
| 255 |
+
"author_url": "https://google.com",
|
| 256 |
+
"url": "https://huggingface.co/datasets/juletxara/mgsm",
|
| 257 |
+
"n_languages": 10,
|
| 258 |
+
"tasks": [
|
| 259 |
+
"math"
|
| 260 |
+
],
|
| 261 |
+
"parallel": true,
|
| 262 |
+
"base": "MGSM",
|
| 263 |
+
"implemented": true,
|
| 264 |
+
"group": "Grade School Math"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"name": "AfriMGSM",
|
| 268 |
+
"author": "Masakhane",
|
| 269 |
+
"author_url": "https://www.masakhane.io",
|
| 270 |
+
"url": "https://huggingface.co/datasets/masakhane/afrimgsm",
|
| 271 |
+
"n_languages": 18,
|
| 272 |
+
"tasks": [
|
| 273 |
+
"math"
|
| 274 |
+
],
|
| 275 |
+
"parallel": true,
|
| 276 |
+
"translation": "human",
|
| 277 |
+
"base": "MGSM",
|
| 278 |
+
"implemented": true,
|
| 279 |
+
"group": "Grade School Math"
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"name": "GSM8K-X",
|
| 283 |
+
"author": "OpenGPT-X",
|
| 284 |
+
"author_url": "https://opengpt-x.de",
|
| 285 |
+
"url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
|
| 286 |
+
"n_languages": 20,
|
| 287 |
+
"tasks": [
|
| 288 |
+
"math"
|
| 289 |
+
],
|
| 290 |
+
"parallel": true,
|
| 291 |
+
"translation": "machine",
|
| 292 |
+
"base": "MGSM",
|
| 293 |
+
"implemented": true,
|
| 294 |
+
"group": "Grade School Math"
|
| 295 |
+
},
|
| 296 |
{
|
| 297 |
"name": "FLEURS",
|
| 298 |
"author": "Meta",
|
|
|
|
| 521 |
"implemented": false,
|
| 522 |
"group": "Adversarial Language Modelling"
|
| 523 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
{
|
| 525 |
"name": "WikiANN / PAN-X",
|
| 526 |
"author": "Academic",
|
evals/backend.py
CHANGED
|
@@ -25,6 +25,7 @@ task_metrics = [
|
|
| 25 |
"translation_to_bleu",
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
|
|
|
| 28 |
]
|
| 29 |
|
| 30 |
|
|
|
|
| 25 |
"translation_to_bleu",
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
+
"mgsm_accuracy",
|
| 29 |
]
|
| 30 |
|
| 31 |
|
evals/datasets_/mgsm.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 2 |
+
from langcodes import Language, standardize_tag
|
| 3 |
+
|
| 4 |
+
slug_mgsm = "juletxara/mgsm"
|
| 5 |
+
tags_mgsm = {
|
| 6 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_mgsm)
|
| 7 |
+
}
|
| 8 |
+
slug_afrimgsm = "masakhane/afrimgsm"
|
| 9 |
+
tags_afrimgsm = {
|
| 10 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_afrimgsm)
|
| 11 |
+
}
|
| 12 |
+
slug_gsm8kx = "Eurolingua/gsm8kx"
|
| 13 |
+
tags_gsm8kx = {
|
| 14 |
+
standardize_tag(a, macro=True): a
|
| 15 |
+
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
def parse_number(i):
|
| 19 |
+
if isinstance(i, int):
|
| 20 |
+
return i
|
| 21 |
+
try:
|
| 22 |
+
return int(i.replace(",", "").replace(".", ""))
|
| 23 |
+
except ValueError:
|
| 24 |
+
return None
|
| 25 |
+
|
| 26 |
+
def load_mgsm(language_bcp_47, nr):
|
| 27 |
+
if language_bcp_47 in tags_mgsm.keys():
|
| 28 |
+
ds = _load_dataset(slug_mgsm, subset=tags_mgsm[language_bcp_47], split="test")
|
| 29 |
+
return slug_mgsm, ds[nr]
|
| 30 |
+
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 31 |
+
ds = _load_dataset(
|
| 32 |
+
slug_afrimgsm, subset=tags_afrimgsm[language_bcp_47], split="test"
|
| 33 |
+
)
|
| 34 |
+
return slug_afrimgsm, ds[nr]
|
| 35 |
+
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 36 |
+
row = _load_dataset(
|
| 37 |
+
slug_gsm8kx,
|
| 38 |
+
subset=tags_gsm8kx[language_bcp_47],
|
| 39 |
+
split="test",
|
| 40 |
+
trust_remote_code=True,
|
| 41 |
+
)[nr]
|
| 42 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 43 |
+
return slug_gsm8kx, row
|
| 44 |
+
else:
|
| 45 |
+
return None, None
|
evals/main.py
CHANGED
|
@@ -16,12 +16,9 @@ n_models = 35
|
|
| 16 |
|
| 17 |
|
| 18 |
async def evaluate():
|
| 19 |
-
# save up-to-date info on models and languages
|
| 20 |
-
args = dict(orient="records", indent=2, force_ascii=False)
|
| 21 |
-
pd.DataFrame(models).to_json("models.json", **args)
|
| 22 |
-
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 23 |
print("running evaluations")
|
| 24 |
old_results = pd.read_json("results.json")
|
|
|
|
| 25 |
# get all combinations of model, language and task
|
| 26 |
combis = [
|
| 27 |
(model, lang.bcp_47, task_name)
|
|
@@ -41,6 +38,7 @@ async def evaluate():
|
|
| 41 |
]
|
| 42 |
results = await tqdm_asyncio.gather(*results, miniters=1)
|
| 43 |
results = [r for group in results for r in group]
|
|
|
|
| 44 |
if results:
|
| 45 |
# aggregate results
|
| 46 |
results = pd.DataFrame(results)
|
|
@@ -53,6 +51,11 @@ async def evaluate():
|
|
| 53 |
results = pd.concat([old_results, results])
|
| 54 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 55 |
results.to_json("results.json", **args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
async def evaluate():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
print("running evaluations")
|
| 20 |
old_results = pd.read_json("results.json")
|
| 21 |
+
old_models = pd.read_json("models.json")
|
| 22 |
# get all combinations of model, language and task
|
| 23 |
combis = [
|
| 24 |
(model, lang.bcp_47, task_name)
|
|
|
|
| 38 |
]
|
| 39 |
results = await tqdm_asyncio.gather(*results, miniters=1)
|
| 40 |
results = [r for group in results for r in group]
|
| 41 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 42 |
if results:
|
| 43 |
# aggregate results
|
| 44 |
results = pd.DataFrame(results)
|
|
|
|
| 51 |
results = pd.concat([old_results, results])
|
| 52 |
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 53 |
results.to_json("results.json", **args)
|
| 54 |
+
# save up-to-date info on models and languages
|
| 55 |
+
all_models = pd.concat([old_models, pd.DataFrame(models)])
|
| 56 |
+
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 57 |
+
all_models.to_json("models.json", **args)
|
| 58 |
+
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 59 |
|
| 60 |
|
| 61 |
if __name__ == "__main__":
|
evals/tasks.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
|
|
|
| 3 |
|
| 4 |
import evaluate
|
| 5 |
import pandas as pd
|
| 6 |
import sentencepiece as spm
|
| 7 |
from datasets_.flores import flores_sentences
|
|
|
|
| 8 |
from datasets_.mmlu import load_mmlu
|
| 9 |
from languages import languages, script_name
|
| 10 |
from models import complete, transcribe
|
|
@@ -247,6 +249,44 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 247 |
]
|
| 248 |
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 251 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 252 |
fleurs = pd.read_csv(
|
|
@@ -284,5 +324,6 @@ tasks = {
|
|
| 284 |
"classification": classify_and_evaluate,
|
| 285 |
# "mlm": mlm_and_evaluate,
|
| 286 |
"mmlu": mmlu_and_evaluate,
|
|
|
|
| 287 |
# "asr": transcribe_and_evaluate,
|
| 288 |
}
|
|
|
|
| 1 |
import random
|
| 2 |
from functools import partial
|
| 3 |
+
from textwrap import dedent
|
| 4 |
|
| 5 |
import evaluate
|
| 6 |
import pandas as pd
|
| 7 |
import sentencepiece as spm
|
| 8 |
from datasets_.flores import flores_sentences
|
| 9 |
+
from datasets_.mgsm import load_mgsm, parse_number
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
from languages import languages, script_name
|
| 12 |
from models import complete, transcribe
|
|
|
|
| 249 |
]
|
| 250 |
|
| 251 |
|
| 252 |
+
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 253 |
+
system_prompt = """
|
| 254 |
+
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
| 255 |
+
Response format: <reasoning> #### <number>
|
| 256 |
+
"""
|
| 257 |
+
system_prompt = dedent(system_prompt).strip()
|
| 258 |
+
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
| 259 |
+
if not question:
|
| 260 |
+
return []
|
| 261 |
+
response = await complete(
|
| 262 |
+
model=model,
|
| 263 |
+
messages=[
|
| 264 |
+
{"role": "system", "content": system_prompt},
|
| 265 |
+
{"role": "user", "content": question["question"]},
|
| 266 |
+
],
|
| 267 |
+
temperature=0,
|
| 268 |
+
max_tokens=1024,
|
| 269 |
+
)
|
| 270 |
+
number = response.split("####")
|
| 271 |
+
if len(number) == 2:
|
| 272 |
+
accuracy = int(
|
| 273 |
+
parse_number(number[1].strip()) == parse_number(question["answer_number"])
|
| 274 |
+
)
|
| 275 |
+
else:
|
| 276 |
+
accuracy = 0
|
| 277 |
+
|
| 278 |
+
return [
|
| 279 |
+
{
|
| 280 |
+
"model": model,
|
| 281 |
+
"bcp_47": language_bcp_47,
|
| 282 |
+
"task": "mgsm",
|
| 283 |
+
"metric": "accuracy",
|
| 284 |
+
"score": accuracy,
|
| 285 |
+
"sentence_nr": nr,
|
| 286 |
+
}
|
| 287 |
+
]
|
| 288 |
+
|
| 289 |
+
|
| 290 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 291 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 292 |
fleurs = pd.read_csv(
|
|
|
|
| 324 |
"classification": classify_and_evaluate,
|
| 325 |
# "mlm": mlm_and_evaluate,
|
| 326 |
"mmlu": mmlu_and_evaluate,
|
| 327 |
+
"mgsm": mgsm_and_evaluate,
|
| 328 |
# "asr": transcribe_and_evaluate,
|
| 329 |
}
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -5,7 +5,6 @@ import { MultiSelect } from 'primereact/multiselect'
|
|
| 5 |
import { useState, useEffect } from 'react'
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
-
import ScoreField from './ScoreField'
|
| 9 |
import ScoreColumns from './ScoreColumns'
|
| 10 |
const ModelTable = ({ data }) => {
|
| 11 |
const [filters, setFilters] = useState({
|
|
|
|
| 5 |
import { useState, useEffect } from 'react'
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
|
|
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
const ModelTable = ({ data }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -64,7 +64,7 @@ const ScoreColumns = [
|
|
| 64 |
// />,
|
| 65 |
<Column
|
| 66 |
field='mmlu_accuracy'
|
| 67 |
-
header='
|
| 68 |
headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
|
| 69 |
sortable
|
| 70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
|
@@ -72,7 +72,18 @@ const ScoreColumns = [
|
|
| 72 |
maxScore: 1
|
| 73 |
})}
|
| 74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
]
|
| 77 |
|
| 78 |
export default ScoreColumns
|
|
|
|
| 64 |
// />,
|
| 65 |
<Column
|
| 66 |
field='mmlu_accuracy'
|
| 67 |
+
header='Q&A'
|
| 68 |
headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
|
| 69 |
sortable
|
| 70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
|
|
|
| 72 |
maxScore: 1
|
| 73 |
})}
|
| 74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 75 |
+
/>,
|
| 76 |
+
<Column
|
| 77 |
+
field='mgsm_accuracy'
|
| 78 |
+
header='Math'
|
| 79 |
+
headerTooltip='Math Problem Solving performance (accuracy on a sample of the MGMS benchmark)'
|
| 80 |
+
sortable
|
| 81 |
+
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 82 |
+
minScore: 0,
|
| 83 |
+
maxScore: 1
|
| 84 |
+
})}
|
| 85 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
+
/>,
|
| 87 |
]
|
| 88 |
|
| 89 |
export default ScoreColumns
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|