Spaces:
Runtime error
Runtime error
Commit
·
d01d881
1
Parent(s):
e1f29ca
update
Browse files- app.py +2 -2
- beta-cli.py +3 -3
- src/display/utils.py +7 -6
- src/leaderboard/read_evals.py +4 -0
app.py
CHANGED
|
@@ -28,8 +28,8 @@ from src.display.utils import (
|
|
| 28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
| 31 |
-
from src.submission.check_validity import already_submitted_models
|
| 32 |
-
from src.tools.collections import update_collections
|
| 33 |
from src.tools.plots import (
|
| 34 |
create_metric_plot_obj,
|
| 35 |
create_plot_df,
|
|
|
|
| 28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
| 29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
| 30 |
from src.submission.submit import add_new_eval
|
| 31 |
+
# from src.submission.check_validity import already_submitted_models
|
| 32 |
+
# from src.tools.collections import update_collections
|
| 33 |
from src.tools.plots import (
|
| 34 |
create_metric_plot_obj,
|
| 35 |
create_plot_df,
|
beta-cli.py
CHANGED
|
@@ -10,7 +10,7 @@ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="
|
|
| 10 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
| 11 |
|
| 12 |
for entry in raw_data:
|
| 13 |
-
if '
|
| 14 |
-
|
| 15 |
|
| 16 |
-
# print(raw_data)
|
|
|
|
| 10 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
| 11 |
|
| 12 |
for entry in raw_data:
|
| 13 |
+
# if '125m' in entry.eval_name:
|
| 14 |
+
print(entry)
|
| 15 |
|
| 16 |
+
# print(raw_data)
|
src/display/utils.py
CHANGED
|
@@ -18,9 +18,10 @@ class Tasks(Enum):
|
|
| 18 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
| 19 |
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
| 20 |
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
| 21 |
-
winogrande = Task("winogrande", "acc", "Winogrande")
|
| 22 |
-
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
| 23 |
drop = Task("drop", "f1", "DROP")
|
|
|
|
| 24 |
|
| 25 |
# These classes are for user facing column names,
|
| 26 |
# to avoid having to change them all around the code
|
|
@@ -77,8 +78,8 @@ baseline_row = {
|
|
| 77 |
AutoEvalColumn.hellaswag.name: 25.0,
|
| 78 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 79 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 80 |
-
AutoEvalColumn.winogrande.name: 50.0,
|
| 81 |
-
AutoEvalColumn.gsm8k.name: 0.21,
|
| 82 |
AutoEvalColumn.drop.name: 0.47,
|
| 83 |
AutoEvalColumn.dummy.name: "baseline",
|
| 84 |
AutoEvalColumn.model_type.name: "",
|
|
@@ -102,8 +103,8 @@ human_baseline_row = {
|
|
| 102 |
AutoEvalColumn.hellaswag.name: 95.0,
|
| 103 |
AutoEvalColumn.mmlu.name: 89.8,
|
| 104 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
| 105 |
-
AutoEvalColumn.winogrande.name: 94.0,
|
| 106 |
-
AutoEvalColumn.gsm8k.name: 100,
|
| 107 |
AutoEvalColumn.drop.name: 96.42,
|
| 108 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 109 |
AutoEvalColumn.model_type.name: "",
|
|
|
|
| 18 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
| 19 |
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
| 20 |
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
| 21 |
+
# winogrande = Task("winogrande", "acc", "Winogrande")
|
| 22 |
+
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
| 23 |
drop = Task("drop", "f1", "DROP")
|
| 24 |
+
nqopen = Task("nq_open", "em", "NQ Open")
|
| 25 |
|
| 26 |
# These classes are for user facing column names,
|
| 27 |
# to avoid having to change them all around the code
|
|
|
|
| 78 |
AutoEvalColumn.hellaswag.name: 25.0,
|
| 79 |
AutoEvalColumn.mmlu.name: 25.0,
|
| 80 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
| 81 |
+
# AutoEvalColumn.winogrande.name: 50.0,
|
| 82 |
+
# AutoEvalColumn.gsm8k.name: 0.21,
|
| 83 |
AutoEvalColumn.drop.name: 0.47,
|
| 84 |
AutoEvalColumn.dummy.name: "baseline",
|
| 85 |
AutoEvalColumn.model_type.name: "",
|
|
|
|
| 103 |
AutoEvalColumn.hellaswag.name: 95.0,
|
| 104 |
AutoEvalColumn.mmlu.name: 89.8,
|
| 105 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
| 106 |
+
# AutoEvalColumn.winogrande.name: 94.0,
|
| 107 |
+
# AutoEvalColumn.gsm8k.name: 100,
|
| 108 |
AutoEvalColumn.drop.name: 96.42,
|
| 109 |
AutoEvalColumn.dummy.name: "human_baseline",
|
| 110 |
AutoEvalColumn.model_type.name: "",
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -96,6 +96,10 @@ class EvalResult:
|
|
| 96 |
mean_acc = np.mean(accs) * 100.0
|
| 97 |
results[task.benchmark] = mean_acc
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
return self(
|
| 100 |
eval_name=result_key,
|
| 101 |
full_model=full_model,
|
|
|
|
| 96 |
mean_acc = np.mean(accs) * 100.0
|
| 97 |
results[task.benchmark] = mean_acc
|
| 98 |
|
| 99 |
+
# XXX
|
| 100 |
+
if 'nq_open' not in results:
|
| 101 |
+
results['nq_open'] = 0.0
|
| 102 |
+
|
| 103 |
return self(
|
| 104 |
eval_name=result_key,
|
| 105 |
full_model=full_model,
|