.gitignore CHANGED
@@ -5,7 +5,6 @@ __pycache__/
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
8
- .idea/
9
 
10
  eval-queue/
11
  eval-results/
 
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
 
8
 
9
  eval-queue/
10
  eval-results/
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Eval Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
 
1
  ---
2
+ title: Leaderboard Test
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
app.py CHANGED
@@ -1,121 +1,103 @@
1
  import gradio as gr
 
2
  import pandas as pd
3
- import json
 
4
 
5
  from src.about import (
6
- REPRODUCIBILITY_TEXT,
 
 
7
  INTRODUCTION_TEXT,
8
- ABOUT_TEXT,
9
  TITLE,
10
  )
11
- from src.display.css_html_js import custom_css, custom_js
12
- from src.display.formatting import make_clickable_field
13
-
14
- def build_leaderboard(type):
15
- with open('data/results.json', 'r') as f:
16
- results = json.load(f)
17
-
18
- with open('data/tasks.json', 'r') as f:
19
- tasks = json.load(f)
20
-
21
- # Filter tasks based on type
22
- filtered_tasks = {k: v for k, v in tasks.items() if v['type'] == type}
23
-
24
- data = []
25
- for model_name, model_data in results.items():
26
- # For agentic type, skip models that have all null values for agentic tasks
27
- if type == "agentic":
28
- has_agentic_results = any(
29
- model_data['results'].get(task, {}).get(tasks[task]['metric']) is not None
30
- for task in filtered_tasks
31
- )
32
- if not has_agentic_results:
33
- continue
34
-
35
- model_sha = model_data["config"]["model_sha"]
36
- model_name = model_data["config"]["model_name"]
37
- row = {
38
- 'Model': make_clickable_field(model_name, model_sha)
39
- }
40
-
41
- for dataset, metrics in model_data['results'].items():
42
- # Only include metrics for tasks of the specified type
43
- if dataset in filtered_tasks:
44
- value = next(iter(metrics.values()))
45
- log_url = metrics.get('log_url')
46
- # Use display name from tasks.json instead of raw dataset name
47
- display_name = filtered_tasks[dataset]['display_name']
48
- # Round non-null values to 2 decimal places and make clickable if log_url exists
49
- if value is not None:
50
- value = round(value*100, 2)
51
- if log_url:
52
- value = make_clickable_field(value, log_url)
53
- row[display_name] = value
54
- data.append(row)
55
-
56
- results_df = pd.DataFrame(data)
57
-
58
- # Round all numeric columns to 2 decimal places
59
- numeric_cols = results_df.select_dtypes(include=['float64', 'float32']).columns
60
- results_df[numeric_cols] = results_df[numeric_cols].round(2)
61
 
62
- # Fill null values with "-"
63
- results_df = results_df.fillna("--")
64
 
65
- if type == "agentic":
66
- # Include agent column as second column after Model
67
- results_df.insert(1, 'Agent', make_clickable_field('Basic Agent', 'https://inspect.ai-safety-institute.org.uk/agents.html#sec-basic-agent'))
 
 
 
 
 
 
 
 
 
68
 
 
 
69
  return gr.components.Dataframe(
70
- value=results_df,
71
- datatype=["html" for _ in results_df.columns],
72
- column_widths=["250px" if c == "Model" else "150px" for c in results_df.columns],
73
  wrap=False,
74
  )
75
 
76
 
77
- black_logo_path = "src/assets/logo-icon-black.png"
78
- white_logo_path = "src/assets/logo-icon-white.png"
79
-
80
- demo = gr.Blocks(
81
- css=custom_css,
82
- js=custom_js,
83
- theme=gr.themes.Default(primary_hue=gr.themes.colors.pink),
84
- fill_height=True,
85
- fill_width=True,
86
- )
87
  with demo:
88
  gr.HTML(f"""
89
- <div id="page-header">
90
- <div id="header-container">
91
- <div id="left-container">
92
- <img id="black-logo" src="/gradio_api/file={black_logo_path}">
93
- <img id="white-logo" src="/gradio_api/file={white_logo_path}">
94
- </div>
95
- <div id="centre-container">
96
- <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
97
- <p style="color:#eb088a; margin:0; font-size:1.2rem;">Explore Interactive Results &amp; Traces</p>
98
- </div>
99
- <div id="right-container">
100
- </div>
101
- </div>
102
  </div>
103
  """)
104
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="intro-text", sanitize_html=False)
105
-
106
- with gr.Tabs(elem_classes=["leaderboard-table", "tab-buttons"]) as tabs:
107
- with gr.TabItem("Base Benchmarks", elem_classes="llm-benchmark-tab-table", id=0):
108
- build_leaderboard("base")
109
 
110
- with gr.TabItem("Agentic Benchmarks", elem_classes="llm-benchmark-tab-table", id=1):
111
- build_leaderboard("agentic")
 
112
 
113
- with gr.TabItem("About", elem_classes="llm-benchmark-tab-table", id=2):
114
- gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text", sanitize_html=False)
115
 
116
- # with gr.TabItem("Reproducibility", elem_classes="llm-benchmark-tab-table", id=3):
117
- # gr.Markdown(REPRODUCIBILITY_TEXT, elem_classes="markdown-text", sanitize_html=False)
118
 
119
- assets = [black_logo_path, white_logo_path]
120
- demo.launch(allowed_paths=assets)
121
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ COLS,
18
+ ST_BENCHMARK_COLS,
19
+ AGENTIC_BENCHMARK_COLS,
20
+ EVAL_COLS,
21
+ EVAL_TYPES,
22
+ AutoEvalColumn,
23
+ ModelType,
24
+ fields,
25
+ WeightType,
26
+ Precision
27
+ )
28
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
30
+ from src.submission.submit import add_new_eval
31
+
32
+
33
+ def restart_space():
34
+ API.restart_space(repo_id=REPO_ID)
35
+
36
+ ### Space initialisation
37
+ try:
38
+ print(EVAL_REQUESTS_PATH)
39
+ snapshot_download(
40
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
41
+ )
42
+ except Exception:
43
+ restart_space()
44
+ try:
45
+ print(EVAL_RESULTS_PATH)
46
+ snapshot_download(
47
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
+ )
49
+ except Exception:
50
+ restart_space()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
 
52
 
53
+ ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
54
+ AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
55
+
56
+ (
57
+ finished_eval_queue_df,
58
+ running_eval_queue_df,
59
+ pending_eval_queue_df,
60
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
+
62
+ def init_leaderboard(dataframe, benchmark_type):
63
+ if dataframe is None or dataframe.empty:
64
+ raise ValueError("Leaderboard DataFrame is empty or None.")
65
 
66
+ AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
67
+
68
  return gr.components.Dataframe(
69
+ value=dataframe,
70
+ datatype=[c.type for c in AutoEvalColumnSubset],
71
+ column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
72
  wrap=False,
73
  )
74
 
75
 
76
+ demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
 
 
77
  with demo:
78
  gr.HTML(f"""
79
+ <div style="text-align:center; margin-bottom:1rem;">
80
+ <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
81
+ <p style="color:#eb088a; margin:0; font-size:1.2rem;">Performance Insights &amp; Comparison</p>
 
 
 
 
 
 
 
 
 
 
82
  </div>
83
  """)
84
+ # gr.HTML(TITLE)
85
+ with gr.Group(elem_classes="intro-block"):
86
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
87
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
88
 
89
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
90
+ with gr.TabItem("Base Benchmark", elem_id="llm-benchmark-tab-table", id=0):
91
+ leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
92
 
93
+ with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
94
+ leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
95
 
96
+ with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
97
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
98
 
 
 
99
 
100
+ scheduler = BackgroundScheduler()
101
+ scheduler.add_job(restart_space, "interval", seconds=1800)
102
+ scheduler.start()
103
+ demo.queue(default_concurrency_limit=40).launch()
create_log_file_map.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ from collections import defaultdict
5
+
6
+
7
+ def main():
8
+
9
+ base_bm_input_path = "./base_benchmarking_logs"
10
+ agentic_bm_input_path = "./agentic_benchmarking_logs"
11
+
12
+ log_file_map = defaultdict()
13
+
14
+ for model_name in os.listdir(base_bm_input_path):
15
+ log_file_map[model_name] = defaultdict(str)
16
+ if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
17
+ for task_log_file in os.listdir(os.path.join(base_bm_input_path, model_name)):
18
+ with open(os.path.join(base_bm_input_path, model_name, task_log_file), "r") as f:
19
+ result = json.load(f)
20
+ task_name = result["eval"]["task"].split("/")[-1]
21
+ log_file_map[model_name][task_name] = task_log_file
22
+
23
+ for model_name in os.listdir(agentic_bm_input_path):
24
+ if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
25
+ for task_log_file in os.listdir(os.path.join(agentic_bm_input_path, model_name)):
26
+ with open(os.path.join(agentic_bm_input_path, model_name, task_log_file), "r") as f:
27
+ result = json.load(f)
28
+ task_name = result["eval"]["task"].split("/")[-1]
29
+ log_file_map[model_name][task_name] = task_log_file
30
+
31
+ with open("./inspect_log_file_names.json", "w") as f:
32
+ json.dump(log_file_map, f, indent=4)
33
+
34
+
35
+ if __name__ == "__main__":
36
+ main()
data/results.json DELETED
@@ -1,948 +0,0 @@
1
- {
2
- "DeepSeek-R1": {
3
- "config": {
4
- "model_name": "DeepSeek-R1",
5
- "model_sha": "https://api-docs.deepseek.com/news/news250120",
6
- "model_dtype": "torch.float16"
7
- },
8
- "results": {
9
- "mmlu_pro": {
10
- "accuracy": 0.8382646276595744,
11
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-12T11-02-35-05-00_mmlu-pro_BhD89DYN9KM3k4weSDfaQK.eval"
12
- },
13
- "humaneval": {
14
- "mean": 0.9567901234567902,
15
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-03T11-45-22-05-00_humaneval_hnkHWYqrb5HxiBt2CWzCnq.eval"
16
- },
17
- "math": {
18
- "accuracy": 0.9272,
19
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-38-10-05-00_math_ZYFSqsWsmP5kLRLHEMWULU.eval"
20
- },
21
- "gsm8k": {
22
- "accuracy": 0.954510993176649,
23
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-02T16-28-05-05-00_gsm8k_YMw6WiZkgTBQ54z5UHtDDX.eval"
24
- },
25
- "arc_challenge": {
26
- "accuracy": 0.9667235494880546,
27
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-01-30T15-42-39-05-00_arc-challenge_CviW9ro6rKBbctkwJzQstp.eval"
28
- },
29
- "winogrande": {
30
- "accuracy": 0.9179163378058406,
31
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-04T00-25-12-05-00_winogrande_NPgTbtqom2QSPKxeThWrdZ.eval"
32
- },
33
- "arc_easy": {
34
- "accuracy": 0.9873737373737373,
35
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
36
- },
37
- "gpqa_diamond": {
38
- "accuracy": 0.7045454545454546,
39
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/DeepSeek-R1/index.html?log_file=logs/logs/2025-02-11T11-37-45-05-00_gpqa-diamond_MwnVeLwyuiEAALr3M5q3dn.eval"
40
- },
41
- "drop": {
42
- "mean": null,
43
- "log_url": null
44
- },
45
- "hellaswag": {
46
- "accuracy": null,
47
- "log_url": null
48
- },
49
- "ifeval": {
50
- "final_acc": null,
51
- "log_url": null
52
- },
53
- "mmlu": {
54
- "accuracy": null,
55
- "log_url": null
56
- },
57
- "mmmu_multiple_choice": {
58
- "accuracy": null,
59
- "log_url": null
60
- },
61
- "mmmu_open": {
62
- "accuracy": null,
63
- "log_url": null
64
- },
65
- "gaia": {
66
- "accuracy": null,
67
- "log_url": null
68
- },
69
- "gdm_intercode_ctf": {
70
- "accuracy": null,
71
- "log_url": null
72
- },
73
- "gdm_in_house_ctf": {
74
- "accuracy": null,
75
- "log_url": null
76
- },
77
- "agentharm": {
78
- "avg_score": null,
79
- "log_url": null
80
- },
81
- "agentharm_benign": {
82
- "avg_score": null,
83
- "log_url": null
84
- },
85
- "swe_bench": {
86
- "mean": null,
87
- "log_url": null
88
- }
89
- }
90
- },
91
- "Meta-Llama-3.1-70B-Instruct": {
92
- "config": {
93
- "model_name": "Meta-Llama-3.1-70B-Instruct",
94
- "model_sha": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
95
- "model_dtype": "torch.float16"
96
- },
97
- "results": {
98
- "hellaswag": {
99
- "accuracy": 0.869946225851424,
100
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-45-54-04-00_hellaswag_BKfQG9yGAr383MGnooMLBH.eval"
101
- },
102
- "drop": {
103
- "mean": 0.8811263765076035,
104
- "log_url": "https://storage.googleapis.com/inspect-evals/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T21-01-02-04-00_drop_LzAWvLWkNrNKu5qf56wXRo.eval"
105
- },
106
- "gpqa_diamond": {
107
- "accuracy": 0.4318181818181818,
108
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.eval"
109
- },
110
- "winogrande": {
111
- "accuracy": 0.8666140489344909,
112
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.eval"
113
- },
114
- "gsm8k": {
115
- "accuracy": 0.9469294920394238,
116
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.eval"
117
- },
118
- "math": {
119
- "accuracy": 0.6004,
120
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.eval"
121
- },
122
- "ifeval": {
123
- "final_acc": 0.8604907201780166,
124
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.eval"
125
- },
126
- "arc_challenge": {
127
- "accuracy": 0.9445392491467577,
128
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.eval"
129
- },
130
- "arc_easy": {
131
- "accuracy": 0.9823232323232324,
132
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.eval"
133
- },
134
- "mmlu_pro": {
135
- "accuracy": 0.6688829787234043,
136
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.eval"
137
- },
138
- "humaneval": {
139
- "mean": 0.7865853658536586,
140
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.eval"
141
- },
142
- "mmlu": {
143
- "accuracy": 0.8033755875231449,
144
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Meta-Llama-3.1-70B-Instruct/index.html?log_file=logs/logs/2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.eval"
145
- },
146
- "mmmu_multiple_choice": {
147
- "accuracy": null,
148
- "log_url": null
149
- },
150
- "mmmu_open": {
151
- "accuracy": null,
152
- "log_url": null
153
- },
154
- "gaia": {
155
- "accuracy": null,
156
- "log_url": null
157
- },
158
- "gdm_intercode_ctf": {
159
- "accuracy": null,
160
- "log_url": null
161
- },
162
- "gdm_in_house_ctf": {
163
- "accuracy": null,
164
- "log_url": null
165
- },
166
- "agentharm": {
167
- "avg_score": null,
168
- "log_url": null
169
- },
170
- "agentharm_benign": {
171
- "avg_score": null,
172
- "log_url": null
173
- },
174
- "swe_bench": {
175
- "mean": null,
176
- "log_url": null
177
- }
178
- }
179
- },
180
- "Mistral-Large-Instruct-2407": {
181
- "config": {
182
- "model_name": "Mistral-Large-Instruct-2407",
183
- "model_sha": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
184
- "model_dtype": "torch.float16"
185
- },
186
- "results": {
187
- "drop": {
188
- "mean": 0.7424257996853698,
189
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.eval"
190
- },
191
- "ifeval": {
192
- "final_acc": 0.8285172231900246,
193
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-30-16-04-00_ifeval_TLkvCSFEWo4PLv6hAha7YB.eval"
194
- },
195
- "mmlu": {
196
- "accuracy": 0.8035892323030908,
197
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T07-21-48-04-00_mmlu_YnUhmHoStr3WuJdchWmNPt.eval"
198
- },
199
- "gpqa_diamond": {
200
- "accuracy": 0.4734848484848485,
201
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-22-52-04-00_gpqa-diamond_SuZUZxGdqS2ZecbLRNkKd4.eval"
202
- },
203
- "gsm8k": {
204
- "accuracy": 0.9378316906747536,
205
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-28-49-04-00_gsm8k_5tQp9tbwUMj6NpjNKCAfVm.eval"
206
- },
207
- "math": {
208
- "accuracy": 0.6574,
209
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-33-09-04-00_math_2CmjBedAfUxqvmcHRdBgyB.eval"
210
- },
211
- "arc_easy": {
212
- "accuracy": 0.9852693602693603,
213
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-48-39-04-00_arc-easy_YbfuBT3usZXt2xgZkkR5dq.eval"
214
- },
215
- "mmlu_pro": {
216
- "accuracy": 0.6942320478723404,
217
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T09-41-25-04-00_mmlu-pro_fyYT4aabPesfY5TpzFMPnd.eval"
218
- },
219
- "humaneval": {
220
- "mean": 0.8658536585365854,
221
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T06-29-24-04-00_humaneval_nu8SUSGekKJWB8HLKDigYK.eval"
222
- },
223
- "hellaswag": {
224
- "accuracy": 0.9047998406691894,
225
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T04-50-00-04-00_hellaswag_ZzQoZ6gkRQsTzMhQr7GYNn.eval"
226
- },
227
- "arc_challenge": {
228
- "accuracy": 0.9436860068259386,
229
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T01-54-13-04-00_arc-challenge_WfQRhMkFcywefpU46isBVP.eval"
230
- },
231
- "winogrande": {
232
- "accuracy": 0.8547750591949487,
233
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/Mistral-Large-Instruct-2407/index.html?log_file=logs/logs/2024-10-31T11-57-58-04-00_winogrande_TP3UGwpp37Dyv6ks9Ty5Hk.eval"
234
- },
235
- "mmmu_multiple_choice": {
236
- "accuracy": null,
237
- "log_url": null
238
- },
239
- "mmmu_open": {
240
- "accuracy": null,
241
- "log_url": null
242
- },
243
- "gaia": {
244
- "accuracy": null,
245
- "log_url": null
246
- },
247
- "gdm_intercode_ctf": {
248
- "accuracy": null,
249
- "log_url": null
250
- },
251
- "gdm_in_house_ctf": {
252
- "accuracy": null,
253
- "log_url": null
254
- },
255
- "agentharm": {
256
- "avg_score": null,
257
- "log_url": null
258
- },
259
- "agentharm_benign": {
260
- "avg_score": null,
261
- "log_url": null
262
- },
263
- "swe_bench": {
264
- "mean": null,
265
- "log_url": null
266
- }
267
- }
268
- },
269
- "c4ai-command-r-plus": {
270
- "config": {
271
- "model_name": "Command R+",
272
- "model_sha": "https://huggingface.co/CohereForAI/c4ai-command-r-plus"
273
- },
274
- "results": {
275
- "ifeval": {
276
- "final_acc": 0.7779591483929307,
277
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.eval"
278
- },
279
- "winogrande": {
280
- "accuracy": 0.7490134175217048,
281
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.eval"
282
- },
283
- "arc_challenge": {
284
- "accuracy": 0.8506825938566553,
285
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.eval"
286
- },
287
- "drop": {
288
- "mean": 0.743557420031463,
289
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.eval"
290
- },
291
- "math": {
292
- "accuracy": 0.2626,
293
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.eval"
294
- },
295
- "gpqa_diamond": {
296
- "accuracy": 0.3194444444444444,
297
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.eval"
298
- },
299
- "mmlu_pro": {
300
- "accuracy": 0.441156914893617,
301
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.eval"
302
- },
303
- "humaneval": {
304
- "mean": 0.6219512195121951,
305
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.eval"
306
- },
307
- "gsm8k": {
308
- "accuracy": 0.7816527672479151,
309
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.eval"
310
- },
311
- "hellaswag": {
312
- "accuracy": 0.7954590718980283,
313
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.eval"
314
- },
315
- "mmlu": {
316
- "accuracy": 0.695128899017234,
317
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.eval"
318
- },
319
- "arc_easy": {
320
- "accuracy": 0.9377104377104377,
321
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/c4ai-command-r-plus/index.html?log_file=logs/logs/2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.eval"
322
- }
323
- }
324
- },
325
- "claude-3-5-sonnet-20241022": {
326
- "config": {
327
- "model_name": "Claude-3.5-Sonnet",
328
- "model_sha": "https://www.anthropic.com/claude/sonnet",
329
- "model_dtype": "torch.float16"
330
- },
331
- "results": {
332
- "mmmu_multiple_choice": {
333
- "accuracy": 0.6481700118063755,
334
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.eval"
335
- },
336
- "mmlu_pro": {
337
- "accuracy": 0.7762632978723404,
338
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T19-01-05-05-00_mmlu-pro_3vi84or97gQupuj5sT6vgZ.eval"
339
- },
340
- "hellaswag": {
341
- "accuracy": 0.9228241386178052,
342
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T15-09-33-05-00_hellaswag_QXqFxojvSToMu8ckHEMLkB.eval"
343
- },
344
- "gpqa_diamond": {
345
- "accuracy": 0.6098484848484849,
346
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T13-56-36-05-00_gpqa-diamond_eg4gFaMRENjnnYvQNtSB59.eval"
347
- },
348
- "gsm8k": {
349
- "accuracy": 0.9620924943138741,
350
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T14-23-25-05-00_gsm8k_nHB8Z4uZAwRAZFYpKmTptA.eval"
351
- },
352
- "mmmu_open": {
353
- "accuracy": 0.41509433962264153,
354
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T11-24-21-05-00_mmmu-open_SSjv3Dq9gZkEEUnvJUd5xf.eval"
355
- },
356
- "arc_easy": {
357
- "accuracy": 0.9915824915824916,
358
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-06-24-05-00_arc-easy_oBReQZQM5SAwMMD2jFshPb.eval"
359
- },
360
- "arc_challenge": {
361
- "accuracy": 0.9692832764505119,
362
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-12-11-05-00_arc-challenge_X8i6caCzkcQo5AT5zXkXso.eval"
363
- },
364
- "mmlu": {
365
- "accuracy": 0.8665432274604757,
366
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T15-16-51-05-00_mmlu_NFDs2kxmh3kQEbpbd8sz3w.eval"
367
- },
368
- "math": {
369
- "accuracy": 0.7942,
370
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T12-29-54-05-00_math_NvNQU58M8r3fpiwPGnvq8h.eval"
371
- },
372
- "ifeval": {
373
- "final_acc": 0.8958114469607309,
374
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.eval"
375
- },
376
- "humaneval": {
377
- "mean": 0.9451219512195121,
378
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.eval"
379
- },
380
- "winogrande": {
381
- "accuracy": 0.9021310181531176,
382
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.eval"
383
- },
384
- "drop": {
385
- "mean": 0.8977608809648663,
386
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.eval"
387
- },
388
- "gaia": {
389
- "accuracy": 0.3381818181818182,
390
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-12T23-57-37-05-00_claude-3-5-sonnet_gaia_merged.eval"
391
- },
392
- "gdm_intercode_ctf": {
393
- "accuracy": 0.8556962025316455,
394
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T02-47-45-05-00_claude-3-5-sonnet_gdm-intercode-ctf_merged.eval"
395
- },
396
- "gdm_in_house_ctf": {
397
- "accuracy": 0.6153846153846154,
398
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-11T07-41-14+00-00_claude-3-5-sonnet_gdm-in-house-ctf.eval"
399
- },
400
- "agentharm": {
401
- "avg_score": 0.14767992424242424,
402
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-15T08-05-14-08-00_agentharm_VJGhWKLrVLdQczBZVgCXHc.eval"
403
- },
404
- "agentharm_benign": {
405
- "avg_score": 0.800704570051161,
406
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-21T15-09-48-08-00_agentharm-benign_A3uBBWNvv88P5BsgqwFCfg.eval"
407
- },
408
- "swe_bench": {
409
- "mean": 0.0672,
410
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/claude-3-5-sonnet-20241022/index.html?log_file=logs/logs/2025-01-16T18-56-55+00-00_anthropic-claude-3-5-sonnet.eval"
411
- }
412
- }
413
- },
414
- "gemini-1.5-flash": {
415
- "config": {
416
- "model_name": "Gemini-1.5-Flash",
417
- "model_sha": "https://deepmind.google/technologies/gemini/flash",
418
- "model_dtype": "torch.float16"
419
- },
420
- "results": {
421
- "gpqa_diamond": {
422
- "accuracy": 0.40404040404040403,
423
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
424
- },
425
- "arc_challenge": {
426
- "accuracy": 0.9308873720136519,
427
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
428
- },
429
- "math": {
430
- "accuracy": 0.452,
431
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
432
- },
433
- "mmmu_open": {
434
- "accuracy": 0.16981132075471697,
435
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
436
- },
437
- "drop": {
438
- "mean": 0.751044572627163,
439
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
440
- },
441
- "mmlu_pro": {
442
- "accuracy": 0.5993184840425532,
443
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
444
- },
445
- "ifeval": {
446
- "final_acc": 0.7681296737102001,
447
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
448
- },
449
- "hellaswag": {
450
- "accuracy": 0.8557060346544513,
451
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
452
- },
453
- "winogrande": {
454
- "accuracy": 0.7884767166535123,
455
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
456
- },
457
- "humaneval": {
458
- "mean": 0.7439024390243902,
459
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
460
- },
461
- "arc_easy": {
462
- "accuracy": 0.984006734006734,
463
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
464
- },
465
- "gsm8k": {
466
- "accuracy": 0.8582259287338894,
467
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
468
- },
469
- "mmlu": {
470
- "accuracy": 0.7714713003845606,
471
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
472
- },
473
- "mmmu_multiple_choice": {
474
- "accuracy": 0.5702479338842975,
475
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-flash/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
476
- },
477
- "gaia": {
478
- "accuracy": null,
479
- "log_url": null
480
- },
481
- "gdm_intercode_ctf": {
482
- "accuracy": null,
483
- "log_url": null
484
- },
485
- "gdm_in_house_ctf": {
486
- "accuracy": null,
487
- "log_url": null
488
- },
489
- "agentharm": {
490
- "avg_score": null,
491
- "log_url": null
492
- },
493
- "agentharm_benign": {
494
- "avg_score": null,
495
- "log_url": null
496
- },
497
- "swe_bench": {
498
- "mean": null,
499
- "log_url": null
500
- }
501
- }
502
- },
503
- "gemini-1.5-pro": {
504
- "config": {
505
- "model_name": "Gemini-1.5-Pro",
506
- "model_sha": "https://deepmind.google/technologies/gemini/pro",
507
- "model_dtype": "torch.float16"
508
- },
509
- "results": {
510
- "mmlu": {
511
- "accuracy": 0.8467454778521578,
512
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.eval"
513
- },
514
- "humaneval": {
515
- "mean": 0.8719512195121951,
516
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.eval"
517
- },
518
- "mmmu_multiple_choice": {
519
- "accuracy": 0.6304604486422668,
520
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-16-04-05-00_mmmu-multiple-choice_NLmxmHYt6CJymRVVa5UsbD.eval"
521
- },
522
- "mmlu_pro": {
523
- "accuracy": 0.7563996010638298,
524
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.eval"
525
- },
526
- "math": {
527
- "accuracy": 0.852,
528
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.eval"
529
- },
530
- "arc_easy": {
531
- "accuracy": 0.9877946127946128,
532
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.eval"
533
- },
534
- "mmmu_open": {
535
- "accuracy": 0.3584905660377358,
536
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2025-01-20T23-19-25-05-00_mmmu-open_CDbtEQ7tjs5zkj4ScBbzod.eval"
537
- },
538
- "gsm8k": {
539
- "accuracy": 0.9613343442001516,
540
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.eval"
541
- },
542
- "gpqa_diamond": {
543
- "accuracy": 0.5782828282828283,
544
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.eval"
545
- },
546
- "ifeval": {
547
- "final_acc": 0.8982344623377084,
548
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.eval"
549
- },
550
- "winogrande": {
551
- "accuracy": 0.8768745067087609,
552
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.eval"
553
- },
554
- "arc_challenge": {
555
- "accuracy": 0.9633105802047781,
556
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.eval"
557
- },
558
- "drop": {
559
- "mean": 0.8800912427897221,
560
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.eval"
561
- },
562
- "hellaswag": {
563
- "accuracy": 0.9123680541724756,
564
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gemini-1.5-pro/index.html?log_file=logs/logs/2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.eval"
565
- },
566
- "gaia": {
567
- "accuracy": 0.13818181818181818,
568
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T15-33-29-05-00_gemini-1.5-pro_gaia_merged.eval"
569
- },
570
- "gdm_intercode_ctf": {
571
- "accuracy": 0.5291139240506328,
572
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T23-59-58+00-00_gemini-1.5-pro_gdm-intercode-ctf_merged.eval"
573
- },
574
- "gdm_in_house_ctf": {
575
- "accuracy": 0.23076923076923078,
576
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-42-16+00-00_gemini-1.5-pro_gdm-in-house-ctf.eval"
577
- },
578
- "agentharm": {
579
- "avg_score": 0.2898649645808737,
580
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T12-45-43-08-00_agentharm_VmD26soLwmRgWPo3hpRHBr.eval"
581
- },
582
- "agentharm_benign": {
583
- "avg_score": 0.5961489079102715,
584
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-21T13-18-51-08-00_agentharm-benign_gP3pQPxAuCtFLiHzt2Egt7.eval"
585
- },
586
- "swe_bench": {
587
- "mean": 0.004,
588
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gemini-1.5-pro-002/index.html?log_file=logs/logs/2025-01-22T03-00-08+00-00_google-gemini-1.5-pro_swe.eval"
589
- }
590
- }
591
- },
592
- "gpt-4o": {
593
- "config": {
594
- "model_name": "GPT-4o",
595
- "model_sha": "https://openai.com/index/hello-gpt-4o",
596
- "model_dtype": "torch.float16"
597
- },
598
- "results": {
599
- "gpqa_diamond": {
600
- "accuracy": 0.51010101010101,
601
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-29-33-04-00_gpqa-diamond_nFmRv5MJiYjHjezmq4V6Va.eval"
602
- },
603
- "arc_challenge": {
604
- "accuracy": 0.9633105802047781,
605
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-45-55-04-00_arc-challenge_nrsPPxh4DpzgLPQDFdcfVp.eval"
606
- },
607
- "gsm8k": {
608
- "accuracy": 0.9446550416982562,
609
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-31-16-04-00_gsm8k_jVXeSvHowbietZCFsFYCwB.eval"
610
- },
611
- "mmlu": {
612
- "accuracy": 0.8435408061529697,
613
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_GarLpfQFSpM3C22nbbGp54.eval"
614
- },
615
- "ifeval": {
616
- "final_acc": 0.8780386042367585,
617
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-00-11-04-00_ifeval_jxreUu8JqRdkrcHP4E3hLR.eval"
618
- },
619
- "mmlu_pro": {
620
- "accuracy": 0.7450964095744681,
621
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T06-59-42-04-00_mmlu-pro_EuAKDwAWSfNVpqyyqrf2Ba.eval"
622
- },
623
- "mmmu_open": {
624
- "accuracy": 0.3584905660377358,
625
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-07-46-05-00_mmmu-open_d3Q2HvuPZzEX6FAM4NBhnp.eval"
626
- },
627
- "winogrande": {
628
- "accuracy": 0.9013417521704814,
629
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T09-02-03-04-00_winogrande_44kKF7M9mKoqVC7ixZVXuq.eval"
630
- },
631
- "drop": {
632
- "mean": 0.7511693759832198,
633
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-47-20-04-00_drop_3gxDcn6vUoR3nvHX9BcSq4.eval"
634
- },
635
- "arc_easy": {
636
- "accuracy": 0.9915824915824916,
637
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T01-41-34-04-00_arc-easy_nUavRHdiRVfrxo6dmCPadh.eval"
638
- },
639
- "mmmu_multiple_choice": {
640
- "accuracy": 0.5903187721369539,
641
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.eval"
642
- },
643
- "humaneval": {
644
- "mean": 0.9085365853658537,
645
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.eval"
646
- },
647
- "math": {
648
- "accuracy": 0.7054,
649
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.eval"
650
- },
651
- "hellaswag": {
652
- "accuracy": 0.924317864967138,
653
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o/index.html?log_file=logs/logs/2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.eval"
654
- },
655
- "gaia": {
656
- "accuracy": 0.16606060606060608,
657
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-13T15-53-22+00-00_gpt-4o_gaia_merged.eval"
658
- },
659
- "gdm_intercode_ctf": {
660
- "accuracy": 0.6379746835443038,
661
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-08T10-06-29-05-00_gpt-4o_gdm-intercode-ctf_merged.eval"
662
- },
663
- "gdm_in_house_ctf": {
664
- "accuracy": 0.23076923076923078,
665
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-11T07-02-14+00-00_gpt-4o_gdm-in-house-ctf.eval"
666
- },
667
- "agentharm": {
668
- "avg_score": 0.49953844451003543,
669
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-07T16-34-15-08-00_agentharm_UfSoyHEAH2E5RVdrPVUemy.eval"
670
- },
671
- "agentharm_benign": {
672
- "avg_score": 0.8249433048012594,
673
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-21T13-45-18-08-00_agentharm-benign_8DhGJqEAvw6o8uCv4a4dVz.eval"
674
- },
675
- "swe_bench": {
676
- "mean": 0.012,
677
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/gpt-4o-2024-08-06/index.html?log_file=logs/logs/2025-01-14T23-09-10+00-00_openai-gpt-4o_swe.eval"
678
- }
679
- }
680
- },
681
- "gpt-4o-mini": {
682
- "config": {
683
- "model_name": "GPT-4o-mini",
684
- "model_sha": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
685
- "model_dtype": "torch.float16"
686
- },
687
- "results": {
688
- "drop": {
689
- "mean": 0.8065915049816466,
690
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.eval"
691
- },
692
- "humaneval": {
693
- "mean": 0.8597560975609756,
694
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.eval"
695
- },
696
- "gpqa_diamond": {
697
- "accuracy": 0.3838383838383838,
698
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.eval"
699
- },
700
- "mmmu_open": {
701
- "accuracy": 0.18867924528301888,
702
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.eval"
703
- },
704
- "arc_challenge": {
705
- "accuracy": 0.9249146757679181,
706
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.eval"
707
- },
708
- "mmlu": {
709
- "accuracy": 0.7698333570716422,
710
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.eval"
711
- },
712
- "hellaswag": {
713
- "accuracy": 0.8750248954391555,
714
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.eval"
715
- },
716
- "ifeval": {
717
- "final_acc": 0.8419061423689144,
718
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.eval"
719
- },
720
- "mmmu_multiple_choice": {
721
- "accuracy": 0.5395513577331759,
722
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.eval"
723
- },
724
- "arc_easy": {
725
- "accuracy": 0.9793771043771043,
726
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.eval"
727
- },
728
- "winogrande": {
729
- "accuracy": 0.7529597474348856,
730
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.eval"
731
- },
732
- "mmlu_pro": {
733
- "accuracy": 0.6396276595744681,
734
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.eval"
735
- },
736
- "math": {
737
- "accuracy": 0.633,
738
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.eval"
739
- },
740
- "gsm8k": {
741
- "accuracy": 0.9181197877179682,
742
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/gpt-4o-mini/index.html?log_file=logs/logs/2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.eval"
743
- },
744
- "gaia": {
745
- "accuracy": null,
746
- "log_url": null
747
- },
748
- "gdm_intercode_ctf": {
749
- "accuracy": null,
750
- "log_url": null
751
- },
752
- "gdm_in_house_ctf": {
753
- "accuracy": null,
754
- "log_url": null
755
- },
756
- "agentharm": {
757
- "avg_score": null,
758
- "log_url": null
759
- },
760
- "agentharm_benign": {
761
- "avg_score": null,
762
- "log_url": null
763
- },
764
- "swe_bench": {
765
- "mean": null,
766
- "log_url": null
767
- }
768
- }
769
- },
770
- "o1": {
771
- "config": {
772
- "model_name": "o1",
773
- "model_sha": "https://openai.com/o1",
774
- "model_dtype": "torch.float16"
775
- },
776
- "results": {
777
- "winogrande": {
778
- "accuracy": 0.9392265193370166,
779
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.eval"
780
- },
781
- "humaneval": {
782
- "mean": 0.9695121951219512,
783
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.eval"
784
- },
785
- "mmmu_open": {
786
- "accuracy": 0.6981132075471698,
787
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.eval"
788
- },
789
- "math": {
790
- "accuracy": 0.959,
791
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.eval"
792
- },
793
- "arc_easy": {
794
- "accuracy": 0.9911616161616161,
795
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.eval"
796
- },
797
- "arc_challenge": {
798
- "accuracy": 0.9786689419795221,
799
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.eval"
800
- },
801
- "gsm8k": {
802
- "accuracy": 0.9416224412433661,
803
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.eval"
804
- },
805
- "gpqa_diamond": {
806
- "accuracy": 0.7550505050505051,
807
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.eval"
808
- },
809
- "mmlu_pro": {
810
- "accuracy": 0.8447473404255319,
811
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.eval"
812
- },
813
- "mmmu_multiple_choice": {
814
- "accuracy": 0.8063754427390791,
815
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o1/index.html?log_file=logs/logs/2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.eval"
816
- },
817
- "drop": {
818
- "mean": null,
819
- "log_url": null
820
- },
821
- "hellaswag": {
822
- "accuracy": null,
823
- "log_url": null
824
- },
825
- "ifeval": {
826
- "final_acc": null,
827
- "log_url": null
828
- },
829
- "mmlu": {
830
- "accuracy": null,
831
- "log_url": null
832
- },
833
- "gaia": {
834
- "accuracy": 0.41090909090909084,
835
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T13-42-00-05-00_o1_gaia_merged.eval"
836
- },
837
- "gdm_intercode_ctf": {
838
- "accuracy": 0.8481012658227849,
839
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T20-46-35+00-00_o1_gdm-intercode-ctf_merged.eval"
840
- },
841
- "gdm_in_house_ctf": {
842
- "accuracy": 0.46153846153846156,
843
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-22T05-52-25+00-00_o1_gdm-in-house-ctf.eval"
844
- },
845
- "agentharm": {
846
- "avg_score": 0.08782061688311688,
847
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T09-05-42-08-00_agentharm_UGDq2yJeLAnPH6p7FgDgD8.eval"
848
- },
849
- "agentharm_benign": {
850
- "avg_score": 0.7235176849665487,
851
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T18-20-15-08-00_agentharm-benign_bkW2Bf5xLyDQdNtfLdjCpJ.eval"
852
- },
853
- "swe_bench": {
854
- "mean": 0.0036,
855
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-01-21T17-42-11+00-00_openai-o1_swe.eval "
856
- }
857
- }
858
- },
859
- "o3-mini": {
860
- "config": {
861
- "model_name": "o3-mini",
862
- "model_sha": "https://openai.com/index/openai-o3-mini",
863
- "model_dtype": "torch.float16"
864
- },
865
- "results": {
866
- "math": {
867
- "accuracy": 0.9691320905993185,
868
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-33-30-05-00_math_86Gx8n4BxhpyfaSHmRcCUm.eval"
869
- },
870
- "humaneval": {
871
- "mean": 0.9817073170731707,
872
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T20-58-48-05-00_humaneval_Dkod7CS9RmbbogYx9aEXtx.eval"
873
- },
874
- "mmlu_pro": {
875
- "accuracy": 0.7924606807023383,
876
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T19-49-27-05-00_mmlu-pro_jz9woKfdKt8VMzqNFsy7kY.eval"
877
- },
878
- "gpqa_diamond": {
879
- "accuracy": 0.7365319865319865,
880
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-57-54-05-00_gpqa-diamond_2znyMtdc7X4LJufxXeXA8Z.eval"
881
- },
882
- "winogrande": {
883
- "accuracy": 0.8492501973164956,
884
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T22-50-40-05-00_winogrande_VsTW2uU2Kj66YoNoFfRfUj.eval"
885
- },
886
- "gsm8k": {
887
- "accuracy": 0.9454131918119788,
888
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T18-23-05-05-00_gsm8k_d523pJzkcvobxamhhobCRb.eval"
889
- },
890
- "arc_challenge": {
891
- "accuracy": 0.9641638225255973,
892
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-53-30-05-00_arc-challenge_AYFHec7wmd4jELF2Rgzfya.eval"
893
- },
894
- "arc_easy": {
895
- "accuracy": 0.9755892255892256,
896
- "log_url": "https://storage.googleapis.com/inspect-evals/base/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-06T17-45-57-05-00_arc-easy_Nd8NP3K48tvwLVZb8kXDwg.eval"
897
- },
898
- "drop": {
899
- "mean": null,
900
- "log_url": null
901
- },
902
- "hellaswag": {
903
- "accuracy": null,
904
- "log_url": null
905
- },
906
- "ifeval": {
907
- "final_acc": null,
908
- "log_url": null
909
- },
910
- "mmlu": {
911
- "accuracy": null,
912
- "log_url": null
913
- },
914
- "mmmu_multiple_choice": {
915
- "accuracy": null,
916
- "log_url": null
917
- },
918
- "mmmu_open": {
919
- "accuracy": null,
920
- "log_url": null
921
- },
922
- "gaia": {
923
- "accuracy": 0.27030303030303043,
924
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o1-2024-12-17/index.html?log_file=logs/logs/2025-02-05T23-21-20+00-00_gaia_hyMq8MzMm6NgAeq3dNqZSU.eval"
925
- },
926
- "gdm_intercode_ctf": {
927
- "accuracy": 0.8278481012658225,
928
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T21-43-18+00-00_gdm-intercode-ctf_gdm29C6DuTEsX9qm9ymmrC.eval"
929
- },
930
- "gdm_in_house_ctf": {
931
- "accuracy": 0.38461538461538464,
932
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-05T23-59-08+00-00_gdm-in-house-ctf_2zkAX5nkJoxDnVKpJL9VgW.eval"
933
- },
934
- "agentharm": {
935
- "avg_score": 0.1241931080283353,
936
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-17-03-08-00_agentharm_DmN6i5HrgXHNARjsuSewjg.eval"
937
- },
938
- "agentharm_benign": {
939
- "avg_score": 0.5429306867375049,
940
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T18-49-08-08-00_agentharm-benign_Gv94YFpAXaaCJqe3Fc6yr3.eval"
941
- },
942
- "swe_bench": {
943
- "mean": 0.0024,
944
- "log_url": "https://storage.googleapis.com/inspect-evals/agentic/eval/o3-mini-2025-01-31/index.html?log_file=logs/logs/2025-02-03T06-49-09+00-00_openai-o3-mini_swe.eval"
945
- }
946
- }
947
- }
948
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/tasks.json DELETED
@@ -1,142 +0,0 @@
1
- {
2
- "arc_easy": {
3
- "benchmark": "arc_easy",
4
- "metric": "accuracy",
5
- "display_name": "ARC-E",
6
- "type": "base",
7
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
8
- },
9
- "arc_challenge": {
10
- "benchmark": "arc_challenge",
11
- "metric": "accuracy",
12
- "display_name": "ARC-C",
13
- "type": "base",
14
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc"
15
- },
16
- "drop": {
17
- "benchmark": "drop",
18
- "metric": "mean",
19
- "display_name": "DROP",
20
- "type": "base",
21
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop"
22
- },
23
- "winogrande": {
24
- "benchmark": "winogrande",
25
- "metric": "accuracy",
26
- "display_name": "WinoGrande",
27
- "type": "base",
28
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande"
29
- },
30
- "gsm8k": {
31
- "benchmark": "gsm8k",
32
- "metric": "accuracy",
33
- "display_name": "GSM8K",
34
- "type": "base",
35
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k"
36
- },
37
- "hellaswag": {
38
- "benchmark": "hellaswag",
39
- "metric": "accuracy",
40
- "display_name": "HellaSwag",
41
- "type": "base",
42
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag"
43
- },
44
- "humaneval": {
45
- "benchmark": "humaneval",
46
- "metric": "mean",
47
- "display_name": "HumanEval",
48
- "type": "base",
49
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval"
50
- },
51
- "ifeval": {
52
- "benchmark": "ifeval",
53
- "metric": "final_acc",
54
- "display_name": "IFEval",
55
- "type": "base",
56
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval"
57
- },
58
- "math": {
59
- "benchmark": "math",
60
- "metric": "accuracy",
61
- "display_name": "MATH",
62
- "type": "base",
63
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics"
64
- },
65
- "mmlu": {
66
- "benchmark": "mmlu",
67
- "metric": "accuracy",
68
- "display_name": "MMLU",
69
- "type": "base",
70
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu"
71
- },
72
- "mmlu_pro": {
73
- "benchmark": "mmlu_pro",
74
- "metric": "accuracy",
75
- "display_name": "MMLU-Pro",
76
- "type": "base",
77
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro"
78
- },
79
- "gpqa_diamond": {
80
- "benchmark": "gpqa_diamond",
81
- "metric": "accuracy",
82
- "display_name": "GPQA-D",
83
- "type": "base",
84
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa"
85
- },
86
- "mmmu_multiple_choice": {
87
- "benchmark": "mmmu_multiple_choice",
88
- "metric": "accuracy",
89
- "display_name": "MMMU-MC",
90
- "type": "base",
91
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
92
- },
93
- "mmmu_open": {
94
- "benchmark": "mmmu_open",
95
- "metric": "accuracy",
96
- "display_name": "MMMU-OE",
97
- "type": "base",
98
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu"
99
- },
100
- "gaia": {
101
- "benchmark": "gaia",
102
- "metric": "accuracy",
103
- "display_name": "GAIA",
104
- "type": "agentic",
105
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia"
106
- },
107
- "gdm_intercode_ctf": {
108
- "benchmark": "gdm_intercode_ctf",
109
- "metric": "accuracy",
110
- "display_name": "InterCode-CTF",
111
- "type": "agentic",
112
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf"
113
- },
114
- "gdm_in_house_ctf": {
115
- "benchmark": "gdm_in_house_ctf",
116
- "metric": "accuracy",
117
- "display_name": "In-House-CTF",
118
- "type": "agentic",
119
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf"
120
- },
121
- "agentharm": {
122
- "benchmark": "agentharm",
123
- "metric": "avg_score",
124
- "display_name": "AgentHarm",
125
- "type": "agentic",
126
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
127
- },
128
- "agentharm_benign": {
129
- "benchmark": "agentharm_benign",
130
- "metric": "avg_score",
131
- "display_name": "AgentHarm-Benign",
132
- "type": "agentic",
133
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm"
134
- },
135
- "swe_bench": {
136
- "benchmark": "swe_bench",
137
- "metric": "mean",
138
- "display_name": "SWE-Bench",
139
- "type": "agentic",
140
- "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench"
141
- }
142
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inspect_log_file_names.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gemini-1.5-pro": {
3
+ "mmlu": "2024-11-04T16-56-26-05-00_mmlu_Z9KrcK7x4ZLAR5nJ9JaVUe.json",
4
+ "humaneval": "2024-11-04T12-43-07-05-00_humaneval_5JBjtymGtK23qwVKxqidhV.json",
5
+ "mmlu_pro": "2024-11-04T20-13-09-05-00_mmlu-pro_Hv2ujvKLV6H7ZwQu2q8LNw.json",
6
+ "math": "2024-11-04T15-48-46-05-00_math_9DAZmGEfhpa3nUcmMAwqZe.json",
7
+ "arc_easy": "2024-11-04T12-31-43-05-00_arc-easy_eGxYWywpLuREcaCKvHa8Uk.json",
8
+ "gsm8k": "2024-11-04T15-15-26-05-00_gsm8k_cTebw3ugfrVz3dyPwxtdUZ.json",
9
+ "gpqa_diamond": "2024-11-05T09-56-31-05-00_gpqa-diamond_FBq2bnoyGYQ3NF96xQw8iy.json",
10
+ "ifeval": "2024-11-04T12-43-32-05-00_ifeval_mSwZ7AwA7akj5PjZbQMjgC.json",
11
+ "winogrande": "2024-11-04T12-40-46-05-00_winogrande_5SmD6rx47zmZvHHkQSSfHK.json",
12
+ "arc_challenge": "2024-11-04T12-37-36-05-00_arc-challenge_5VVApyQD22QpJoMm53EMdU.json",
13
+ "drop": "2024-11-04T12-44-32-05-00_drop_9dzPKVJojSVsxmiBFnej2m.json",
14
+ "hellaswag": "2024-11-05T13-14-31-05-00_hellaswag_N98eeftuY2pucRtgpUYk5m.json",
15
+ "gaia": "2024-11-15T12-53-32-05-00_gaia_NvyGRTXFrFskJfUvuLwvVr.json",
16
+ "gdm_intercode_ctf": "2024-11-15T16-23-23-05-00_gdm-intercode-ctf_3JrgtTMcijTUxHVaagPRYh.json"
17
+ },
18
+ "gemini-1.5-flash": {
19
+ "gpqa_diamond": "2024-11-04T12-47-34-05-00_gpqa-diamond_cL5kQj8DWbRfxz79piTSdy.json",
20
+ "arc_challenge": "2024-11-04T12-45-59-05-00_arc-challenge_YQLMHfEXqeYgGJY86EB9bp.json",
21
+ "math": "2024-11-04T15-25-38-05-00_math_eaYBRMFgo8p6VUUCYxnCWj.json",
22
+ "drop": "2024-11-04T12-52-08-05-00_drop_5i253AQzbENgHTYN4ATemV.json",
23
+ "mmlu_pro": "2024-11-04T19-44-13-05-00_mmlu-pro_8GrR6wUsYNkthiZNMmLa8y.json",
24
+ "ifeval": "2024-11-04T12-51-30-05-00_ifeval_ZATErMbLHoyxh4kDaSqy8j.json",
25
+ "hellaswag": "2024-11-05T23-19-25-05-00_hellaswag_MRffohuzgVjighGb8FoqSJ.json",
26
+ "winogrande": "2024-11-04T12-48-29-05-00_winogrande_Hmqo6Ydz3nfCnQAdUwgrbD.json",
27
+ "humaneval": "2024-11-04T12-50-47-05-00_humaneval_9j4rYguKeKmxEoD9VuddwX.json",
28
+ "arc_easy": "2024-11-04T12-39-50-05-00_arc-easy_NwmTEw6C8VSCXzzwZCFy48.json",
29
+ "gsm8k": "2024-11-04T15-22-21-05-00_gsm8k_hdJs3Z6XzpR5netTcWLXJT.json",
30
+ "mmlu": "2024-11-04T16-26-13-05-00_mmlu_QvfQ46qJen2bvxiktHu86H.json",
31
+ "gdm_intercode_ctf": "2024-11-15T20-52-53-05-00_gdm-intercode-ctf_oLYr3H6bFtrcmgM6EABmNt.json"
32
+ },
33
+ "o1": {
34
+ "winogrande": "2025-01-20T16-46-06-05-00_winogrande_YUtAdEsForRffqe4Sm3wtR.json",
35
+ "humaneval": "2025-01-17T14-59-12-05-00_humaneval_RRL8GMy9NakTxUHsDVWNng.json",
36
+ "mmmu_open": "2025-01-20T22-48-09-05-00_mmmu-open_oBzxJBYbvnktbbAwhoCrYK.json",
37
+ "mmlu_pro": "2025-01-20T14-02-37-05-00_mmlu-pro_EvDzvqaahQwhv6fJovN4BT.json",
38
+ "math": "2025-01-17T15-03-22-05-00_math_6BbvHFF8hLMsVYozyNLbyQ.json",
39
+ "arc_easy": "2025-01-17T11-29-26-05-00_arc-easy_DFbir4BdgQDbKd52r7tRKR.json",
40
+ "arc_challenge": "2025-01-17T11-44-42-05-00_arc-challenge_PsWXaBqrgv3EcTZC55gRzJ.json",
41
+ "gsm8k": "2025-01-17T12-56-38-05-00_gsm8k_iD8275qeyNTgX523pn45bF.json",
42
+ "gpqa_diamond": "2025-01-17T11-53-53-05-00_gpqa-diamond_EJV7ULFSQLRoFTEqsv3t6q.json",
43
+ "hellaswag": "2025-01-17T13-14-39-05-00_hellaswag_73sQJFnwpzWjTvEqKjUk4M.json",
44
+ "mmmu_multiple_choice": "2025-01-20T21-04-57-05-00_mmmu-multiple-choice_MctxjookaeTLCL8KpUeazT.json"
45
+ },
46
+ "claude-3-5-sonnet-20241022": {
47
+ "mmmu_multiple_choice": "2025-01-21T11-20-03-05-00_mmmu-multiple-choice_CWhKvGdoFo6pdHhDyi9GNm.json",
48
+ "mmlu_pro": "2025-01-16T19-01-05-05-00_mmlu-pro_3vi84or97gQupuj5sT6vgZ.json",
49
+ "hellaswag": "2025-01-15T15-09-33-05-00_hellaswag_QXqFxojvSToMu8ckHEMLkB.json",
50
+ "gpqa_diamond": "2025-01-15T13-56-36-05-00_gpqa-diamond_eg4gFaMRENjnnYvQNtSB59.json",
51
+ "gsm8k": "2025-01-15T14-23-25-05-00_gsm8k_nHB8Z4uZAwRAZFYpKmTptA.json",
52
+ "mmmu_open": "2025-01-21T11-24-21-05-00_mmmu-open_SSjv3Dq9gZkEEUnvJUd5xf.json",
53
+ "arc_easy": "2025-01-15T10-06-24-05-00_arc-easy_oBReQZQM5SAwMMD2jFshPb.json",
54
+ "arc_challenge": "2025-01-15T10-12-11-05-00_arc-challenge_X8i6caCzkcQo5AT5zXkXso.json",
55
+ "mmlu": "2025-01-16T15-16-51-05-00_mmlu_NFDs2kxmh3kQEbpbd8sz3w.json",
56
+ "math": "2025-01-16T12-29-54-05-00_math_NvNQU58M8r3fpiwPGnvq8h.json",
57
+ "ifeval": "2025-01-16T11-28-44-05-00_ifeval_fmWxch4ZjbmYCST6yUZsdV.json",
58
+ "humaneval": "2025-01-16T11-26-12-05-00_humaneval_kUASiaNd9uZfWvCwYHhdF5.json",
59
+ "winogrande": "2025-01-16T22-09-41-05-00_winogrande_mSWGAKg75E5RP79KWizvb9.json",
60
+ "drop": "2025-01-15T10-15-15-05-00_drop_Z9A2Y84HYponNxnzNT9TNq.json"
61
+ },
62
+ "c4ai-command-r-plus": {
63
+ "ifeval": "2024-10-30T17-23-04-04-00_ifeval_RGucUMwdGmUnRpqyMTZTzW.json",
64
+ "winogrande": "2024-10-30T14-42-18-04-00_winogrande_bY8yg7aRR5dCCK7NDCZEcc.json",
65
+ "arc_challenge": "2024-10-29T17-30-03-04-00_arc-challenge_XB7LURXEGaxskWuLtYwdnW.json",
66
+ "drop": "2024-10-30T12-06-30-04-00_drop_itY9cLiYAW2BF7NTeDceNd.json",
67
+ "math": "2024-10-30T17-26-34-04-00_math_kohBUMpMFuMsR4jz4vUNWM.json",
68
+ "gpqa_diamond": "2024-10-29T22-47-45-04-00_gpqa-diamond_JKpb6ya4pec9hh7uovPPCZ.json",
69
+ "mmlu_pro": "2024-10-31T01-11-38-04-00_mmlu-pro_gZVAuy3zMKR23BieM5PqAX.json",
70
+ "humaneval": "2024-10-30T17-22-23-04-00_humaneval_5ByPqUhoofSbKgvsUQNFCX.json",
71
+ "gsm8k": "2024-10-30T15-03-35-04-00_gsm8k_QxbfbriJsKGQAg96JyjkoT.json",
72
+ "hellaswag": "2024-10-30T15-18-17-04-00_hellaswag_UYyBTR6N8VJnKRmnbCrB8N.json",
73
+ "mmlu": "2024-10-30T21-55-26-04-00_mmlu_JUPPLTzfe3Kme6UuorPTqg.json",
74
+ "arc_easy": "2024-10-29T17-10-40-04-00_arc-easy_UvprihBMLXPF8JENVLRkdx.json"
75
+ },
76
+ "gpt-4o-mini": {
77
+ "drop": "2024-10-30T17-36-25-04-00_drop_6TzJGqqEkpFUCxGD4QejV6.json",
78
+ "humaneval": "2024-10-30T21-14-41-04-00_humaneval_Z9aXdUERuwYxoTheZ5GANC.json",
79
+ "gpqa_diamond": "2024-10-30T19-31-26-04-00_gpqa-diamond_7aNe9wQiQKpNN96mfaWBPg.json",
80
+ "mmmu_open": "2025-01-20T23-13-27-05-00_mmmu-open_GWi6XNYUSLq99BdabtScGm.json",
81
+ "arc_challenge": "2024-10-30T17-34-51-04-00_arc-challenge_FbGgLswBZbRE4EhWiMyRt6.json",
82
+ "mmlu": "2024-10-31T10-49-43-04-00_mmlu_oGb9mspeGbYS2gfbkknskN.json",
83
+ "hellaswag": "2024-10-30T19-35-34-04-00_hellaswag_2SAz3cvMpDxFaApdHDR3s4.json",
84
+ "ifeval": "2024-10-30T21-15-06-04-00_ifeval_nYs9KujQMQjcpbpbLtVx8G.json",
85
+ "mmmu_multiple_choice": "2025-01-20T23-10-01-05-00_mmmu-multiple-choice_c5rLkrXkV83udX6DVJui5F.json",
86
+ "arc_easy": "2024-10-30T17-29-56-04-00_arc-easy_XcEzqqPqJsRV29NqYDfnNo.json",
87
+ "winogrande": "2024-10-31T00-59-07-04-00_winogrande_Ci55vHvbGGW38zVpMCwtWa.json",
88
+ "mmlu_pro": "2024-10-30T22-43-30-04-00_mmlu-pro_Dc2uu3EV7MJtjg6gg5Y9qH.json",
89
+ "math": "2024-10-30T21-15-49-04-00_math_YsWdRzpqMq2dqQ9SPKfack.json",
90
+ "gsm8k": "2024-10-30T19-32-39-04-00_gsm8k_nLSssETKDDWNktAFWnVwfv.json"
91
+ },
92
+ "Meta-Llama-3.1-70B-Instruct": {
93
+ "hellaswag": "2024-10-30T00-45-54-04-00_hellaswag_BKfQG9yGAr383MGnooMLBH.json",
94
+ "drop": "2024-10-29T21-01-02-04-00_drop_LzAWvLWkNrNKu5qf56wXRo.json",
95
+ "gpqa_diamond": "2024-10-29T23-41-39-04-00_gpqa-diamond_TdLdYmVM6GCVMAECcXkuhj.json",
96
+ "winogrande": "2024-10-30T09-20-56-04-00_winogrande_WnUgkSRhSMvh3zUjnuJWQZ.json",
97
+ "gsm8k": "2024-10-30T00-03-31-04-00_gsm8k_bKsUfCAfcmBCeryboNaLoX.json",
98
+ "math": "2024-10-30T02-34-50-04-00_math_2xiNcrGih26uzJdG4q88bM.json",
99
+ "ifeval": "2024-10-30T02-29-32-04-00_ifeval_Dwh3CF2ZYFrvw7UcTwrsvK.json",
100
+ "arc_challenge": "2024-10-29T20-58-56-04-00_arc-challenge_oFL5wFjT7KwNFhMFfe72JN.json",
101
+ "arc_easy": "2024-10-29T20-53-12-04-00_arc-easy_UXzR7cDeNteP39NoXUYnhm.json",
102
+ "mmlu_pro": "2024-10-30T06-11-16-04-00_mmlu-pro_oQiEBJdeKtEEt4cm9KL7uy.json",
103
+ "humaneval": "2024-10-30T02-28-25-04-00_humaneval_KcJV2rHuHJ2JLxijihEkcW.json",
104
+ "mmlu": "2024-10-30T03-51-50-04-00_mmlu_6SNjs2QmPRvqGnvbnNtaqb.json"
105
+ },
106
+ "gpt-4o": {
107
+ "gpqa_diamond": "2024-10-31T03-29-33-04-00_gpqa-diamond_nFmRv5MJiYjHjezmq4V6Va.json",
108
+ "arc_challenge": "2024-10-31T01-45-55-04-00_arc-challenge_nrsPPxh4DpzgLPQDFdcfVp.json",
109
+ "gsm8k": "2024-10-31T03-31-16-04-00_gsm8k_jVXeSvHowbietZCFsFYCwB.json",
110
+ "mmlu": "2024-10-31T10-49-43-04-00_mmlu_GarLpfQFSpM3C22nbbGp54.json",
111
+ "ifeval": "2024-10-31T05-00-11-04-00_ifeval_jxreUu8JqRdkrcHP4E3hLR.json",
112
+ "mmlu_pro": "2024-10-31T06-59-42-04-00_mmlu-pro_EuAKDwAWSfNVpqyyqrf2Ba.json",
113
+ "mmmu_open": "2025-01-20T23-07-46-05-00_mmmu-open_d3Q2HvuPZzEX6FAM4NBhnp.json",
114
+ "winogrande": "2024-10-31T09-02-03-04-00_winogrande_44kKF7M9mKoqVC7ixZVXuq.json",
115
+ "drop": "2024-10-31T01-47-20-04-00_drop_3gxDcn6vUoR3nvHX9BcSq4.json",
116
+ "arc_easy": "2024-10-31T01-41-34-04-00_arc-easy_nUavRHdiRVfrxo6dmCPadh.json",
117
+ "mmmu_multiple_choice": "2025-01-20T23-03-21-05-00_mmmu-multiple-choice_eoycAFLMirSqiURdXmBP2e.json",
118
+ "humaneval": "2024-10-31T04-59-42-04-00_humaneval_nmJcd84CcNKjWS8fBfMbZM.json",
119
+ "math": "2024-10-31T05-01-22-04-00_math_cDSpKPp3nLrFy8uYfYKEbM.json",
120
+ "hellaswag": "2024-10-31T03-33-47-04-00_hellaswag_JNnnPuz3dhZRpyXzizMUBF.json"
121
+ },
122
+ "Mistral-Large-Instruct-2407": {
123
+ "drop": "2024-10-31T01-56-12-04-00_drop_NtvuCoU2LoMbH8DztcCTen.json",
124
+ "ifeval": "2024-10-31T06-30-16-04-00_ifeval_TLkvCSFEWo4PLv6hAha7YB.json",
125
+ "mmlu": "2024-10-31T07-21-48-04-00_mmlu_YnUhmHoStr3WuJdchWmNPt.json",
126
+ "gpqa_diamond": "2024-10-31T04-22-52-04-00_gpqa-diamond_SuZUZxGdqS2ZecbLRNkKd4.json",
127
+ "gsm8k": "2024-10-31T04-28-49-04-00_gsm8k_5tQp9tbwUMj6NpjNKCAfVm.json",
128
+ "math": "2024-10-31T06-33-09-04-00_math_2CmjBedAfUxqvmcHRdBgyB.json",
129
+ "arc_easy": "2024-10-31T01-48-39-04-00_arc-easy_YbfuBT3usZXt2xgZkkR5dq.json",
130
+ "mmlu_pro": "2024-10-31T09-41-25-04-00_mmlu-pro_fyYT4aabPesfY5TpzFMPnd.json",
131
+ "humaneval": "2024-10-31T06-29-24-04-00_humaneval_nu8SUSGekKJWB8HLKDigYK.json",
132
+ "hellaswag": "2024-10-31T04-50-00-04-00_hellaswag_ZzQoZ6gkRQsTzMhQr7GYNn.json",
133
+ "arc_challenge": "2024-10-31T01-54-13-04-00_arc-challenge_WfQRhMkFcywefpU46isBVP.json",
134
+ "winogrande": "2024-10-31T11-57-58-04-00_winogrande_TP3UGwpp37Dyv6ks9Ty5Hk.json"
135
+ },
136
+ "Qwen2.5-72B-Instruct": {
137
+ "arc_challenge": "2024-10-31T13-46-34-04-00_arc-challenge_FSybKYYwpXVLQag8VwpjKe.json",
138
+ "mmlu_pro": "2024-11-01T20-31-04-04-00_mmlu-pro_2TfSPmsVmKatntHy2CnR7A.json",
139
+ "gpqa_diamond": "2024-10-31T13-48-32-04-00_gpqa-diamond_8qSySicySUyNvRRYVFBKLU.json",
140
+ "winogrande": "2024-10-31T14-46-29-04-00_winogrande_CX692dYh53gJ6JigT9GMpa.json",
141
+ "mmlu": "2024-11-01T10-08-50-04-00_mmlu_AgK27yYvmAo2LxotBH7ZL9.json",
142
+ "hellaswag": "2024-11-01T02-55-55-04-00_hellaswag_RSk8rGcQWg3HRrLffTNoiM.json",
143
+ "gsm8k": "2024-11-01T01-15-16-04-00_gsm8k_3h4W6xZjXpz9oCwtgKNYzo.json",
144
+ "arc_easy": "2024-10-31T13-40-08-04-00_arc-easy_3JUyzfoEHxhSBUdCU2AaVC.json",
145
+ "math": "2024-11-01T10-06-46-04-00_math_UUpS2R9eQc9KxBxkanT2gE.json",
146
+ "ifeval": "2024-10-31T14-51-45-04-00_ifeval_VGxA7gTZLZSruceM9Ci37C.json",
147
+ "humaneval": "2024-10-31T14-49-39-04-00_humaneval_9u7khnxivCDroJoPNRFpjs.json",
148
+ "drop": "2024-10-31T15-03-20-04-00_drop_DDLi98VhiV2bLzuw7fx6H4.json"
149
+ }
150
+ }
refactor_eval_results.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+
5
+ METRIC_NAME = {
6
+ # single-turn
7
+ "arc_easy": "accuracy",
8
+ "arc_challenge": "accuracy",
9
+ "gpqa_diamond": "accuracy",
10
+ "drop": "mean",
11
+ "winogrande": "accuracy",
12
+ "gsm8k": "accuracy",
13
+ "hellaswag": "accuracy",
14
+ "humaneval": "mean",
15
+ "ifeval": "final_acc",
16
+ "math": "accuracy",
17
+ "mmlu": "accuracy",
18
+ "mmlu_pro": "accuracy",
19
+ "mmmu_multiple_choice": "accuracy",
20
+ "mmmu_open": "accuracy",
21
+
22
+ # agentic
23
+ "gaia": "mean",
24
+ "gdm_intercode_ctf": "accuracy",
25
+ }
26
+
27
+ MODEL_SHA_MAP = {
28
+ # open source models
29
+ "c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
30
+ "Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
31
+ "Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
32
+ "Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
33
+
34
+ # closed source models
35
+ "claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
36
+ "gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
37
+ "gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
38
+ "gpt-4o": "https://openai.com/index/hello-gpt-4o",
39
+ "gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
40
+ "o1": "https://openai.com/o1",
41
+ }
42
+
43
+
44
+ def combine_eval_results(results_path: str, model_name: str) -> dict:
45
+ results = dict(
46
+ {
47
+ "config": {
48
+ "model_name": model_name,
49
+ # dummy keys
50
+ "model_sha": MODEL_SHA_MAP[model_name],
51
+ "model_dtype": "torch.float16",
52
+ },
53
+ "results": {},
54
+ }
55
+ )
56
+ for file in os.listdir(os.path.join(results_path, model_name)):
57
+ if file.endswith(".json"):
58
+ with open(os.path.join(results_path, model_name, file), "r") as f:
59
+ try:
60
+ result = json.load(f)
61
+ task_name = result["eval"]["task"].split("/")[-1]
62
+ if task_name == "math":
63
+ metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
64
+ else:
65
+ metrics = result["results"]["scores"][0]["metrics"]
66
+ metric_name = metrics[METRIC_NAME[task_name]]["name"]
67
+ metric_value = metrics[METRIC_NAME[task_name]]["value"]
68
+ results["results"].update(
69
+ {
70
+ task_name: {
71
+ metric_name: metric_value
72
+ }
73
+ }
74
+ )
75
+ except KeyError as e:
76
+ print(f"KeyError: {e}")
77
+ print(model_name)
78
+ print(file)
79
+ return results
80
+
81
+
82
+ def main():
83
+
84
+ CACHE_PATH=os.getenv("HF_HOME", ".")
85
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
86
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
87
+
88
+ base_bm_input_path = "./base_benchmarking_logs"
89
+ agentic_bm_input_path = "./agentic_benchmarking_logs"
90
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
91
+ os.makedirs(EVAL_REQUESTS_PATH, exist_ok=True)
92
+
93
+ for model_name in os.listdir(base_bm_input_path):
94
+ if os.path.isdir(os.path.join(base_bm_input_path, model_name)):
95
+ results = combine_eval_results(base_bm_input_path, model_name)
96
+ # TMP: Add dummy agentic benchmarks to the results
97
+ for metric in METRIC_NAME.items():
98
+ if metric[0] not in results["results"]:
99
+ results["results"].update({metric[0]: {metric[1]: None}})
100
+ if os.path.isdir(os.path.join(agentic_bm_input_path, model_name)):
101
+ agentic_bm_results = combine_eval_results(agentic_bm_input_path, model_name)
102
+ results["results"].update(agentic_bm_results["results"])
103
+ with open(os.path.join(EVAL_RESULTS_PATH, f"{model_name}.json"), "w") as f:
104
+ json.dump(results, f, indent=4)
105
+
106
+ # Create dummy requests file
107
+ requests = {
108
+ "model": model_name,
109
+ "model_sha": MODEL_SHA_MAP[model_name],
110
+ "base_model": "",
111
+ "revision": "main",
112
+ "private": False,
113
+ "precision": "float16",
114
+ "weight_type": "Original",
115
+ "status": "FINISHED",
116
+ "submitted_time": "",
117
+ "model_type": "pretrained",
118
+ "likes": 0,
119
+ "params": 0,
120
+ "license": "custom",
121
+ }
122
+ with open(os.path.join(EVAL_REQUESTS_PATH, f"{model_name}.json"), "w") as f:
123
+ json.dump(requests, f, indent=4)
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
src/about.py CHANGED
@@ -1,96 +1,132 @@
1
- # Your leaderboard name
2
- TITLE = """<h1 align="center" id="space-title">Evaluation Leaderboard</h1>"""
3
 
4
- # SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
5
- # AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
 
 
 
 
 
6
 
7
- # What does your leaderboard evaluate?
8
- INTRODUCTION_TEXT = f"""
9
- Powered by **Inspect** and **Inspect Evals**, the **Vector Evaluation Leaderboard** presents an evaluation of leading frontier models across a comprehensive suite of benchmarks. Go beyond the summary metrics: click through to interactive reporting for each model and benchmark to explore sample-level performance and detailed traces."""
10
 
11
- # Which evaluations are you running? how can people reproduce what you have?
12
- ABOUT_TEXT = f"""
 
 
13
 
14
- ## Vector Institute
15
- The **Vector Institute** is dedicated to advancing the field of artificial intelligence through cutting-edge research and application. Our mission is to drive excellence and innovation in AI, fostering a community of researchers, developers, and industry partners.
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- ## 🎯 Benchmarks
 
 
18
 
19
- This leaderboard showcases performance across a comprehensive suite of benchmarks, designed to rigorously evaluate different aspects of AI model capabilities. Let's explore the benchmarks we use:
20
 
21
- ### Inspect Evals
 
22
 
23
- This leaderboard leverages [Inspect Evals](https://ukgovernmentbeis.github.io/inspect_evals/) to power evaluation. Inspect Evals is an open-source repository built upon the Inspect AI framework. Developed in collaboration between the Vector Institute, Arcadia Impact and the UK AI Security Institute, Inspect Evals provides a comprehensive suite of high-quality benchmarks spanning diverse domains like coding, mathematics, cybersecurity, reasoning, and general knowledge.
24
 
25
- #### Transparent and Detailed Insights
26
 
27
- All evaluations presented on this leaderboard are run using Inspect Evals. To facilitate in-depth analysis and promote transparency, we provide [Inspect Logs](https://inspect.ai-safety-institute.org.uk/log-viewer.html) for every benchmark run. These logs offer sample and trace level reporting, allowing the community to explore the granular details of model performance.
28
-
29
- ### ⚙️ Base Benchmarks
30
 
31
- These benchmarks assess fundamental reasoning and knowledge capabilities of models.
 
32
 
33
- <div class="benchmark-table-container">
 
 
34
 
35
- | Benchmark | Description |
36
- |--------------------|----------------------------------------------------------------------------------|
37
- | **ARC-Easy** / **ARC-Challenge** | Multiple-choice science questions. |
38
- | **DROP** | Comprehension benchmark evaluating advanced reasoning capability. |
39
- | **WinoGrande** | Commonsense reasoning challenge. |
40
- | **GSM8K** | Grade-school math word problems testing math capability & multi-step reasoning. |
41
- | **HellaSwag** | Commonsense reasoning task. |
42
- | **HumanEval** | Evaluates code generation and reasoning in a programming context. |
43
- | **IFEval** | Specialized benchmark for instruction following. |
44
- | **MATH** | Challenging questions sourced from math competitions. |
45
- | **MMLU** / **MMLU-Pro**| Multi-subject multiple-choice tests of advanced knowledge. |
46
- | **GPQA-Diamond** | Question-answering benchmark assessing deeper reasoning. |
47
- | **MMMU** (Multi-Choice / Open-Ended) | Multi-modal tasks testing structured & open responses. |
48
- </div>
49
 
50
- ### 🚀 Agentic Benchmarks
 
51
 
52
- These benchmarks go beyond basic reasoning and evaluate more advanced, autonomous, or "agentic" capabilities of models, such as planning and interaction.
 
53
 
54
- <div class="benchmark-table-container">
55
 
56
- | Benchmark | Description |
57
- |-----------------------|----------------------------------------------------------------------------|
58
- | **GAIA** | Evaluates autonomous reasoning, planning, problem-solving for question answering. |
59
- | **InterCode-CTF** | Capture-the-flag challenge testing cyber-security skills. |
60
- | **In-House-CTF** | Capture-the-flag challenge testing cyber-security skills. |
61
- | **AgentHarm** / **AgentHarm-Benign** | Measures harmfulness of LLM agents (and benign behavior baseline). |
62
- | **SWE-Bench-Verified** | Tests AI agent ability to solve software engineering tasks. |
63
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  """
65
 
66
- REPRODUCIBILITY_TEXT = """
67
- ## 🛠️ Reproducibility
68
- The [Vector State of Evaluation Leaderboard Repository](https://github.com/VectorInstitute/evaluation) repository contains the evaluation script to reproduce results presented on the leaderboard.
69
-
70
- ### Install dependencies
71
 
72
- 1. Create a python virtual env. with ```python>=3.10``` and activate it
73
- ```bash
74
- python -m venv env
75
- source env/bin/activate
 
 
76
  ```
 
77
 
78
- 2. Install ```inspect_ai```, ```inspect_evals``` and other dependencies based on ```requirements.txt```
79
- ```bash
80
- python -m pip install -r requirements.txt
81
- ```
82
 
83
- 3. Install any packages required for models you'd like to evaluate and use as grader models
84
- ```bash
85
- python -m pip install <model_package>
86
- ```
87
- Note: ```openai``` package is already included in ```requirements.txt```
88
 
89
- ### Run Inspect evaluation
90
- 1. Update the ```src/evals_cfg/run_cfg.yaml``` file to select the evals (base/agentic) and include all models to be evaluated
91
- 2. Now run evaluation as follows:
92
- ```bash
93
- python src/run_evals.py
94
- ```
 
 
 
 
95
  """
96
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
 
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
+ type: str
10
+ source: str
11
 
 
 
 
12
 
13
+ # Select your tasks here
14
+ # ---------------------------------------------------
15
+ class Tasks(Enum):
16
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
17
 
18
+ # base
19
+ task0 = Task("arc_easy", "accuracy", "ARC-Easy", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
20
+ task1 = Task("arc_challenge", "accuracy", "ARC-Challenge", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc")
21
+ task2 = Task("drop", "mean", "DROP", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop")
22
+ task3 = Task("winogrande", "accuracy", "WinoGrande", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande")
23
+ task4 = Task("gsm8k", "accuracy", "GSM8K", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k")
24
+ task5 = Task("hellaswag", "accuracy", "HellaSwag", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag")
25
+ task6 = Task("humaneval", "mean", "HumanEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval")
26
+ task7 = Task("ifeval", "final_acc", "IFEval", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval")
27
+ task8 = Task("math", "accuracy", "MATH", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics")
28
+ task9 = Task("mmlu", "accuracy", "MMLU", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu")
29
+ task10 = Task("mmlu_pro", "accuracy", "MMLU-Pro", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro")
30
+ task11 = Task("gpqa_diamond", "accuracy", "GPQA-Diamond", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa")
31
+ task12 = Task("mmmu_multiple_choice", "accuracy", "MMMU-Multiple-Choice", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
32
+ task13 = Task("mmmu_open", "accuracy", "MMMU-Open-Ended", "base", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu")
33
 
34
+ # agentic
35
+ task14 = Task("gaia", "mean", "GAIA", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia")
36
+ task15 = Task("gdm_intercode_ctf", "accuracy", "GDM-InterCode-CTF", "agentic", "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf")
37
 
 
38
 
39
+ NUM_FEWSHOT = 0 # Change with your few shot
40
+ # ---------------------------------------------------
41
 
 
42
 
 
43
 
44
+ # Your leaderboard name
45
+ TITLE = """<h1 align="center" id="space-title">Vector State of Evaluation Leaderboard</h1>"""
 
46
 
47
+ SINGLE_TURN_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "base"])
48
+ AGENTIC_TASK_NAMES = ", ".join([f"[{task.value.col_name}]({task.value.source})" for task in Tasks if task.value.type == "agentic"])
49
 
50
+ # What does your leaderboard evaluate?
51
+ INTRODUCTION_TEXT = f"""
52
+ This leaderboard presents the performance of selected LLM models on a set of tasks. The tasks are divided into two categories: base and agentic. The base tasks are: {SINGLE_TURN_TASK_NAMES}. The agentic tasks are: {AGENTIC_TASK_NAMES}."""
53
 
54
+ # Which evaluations are you running? how can people reproduce what you have?
55
+ LLM_BENCHMARKS_TEXT = f"""
56
+ # Vector State of Evaluation Leaderboard
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ ## Overview
59
+ The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
60
 
61
+ - **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
62
+ - **Agentic Tasks**: GAIA, GDM-InterCode-CTF
63
 
64
+ Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
65
 
66
+ ## Vector Institute
67
+ The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. This leaderboard is part of Vector’s broader effort to promote transparency and progress in AI research.
68
+
69
+ ## Model
70
+ We evaluate a variety of **Large Language Models (LLMs)** across the included benchmarks. Each model:
71
+ - Is tested on the same set of tasks.
72
+ - Has standardized prompts or evaluation methodologies.
73
+ - Generates performance metrics (accuracy, F1, etc.) for comparison.
74
+
75
+ Our goal is to provide clear, reproducible metrics that shed light on how each model handles different task complexities and reasoning requirements.
76
+
77
+ ## Benchmarks
78
+ Here is a closer look at each benchmark included in the leaderboard:
79
+
80
+ ### Base Benchmarks
81
+ - **ARC-Easy / ARC-Challenge**: A set of multiple-choice science questions designed to measure a model’s scientific and commonsense reasoning.
82
+ - **DROP**: A reading comprehension benchmark emphasizing discrete reasoning steps.
83
+ - **WinoGrande**: A commonsense reasoning challenge focused on co-reference resolution.
84
+ - **GSM8K**: Grade-school math word problems testing arithmetic and multi-step reasoning.
85
+ - **HellaSwag**: A commonsense inference task centered on action completion.
86
+ - **HumanEval**: Evaluates code generation and reasoning in a programming context.
87
+ - **IFEval**: A specialized benchmark for incremental formal reasoning.
88
+ - **MATH**: High school-level math questions requiring detailed solutions.
89
+ - **MMLU / MMLU-Pro**: Multi-subject multiple-choice tests covering advanced high school and collegiate-level knowledge.
90
+ - **GPQA-Diamond**: A question-answering benchmark that assesses deeper reasoning and knowledge linking.
91
+ - **MMMU (Multiple-Choice / Open-Ended)**: A suite of multilingual and multi-domain tasks testing both structured and open-form responses.
92
+
93
+ ### Agentic Benchmarks
94
+ - **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
95
+ - **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
96
+
97
+ ---
98
  """
99
 
100
+ EVALUATION_QUEUE_TEXT = """
101
+ ## Some good practices before submitting a model
 
 
 
102
 
103
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
104
+ ```python
105
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
106
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
107
+ model = AutoModel.from_pretrained("your model name", revision=revision)
108
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
109
  ```
110
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
111
 
112
+ Note: make sure your model is public!
113
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
 
 
114
 
115
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
116
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 
 
 
117
 
118
+ ### 3) Make sure your model has an open license!
119
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
120
+
121
+ ### 4) Fill up your model card
122
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
123
+
124
+ ## In case of model failure
125
+ If your model is displayed in the `FAILED` category, its execution stopped.
126
+ Make sure you have followed the above steps first.
127
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
128
  """
129
 
130
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
131
+ CITATION_BUTTON_TEXT = r"""
132
+ """
src/assets/logo-icon-black.png DELETED
Binary file (32.4 kB)
 
src/assets/logo-icon-white.png DELETED
Binary file (32.6 kB)
 
src/display/css_html_js.py CHANGED
@@ -1,192 +1,236 @@
1
- custom_js = """
2
- function tableLinkHack() {
3
- // This is a hack to make the table links work
4
- var allTableLinks = document.querySelectorAll(".llm-benchmark-tab-table .table-wrap table.table a")
5
- for (var link of allTableLinks) {
6
- link.addEventListener("click", e => {
7
- window.open(e.target.href, e.target.target);
8
- });
9
- }
10
- }
11
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  custom_css = """
14
- footer {visibility: hidden}
15
-
16
- #main {
17
- max-width: 1200px;
18
- margin: 0 auto;
19
- padding: 0 20px;
20
- }
21
-
22
- .gradio-container {
23
- max-width: 1200px !important;
24
- margin: 0 auto !important;
25
- }
26
-
27
- .intro-text {
28
- text-align: center; /* Center the text */
29
- font-size: 3rem; /* Slightly larger font size if desired */
30
- color: #555; /* A slightly softer color than black */
31
- margin-bottom: 5px; /* Add some space below the text before the tabs */
32
- padding: 0 10px; /* Add some horizontal padding for wider screens */
33
- line-height: 1.2; /* Improve readability with line height */
34
- max-width: 1200px; /* Limit width for better readability on large screens */
35
- margin-left: auto; /* Center the block horizontally */
36
- margin-right: auto; /* Center the block horizontally */
37
- }
38
 
39
  .tab-buttons button {
40
  font-size: 20px;
41
  }
42
 
43
- .intro-block {
44
- padding: 20px;
45
- }
46
-
47
- .header-row {
48
- height: 0;
49
- min-height: 0;
50
- }
51
-
52
- .tabitem {
53
- padding-top: 0;
54
- }
55
-
56
- .html-container {
57
- padding: 0;
58
- }
59
-
60
- #page-header {
61
- display: flex;
62
- justify-content: center;
63
- text-align: center;
64
- margin-bottom: 1rem;
65
- }
66
-
67
- #header-container{
68
- display: flex;
69
- width: 800px;
70
- }
71
-
72
- #left-container {
73
- flex: 1;
74
- }
75
-
76
- #left-container #black-logo, #left-container #white-logo {
77
- height: 150px;
78
- width: 150px;
79
- }
80
-
81
- #left-container #black-logo {
82
- display: block;
83
- }
84
-
85
- #left-container #white-logo {
86
- display: none;
87
- }
88
-
89
- #centre-container {
90
- align-self: center;
91
- }
92
-
93
- #right-container {
94
- flex: 1;
95
- align-self: center;
96
- }
97
-
98
- #right-container #black-logo, #right-container #white-logo {
99
- height: 150px;
100
- width: 150px;
101
- }
102
-
103
- #right-container #black-logo {
104
- display: block;
105
- }
106
-
107
- #right-container #white-logo {
108
- display: none;
109
- }
110
-
111
- .llm-benchmark-tab-table .table-wrap table.table {
112
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
113
- color: rgb(97, 97, 97);
114
- overflow-y: auto;
115
- overflow-x: auto;
116
- }
117
-
118
- .llm-benchmark-tab-table .table-wrap table.table a {
119
- font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif;
120
- color: #ec4899
121
- }
122
-
123
- .llm-benchmark-tab-table .table-wrap table.table tr td,
124
- .llm-benchmark-tab-table .table-wrap table.table tr th {
125
- border-bottom: 1px solid var(--border-color-primary);
126
- }
127
-
128
- .llm-benchmark-tab-table .table-wrap table.table a {
129
- color: rgb(2, 136, 209) !important;
130
- }
131
-
132
- .llm-benchmark-tab-table .table-wrap table.table th {
133
- font-size: 14px; /* Smaller font size for table headers */
134
- font-weight: 600; /* Semi-bold instead of full bold */
135
- }
136
-
137
- @media (prefers-color-scheme: dark) {
138
- #left-container #black-logo {
139
- display: none;
140
- }
141
- #left-container #white-logo {
142
- display: block;
143
- }
144
- #right-container #black-logo {
145
- display: none;
146
- }
147
- #right-container #white-logo {
148
- display: block;
149
- }
150
- }
151
-
152
- .benchmark-table-container table {
153
- width: 100%; /* Make table take full width of its container */
154
- border-collapse: collapse; /* Remove spacing between table cells */
155
- margin-bottom: 20px; /* Add some space below the table */
156
- }
157
-
158
- .benchmark-table-container th, .benchmark-table-container td {
159
- border: 1px solid #ddd; /* Light gray border for cells */
160
- padding: 8px; /* Padding within cells for better spacing */
161
- text-align: left; /* Align text to the left within cells */
162
- vertical-align: top; /* Align content to the top of cells */
163
- }
164
 
165
- .benchmark-table-container th {
166
- background-color: #f2f2f2; /* Light gray background for header row */
167
- font-weight: bold; /* Make header text bold */
168
- }
169
 
170
- .benchmark-table-container tbody tr:nth-child(even) {
171
- background-color: #f9f9f9; /* Very light gray background for even rows for zebra striping */
172
- }
173
 
174
- @media (prefers-color-scheme: dark) {
175
- #left-container #black-logo {
176
- display: none;
177
- }
178
- #left-container #white-logo {
179
- display: block;
180
- }
181
- .benchmark-table-container th {
182
- background-color: #3b3b3b;
183
- }
184
- .benchmark-table-container tbody tr:nth-child(even) {
185
- background-color: #2b2b2b;
186
- }
187
- }
188
 
189
- """
190
 
191
  get_window_url_params = """
192
  function(url_params) {
 
1
+ # custom_css = """
2
+
3
+ # .markdown-text {
4
+ # font-size: 16px !important;
5
+ # }
6
+
7
+ # #models-to-add-text {
8
+ # font-size: 18px !important;
9
+ # }
10
+
11
+ # #citation-button span {
12
+ # font-size: 16px !important;
13
+ # }
14
+
15
+ # #citation-button textarea {
16
+ # font-size: 16px !important;
17
+ # }
18
+
19
+ # #citation-button > label > button {
20
+ # margin: 6px;
21
+ # transform: scale(1.3);
22
+ # }
23
+
24
+ # #leaderboard-table {
25
+ # margin-top: 15px
26
+ # }
27
+
28
+ # #leaderboard-table-lite {
29
+ # margin-top: 15px
30
+ # }
31
+
32
+ # #search-bar-table-box > div:first-child {
33
+ # background: none;
34
+ # border: none;
35
+ # }
36
+
37
+ # #search-bar {
38
+ # padding: 0px;
39
+ # }
40
+
41
+ # /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ # #leaderboard-table td:nth-child(2),
43
+ # #leaderboard-table th:nth-child(2) {
44
+ # max-width: 400px;
45
+ # overflow: auto;
46
+ # white-space: nowrap;
47
+ # }
48
+
49
+ # .tab-buttons button {
50
+ # font-size: 20px;
51
+ # }
52
+
53
+ # #scale-logo {
54
+ # border-style: none !important;
55
+ # box-shadow: none;
56
+ # display: block;
57
+ # margin-left: auto;
58
+ # margin-right: auto;
59
+ # max-width: 600px;
60
+ # }
61
+
62
+ # #scale-logo .download {
63
+ # display: none;
64
+ # }
65
+ # #filter_type{
66
+ # border: 0;
67
+ # padding-left: 0;
68
+ # padding-top: 0;
69
+ # }
70
+ # #filter_type label {
71
+ # display: flex;
72
+ # }
73
+ # #filter_type label > span{
74
+ # margin-top: var(--spacing-lg);
75
+ # margin-right: 0.5em;
76
+ # }
77
+ # #filter_type label > .wrap{
78
+ # width: 103px;
79
+ # }
80
+ # #filter_type label > .wrap .wrap-inner{
81
+ # padding: 2px;
82
+ # }
83
+ # #filter_type label > .wrap .wrap-inner input{
84
+ # width: 1px
85
+ # }
86
+ # #filter-columns-type{
87
+ # border:0;
88
+ # padding:0.5;
89
+ # }
90
+ # #filter-columns-size{
91
+ # border:0;
92
+ # padding:0.5;
93
+ # }
94
+ # #box-filter > .form{
95
+ # border: 0
96
+ # }
97
+
98
+ # body, .gradio-container {
99
+ # font-family: Roboto, sans-serif;
100
+ # background-color: #ffffff;
101
+ # color: #000000; /* main text color */
102
+ # margin: 0;
103
+ # padding: 0;
104
+ # }
105
+
106
+ # h1, h2, h3, h4, h5, h6 {
107
+ # color: #eb088a; /* your brand color for headings */
108
+ # font-weight: 600;
109
+ # margin-bottom: 1rem;
110
+ # }
111
+
112
+ # /* Example ‘intro-block’ styling if you want extra flair */
113
+ # .intro-block {
114
+ # background-color: #eb088a10; /* light tinted background */
115
+ # padding: 1.5rem;
116
+ # border-radius: 10px;
117
+ # margin-bottom: 2rem;
118
+ # }
119
+
120
+ # """
121
+
122
+ # custom_css = """
123
+ # /* 1) Load Karbon Font: Make sure this points to your actual font files */
124
+ # @font-face {
125
+ # font-family: 'Karbon';
126
+ # src: url('path/to/Karbon.woff2') format('woff2'),
127
+ # url('path/to/Karbon.woff') format('woff');
128
+ # font-weight: normal;
129
+ # font-style: normal;
130
+ # }
131
+
132
+ # /* 2) Global Container */
133
+ # body, .gradio-container {
134
+ # font-family: 'Karbon', sans-serif;
135
+ # margin: 0;
136
+ # padding: 0;
137
+ # background-color: #fafafa; /* Light background */
138
+ # color: #000000;
139
+ # }
140
+
141
+ # .gradio-container {
142
+ # max-width: 1200px;
143
+ # margin: 0 auto;
144
+ # padding: 2rem 1rem;
145
+ # }
146
+
147
+ # /* 3) Headings, with brand color #eb088a */
148
+ # h1, h2, h3, h4, h5, h6 {
149
+ # color: #000000;
150
+ # margin-bottom: 1rem;
151
+ # font-weight: 600;
152
+ # }
153
+
154
+ # /* 4) Intro Block for a slight highlight */
155
+ # .intro-block {
156
+ # background-color: #ffe2f1; /* lighter tint of #eb088a */
157
+ # padding: 1.5rem;
158
+ # border-radius: 8px;
159
+ # border: 1px solid #f8badb;
160
+ # margin-bottom: 2rem;
161
+ # }
162
+
163
+ # /* 5) Tab styling - remove default orange styling */
164
+ # .tab-buttons {
165
+ # margin-top: 1rem;
166
+ # margin-bottom: 1rem;
167
+ # display: flex;
168
+ # }
169
+ # .tab-buttons > .tabitem {
170
+ # padding: 0.6rem 1.2rem;
171
+ # background-color: #ffffff;
172
+ # border: 1px solid #eb088a;
173
+ # border-radius: 6px;
174
+ # color: #eb088a;
175
+ # margin-right: 5px;
176
+ # cursor: pointer;
177
+ # transition: background-color 0.2s ease, color 0.2s ease;
178
+ # font-weight: 500;
179
+ # }
180
+ # .tab-buttons > .tabitem.selected {
181
+ # background-color: #eb088a;
182
+ # color: #ffffff;
183
+ # }
184
+ # .tab-buttons > .tabitem:hover {
185
+ # background-color: #eb088a;
186
+ # color: #ffffff;
187
+ # }
188
+
189
+ # /* 6) Dataframe Styling */
190
+ # .gr-dataframe table {
191
+ # width: 100%;
192
+ # border-collapse: collapse;
193
+ # border: 1px solid #cccccc;
194
+ # margin-bottom: 2rem;
195
+ # }
196
+ # .gr-dataframe th {
197
+ # background-color: #eb088a;
198
+ # color: #ffffff;
199
+ # padding: 0.6rem;
200
+ # text-align: left;
201
+ # font-weight: 600;
202
+ # }
203
+ # .gr-dataframe td {
204
+ # padding: 0.6rem;
205
+ # border-bottom: 1px solid #e0e0e0;
206
+ # }
207
+ # .gr-dataframe tr:nth-child(even) {
208
+ # background-color: #fdfdfd;
209
+ # }
210
+
211
+ # /* 7) Make default markdown text nice */
212
+ # .markdown-text p {
213
+ # margin-bottom: 1rem;
214
+ # line-height: 1.6;
215
+ # }
216
+ # """
217
 
218
  custom_css = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  .tab-buttons button {
221
  font-size: 20px;
222
  }
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
 
 
 
 
225
 
226
+ """
 
 
227
 
228
+ # .selected.svelte-1tcem6n.svelte-1tcem6n {
229
+ # background-color: #000000 !important; /* Desired background color */
230
+ # color: #eb088a !important; /* Desired text color */
231
+ # border-color: #eb088a !important; /* Desired border color */
232
+ # }
 
 
 
 
 
 
 
 
 
233
 
 
234
 
235
  get_window_url_params = """
236
  function(url_params) {
src/display/formatting.py CHANGED
@@ -5,8 +5,6 @@ def model_hyperlink(link, model_name):
5
  def make_clickable_model(model_name, model_sha):
6
  return model_hyperlink(model_sha, model_name)
7
 
8
- def make_clickable_field(name, url):
9
- return model_hyperlink(url, name)
10
 
11
  def styled_error(error):
12
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
 
5
  def make_clickable_model(model_name, model_sha):
6
  return model_hyperlink(model_sha, model_name)
7
 
 
 
8
 
9
  def styled_error(error):
10
  return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
src/display/utils.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
27
+ # Scores
28
+ for task in Tasks:
29
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "markdown", True)])
30
+
31
+ # We use make dataclass to dynamically fill the scores from Tasks
32
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
33
+
34
+ ## For the queue columns in the submission tab
35
+ @dataclass(frozen=True)
36
+ class EvalQueueColumn: # Queue column
37
+ model = ColumnContent("model", "markdown", True)
38
+ revision = ColumnContent("revision", "str", True)
39
+ private = ColumnContent("private", "bool", True)
40
+ precision = ColumnContent("precision", "str", True)
41
+ weight_type = ColumnContent("weight_type", "str", "Original")
42
+ status = ColumnContent("status", "str", True)
43
+
44
+ ## All the model information that we might need
45
+ @dataclass
46
+ class ModelDetails:
47
+ name: str
48
+ display_name: str = ""
49
+ symbol: str = "" # emoji
50
+
51
+
52
+ class ModelType(Enum):
53
+ PT = ModelDetails(name="pretrained", symbol="🟢")
54
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
55
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
56
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
57
+ Unknown = ModelDetails(name="", symbol="?")
58
+
59
+ def to_str(self, separator=" "):
60
+ return f"{self.value.symbol}{separator}{self.value.name}"
61
+
62
+ @staticmethod
63
+ def from_str(type):
64
+ if "fine-tuned" in type or "🔶" in type:
65
+ return ModelType.FT
66
+ if "pretrained" in type or "🟢" in type:
67
+ return ModelType.PT
68
+ if "RL-tuned" in type or "🟦" in type:
69
+ return ModelType.RL
70
+ if "instruction-tuned" in type or "⭕" in type:
71
+ return ModelType.IFT
72
+ return ModelType.Unknown
73
+
74
+ class WeightType(Enum):
75
+ Adapter = ModelDetails("Adapter")
76
+ Original = ModelDetails("Original")
77
+ Delta = ModelDetails("Delta")
78
+
79
+ class Precision(Enum):
80
+ float16 = ModelDetails("float16")
81
+ bfloat16 = ModelDetails("bfloat16")
82
+ Unknown = ModelDetails("?")
83
+
84
+ def from_str(precision):
85
+ if precision in ["torch.float16", "float16"]:
86
+ return Precision.float16
87
+ if precision in ["torch.bfloat16", "bfloat16"]:
88
+ return Precision.bfloat16
89
+ return Precision.Unknown
90
+
91
+ # Column selection
92
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
93
+
94
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
95
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
96
+
97
+ ST_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="base"]
98
+ AGENTIC_BENCHMARK_COLS = [t.value.col_name for t in Tasks if t.value.type=="agentic"]
src/envs.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ OWNER = "vector-institute" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
11
+
12
+ REPO_ID = f"{OWNER}/llm-eval-leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/llm-eval-requests"
14
+ RESULTS_REPO = f"{OWNER}/llm-eval-results"
15
+
16
+ # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
18
+
19
+ # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
+
25
+ API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
+ from src.submission.check_validity import is_model_on_hub
13
+
14
+
15
+ @dataclass
16
+ class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
+ """
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
+ model: str
23
+ revision: str # commit hash, "" if main
24
+ results: dict
25
+ precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
+ license: str = "?"
30
+ likes: int = 0
31
+ num_params: int = 0
32
+ date: str = "" # submission date of request file
33
+ still_on_hub: bool = False
34
+
35
+ @classmethod
36
+ def init_from_json_file(self, json_filepath):
37
+ """Inits the result from the specific model result file"""
38
+ with open(json_filepath) as fp:
39
+ data = json.load(fp)
40
+
41
+ config = data.get("config")
42
+
43
+ # Precision
44
+ precision = Precision.from_str(config.get("model_dtype"))
45
+
46
+ # Get model and org
47
+ org_and_model = config.get("model_name", config.get("model_args", None))
48
+ org_and_model = org_and_model.split("/", 1)
49
+
50
+ if len(org_and_model) == 1:
51
+ org = None
52
+ model = org_and_model[0]
53
+ result_key = f"{model}"
54
+ else:
55
+ org = org_and_model[0]
56
+ model = org_and_model[1]
57
+ result_key = f"{org}_{model}"
58
+ full_model = "/".join(org_and_model)
59
+
60
+ still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
+ )
63
+ architecture = "?"
64
+ if model_config is not None:
65
+ architectures = getattr(model_config, "architectures", None)
66
+ if architectures:
67
+ architecture = ";".join(architectures)
68
+
69
+ # Extract results available in this file (some results are split in several files)
70
+ results = {}
71
+ for task in Tasks:
72
+ task = task.value
73
+
74
+ # We average all scores of a given metric (not all metrics are present in all files)
75
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
+ # if accs.size == 0 or any([acc is None for acc in accs]):
77
+ # continue
78
+ if accs.size == 0:
79
+ continue
80
+ elif any([acc is None for acc in accs]):
81
+ mean_acc = None
82
+ else:
83
+ mean_acc = np.mean(accs) * 100.0
84
+ results[task.benchmark] = mean_acc
85
+
86
+ return self(
87
+ eval_name=result_key,
88
+ full_model=full_model,
89
+ org=org,
90
+ model=model,
91
+ results=results,
92
+ precision=precision,
93
+ revision= config.get("model_sha", ""),
94
+ still_on_hub=still_on_hub,
95
+ architecture=architecture
96
+ )
97
+
98
+ def update_with_request_file(self, requests_path):
99
+ """Finds the relevant request file for the current model and updates info with it"""
100
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
101
+
102
+ try:
103
+ with open(request_file, "r") as f:
104
+ request = json.load(f)
105
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
106
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
107
+ self.license = request.get("license", "?")
108
+ self.likes = request.get("likes", 0)
109
+ self.num_params = request.get("params", 0)
110
+ self.date = request.get("submitted_time", "")
111
+ except Exception:
112
+ print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
113
+
114
+ def to_dict(self):
115
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
116
+ data_dict = {
117
+ "eval_name": self.eval_name, # not a column, just a save name,
118
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.revision),
119
+ }
120
+
121
+ for task in Tasks:
122
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
123
+
124
+ return data_dict
125
+
126
+
127
+ def get_request_file_for_model(requests_path, model_name, precision):
128
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
129
+ request_files = os.path.join(
130
+ requests_path,
131
+ f"{model_name}.json",
132
+ )
133
+ request_files = glob.glob(request_files)
134
+
135
+ # Select correct request file (precision)
136
+ request_file = ""
137
+ request_files = sorted(request_files, reverse=True)
138
+ for tmp_request_file in request_files:
139
+ with open(tmp_request_file, "r") as f:
140
+ req_content = json.load(f)
141
+ if (
142
+ req_content["status"] in ["FINISHED"]
143
+ and req_content["precision"] == precision.split(".")[-1]
144
+ ):
145
+ request_file = tmp_request_file
146
+ return request_file
147
+
148
+
149
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
150
+ """From the path of the results folder root, extract all needed info for results"""
151
+ model_result_filepaths = []
152
+
153
+ for root, _, files in os.walk(results_path):
154
+ # We should only have json files in model results
155
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
156
+ continue
157
+
158
+ # Sort the files by date
159
+ try:
160
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
161
+ except dateutil.parser._parser.ParserError:
162
+ files = [files[-1]]
163
+
164
+ for file in files:
165
+ model_result_filepaths.append(os.path.join(root, file))
166
+
167
+ eval_results = {}
168
+ for model_result_filepath in model_result_filepaths:
169
+ # Creation of result
170
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
171
+ eval_result.update_with_request_file(requests_path)
172
+
173
+ # Store results of same eval together
174
+ eval_name = eval_result.eval_name
175
+ if eval_name in eval_results.keys():
176
+ eval_results[eval_name].results.update(eval_result.results)
177
+ else:
178
+ eval_results[eval_name] = eval_result
179
+
180
+ results = []
181
+ for v in eval_results.values():
182
+ try:
183
+ v.to_dict() # we test if the dict version is complete
184
+ results.append(v)
185
+ except KeyError: # not all eval values present
186
+ continue
187
+
188
+ return results
src/populate.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from src.about import Tasks
8
+ from src.display.formatting import has_no_nan_values, make_clickable_model
9
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
10
+ from src.leaderboard.read_evals import get_raw_eval_results
11
+
12
+
13
+ TASK_NAME_INVERSE_MAP = dict()
14
+ for task in Tasks:
15
+ TASK_NAME_INVERSE_MAP[task.value.col_name] = {
16
+ "name": task.value.benchmark,
17
+ "type": task.value.type,
18
+ "source": task.value.source,
19
+ }
20
+
21
+ EMPTY_SYMBOL = "--"
22
+
23
+
24
+ def get_inspect_log_url(model_name: str, benchmark_name: str) -> str:
25
+ """Returns the URL to the log file for a given model and benchmark"""
26
+ with open("./inspect_log_file_names.json", "r") as f:
27
+ inspect_log_files = json.load(f)
28
+ log_file_name = inspect_log_files[model_name].get(benchmark_name, None)
29
+ if log_file_name is None:
30
+ return ""
31
+ else:
32
+ # replace .json with .eval
33
+ log_file_name = log_file_name.replace(".json", ".eval")
34
+ return f"https://storage.googleapis.com/inspect-evals/eval/{model_name}/index.html?log_file=logs/logs/{log_file_name}"
35
+
36
+
37
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
38
+ """Creates a dataframe from all the individual experiment results"""
39
+ raw_data = get_raw_eval_results(results_path, requests_path)
40
+ all_data_json = [v.to_dict() for v in raw_data]
41
+
42
+ df = pd.DataFrame.from_records(all_data_json)
43
+
44
+ df = df[cols].round(decimals=2)
45
+
46
+ # subset for model and benchmark cols
47
+ df = df[[AutoEvalColumn.model.name] + benchmark_cols]
48
+
49
+ df = df.fillna(EMPTY_SYMBOL)
50
+
51
+ # make values clickable and link to log files
52
+ for col in benchmark_cols:
53
+ df[col] = df[[AutoEvalColumn.model.name, col]].apply(lambda x: f"[{x[col]}]({get_inspect_log_url(model_name=x[AutoEvalColumn.model.name].split('>')[1].split('<')[0], benchmark_name=TASK_NAME_INVERSE_MAP[col]['name'])})" if x[col] != EMPTY_SYMBOL else x[col], axis=1)
54
+
55
+ return df
56
+
57
+
58
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
59
+ """Creates the different dataframes for the evaluation queues requestes"""
60
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
61
+ all_evals = []
62
+
63
+ for entry in entries:
64
+ if ".json" in entry:
65
+ file_path = os.path.join(save_path, entry)
66
+ with open(file_path) as fp:
67
+ data = json.load(fp)
68
+
69
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"], data["model_sha"])
70
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
71
+
72
+ all_evals.append(data)
73
+ elif ".md" not in entry:
74
+ # this is a folder
75
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
76
+ for sub_entry in sub_entries:
77
+ file_path = os.path.join(save_path, entry, sub_entry)
78
+ with open(file_path) as fp:
79
+ data = json.load(fp)
80
+
81
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"], data["model_sha"])
82
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
83
+ all_evals.append(data)
84
+
85
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
86
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
87
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
88
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
89
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
90
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
91
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/check_validity.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
+ if test_tokenizer:
39
+ try:
40
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
+ except ValueError as e:
42
+ return (
43
+ False,
44
+ f"uses a tokenizer which is not in a transformers release: {e}",
45
+ None
46
+ )
47
+ except Exception as e:
48
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
+ return True, None, config
50
+
51
+ except ValueError:
52
+ return (
53
+ False,
54
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
+ None
56
+ )
57
+
58
+ except Exception as e:
59
+ return False, "was not found on hub!", None
60
+
61
+
62
+ def get_model_size(model_info: ModelInfo, precision: str):
63
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ try:
65
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
+ except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
+
69
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
+ model_size = size_factor * model_size
71
+ return model_size
72
+
73
+ def get_model_arch(model_info: ModelInfo):
74
+ """Gets the model architecture from the configuration"""
75
+ return model_info.config.get("architectures", "Unknown")
76
+
77
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
78
+ """Gather a list of already submitted models to avoid duplicates"""
79
+ depth = 1
80
+ file_names = []
81
+ users_to_submission_dates = defaultdict(list)
82
+
83
+ for root, _, files in os.walk(requested_models_dir):
84
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
+ if current_depth == depth:
86
+ for file in files:
87
+ if not file.endswith(".json"):
88
+ continue
89
+ with open(os.path.join(root, file), "r") as f:
90
+ info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
+
93
+ # Select organisation
94
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
95
+ continue
96
+ organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
+
99
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import (
8
+ already_submitted_models,
9
+ check_model_card,
10
+ get_model_size,
11
+ is_model_on_hub,
12
+ )
13
+
14
+ REQUESTED_MODELS = None
15
+ USERS_TO_SUBMISSION_DATES = None
16
+
17
+ def add_new_eval(
18
+ model: str,
19
+ base_model: str,
20
+ revision: str,
21
+ precision: str,
22
+ weight_type: str,
23
+ model_type: str,
24
+ ):
25
+ global REQUESTED_MODELS
26
+ global USERS_TO_SUBMISSION_DATES
27
+ if not REQUESTED_MODELS:
28
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+
30
+ user_name = ""
31
+ model_path = model
32
+ if "/" in model:
33
+ user_name = model.split("/")[0]
34
+ model_path = model.split("/")[1]
35
+
36
+ precision = precision.split(" ")[0]
37
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+
39
+ if model_type is None or model_type == "":
40
+ return styled_error("Please select a model type.")
41
+
42
+ # Does the model actually exist?
43
+ if revision == "":
44
+ revision = "main"
45
+
46
+ # Is the model on the hub?
47
+ if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ if not base_model_on_hub:
50
+ return styled_error(f'Base model "{base_model}" {error}')
51
+
52
+ if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ if not model_on_hub:
55
+ return styled_error(f'Model "{model}" {error}')
56
+
57
+ # Is the model info correctly filled?
58
+ try:
59
+ model_info = API.model_info(repo_id=model, revision=revision)
60
+ except Exception:
61
+ return styled_error("Could not get your model information. Please fill it up properly.")
62
+
63
+ model_size = get_model_size(model_info=model_info, precision=precision)
64
+
65
+ # Were the model card and license filled?
66
+ try:
67
+ license = model_info.cardData["license"]
68
+ except Exception:
69
+ return styled_error("Please select a license for your model")
70
+
71
+ modelcard_OK, error_msg = check_model_card(model)
72
+ if not modelcard_OK:
73
+ return styled_error(error_msg)
74
+
75
+ # Seems good, creating the eval
76
+ print("Adding new eval")
77
+
78
+ eval_entry = {
79
+ "model": model,
80
+ "base_model": base_model,
81
+ "revision": revision,
82
+ "precision": precision,
83
+ "weight_type": weight_type,
84
+ "status": "PENDING",
85
+ "submitted_time": current_time,
86
+ "model_type": model_type,
87
+ "likes": model_info.likes,
88
+ "params": model_size,
89
+ "license": license,
90
+ "private": False,
91
+ }
92
+
93
+ # Check for duplicate submission
94
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ return styled_warning("This model has been already submitted.")
96
+
97
+ print("Creating eval file")
98
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ os.makedirs(OUT_DIR, exist_ok=True)
100
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+
102
+ with open(out_path, "w") as f:
103
+ f.write(json.dumps(eval_entry))
104
+
105
+ print("Uploading eval file")
106
+ API.upload_file(
107
+ path_or_fileobj=out_path,
108
+ path_in_repo=out_path.split("eval-queue/")[1],
109
+ repo_id=QUEUE_REPO,
110
+ repo_type="dataset",
111
+ commit_message=f"Add {model} to eval queue",
112
+ )
113
+
114
+ # Remove the local file
115
+ os.remove(out_path)
116
+
117
+ return styled_message(
118
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ )