deepmage121 commited on
Commit
d0ab546
·
1 Parent(s): 49c1354

initial commit, space + other info related to action

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/sync-to-hf.yml +55 -0
  2. .gitignore +8 -0
  3. .python-version +1 -0
  4. app.py +127 -531
  5. data_loader.py +317 -0
  6. eval.schema.json +282 -0
  7. hf_operations.py +202 -0
  8. json_to_parquet.py +228 -0
  9. leaderboard_data/HFOpenLLMv2/0-hero/0-hero_Matter-0.2-7B-DPO/40e80d5e-db72-46b7-bd14-b7d005df4be8.json +0 -107
  10. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-32K/0d91a153-1b6b-4891-8722-a5c7e372ba64.json +0 -107
  11. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat-16K/2192007d-1f6e-4f74-b518-7448ef3a896e.json +0 -107
  12. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat/e335874b-9b3e-4966-a7e0-22e9d16f8324.json +0 -107
  13. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B/8409c158-ef12-4e6c-8a1d-7be2084b3446.json +0 -107
  14. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B-Chat/3452e57f-3023-4e2e-ad84-b09e409fe334.json +0 -107
  15. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B/1a1f1263-96b6-4e32-a2c8-6c0d6b47dff9.json +0 -107
  16. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-32K/df9d9d44-daa1-4e61-9b46-192380043889.json +0 -107
  17. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat-16K/090c9691-4b7e-4a98-b9a2-644e21797be4.json +0 -107
  18. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat/9256c32b-d956-418f-97da-ea78e3ad9e48.json +0 -107
  19. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B/904d1f91-3153-49d5-afd3-9921bfc086f1.json +0 -107
  20. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-200K/fb2ebd9a-f5b8-42a2-9b58-e6f0e7d9b98a.json +0 -107
  21. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-Chat/5d9b9217-874b-426d-8af4-5105a3b1b3ad.json +0 -107
  22. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B/3ebcbf3d-cb2d-4332-bb8a-1db104033391.json +0 -107
  23. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-200K/6b720e8b-aab8-4ba4-9bce-e7a1de3cfb86.json +0 -107
  24. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-Chat/1120c801-7736-4d9d-b23d-08eeedb34186.json +0 -107
  25. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B/297419fa-855c-4eae-ad7c-3cf4a0262450.json +0 -107
  26. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B-200K/4299df04-495a-4687-b143-96b1b562d5e8.json +0 -107
  27. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B/0ec59add-f9a9-4dbd-8a83-c6aec0b8ad21.json +0 -107
  28. leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-Coder-9B-Chat/ef0cc3a5-0d62-4a45-b0c7-28a6f7dfdac4.json +0 -107
  29. leaderboard_data/HFOpenLLMv2/1-800-LLMs/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/a48b0864-76b7-4860-a448-942a8d74f68e.json +0 -107
  30. leaderboard_data/HFOpenLLMv2/152334H/152334H_miqu-1-70b-sf/f57d7b8d-85d5-4e0b-8dec-31e2931487dd.json +0 -107
  31. leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-7B-v0.1/1347cd1b-2ebc-4223-900f-7c2479e228a3.json +0 -107
  32. leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-v0.1/b2926dd6-628c-4274-b0e8-1efc64269bb2.json +0 -107
  33. leaderboard_data/HFOpenLLMv2/3rd-Degree-Burn/3rd-Degree-Burn_L-3.1-Science-Writer-8B/0c4fd071-b5c9-4bf1-a1d5-d658be1a3258.json +0 -107
  34. leaderboard_data/HFOpenLLMv2/4season/4season_final_model_test_v2/74973e37-cd82-4e8a-816a-02b035fabff4.json +0 -107
  35. leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/3766e8a0-99ad-4733-a01b-ced446b15eda.json +0 -107
  36. leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-SFT-preview/342ac912-805f-4166-b8f4-10f0503fa892.json +0 -107
  37. leaderboard_data/HFOpenLLMv2/AGI-0/AGI-0_Art-v0-3B/162b6d5f-f983-4989-9603-f6baea26b633.json +0 -107
  38. leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-CoT/9ac2ba3c-9a21-46b2-a21c-4909cfae6315.json +0 -107
  39. leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-TIR/0ffa78d4-fe45-4639-bcd1-eb19ab168a35.json +0 -107
  40. leaderboard_data/HFOpenLLMv2/AI-Sweden-Models/AI-Sweden-Models_Llama-3-8B-instruct/1d68bd2e-de6e-4327-a8f1-33322eba537e.json +0 -107
  41. leaderboard_data/HFOpenLLMv2/AI4free/AI4free_Dhanishtha/a554a3eb-943c-4135-966b-929129ef025d.json +0 -107
  42. leaderboard_data/HFOpenLLMv2/AI4free/AI4free_t2/332ccdb5-faf5-47c6-afeb-a91d2148adf0.json +0 -107
  43. leaderboard_data/HFOpenLLMv2/AIDC-AI/AIDC-AI_Marco-o1/17f7398f-675d-4b38-b233-64fc106737c3.json +0 -107
  44. leaderboard_data/HFOpenLLMv2/Aashraf995/Aashraf995_Creative-7B-nerd/7ea9f4db-5b52-40a5-904e-785e43302934.json +0 -107
  45. leaderboard_data/HFOpenLLMv2/AbacusResearch/AbacusResearch_Jallabi-34B/76397277-901a-4ad0-9dae-0351ca875ec6.json +0 -107
  46. leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_StructuredThinker-v0.3-MoreStructure/81a5aafb-2cf7-490d-b619-ce638fcc8b38.json +0 -107
  47. leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_Test_StealthThinker/43c907eb-3e43-47ff-b38d-f912ba6ef46c.json +0 -107
  48. leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/48732edf-8baf-438e-8a5c-763eee6c0c18.json +0 -107
  49. leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0/38f169f0-e939-4b12-8f78-b2a27fb90de0.json +0 -107
  50. leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/e8c63728-a1f5-432f-bf9f-204b0f4041aa.json +0 -107
.github/workflows/sync-to-hf.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to HuggingFace Dataset
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ paths:
7
+ - 'data/**/*.json'
8
+ workflow_dispatch: # Allow manual trigger
9
+
10
+ jobs:
11
+ sync-to-huggingface:
12
+ runs-on: ubuntu-latest
13
+
14
+ steps:
15
+ - name: Checkout repository
16
+ uses: actions/checkout@v4
17
+ with:
18
+ fetch-depth: 2
19
+
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v5
22
+ with:
23
+ python-version: '3.11'
24
+
25
+ - name: Install dependencies
26
+ run: |
27
+ pip install datasets huggingface_hub pandas pyarrow
28
+
29
+ - name: Convert Changed JSONs to Parquet (Optimized)
30
+ env:
31
+ HF_DATASET_REPO: deepmage121/eee_test
32
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
33
+ run: |
34
+ echo "Detecting changed leaderboards..."
35
+ python scripts/convert_to_parquet.py
36
+
37
+ - name: Upload Changed Parquets to HuggingFace
38
+ env:
39
+ HF_DATASET_REPO: deepmage121/eee_test
40
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
41
+ run: |
42
+ echo "Uploading changed parquets..."
43
+ python scripts/upload_to_hf.py
44
+
45
+ - name: Report status
46
+ if: success()
47
+ run: |
48
+ echo "Successfully synced to HuggingFace dataset"
49
+ echo "View at: https://huggingface.co/datasets/deepmage121/eee_test"
50
+ if [ -f parquet_output/changed_leaderboards.json ]; then
51
+ echo ""
52
+ echo "Changes processed:"
53
+ cat parquet_output/changed_leaderboards.json
54
+ fi
55
+
.gitignore CHANGED
@@ -1 +1,9 @@
1
  .DS_Store
 
 
 
 
 
 
 
 
 
1
  .DS_Store
2
+ .secrets
3
+ .actrc
4
+ __pycache__/
5
+ *.pyc
6
+ parquet_output/
7
+ *.venv*
8
+ *.md
9
+ *.ipynb_checkpoints
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
app.py CHANGED
@@ -1,479 +1,106 @@
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
- import json
4
  from pathlib import Path
5
 
6
- DATA_DIR = Path("leaderboard_data")
7
- LEADERBOARD_CACHE = {}
8
-
9
- def parse_eval_json(file_path):
10
- """Parses a single JSON file to extract model, provider, and results."""
11
- try:
12
- with open(file_path, 'r') as f:
13
- data = json.load(f)
14
-
15
- leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
16
- provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
17
- model_id = data.get("model_info", {}).get("id", "Unknown Model")
18
- developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
19
-
20
- params = data.get("model_info", {}).get("params_billions", None)
21
- architecture = data.get("model_info", {}).get("architecture", "Unknown")
22
- precision = data.get("additional_details", {}).get("precision", "Unknown")
23
- if precision == "Unknown":
24
- precision = data.get("model_info", {}).get("precision", "Unknown")
25
-
26
- results = {}
27
- if "evaluation_results" in data:
28
- for res in data["evaluation_results"]:
29
- eval_name = res.get("evaluation_name", "Unknown Metric")
30
- score = res.get("score_details", {}).get("score", None)
31
- if score is not None:
32
- results[eval_name] = score
33
-
34
- return {
35
- "leaderboard": leaderboard_name,
36
- "provider": provider_name,
37
- "model": model_id,
38
- "developer": developer_name,
39
- "params": params,
40
- "architecture": architecture,
41
- "precision": precision,
42
- "results": results,
43
- "raw_data": data
44
- }
45
- except Exception as e:
46
- print(f"Error parsing {file_path}: {e}")
47
- return None
48
-
49
- def get_available_leaderboards():
50
- """Scans data directory for leaderboard folders."""
51
- if not DATA_DIR.exists():
52
- return []
53
- return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
54
-
55
- def normalize_leaderboard_name(name):
56
- """Normalizes leaderboard name to remove spaces."""
57
- return name.replace(" ", "")
58
-
59
- def sanitize_filename_component(name):
60
- """Sanitizes a name to be safe for use in directory names."""
61
- return name.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
62
-
63
- def walk_eval_files(leaderboard_name):
64
- """Generator that walks through Leaderboard directory recursively."""
65
- lb_path = DATA_DIR / leaderboard_name
66
- if not lb_path.exists():
67
- return
68
-
69
- yield from lb_path.rglob("*.json")
70
 
71
- def get_eval_metadata(selected_leaderboard):
72
- """Extracts evaluation metadata from the leaderboard data."""
73
- if not selected_leaderboard:
74
- return {}
75
-
76
- eval_metadata = {"evals": {}, "source_info": {}}
77
-
78
- for json_file in walk_eval_files(selected_leaderboard):
79
- parsed = parse_eval_json(json_file)
80
- if parsed:
81
- if not eval_metadata["source_info"]:
82
- source_meta = parsed["raw_data"].get("source_metadata", {})
83
- source_data_list = parsed["raw_data"].get("source_data", [])
84
- url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
85
-
86
- eval_metadata["source_info"] = {
87
- "organization": source_meta.get("source_organization_name", "Unknown"),
88
- "relationship": source_meta.get("evaluator_relationship", "Unknown"),
89
- "url": url
90
- }
91
-
92
- if "evaluation_results" in parsed["raw_data"]:
93
- for res in parsed["raw_data"]["evaluation_results"]:
94
- eval_name = res.get("evaluation_name", "Unknown Metric")
95
- if eval_name not in eval_metadata["evals"]:
96
- metric_config = res.get("metric_config", {})
97
- eval_metadata["evals"][eval_name] = {
98
- "description": metric_config.get("evaluation_description", "No description available"),
99
- "score_type": metric_config.get("score_type", "unknown"),
100
- "lower_is_better": metric_config.get("lower_is_better", False),
101
- "min_score": metric_config.get("min_score"),
102
- "max_score": metric_config.get("max_score"),
103
- "level_names": metric_config.get("level_names", []),
104
- "level_metadata": metric_config.get("level_metadata", []),
105
- "has_unknown_level": metric_config.get("has_unknown_level", False)
106
- }
107
- break
108
-
109
- return eval_metadata
110
 
111
- def format_eval_info_html(selected_leaderboard):
112
- """Formats evaluation metadata into a responsive HTML grid."""
113
  if not selected_leaderboard:
114
- return """
115
- <div style="text-align: center; padding: 3rem; color: var(--body-text-color-subdued);">
116
- <h3>👋 Welcome to Eval Leaderboard</h3>
117
- <p>Select a leaderboard above to visualize results and metadata.</p>
118
- </div>
119
- """
120
-
121
- metadata = get_eval_metadata(selected_leaderboard)
122
- if not metadata or not metadata.get("evals"):
123
- return f"""<div style="padding: 1rem;">No metadata found for {selected_leaderboard}</div>"""
124
-
125
- source_info = metadata.get("source_info", {})
126
- evals = metadata.get("evals", {})
127
- unique_evals_count = len(evals)
128
-
129
- eval_badges = "".join([
130
- f'<span style="background: var(--background-fill-secondary); border: 1px solid var(--border-color-primary); padding: 2px 8px; border-radius: 4px; font-size: 0.85rem; white-space: nowrap;">{name}</span>'
131
- for name in sorted(evals.keys())
132
- ])
133
-
134
- source_url = source_info.get('url', '#')
135
- source_link = f'<a href="{source_url}" target="_blank" style="text-decoration: none; color: var(--link-text-color); hover: underline;">🔗 {source_info.get("organization", "Unknown")}</a>'
136
-
137
- html = f"""
138
- <div style="
139
- background: var(--block-background-fill);
140
- border: 1px solid var(--border-color-primary);
141
- border-radius: 8px;
142
- padding: 1.5rem;
143
- margin-bottom: 2rem;
144
- box-shadow: var(--shadow-sm);
145
- ">
146
- <h2 style="margin-top: 0; margin-bottom: 1rem;">📊 {selected_leaderboard}</h2>
147
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1.5rem;">
148
- <div>
149
- <div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600;">Source Organization</div>
150
- <div style="font-size: 1.1rem; font-weight: 500;">{source_link}</div>
151
- </div>
152
- <div>
153
- <div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600;">Evaluator Relationship</div>
154
- <div style="font-size: 1.1rem; font-weight: 500;">{source_info.get('relationship', 'Unknown').replace('_', ' ').title()}</div>
155
- </div>
156
- <div>
157
- <div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600; margin-bottom: 0.5rem;">Included Evaluations</div>
158
- <div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">{eval_badges}</div>
159
- </div>
160
- </div>
161
- </div>
162
-
163
- <h3 style="margin-bottom: 1rem;">Metric Details</h3>
164
- """
165
 
166
- html += """
167
- <div style="
168
- display: grid;
169
- grid-template-columns: repeat(auto-fill, minmax(350px, 1fr));
170
- gap: 1rem;
171
- ">
172
- """
173
 
174
- for eval_name, info in evals.items():
175
- score_type = info['score_type'].upper() if info['score_type'] else "UNKNOWN"
176
- direction = "Lower is better" if info['lower_is_better'] else "Higher is better"
177
- direction_icon = "↓" if info['lower_is_better'] else "↑"
178
 
179
- details_content = ""
180
- if info['score_type'] == "continuous" and info.get('min_score') is not None:
181
- details_content += f"<div><span style='opacity: 0.7;'>Range:</span> <strong>[{info['min_score']} - {info['max_score']}]</strong></div>"
182
- elif info['score_type'] == "levels" and info.get('level_names'):
183
- levels = ", ".join(info['level_names'])
184
- details_content += f"<div><span style='opacity: 0.7;'>Levels:</span> <strong>{levels}</strong></div>"
185
 
186
- if info.get('has_unknown_level'):
187
- details_content += "<div style='margin-top: 0.25rem; font-size: 0.8rem; opacity: 0.7;'>* -1 indicates Unknown</div>"
188
-
189
- html += f"""
190
- <details style="
191
- background: var(--background-fill-secondary);
192
- border: 1px solid var(--border-color-primary);
193
- border-radius: 6px;
194
- overflow: hidden;
195
- height: fit-content;
196
- ">
197
- <summary style="
198
- padding: 0.75rem 1rem;
199
- cursor: pointer;
200
- font-weight: 600;
201
- display: flex;
202
- align-items: center;
203
- justify-content: space-between;
204
- list-style: none;
205
- font-size: 0.95rem;
206
- ">
207
- <div style="display: flex; align-items: center; gap: 0.5rem;">
208
- <span style="font-size: 1.1rem; opacity: 0.8;">🏷️</span>
209
- <span style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{eval_name}</span>
210
- </div>
211
- <div style="display: flex; align-items: center; gap: 0.5rem;">
212
- <span style="font-size: 0.8rem; font-weight: 400; color: var(--body-text-color-subdued); white-space: nowrap;">{direction_icon} {direction}</span>
213
- </div>
214
- </summary>
215
 
216
- <div style="
217
- padding: 0.75rem 1rem;
218
- border-top: 1px solid var(--border-color-primary);
219
- background: var(--block-background-fill);
220
- font-size: 0.9rem;
221
- ">
222
- <p style="margin: 0 0 0.5rem 0; color: var(--body-text-color-subdued); line-height: 1.4;">
223
- {info['description']}
224
- </p>
225
- <div style="display: flex; justify-content: space-between; align-items: flex-end; margin-top: 0.5rem;">
226
- <div style="font-size: 0.85rem;">
227
- {details_content}
228
- </div>
229
- <span style="
230
- font-size: 0.7rem;
231
- padding: 1px 6px;
232
- border-radius: 4px;
233
- background: var(--background-fill-primary);
234
- border: 1px solid var(--border-color-primary);
235
- color: var(--body-text-color-subdued);
236
- ">{score_type}</span>
237
- </div>
238
- </div>
239
- </details>
240
- """
241
-
242
- html += "</div>"
243
- return html
244
 
245
- def update_leaderboard_table(selected_leaderboard, search_query="", group_by_model=False, progress=gr.Progress()):
246
  """Loads and aggregates data for the selected leaderboard."""
247
  if not selected_leaderboard:
248
- return pd.DataFrame(), format_eval_info_html(None)
249
 
250
- # Check cache
251
- full_df = None
252
- if selected_leaderboard in LEADERBOARD_CACHE:
253
- # Cache stores (df, meta_html)
254
- full_df, meta_html = LEADERBOARD_CACHE[selected_leaderboard]
255
- else:
256
- progress(0, desc=f"Scanning {selected_leaderboard}...")
257
- all_files = list(walk_eval_files(selected_leaderboard))
258
- total_files = len(all_files)
259
-
260
- rows = []
261
- for i, json_file in enumerate(all_files):
262
- if i % 100 == 0:
263
- progress((i / total_files), desc=f"Loading {selected_leaderboard}...")
264
- parsed = parse_eval_json(json_file)
265
- if parsed:
266
- row = {
267
- "Model": parsed["model"],
268
- "Developer": parsed["developer"],
269
- "Params (B)": parsed["params"],
270
- "Arch": parsed["architecture"],
271
- "Precision": parsed["precision"]
272
- }
273
- row.update(parsed["results"])
274
- rows.append(row)
275
-
276
- meta_html = format_eval_info_html(selected_leaderboard)
277
-
278
- if not rows:
279
- full_df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision", "Score"])
280
- else:
281
- full_df = pd.DataFrame(rows)
282
- numeric_cols = full_df.select_dtypes(include=['float', 'int']).columns
283
- full_df[numeric_cols] = full_df[numeric_cols].round(3)
284
-
285
- LEADERBOARD_CACHE[selected_leaderboard] = (full_df, meta_html)
286
-
287
- # Filter by search query
288
- df = full_df.copy()
289
- if search_query:
290
- df = df[
291
- df["Model"].str.contains(search_query, case=False, na=False) |
292
- df["Developer"].str.contains(search_query, case=False, na=False)
293
- ]
294
-
295
- # Group by model and average scores if requested
296
- if group_by_model and not df.empty:
297
- # Identify grouping columns (non-numeric usually, or specific base cols)
298
- # We group by the base identifiers.
299
- base_cols_all = ["Model", "Developer", "Params (B)", "Arch", "Precision"]
300
- group_cols = [c for c in base_cols_all if c in df.columns]
301
-
302
- # Identify columns to average (numeric)
303
- numeric_cols = df.select_dtypes(include=['number']).columns
304
- # Exclude group_cols from numeric_cols if they happen to be numeric (like Params)
305
- # But groupby keys can be numeric.
306
- # We want to average the SCORES.
307
- # Any numeric column NOT in group_cols should be averaged.
308
- agg_cols = [c for c in numeric_cols if c not in group_cols]
309
-
310
- if group_cols and agg_cols:
311
- df = df.groupby(group_cols)[agg_cols].mean().reset_index()
312
- df = df.round(3)
313
-
314
- # Drop columns where all values are null
315
- df = df.dropna(axis=1, how='all')
316
-
317
- if df.empty:
318
- return df, meta_html
319
-
320
- # Filter base_cols to only include columns that exist in df (in case some were dropped)
321
- base_cols = [c for c in ["Model", "Developer", "Params (B)", "Arch", "Precision"] if c in df.columns]
322
- eval_cols = [c for c in df.columns if c not in base_cols]
323
-
324
- cols = base_cols + eval_cols
325
- return df[cols], meta_html
326
-
327
- def find_json_files(path):
328
- """Recursively finds all JSON files in a directory or returns the file if it's a JSON file."""
329
- json_files = []
330
- path_obj = Path(path)
331
-
332
- if path_obj.is_file() and path_obj.suffix == ".json":
333
- json_files.append(path_obj)
334
- elif path_obj.is_dir():
335
- json_files.extend(path_obj.rglob("*.json"))
336
-
337
- return json_files
338
-
339
- def check_is_duplicate(save_dir, new_eval_id):
340
- """Checks if a file with the same evaluation_id already exists in the directory."""
341
- if not new_eval_id or not save_dir.exists():
342
- return False
343
-
344
- for existing_file in save_dir.glob("*.json"):
345
- try:
346
- with open(existing_file, 'r') as f:
347
- data = json.load(f)
348
- if data.get("evaluation_id") == new_eval_id:
349
- return True
350
- except:
351
- continue
352
- return False
353
-
354
- def handle_file_upload(files, progress=gr.Progress()):
355
- """Processes uploaded files/folders and saves them to the correct structure.
356
 
357
- Structure: Leaderboard/Provider/Model/<uuid>.json
358
- Preserves original filename (which already contains the UUID).
359
- """
360
- if not files:
361
- return gr.update(), "No files uploaded."
362
 
363
- saved_count = 0
364
- all_json_files = []
365
- skipped_count = 0
366
- duplicate_count = 0
367
 
368
- progress(0, desc="Scanning files...")
369
- for file_obj in files:
370
- path = file_obj.name if hasattr(file_obj, "name") else file_obj
371
- json_files = find_json_files(path)
372
-
373
- if Path(path).is_file() and Path(path).suffix != ".json":
374
- skipped_count += 1
375
-
376
- all_json_files.extend(json_files)
377
 
378
- total_files = len(all_json_files)
379
- for i, json_file in enumerate(all_json_files):
380
- progress((i / total_files), desc=f"Processing {json_file.name}...")
381
- try:
382
- parsed = parse_eval_json(json_file)
383
- if not parsed:
384
- continue
385
-
386
- leaderboard = normalize_leaderboard_name(parsed["leaderboard"])
387
- provider = parsed["provider"]
388
- model_id = parsed["model"]
389
- developer = parsed["developer"]
390
- eval_id = parsed["raw_data"].get("evaluation_id")
391
-
392
- # Sanitize names for directory structure
393
- sanitized_provider = sanitize_filename_component(developer)
394
- sanitized_model = sanitize_filename_component(model_id)
395
-
396
- # Create structure: Leaderboard/Developer/Model
397
- save_dir = DATA_DIR / leaderboard / sanitized_provider / sanitized_model
398
- save_dir.mkdir(parents=True, exist_ok=True)
399
-
400
- # Check for duplicates based on evaluation_id
401
- if check_is_duplicate(save_dir, eval_id):
402
- duplicate_count += 1
403
- continue
404
-
405
- # Preserve original filename
406
- filename = json_file.name
407
- save_path = save_dir / filename
408
-
409
- # Avoid overwriting by appending counter
410
- counter = 1
411
- while save_path.exists():
412
- stem = save_path.stem.rsplit('_', 1)[0] if '_' in save_path.stem else save_path.stem
413
- save_path = save_dir / f"{stem}_{counter}.json"
414
- counter += 1
415
-
416
- with open(save_path, 'w') as f:
417
- json.dump(parsed["raw_data"], f, indent=2)
418
-
419
- saved_count += 1
420
-
421
- except Exception as e:
422
- print(f"Failed to save {json_file}: {e}")
423
-
424
- # Clear cache since data changed
425
- LEADERBOARD_CACHE.clear()
426
-
427
- # Refresh leaderboard choices
428
- choices = get_available_leaderboards()
429
 
430
- msg_parts = [f"Processed {saved_count} files."]
431
- if duplicate_count > 0:
432
- msg_parts.append(f"Skipped {duplicate_count} duplicates.")
433
- if skipped_count > 0:
434
- msg_parts.append(f"Skipped {skipped_count} non-JSON files.")
435
-
436
- return gr.Dropdown(choices=choices), " ".join(msg_parts), None, None
437
 
438
- # Professional, high-contrast theme
439
- theme = gr.themes.Soft(
440
- primary_hue="slate",
441
- neutral_hue="slate",
442
- font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"]
443
- ).set(
444
- body_background_fill="var(--neutral-50)",
445
- block_background_fill="white",
446
- block_border_width="1px",
447
- block_title_text_weight="600"
448
- )
449
-
450
- css = """
451
- /* Clean up the global container */
452
- .gradio-container {
453
- max-width: 100% !important;
454
- padding: 0 2rem !important;
455
- }
456
-
457
- /* Table Styles */
458
- .dataframe {
459
- border: 1px solid var(--border-color-primary) !important;
460
- border-radius: 8px;
461
- }
462
 
463
- /* Hide file list in uploaders */
464
- .file-preview {
465
- display: none !important;
466
- }
467
- """
468
 
469
- with gr.Blocks(title="Eval Leaderboard", theme=theme, css=css) as demo:
 
470
 
471
  with gr.Row(variant="compact", elem_classes="header-row"):
472
  with gr.Column(scale=1):
473
  gr.Markdown("# 🏆 Evaluation Leaderboard")
474
  gr.Markdown("Analyze and compare model performance metrics.", elem_classes="subtitle")
475
 
476
- with gr.Row(variant="panel", equal_height=True):
477
  initial_choices = get_available_leaderboards()
478
  initial_value = initial_choices[0] if initial_choices else None
479
 
@@ -482,56 +109,51 @@ with gr.Blocks(title="Eval Leaderboard", theme=theme, css=css) as demo:
482
  choices=initial_choices,
483
  value=initial_value,
484
  label="Current Leaderboard",
485
- interactive=True,
486
- container=False,
487
- scale=1
488
- )
489
- with gr.Column(scale=2):
490
- search_box = gr.Textbox(
491
- label="Search Model/Developer",
492
- placeholder="🔍 Search model or developer...",
493
- show_label=False,
494
- container=False,
495
- scale=1
496
- )
497
- with gr.Column(scale=1, min_width=100):
498
- group_by_model = gr.Checkbox(
499
- label="Average by Model",
500
- value=False,
501
- container=False
502
  )
503
- with gr.Column(scale=1, min_width=100):
 
 
 
 
 
 
504
  refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
505
 
506
- with gr.Accordion("📤 Upload New Data", open=False):
507
- upload_mode = gr.Radio(
508
- choices=["Files", "Folder"],
509
- value="Files",
510
- label="Upload Mode",
511
- info="Choose 'Files' for individual JSONs, or 'Folder' to upload a directory structure."
512
- )
513
-
514
- with gr.Group(visible=True) as file_upload_group:
515
- file_uploader_files = gr.File(
516
- file_count="multiple",
517
- file_types=[".json"],
518
- label="Select JSON Files"
519
- )
520
-
521
- with gr.Group(visible=False) as folder_upload_group:
522
- file_uploader_folder = gr.File(
523
- file_count="directory",
524
- label="Select Folder"
525
- )
526
-
527
- upload_status = gr.Textbox(
528
- label="Upload Status",
529
- interactive=False
530
- )
531
 
532
- init_df, init_meta = update_leaderboard_table(initial_value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
533
 
534
- metadata_view = gr.HTML(value=init_meta)
535
 
536
  leaderboard_table = gr.Dataframe(
537
  value=init_df,
@@ -541,59 +163,33 @@ with gr.Blocks(title="Eval Leaderboard", theme=theme, css=css) as demo:
541
  elem_classes="dataframe"
542
  )
543
 
544
- def toggle_upload_input(mode):
545
- return {
546
- file_upload_group: gr.Group(visible=(mode == "Files")),
547
- folder_upload_group: gr.Group(visible=(mode == "Folder"))
548
- }
549
 
550
- upload_mode.change(
551
- fn=toggle_upload_input,
552
- inputs=[upload_mode],
553
- outputs=[file_upload_group, folder_upload_group]
554
- )
555
-
556
- file_uploader_files.upload(
557
- fn=handle_file_upload,
558
- inputs=[file_uploader_files],
559
- outputs=[leaderboard_selector, upload_status, file_uploader_files, file_uploader_folder]
560
- )
561
-
562
- file_uploader_folder.upload(
563
- fn=handle_file_upload,
564
- inputs=[file_uploader_folder],
565
- outputs=[leaderboard_selector, upload_status, file_uploader_files, file_uploader_folder]
566
- )
567
 
 
568
  leaderboard_selector.change(
569
  fn=update_leaderboard_table,
570
- inputs=[leaderboard_selector, search_box, group_by_model],
571
- outputs=[leaderboard_table, metadata_view]
572
- )
573
-
574
- search_box.change(
575
- fn=update_leaderboard_table,
576
- inputs=[leaderboard_selector, search_box, group_by_model],
577
- outputs=[leaderboard_table, metadata_view]
578
  )
579
 
580
- group_by_model.change(
581
- fn=update_leaderboard_table,
582
- inputs=[leaderboard_selector, search_box, group_by_model],
583
- outputs=[leaderboard_table, metadata_view]
584
  )
585
 
586
  refresh_btn.click(
587
- fn=lambda: (gr.Dropdown(choices=get_available_leaderboards()), "Refreshed."),
588
- outputs=[leaderboard_selector, upload_status]
589
  ).then(
590
- fn=lambda: LEADERBOARD_CACHE.clear()
591
  ).then(
592
  fn=update_leaderboard_table,
593
- inputs=[leaderboard_selector, search_box, group_by_model],
594
- outputs=[leaderboard_table, metadata_view]
595
  )
596
-
597
  DATA_DIR.mkdir(exist_ok=True)
598
 
599
  if __name__ == "__main__":
 
1
+ """
2
+ Evaluation Leaderboard - Gradio Interface
3
+ Displays model evaluation results from HuggingFace datasets.
4
+ """
5
  import gradio as gr
6
  import pandas as pd
 
7
  from pathlib import Path
8
 
9
+ # Import custom modules
10
+ from data_loader import (
11
+ load_hf_dataset_on_startup,
12
+ get_available_leaderboards,
13
+ get_eval_metadata,
14
+ build_leaderboard_table,
15
+ clear_cache,
16
+ DATA_DIR
17
+ )
18
+ from ui_components import get_theme, get_custom_css, format_leaderboard_header, format_metric_details
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ def export_leaderboard_to_json(selected_leaderboard):
22
+ """Export current leaderboard to JSON files in a zip using parquet_to_folder."""
23
  if not selected_leaderboard:
24
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ import tempfile
27
+ import shutil
28
+ import zipfile
29
+ from json_to_parquet import parquet_to_folder
 
 
 
30
 
31
+ try:
32
+ # Find the parquet file in DATA_DIR
33
+ parquet_path = DATA_DIR / selected_leaderboard / f"{selected_leaderboard}.parquet"
 
34
 
35
+ if not parquet_path.exists():
36
+ print(f"Parquet file not found: {parquet_path}")
37
+ return None
 
 
 
38
 
39
+ # Create temp directory for export
40
+ with tempfile.TemporaryDirectory() as temp_dir:
41
+ temp_path = Path(temp_dir)
42
+ output_dir = temp_path / "json_export"
43
+ output_dir.mkdir()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # Use the round-trip functionality from json_to_parquet
46
+ parquet_to_folder(str(parquet_path), str(output_dir))
47
+
48
+ # Create zip file
49
+ zip_path = temp_path / f"{selected_leaderboard}_export.zip"
50
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
51
+ for json_file in output_dir.rglob("*.json"):
52
+ arcname = json_file.relative_to(output_dir)
53
+ zipf.write(json_file, arcname)
54
+
55
+ # Copy to a permanent location for download
56
+ final_zip = Path(tempfile.gettempdir()) / f"{selected_leaderboard}_export.zip"
57
+ shutil.copy(zip_path, final_zip)
58
+
59
+ return str(final_zip)
60
+ except Exception as e:
61
+ print(f"Export error: {e}")
62
+ return None
63
+
 
 
 
 
 
 
 
 
 
64
 
65
+ def update_leaderboard_table(selected_leaderboard, search_query="", progress=gr.Progress()):
66
  """Loads and aggregates data for the selected leaderboard."""
67
  if not selected_leaderboard:
68
+ return pd.DataFrame(), "", format_leaderboard_header(None, {}), format_metric_details(None, {})
69
 
70
+ metadata = get_eval_metadata(selected_leaderboard)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ def progress_callback(value, desc):
73
+ progress(value, desc=desc)
 
 
 
74
 
75
+ df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
76
+ total_count = len(df)
 
 
77
 
78
+ # Apply search filter (searches all columns)
79
+ if search_query and not df.empty:
80
+ mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
81
+ df = df[mask]
 
 
 
 
 
82
 
83
+ # Build search status message
84
+ if search_query:
85
+ search_msg = f"Showing {len(df)} of {total_count} results for '{search_query}'"
86
+ else:
87
+ search_msg = f"Showing {len(df)} results"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ return df, search_msg, format_leaderboard_header(selected_leaderboard, metadata), format_metric_details(selected_leaderboard, metadata)
 
 
 
 
 
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # Load HF dataset BEFORE building the interface
93
+ load_hf_dataset_on_startup()
 
 
 
94
 
95
+ # Build Gradio interface
96
+ with gr.Blocks(title="Eval Leaderboard", theme=get_theme(), css=get_custom_css()) as demo:
97
 
98
  with gr.Row(variant="compact", elem_classes="header-row"):
99
  with gr.Column(scale=1):
100
  gr.Markdown("# 🏆 Evaluation Leaderboard")
101
  gr.Markdown("Analyze and compare model performance metrics.", elem_classes="subtitle")
102
 
103
+ with gr.Row(variant="panel"):
104
  initial_choices = get_available_leaderboards()
105
  initial_value = initial_choices[0] if initial_choices else None
106
 
 
109
  choices=initial_choices,
110
  value=initial_value,
111
  label="Current Leaderboard",
112
+ interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  )
114
+ with gr.Column(scale=3):
115
+ search_box = gr.Textbox(
116
+ label="Search",
117
+ placeholder="Type to search across all columns...",
118
+ show_label=False
119
+ )
120
+ with gr.Column(scale=1):
121
  refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
122
 
123
+ with gr.Accordion("ℹ️ How to Submit Data", open=False):
124
+ gr.Markdown("""
125
+ ### Submitting Evaluation Data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ **Data submissions happen via GitHub Pull Requests:**
128
+
129
+ 1. **Fork** [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
130
+ 2. **Add your JSON files** to `data/<leaderboard>/<developer>/<model>/`
131
+ 3. **Create a Pull Request**
132
+ 4. **Automated validation** checks your data
133
+ 5. **After merge**: GitHub Actions automatically syncs to HuggingFace
134
+ 6. **Refresh this page** to see your data!
135
+
136
+ #### File Structure
137
+ ```
138
+ data/
139
+ └── YourBenchmark/
140
+ └── developer_name/
141
+ └── model_name/
142
+ └── {uuid}.json
143
+ ```
144
+
145
+ Each JSON file should follow the schema and be named with a unique UUID.
146
+
147
+ 📖 [**Full Submission Guide**](https://github.com/evaleval/every_eval_ever#contributor-guide) |
148
+ 📋 [**JSON Schema**](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) |
149
+ 👀 [**See Examples**](https://github.com/evaleval/every_eval_ever/tree/main/data)
150
+ """)
151
+
152
+ init_df, init_search_msg, init_header, init_metrics = update_leaderboard_table(initial_value)
153
+
154
+ header_view = gr.HTML(value=init_header)
155
 
156
+ search_info = gr.Markdown(value=init_search_msg)
157
 
158
  leaderboard_table = gr.Dataframe(
159
  value=init_df,
 
163
  elem_classes="dataframe"
164
  )
165
 
166
+ metrics_view = gr.HTML(value=init_metrics)
 
 
 
 
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # Event handlers
170
  leaderboard_selector.change(
171
  fn=update_leaderboard_table,
172
+ inputs=[leaderboard_selector, search_box],
173
+ outputs=[leaderboard_table, search_info, header_view, metrics_view]
 
 
 
 
 
 
174
  )
175
 
176
+ search_box.input(
177
+ fn=update_leaderboard_table,
178
+ inputs=[leaderboard_selector, search_box],
179
+ outputs=[leaderboard_table, search_info, header_view, metrics_view]
180
  )
181
 
182
  refresh_btn.click(
183
+ fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
184
+ outputs=[leaderboard_selector]
185
  ).then(
186
+ fn=lambda: clear_cache()
187
  ).then(
188
  fn=update_leaderboard_table,
189
+ inputs=[leaderboard_selector, search_box],
190
+ outputs=[leaderboard_table, search_info, header_view, metrics_view]
191
  )
192
+
193
  DATA_DIR.mkdir(exist_ok=True)
194
 
195
  if __name__ == "__main__":
data_loader.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Loader: Load from HuggingFace, parse JSON files, and build tables.
3
+ """
4
+ import json
5
+ import pandas as pd
6
+ from pathlib import Path
7
+ from datasets import load_dataset
8
+
9
+
10
+ # Global caches
11
+ HF_DATASET_CACHE = {}
12
+ LEADERBOARD_CACHE = {}
13
+ DATA_DIR = Path("leaderboard_data")
14
+
15
+
16
+ def load_hf_dataset_on_startup():
17
+ """Load all splits from HuggingFace dataset at startup."""
18
+ print("Loading dataset from HuggingFace...")
19
+ try:
20
+ dataset = load_dataset("deepmage121/eee_test")
21
+
22
+ for split_name, split_data in dataset.items():
23
+ print(f"Loading split: {split_name} ({len(split_data)} rows)")
24
+
25
+ df = split_data.to_pandas()
26
+ parsed_items = []
27
+
28
+ for _, row in df.iterrows():
29
+ evaluation_results = json.loads(row['evaluation_results'])
30
+
31
+ results = {}
32
+ for eval_result in evaluation_results:
33
+ eval_name = eval_result.get("evaluation_name")
34
+ score = eval_result.get("score_details", {}).get("score")
35
+ if eval_name and score is not None:
36
+ results[eval_name] = score
37
+
38
+ additional_details = {}
39
+ if pd.notna(row.get('additional_details')):
40
+ additional_details = json.loads(row['additional_details'])
41
+
42
+ parsed_item = {
43
+ "leaderboard": row['_leaderboard'],
44
+ "provider": row['source_organization_name'],
45
+ "model": row['model_id'],
46
+ "developer": row['model_developer'],
47
+ "params": additional_details.get('params_billions'),
48
+ "architecture": additional_details.get('architecture', 'Unknown'),
49
+ "precision": additional_details.get('precision', 'Unknown'),
50
+ "results": results,
51
+ "raw_data": {
52
+ "schema_version": row['schema_version'],
53
+ "evaluation_id": row['evaluation_id'],
54
+ "retrieved_timestamp": row['retrieved_timestamp'],
55
+ "source_data": json.loads(row['source_data']),
56
+ "evaluation_source": {
57
+ "evaluation_source_name": row['evaluation_source_name'],
58
+ "evaluation_source_type": row['evaluation_source_type']
59
+ },
60
+ "source_metadata": {
61
+ "source_organization_name": row['source_organization_name'],
62
+ "evaluator_relationship": row['evaluator_relationship'],
63
+ },
64
+ "model_info": {
65
+ "name": row['model_name'],
66
+ "id": row['model_id'],
67
+ "developer": row['model_developer'],
68
+ },
69
+ "evaluation_results": evaluation_results,
70
+ "additional_details": additional_details
71
+ }
72
+ }
73
+
74
+ if pd.notna(row.get('source_organization_url')):
75
+ parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
76
+ if pd.notna(row.get('source_organization_logo_url')):
77
+ parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
78
+ if pd.notna(row.get('model_inference_platform')):
79
+ parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
80
+
81
+ parsed_items.append(parsed_item)
82
+
83
+ HF_DATASET_CACHE[split_name] = parsed_items
84
+
85
+ print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
86
+ return True
87
+ except Exception as e:
88
+ print(f"Warning: Could not load HuggingFace dataset: {e}")
89
+ print("Falling back to local file system...")
90
+ return False
91
+
92
+
93
+ def parse_eval_json(file_path):
94
+ """Parses a single JSON file to extract model, provider, and results."""
95
+ try:
96
+ with open(file_path, 'r') as f:
97
+ data = json.load(f)
98
+
99
+ leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
100
+ provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
101
+ model_id = data.get("model_info", {}).get("id", "Unknown Model")
102
+ developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
103
+
104
+ params = data.get("model_info", {}).get("params_billions", None)
105
+ architecture = data.get("model_info", {}).get("architecture", "Unknown")
106
+ precision = data.get("additional_details", {}).get("precision", "Unknown")
107
+ if precision == "Unknown":
108
+ precision = data.get("model_info", {}).get("precision", "Unknown")
109
+
110
+ results = {}
111
+ if "evaluation_results" in data:
112
+ for res in data["evaluation_results"]:
113
+ eval_name = res.get("evaluation_name", "Unknown Metric")
114
+ score = res.get("score_details", {}).get("score", None)
115
+ if score is not None:
116
+ results[eval_name] = score
117
+
118
+ return {
119
+ "leaderboard": leaderboard_name,
120
+ "provider": provider_name,
121
+ "model": model_id,
122
+ "developer": developer_name,
123
+ "params": params,
124
+ "architecture": architecture,
125
+ "precision": precision,
126
+ "results": results,
127
+ "raw_data": data
128
+ }
129
+ except Exception as e:
130
+ print(f"Error parsing {file_path}: {e}")
131
+ return None
132
+
133
+
134
+ def get_available_leaderboards():
135
+ """Returns available leaderboards from HF cache or local directory."""
136
+ if HF_DATASET_CACHE:
137
+ return list(HF_DATASET_CACHE.keys())
138
+
139
+ if not DATA_DIR.exists():
140
+ return []
141
+ return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
142
+
143
+
144
+ def walk_eval_files(leaderboard_name):
145
+ """Generator that walks through Leaderboard directory recursively."""
146
+ lb_path = DATA_DIR / leaderboard_name
147
+ if not lb_path.exists():
148
+ return
149
+ yield from lb_path.rglob("*.json")
150
+
151
+
152
+ def get_eval_metadata(selected_leaderboard):
153
+ """Extracts evaluation metadata from the leaderboard data."""
154
+ if not selected_leaderboard:
155
+ return {}
156
+
157
+ eval_metadata = {"evals": {}, "source_info": {}}
158
+
159
+ if selected_leaderboard in HF_DATASET_CACHE:
160
+ parsed_items = HF_DATASET_CACHE[selected_leaderboard]
161
+ if parsed_items:
162
+ parsed = parsed_items[0]
163
+
164
+ source_meta = parsed["raw_data"].get("source_metadata", {})
165
+ source_data_list = parsed["raw_data"].get("source_data", [])
166
+ url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
167
+
168
+ eval_metadata["source_info"] = {
169
+ "organization": source_meta.get("source_organization_name", "Unknown"),
170
+ "relationship": source_meta.get("evaluator_relationship", "Unknown"),
171
+ "url": url
172
+ }
173
+
174
+ if "evaluation_results" in parsed["raw_data"]:
175
+ for res in parsed["raw_data"]["evaluation_results"]:
176
+ eval_name = res.get("evaluation_name", "Unknown Metric")
177
+ if eval_name not in eval_metadata["evals"]:
178
+ metric_config = res.get("metric_config", {})
179
+ eval_metadata["evals"][eval_name] = {
180
+ "description": metric_config.get("evaluation_description", "No description available"),
181
+ "score_type": metric_config.get("score_type", "unknown"),
182
+ "lower_is_better": metric_config.get("lower_is_better", False),
183
+ "min_score": metric_config.get("min_score"),
184
+ "max_score": metric_config.get("max_score"),
185
+ "level_names": metric_config.get("level_names", []),
186
+ "level_metadata": metric_config.get("level_metadata", []),
187
+ "has_unknown_level": metric_config.get("has_unknown_level", False)
188
+ }
189
+ return eval_metadata
190
+
191
+ # Fall back to file system
192
+ for json_file in walk_eval_files(selected_leaderboard):
193
+ parsed = parse_eval_json(json_file)
194
+ if parsed:
195
+ if not eval_metadata["source_info"]:
196
+ source_meta = parsed["raw_data"].get("source_metadata", {})
197
+ source_data_list = parsed["raw_data"].get("source_data", [])
198
+ url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
199
+
200
+ eval_metadata["source_info"] = {
201
+ "organization": source_meta.get("source_organization_name", "Unknown"),
202
+ "relationship": source_meta.get("evaluator_relationship", "Unknown"),
203
+ "url": url
204
+ }
205
+
206
+ if "evaluation_results" in parsed["raw_data"]:
207
+ for res in parsed["raw_data"]["evaluation_results"]:
208
+ eval_name = res.get("evaluation_name", "Unknown Metric")
209
+ if eval_name not in eval_metadata["evals"]:
210
+ metric_config = res.get("metric_config", {})
211
+ eval_metadata["evals"][eval_name] = {
212
+ "description": metric_config.get("evaluation_description", "No description available"),
213
+ "score_type": metric_config.get("score_type", "unknown"),
214
+ "lower_is_better": metric_config.get("lower_is_better", False),
215
+ "min_score": metric_config.get("min_score"),
216
+ "max_score": metric_config.get("max_score"),
217
+ "level_names": metric_config.get("level_names", []),
218
+ "level_metadata": metric_config.get("level_metadata", []),
219
+ "has_unknown_level": metric_config.get("has_unknown_level", False)
220
+ }
221
+ break
222
+
223
+ return eval_metadata
224
+
225
+
226
+ def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
227
+ """Builds the leaderboard DataFrame from cache or files."""
228
+ if not selected_leaderboard:
229
+ return pd.DataFrame()
230
+
231
+ if selected_leaderboard in LEADERBOARD_CACHE:
232
+ df, _ = LEADERBOARD_CACHE[selected_leaderboard]
233
+ else:
234
+ rows = []
235
+
236
+ if selected_leaderboard in HF_DATASET_CACHE:
237
+ if progress_callback:
238
+ progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
239
+
240
+ parsed_items = HF_DATASET_CACHE[selected_leaderboard]
241
+
242
+ for i, parsed in enumerate(parsed_items):
243
+ if i % 100 == 0 and progress_callback:
244
+ progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
245
+
246
+ row = {
247
+ "Model": parsed["model"],
248
+ "Developer": parsed["developer"],
249
+ "Params (B)": parsed["params"],
250
+ "Arch": parsed["architecture"],
251
+ "Precision": parsed["precision"]
252
+ }
253
+ row.update(parsed["results"])
254
+ rows.append(row)
255
+ else:
256
+ # Fall back to file system
257
+ if progress_callback:
258
+ progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
259
+
260
+ all_files = list(walk_eval_files(selected_leaderboard))
261
+ total_files = len(all_files)
262
+
263
+ for i, json_file in enumerate(all_files):
264
+ if i % 100 == 0 and progress_callback:
265
+ progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
266
+
267
+ parsed = parse_eval_json(json_file)
268
+ if parsed:
269
+ row = {
270
+ "Model": parsed["model"],
271
+ "Developer": parsed["developer"],
272
+ "Params (B)": parsed["params"],
273
+ "Arch": parsed["architecture"],
274
+ "Precision": parsed["precision"]
275
+ }
276
+ row.update(parsed["results"])
277
+ rows.append(row)
278
+
279
+ if not rows:
280
+ df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
281
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
282
+ return df
283
+
284
+ df = pd.DataFrame(rows)
285
+ df = df.dropna(axis=1, how='all')
286
+
287
+ if df.empty:
288
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
289
+ return df
290
+
291
+ numeric_cols = df.select_dtypes(include=['float', 'int']).columns
292
+ df[numeric_cols] = df[numeric_cols].round(3)
293
+
294
+ # Add Average Score
295
+ eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
296
+ if len(eval_only_cols) > 0:
297
+ df["Average"] = df[eval_only_cols].mean(axis=1).round(3)
298
+
299
+ base_cols = ["Model", "Developer", "Params (B)", "Arch", "Precision", "Average"]
300
+ eval_cols = [c for c in df.columns if c not in base_cols]
301
+ base_cols = [c for c in base_cols if c in df.columns]
302
+
303
+ final_cols = base_cols + sorted(eval_cols)
304
+ df = df[final_cols]
305
+
306
+ if "Average" in df.columns:
307
+ df = df.sort_values("Average", ascending=False)
308
+
309
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
310
+
311
+ return df
312
+
313
+
314
+ def clear_cache():
315
+ """Clears all caches."""
316
+ LEADERBOARD_CACHE.clear()
317
+
eval.schema.json ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "version": "0.0.1",
4
+ "type": "object",
5
+ "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
6
+ "required": [
7
+ "schema_version",
8
+ "evaluation_id",
9
+ "evaluation_source",
10
+ "retrieved_timestamp",
11
+ "source_data",
12
+ "source_metadata",
13
+ "model_info",
14
+ "evaluation_results"
15
+ ],
16
+ "properties": {
17
+ "schema_version": {
18
+ "type": "string",
19
+ "description": "Version of the schema used for this evaluation data"
20
+ },
21
+ "evaluation_id": {
22
+ "type": "string",
23
+ "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
24
+ },
25
+ "retrieved_timestamp": {
26
+ "type": "string",
27
+ "description": "Timestamp for when this record was created"
28
+ },
29
+ "source_data": {
30
+ "type": "array",
31
+ "description": "URLs for the source of the evaluation data",
32
+ "items": {
33
+ "type": "string"
34
+ }
35
+ },
36
+ "evaluation_source": {
37
+ "type": "object",
38
+ "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
39
+ "required": [
40
+ "evaluation_source_name",
41
+ "evaluation_source_type"
42
+ ],
43
+ "properties": {
44
+ "evaluation_source_name": {
45
+ "type": "string",
46
+ "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
47
+ },
48
+ "evaluation_source_type": {
49
+ "type": "string",
50
+ "enum": [
51
+ "leaderboard",
52
+ "evaluation_platform"
53
+ ],
54
+ "description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
55
+ }
56
+ }
57
+ },
58
+ "source_metadata": {
59
+ "type": "object",
60
+ "description": "Metadata about the source of the leaderboard data",
61
+ "required": [
62
+ "source_organization_name",
63
+ "evaluator_relationship"
64
+ ],
65
+ "properties": {
66
+ "source_organization_name": {
67
+ "type": "string",
68
+ "description": "Name of the organization that provides the data"
69
+ },
70
+ "source_organization_url": {
71
+ "type": "string",
72
+ "description": "URL for the organization that provides the data"
73
+ },
74
+ "source_organization_logo_url": {
75
+ "type": "string",
76
+ "description": "URL for the Logo for the organization that provides the data"
77
+ },
78
+ "evaluator_relationship": {
79
+ "type": "string",
80
+ "description": "Relationship between the evaluator and the model",
81
+ "enum": [
82
+ "first_party",
83
+ "third_party",
84
+ "collaborative",
85
+ "other"
86
+ ]
87
+ }
88
+ }
89
+ },
90
+ "model_info": {
91
+ "type": "object",
92
+ "description": "Complete model specification including basic information, technical configuration and inference settings",
93
+ "required": [
94
+ "name",
95
+ "id"
96
+ ],
97
+ "properties": {
98
+ "name": {
99
+ "type": "string",
100
+ "description": "Model name provided by evaluation source"
101
+ },
102
+ "id": {
103
+ "type": "string",
104
+ "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
105
+ },
106
+ "developer": {
107
+ "type": "string",
108
+ "description": "Name of organization that provides the model (e.g. 'OpenAI')"
109
+ },
110
+ "inference_platform": {
111
+ "type": "string",
112
+ "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
113
+ }
114
+ }
115
+ },
116
+ "evaluation_results": {
117
+ "type": "array",
118
+ "description": "Array of evaluation results",
119
+ "items": {
120
+ "type": "object",
121
+ "required": [
122
+ "evaluation_name",
123
+ "metric_config",
124
+ "score_details"
125
+ ],
126
+ "properties": {
127
+ "evaluation_name": {
128
+ "type": "string",
129
+ "description": "Name of the evaluation"
130
+ },
131
+ "evaluation_timestamp": {
132
+ "type": "string",
133
+ "description": "Timestamp for when the evaluations were run"
134
+ },
135
+ "metric_config": {
136
+ "type": "object",
137
+ "description": "Details about the metric",
138
+ "required": [
139
+ "lower_is_better"
140
+ ],
141
+ "properties": {
142
+ "evaluation_description": {
143
+ "type": "string",
144
+ "description": "Description of the evaluation"
145
+ },
146
+ "lower_is_better": {
147
+ "type": "boolean",
148
+ "description": "Whether a lower score is better"
149
+ },
150
+ "score_type": {
151
+ "type": "string",
152
+ "description": "Type of score",
153
+ "enum": [
154
+ "binary",
155
+ "continuous",
156
+ "levels"
157
+ ]
158
+ },
159
+ "level_names": {
160
+ "type": "array",
161
+ "description": "Names of the score levels",
162
+ "items": {
163
+ "type": "string"
164
+ }
165
+ },
166
+ "level_metadata": {
167
+ "type": "array",
168
+ "description": "Additional Description for each Score Level",
169
+ "items": {
170
+ "type": "string"
171
+ }
172
+ },
173
+ "has_unknown_level": {
174
+ "type": "boolean",
175
+ "description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
176
+ },
177
+ "min_score": {
178
+ "type": "number",
179
+ "description": "Minimum possible score for continuous metric"
180
+ },
181
+ "max_score": {
182
+ "type": "number",
183
+ "description": "Maximum possible score for continuous metric"
184
+ }
185
+ },
186
+ "if": {
187
+ "properties": {
188
+ "score_type": {
189
+ "const": "levels"
190
+ }
191
+ }
192
+ },
193
+ "then": {
194
+ "required": [
195
+ "level_names",
196
+ "has_unknown_level"
197
+ ]
198
+ },
199
+ "else": {
200
+ "if": {
201
+ "properties": {
202
+ "score_type": {
203
+ "const": "continuous"
204
+ }
205
+ }
206
+ },
207
+ "then": {
208
+ "required": [
209
+ "min_score",
210
+ "max_score"
211
+ ]
212
+ }
213
+ }
214
+ },
215
+ "score_details": {
216
+ "type": "object",
217
+ "description": "The score for the evaluation and related details",
218
+ "required": [
219
+ "score"
220
+ ],
221
+ "properties": {
222
+ "score": {
223
+ "type": "number",
224
+ "description": "The score for the evaluation"
225
+ },
226
+ "details": {
227
+ "type": "object",
228
+ "description": "Any additional details about the score",
229
+ "additionalProperties": true
230
+ }
231
+ }
232
+ },
233
+ "detailed_evaluation_results_url": {
234
+ "type": "string",
235
+ "description": "Link to detailed evaluation data"
236
+ },
237
+ "generation_config": {
238
+ "type": "object",
239
+ "generation_args": {
240
+ "type": "object",
241
+ "description": "Parameters used to generate results - properties may vary by model type",
242
+ "properties": {
243
+ "temperature": {
244
+ "type": [
245
+ "null",
246
+ "number"
247
+ ],
248
+ "description": "Sampling temperature"
249
+ },
250
+ "top_p": {
251
+ "type": [
252
+ "null",
253
+ "number"
254
+ ],
255
+ "description": "Nucleus sampling parameter"
256
+ },
257
+ "top_k": {
258
+ "type": [
259
+ "null",
260
+ "number"
261
+ ],
262
+ "description": "Top-k sampling parameter"
263
+ },
264
+ "max_tokens": {
265
+ "type": "integer",
266
+ "minimum": 1,
267
+ "description": "Maximum number of tokens to generate"
268
+ }
269
+ },
270
+ "additionalProperties": true
271
+ },
272
+ "additional_details": {
273
+ "type": "string",
274
+ "description": "Additional details about how the results for this metric were generated."
275
+ }
276
+ }
277
+ }
278
+ }
279
+
280
+ }
281
+ }
282
+ }
hf_operations.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Operations: Upload data, create PRs, validate schemas.
3
+ """
4
+ from huggingface_hub import HfApi, login
5
+ import pandas as pd
6
+ import json
7
+ from pathlib import Path
8
+ from jsonschema import validate, ValidationError, Draft7Validator
9
+
10
+
11
+ # Load schema once at module level
12
+ SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
13
+ with open(SCHEMA_PATH, 'r') as f:
14
+ EVAL_SCHEMA = json.load(f)
15
+
16
+
17
+ def validate_json_against_schema(json_data):
18
+ """
19
+ Validate a JSON object against eval.schema.json.
20
+
21
+ Args:
22
+ json_data: Dict containing the evaluation data
23
+
24
+ Returns:
25
+ (bool, str): (is_valid, error_message)
26
+ """
27
+ try:
28
+ validate(instance=json_data, schema=EVAL_SCHEMA)
29
+ return True, "Schema validation passed"
30
+ except ValidationError as e:
31
+ # Extract the most relevant error message
32
+ error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
33
+ return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
34
+ except Exception as e:
35
+ return False, f"❌ Validation error: {str(e)}"
36
+
37
+
38
+ def upload_to_hf_dataset(parquet_file, split_name, repo_id="deepmage121/eee_test"):
39
+ """
40
+ Upload a parquet file as a new split to the HF dataset.
41
+
42
+ Args:
43
+ parquet_file: Path to parquet file
44
+ split_name: Name of the split (leaderboard name)
45
+ repo_id: HuggingFace dataset repository ID
46
+ """
47
+ # TODO: Implement upload logic
48
+ pass
49
+
50
+
51
+ def check_hf_authentication():
52
+ """
53
+ Check if user is authenticated with HuggingFace.
54
+
55
+ Returns:
56
+ (bool, str): (is_authenticated, username or error_message)
57
+ """
58
+ try:
59
+ api = HfApi()
60
+ user_info = api.whoami()
61
+ return True, user_info['name']
62
+ except Exception as e:
63
+ return False, "Not authenticated. Run: huggingface-cli login"
64
+
65
+
66
+ def check_duplicate_pr_exists(leaderboard_name, repo_id="deepmage121/eee_test"):
67
+ """
68
+ Check if a PR already exists for this leaderboard.
69
+
70
+ Args:
71
+ leaderboard_name: Name of the leaderboard
72
+ repo_id: HuggingFace dataset repository ID
73
+
74
+ Returns:
75
+ (bool, str or None): (exists, pr_url if exists)
76
+ """
77
+ try:
78
+ api = HfApi()
79
+ discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
80
+
81
+ # Check for open PRs with matching title
82
+ pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
83
+ for discussion in discussions:
84
+ if discussion.is_pull_request and discussion.status == "open":
85
+ if pr_title_pattern in discussion.title.lower():
86
+ pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
87
+ return True, pr_url
88
+
89
+ return False, None
90
+ except Exception as e:
91
+ # If we can't check, assume no duplicate (fail open)
92
+ print(f"Warning: Could not check for duplicate PRs: {e}")
93
+ return False, None
94
+
95
+
96
+ def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id="deepmage121/eee_test"):
97
+ """
98
+ Create a pull request to add a new leaderboard split.
99
+
100
+ Args:
101
+ leaderboard_name: Name of the new leaderboard
102
+ parquet_file: Path to parquet file
103
+ repo_id: HuggingFace dataset repository ID
104
+
105
+ Returns:
106
+ (success, pr_url or error_message)
107
+ """
108
+ # 1. Check authentication
109
+ is_auth, auth_result = check_hf_authentication()
110
+ if not is_auth:
111
+ return False, f"❌ {auth_result}"
112
+
113
+ # 2. Check for duplicate PR
114
+ has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
115
+ if has_duplicate:
116
+ return False, f"⚠️ PR already exists: {duplicate_url}"
117
+
118
+ # 3. Validate parquet file exists and has data
119
+ parquet_path = Path(parquet_file)
120
+ if not parquet_path.exists():
121
+ return False, "❌ Parquet file not found"
122
+
123
+ df = pd.read_parquet(parquet_file)
124
+ if len(df) == 0:
125
+ return False, "❌ Parquet file is empty"
126
+
127
+ # 4. Create PR
128
+ try:
129
+ api = HfApi()
130
+
131
+ # Upload the parquet file to the branch
132
+ commit_message = f"Add new leaderboard: {leaderboard_name}"
133
+
134
+ # Upload file and create PR
135
+ commit_info = api.upload_file(
136
+ path_or_fileobj=parquet_file,
137
+ path_in_repo=f"data/{leaderboard_name}.parquet",
138
+ repo_id=repo_id,
139
+ repo_type="dataset",
140
+ commit_message=commit_message,
141
+ create_pr=True,
142
+ )
143
+
144
+ # Extract PR URL from commit info
145
+ pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
146
+
147
+ return True, f"PR created ({len(df)} rows): {pr_url}"
148
+
149
+ except Exception as e:
150
+ return False, f"❌ Failed to create PR: {str(e)}"
151
+
152
+
153
+ def validate_schema(parquet_file):
154
+ """
155
+ Validate that a parquet file matches the expected schema.
156
+
157
+ Args:
158
+ parquet_file: Path to parquet file to validate
159
+
160
+ Returns:
161
+ (bool, str): (is_valid, error_message)
162
+ """
163
+ try:
164
+ df = pd.read_parquet(parquet_file)
165
+
166
+ # Required columns
167
+ required_cols = [
168
+ '_leaderboard', '_developer', '_model', '_uuid',
169
+ 'schema_version', 'evaluation_id', 'retrieved_timestamp',
170
+ 'source_data', 'evaluation_source_name', 'evaluation_source_type',
171
+ 'source_organization_name', 'evaluator_relationship',
172
+ 'model_name', 'model_id', 'model_developer',
173
+ 'evaluation_results'
174
+ ]
175
+
176
+ missing = [col for col in required_cols if col not in df.columns]
177
+ if missing:
178
+ return False, f"Missing required columns: {', '.join(missing)}"
179
+
180
+ # Check data types (all should be strings)
181
+ for col in df.columns:
182
+ if df[col].dtype not in ['object', 'string']:
183
+ return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
184
+
185
+ return True, "Schema validation passed"
186
+
187
+ except Exception as e:
188
+ return False, f"Validation error: {str(e)}"
189
+
190
+
191
+ def export_to_json(parquet_file, output_dir):
192
+ """
193
+ Export parquet data back to JSON files.
194
+ Uses the parquet_to_folder function from json_to_parquet.py
195
+
196
+ Args:
197
+ parquet_file: Path to parquet file
198
+ output_dir: Directory to write JSON files to
199
+ """
200
+ from json_to_parquet import parquet_to_folder
201
+ parquet_to_folder(parquet_file, output_dir)
202
+
json_to_parquet.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import json
4
+ from pathlib import Path
5
+ import pandas as pd
6
+
7
+
8
+ def json_to_row(json_path: Path) -> dict:
9
+ """Convert one JSON to a single row (1 JSON = 1 row, evaluations as columns)."""
10
+ with open(json_path, 'r') as f:
11
+ data = json.load(f)
12
+
13
+ required_fields = ["schema_version", "evaluation_id", "evaluation_source", "retrieved_timestamp",
14
+ "source_data", "source_metadata", "model_info", "evaluation_results"]
15
+ for field in required_fields:
16
+ if field not in data:
17
+ raise ValueError(f"{json_path}: Missing required field '{field}'")
18
+
19
+ if "evaluation_source_name" not in data["evaluation_source"]:
20
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_name'")
21
+ if "evaluation_source_type" not in data["evaluation_source"]:
22
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_type'")
23
+
24
+ if "source_organization_name" not in data["source_metadata"]:
25
+ raise ValueError(f"{json_path}: Missing required field 'source_metadata.source_organization_name'")
26
+ if "evaluator_relationship" not in data["source_metadata"]:
27
+ raise ValueError(f"{json_path}: Missing required field 'source_metadata.evaluator_relationship'")
28
+
29
+ if "name" not in data["model_info"]:
30
+ raise ValueError(f"{json_path}: Missing required field 'model_info.name'")
31
+ if "id" not in data["model_info"]:
32
+ raise ValueError(f"{json_path}: Missing required field 'model_info.id'")
33
+ if "developer" not in data["model_info"]:
34
+ raise ValueError(f"{json_path}: Missing required field 'model_info.developer'")
35
+
36
+ leaderboard = data["evaluation_source"]["evaluation_source_name"]
37
+ model = data["model_info"]["id"]
38
+ uuid = json_path.stem
39
+ developer = data["model_info"]["developer"]
40
+
41
+ # Validate evaluation results
42
+ for eval_result in data["evaluation_results"]:
43
+ if "evaluation_name" not in eval_result:
44
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].evaluation_name'")
45
+ if "metric_config" not in eval_result:
46
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config'")
47
+ if "score_details" not in eval_result:
48
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details'")
49
+
50
+ if "lower_is_better" not in eval_result["metric_config"]:
51
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config.lower_is_better'")
52
+ if "score" not in eval_result["score_details"]:
53
+ raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details.score'")
54
+
55
+ row = {
56
+ # Folder structure (for reconstruction)
57
+ "_leaderboard": leaderboard,
58
+ "_developer": developer,
59
+ "_model": model,
60
+ "_uuid": uuid,
61
+
62
+ # Required top-level fields
63
+ "schema_version": data["schema_version"],
64
+ "evaluation_id": data["evaluation_id"],
65
+ "retrieved_timestamp": data["retrieved_timestamp"],
66
+ "source_data": json.dumps(data["source_data"]),
67
+
68
+ # Required nested fields
69
+ "evaluation_source_name": data["evaluation_source"]["evaluation_source_name"],
70
+ "evaluation_source_type": data["evaluation_source"]["evaluation_source_type"],
71
+
72
+ "source_organization_name": data["source_metadata"]["source_organization_name"],
73
+ "source_organization_url": data["source_metadata"].get("source_organization_url"),
74
+ "source_organization_logo_url": data["source_metadata"].get("source_organization_logo_url"),
75
+ "evaluator_relationship": data["source_metadata"]["evaluator_relationship"],
76
+
77
+ "model_name": data["model_info"]["name"],
78
+ "model_id": data["model_info"]["id"],
79
+ "model_developer": data["model_info"]["developer"],
80
+ "model_inference_platform": data["model_info"].get("inference_platform"),
81
+
82
+ # Store full evaluation_results and additional_details as JSON
83
+ "evaluation_results": json.dumps(data["evaluation_results"]),
84
+ "additional_details": json.dumps(data["additional_details"]) if "additional_details" in data else None,
85
+ }
86
+
87
+ return row
88
+
89
+
90
+ def add_to_parquet(json_or_folder: str, parquet_file: str):
91
+ """
92
+ Add JSON(s) to Parquet file.
93
+ Creates new file if it doesn't exist, appends and deduplicates if it does.
94
+
95
+ Args:
96
+ json_or_folder: Path to single JSON file or folder containing JSONs
97
+ parquet_file: Output Parquet file path
98
+ """
99
+ input_path = Path(json_or_folder)
100
+
101
+ if input_path.is_file():
102
+ json_files = [input_path]
103
+ elif input_path.is_dir():
104
+ json_files = list(input_path.rglob("*.json"))
105
+ if not json_files:
106
+ raise ValueError(f"No JSON files found in directory: {json_or_folder}")
107
+ else:
108
+ raise ValueError(f"Invalid input: {json_or_folder}")
109
+
110
+ print(f"Processing {len(json_files)} JSON file(s)...")
111
+
112
+ parquet_path = Path(parquet_file)
113
+ if parquet_path.exists():
114
+ existing_df = pd.read_parquet(parquet_file)
115
+ existing_keys = set(
116
+ existing_df[["_leaderboard", "_developer", "_model", "_uuid"]]
117
+ .apply(tuple, axis=1)
118
+ )
119
+ print(f"Found {len(existing_df)} existing rows")
120
+ else:
121
+ existing_df = None
122
+ existing_keys = set()
123
+
124
+ all_rows = []
125
+ skipped = 0
126
+ for i, jf in enumerate(json_files, 1):
127
+ if i % 100 == 0:
128
+ print(f" {i}/{len(json_files)}")
129
+
130
+ row = json_to_row(jf)
131
+ key = (row["_leaderboard"], row["_developer"], row["_model"], row["_uuid"])
132
+ if key not in existing_keys:
133
+ all_rows.append(row)
134
+ existing_keys.add(key)
135
+ else:
136
+ skipped += 1
137
+
138
+ if skipped > 0:
139
+ print(f" Skipped {skipped} duplicate file(s)")
140
+
141
+ # Handle case where no new rows to add
142
+ if not all_rows:
143
+ if existing_df is not None:
144
+ print(f"No new files to add, keeping existing {len(existing_df)} file(s)")
145
+ return
146
+ else:
147
+ raise ValueError("No valid JSON files to process and no existing parquet file")
148
+
149
+ new_df = pd.DataFrame(all_rows)
150
+
151
+ if existing_df is not None:
152
+ df = pd.concat([existing_df, new_df], ignore_index=True)
153
+ print(f"Added {len(new_df)} new file(s) to existing {len(existing_df)} file(s)")
154
+ else:
155
+ df = new_df
156
+
157
+ df.to_parquet(parquet_file, index=False)
158
+ print(f"Saved {len(df)} total file(s) to {parquet_file} ({parquet_path.stat().st_size / 1024 / 1024:.1f} MB)")
159
+
160
+
161
+ def parquet_to_folder(parquet_file: str, output_dir: str):
162
+ """Reconstruct folder structure from Parquet."""
163
+ df = pd.read_parquet(parquet_file)
164
+ out = Path(output_dir)
165
+
166
+ for _, row in df.iterrows():
167
+ lb = row["_leaderboard"]
168
+ dev = row["_developer"]
169
+ model = row["_model"]
170
+ uuid = row["_uuid"]
171
+
172
+ json_data = {
173
+ "schema_version": row["schema_version"],
174
+ "evaluation_id": row["evaluation_id"],
175
+ "retrieved_timestamp": row["retrieved_timestamp"],
176
+ "source_data": json.loads(row["source_data"]),
177
+ "evaluation_source": {
178
+ "evaluation_source_name": row["evaluation_source_name"],
179
+ "evaluation_source_type": row["evaluation_source_type"]
180
+ },
181
+ "source_metadata": {
182
+ "source_organization_name": row["source_organization_name"],
183
+ "evaluator_relationship": row["evaluator_relationship"]
184
+ },
185
+ "model_info": {
186
+ "name": row["model_name"],
187
+ "id": row["model_id"],
188
+ "developer": row["model_developer"]
189
+ },
190
+ "evaluation_results": json.loads(row["evaluation_results"])
191
+ }
192
+
193
+ if pd.notna(row["source_organization_url"]):
194
+ json_data["source_metadata"]["source_organization_url"] = row["source_organization_url"]
195
+ if pd.notna(row["source_organization_logo_url"]):
196
+ json_data["source_metadata"]["source_organization_logo_url"] = row["source_organization_logo_url"]
197
+
198
+ if pd.notna(row["model_inference_platform"]):
199
+ json_data["model_info"]["inference_platform"] = row["model_inference_platform"]
200
+
201
+ if pd.notna(row["additional_details"]):
202
+ json_data["additional_details"] = json.loads(row["additional_details"])
203
+
204
+ file_path = out / lb / dev / model / f"{uuid}.json"
205
+ file_path.parent.mkdir(parents=True, exist_ok=True)
206
+ with open(file_path, 'w') as f:
207
+ json.dump(json_data, f, indent=2)
208
+
209
+ print(f"Reconstructed {len(df)} files to {output_dir}")
210
+
211
+
212
+ if __name__ == "__main__":
213
+ import sys
214
+
215
+ if len(sys.argv) < 2:
216
+ print("Usage:")
217
+ print(" python json_to_parquet.py add <json_or_folder> <output.parquet>")
218
+ print(" python json_to_parquet.py export <input.parquet> <output_dir>")
219
+ sys.exit(1)
220
+
221
+ cmd = sys.argv[1]
222
+
223
+ if cmd == "add":
224
+ add_to_parquet(sys.argv[2], sys.argv[3])
225
+ elif cmd == "export":
226
+ parquet_to_folder(sys.argv[2], sys.argv[3])
227
+ else:
228
+ print(f"Unknown command: {cmd}")
leaderboard_data/HFOpenLLMv2/0-hero/0-hero_Matter-0.2-7B-DPO/40e80d5e-db72-46b7-bd14-b7d005df4be8.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/0-hero_Matter-0.2-7B-DPO/1762652579.4626381",
4
- "retrieved_timestamp": "1762652579.462642",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "0-hero/Matter-0.2-7B-DPO",
18
- "developer": "0-hero",
19
- "inference_platform": "unknown",
20
- "id": "0-hero/Matter-0.2-7B-DPO"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3302792147058693
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.3596254301656297
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.014350453172205438
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.25922818791946306
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.381375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.1163563829787234
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "MistralForCausalLM",
105
- "params_billions": 7.242
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-32K/0d91a153-1b6b-4891-8722-a5c7e372ba64.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-32K/1762652579.463656",
4
- "retrieved_timestamp": "1762652579.463657",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-34B-32K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-34B-32K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3118691737922047
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6015685776542417
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.1540785498489426
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.36325503355704697
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4398229166666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.4709109042553192
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat-16K/2192007d-1f6e-4f74-b518-7448ef3a896e.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat-16K/1762652579.464125",
4
- "retrieved_timestamp": "1762652579.4641259",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-34B-Chat-16K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-34B-Chat-16K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.456449997118756
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6100218256499571
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.21374622356495468
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.33808724832214765
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.43976041666666665
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.45445478723404253
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat/e335874b-9b3e-4966-a7e0-22e9d16f8324.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat/1762652579.463886",
4
- "retrieved_timestamp": "1762652579.4638872",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-34B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-34B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.6066758423205982
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6083748310271819
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.277190332326284
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3649328859060403
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4281979166666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.45204454787234044
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B/8409c158-ef12-4e6c-8a1d-7be2084b3446.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B/1762652579.4633532",
4
- "retrieved_timestamp": "1762652579.463354",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-34B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-34B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.2841172533322695
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5976391706360018
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.15332326283987915
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.36577181208053694
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4236041666666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.4665890957446808
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B-Chat/3452e57f-3023-4e2e-ad84-b09e409fe334.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B-Chat/1762652579.464571",
4
- "retrieved_timestamp": "1762652579.464572",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-6B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-6B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.5145270105542183
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4571311331954389
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.1623867069486405
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.30201342281879195
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.43917708333333333
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3193151595744681
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.061
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B/1a1f1263-96b6-4e32-a2c8-6c0d6b47dff9.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B/1762652579.464354",
4
- "retrieved_timestamp": "1762652579.464355",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-6B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-6B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.26166017278598563
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.44925820198929056
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.06646525679758308
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.313758389261745
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.43740625
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.31441156914893614
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.061
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-32K/df9d9d44-daa1-4e61-9b46-192380043889.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-32K/1762652579.4649951",
4
- "retrieved_timestamp": "1762652579.464996",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-9B-32K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-9B-32K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.23031113002389217
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.496332115988265
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.10800604229607251
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.35906040268456374
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4186145833333333
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.37649601063829785
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat-16K/090c9691-4b7e-4a98-b9a2-644e21797be4.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat-16K/1762652579.465471",
4
- "retrieved_timestamp": "1762652579.465471",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-9B-Chat-16K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-9B-Chat-16K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.4214040966856829
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5153383364651778
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.1782477341389728
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3087248322147651
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.40990624999999997
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.39935172872340424
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat/9256c32b-d956-418f-97da-ea78e3ad9e48.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat/1762652579.465226",
4
- "retrieved_timestamp": "1762652579.465226",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-9B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-9B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.6045525871354672
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.555906430281685
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.2258308157099698
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3347315436241611
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.42590625
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.39752327127659576
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B/904d1f91-3153-49d5-afd3-9921bfc086f1.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B/1762652579.464781",
4
- "retrieved_timestamp": "1762652579.464782",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-1.5-9B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-1.5-9B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.29358435617494916
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.514294179104191
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.11404833836858005
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.37919463087248323
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.43278124999999995
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3916223404255319
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-200K/fb2ebd9a-f5b8-42a2-9b58-e6f0e7d9b98a.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-200K/1762652579.465893",
4
- "retrieved_timestamp": "1762652579.465894",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-34B-200K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-34B-200K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.15424850507763843
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5441817925289527
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.05740181268882175
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3565436241610738
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.38171874999999994
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.45345744680851063
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-Chat/5d9b9217-874b-426d-8af4-5105a3b1b3ad.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-Chat/1762652579.466115",
4
- "retrieved_timestamp": "1762652579.4661162",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-34B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-34B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.4698887839820565
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5560872910766164
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.06268882175226587
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.33808724832214765
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.39784375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.4093251329787234
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B/3ebcbf3d-cb2d-4332-bb8a-1db104033391.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B/1762652579.4656792",
4
- "retrieved_timestamp": "1762652579.46568",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-34B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-34B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3045751938190667
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5457099951794562
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.0513595166163142
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.36661073825503354
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4118541666666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.441156914893617
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-200K/6b720e8b-aab8-4ba4-9bce-e7a1de3cfb86.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-200K/1762652579.4665558",
4
- "retrieved_timestamp": "1762652579.466557",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-6B-200K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-6B-200K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.08433068702154728
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.42892948109603307
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.01812688821752266
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.28187919463087246
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.45873958333333337
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.2844082446808511
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.061
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-Chat/1120c801-7736-4d9d-b23d-08eeedb34186.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-Chat/1762652579.466805",
4
- "retrieved_timestamp": "1762652579.466806",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-6B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-6B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.33952135888331847
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.41326019207548687
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.013595166163141994
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.29446308724832215
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.36879166666666663
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3061003989361702
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.061
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B/297419fa-855c-4eae-ad7c-3cf4a0262450.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B/1762652579.4663382",
4
- "retrieved_timestamp": "1762652579.4663382",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-6B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-6B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.28933784580468713
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4309230591000865
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.015861027190332326
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.26929530201342283
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.39368749999999997
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.29911901595744683
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.061
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B-200K/4299df04-495a-4687-b143-96b1b562d5e8.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B-200K/1762652579.467233",
4
- "retrieved_timestamp": "1762652579.467233",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-9B-200K",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-9B-200K"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.23270921155866434
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4793302602023641
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.06646525679758308
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.31543624161073824
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.42940625
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.36220079787234044
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B/0ec59add-f9a9-4dbd-8a83-c6aec0b8ad21.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B/1762652579.46702",
4
- "retrieved_timestamp": "1762652579.4670231",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-9B",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-9B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.2708779372066118
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.49396075125308075
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.055891238670694864
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3179530201342282
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.40540624999999997
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.35738031914893614
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-Coder-9B-Chat/ef0cc3a5-0d62-4a45-b0c7-28a6f7dfdac4.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/01-ai_Yi-Coder-9B-Chat/1762652579.4674509",
4
- "retrieved_timestamp": "1762652579.4674518",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "01-ai/Yi-Coder-9B-Chat",
18
- "developer": "01-ai",
19
- "inference_platform": "unknown",
20
- "id": "01-ai/Yi-Coder-9B-Chat"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.4817041006750976
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.48142000339111674
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.04003021148036254
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.24748322147651006
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.3991770833333333
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.24251994680851063
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.829
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/1-800-LLMs/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/a48b0864-76b7-4860-a448-942a8d74f68e.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/1762652579.468073",
4
- "retrieved_timestamp": "1762652579.468074",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
18
- "developer": "1-800-LLMs",
19
- "inference_platform": "unknown",
20
- "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.30774677854758703
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6284322714967584
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.311178247734139
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3699664429530201
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4490625
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.516373005319149
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 14.77
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/152334H/152334H_miqu-1-70b-sf/f57d7b8d-85d5-4e0b-8dec-31e2931487dd.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/152334H_miqu-1-70b-sf/1762652579.469194",
4
- "retrieved_timestamp": "1762652579.469195",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "152334H/miqu-1-70b-sf",
18
- "developer": "152334H",
19
- "inference_platform": "unknown",
20
- "id": "152334H/miqu-1-70b-sf"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.5181740005407873
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6102361685099691
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.12462235649546828
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.35067114093959734
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.45820833333333333
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.42278922872340424
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 68.977
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-7B-v0.1/1347cd1b-2ebc-4223-900f-7c2479e228a3.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-7B-v0.1/1762652579.469481",
4
- "retrieved_timestamp": "1762652579.469482",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "1TuanPham/T-VisStar-7B-v0.1",
18
- "developer": "1TuanPham",
19
- "inference_platform": "unknown",
20
- "id": "1TuanPham/T-VisStar-7B-v0.1"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.36070404305021786
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5052203113352468
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.05740181268882175
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.28523489932885904
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3210605053191489
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "MistralForCausalLM",
105
- "params_billions": 7.294
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-v0.1/b2926dd6-628c-4274-b0e8-1efc64269bb2.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-v0.1/1762652579.469921",
4
- "retrieved_timestamp": "1762652579.469923",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "1TuanPham/T-VisStar-v0.1",
18
- "developer": "1TuanPham",
19
- "inference_platform": "unknown",
20
- "id": "1TuanPham/T-VisStar-v0.1"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.36070404305021786
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5052203113352468
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.05740181268882175
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.28523489932885904
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3210605053191489
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "MistralForCausalLM",
105
- "params_billions": 7.294
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/3rd-Degree-Burn/3rd-Degree-Burn_L-3.1-Science-Writer-8B/0c4fd071-b5c9-4bf1-a1d5-d658be1a3258.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_L-3.1-Science-Writer-8B/1762652579.470164",
4
- "retrieved_timestamp": "1762652579.470165",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
18
- "developer": "3rd-Degree-Burn",
19
- "inference_platform": "unknown",
20
- "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.42625012743963797
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5041306326216103
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.10347432024169184
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.27432885906040266
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.3959479166666666
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.36494348404255317
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.03
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/4season/4season_final_model_test_v2/74973e37-cd82-4e8a-816a-02b035fabff4.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/4season_final_model_test_v2/1762652579.4714398",
4
- "retrieved_timestamp": "1762652579.4714408",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "4season/final_model_test_v2",
18
- "developer": "4season",
19
- "inference_platform": "unknown",
20
- "id": "4season/final_model_test_v2"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3191132860809319
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6342049783295018
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.08383685800604229
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3271812080536913
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4314479166666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3528091755319149
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 21.421
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/3766e8a0-99ad-4733-a01b-ced446b15eda.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/1762652579.471838",
4
- "retrieved_timestamp": "1762652579.471839",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
18
- "developer": "AALF",
19
- "inference_platform": "unknown",
20
- "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.7189579205397235
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5119887898349903
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.24773413897280966
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3053691275167785
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.38200000000000006
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3732546542553192
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.03
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-SFT-preview/342ac912-805f-4166-b8f4-10f0503fa892.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-SFT-preview/1762652579.472149",
4
- "retrieved_timestamp": "1762652579.47215",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
18
- "developer": "AALF",
19
- "inference_platform": "unknown",
20
- "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.7280504616639405
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5240303130445233
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.22507552870090636
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.30453020134228187
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.40199999999999997
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.37433510638297873
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.03
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AGI-0/AGI-0_Art-v0-3B/162b6d5f-f983-4989-9603-f6baea26b633.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AGI-0_Art-v0-3B/1762652579.473539",
4
- "retrieved_timestamp": "1762652579.47354",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AGI-0/Art-v0-3B",
18
- "developer": "AGI-0",
19
- "inference_platform": "unknown",
20
- "id": "AGI-0/Art-v0-3B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.319238509377341
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.3400959483013824
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.24622356495468278
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.25922818791946306
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.3768229166666666
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.11785239361702128
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 3.086
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-CoT/9ac2ba3c-9a21-46b2-a21c-4909cfae6315.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-CoT/1762652579.474318",
4
- "retrieved_timestamp": "1762652579.4743192",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AI-MO/NuminaMath-7B-CoT",
18
- "developer": "AI-MO",
19
- "inference_platform": "unknown",
20
- "id": "AI-MO/NuminaMath-7B-CoT"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.2688544173903022
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4314193495860012
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.26963746223564955
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.26593959731543626
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.33034375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.28681848404255317
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.91
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-TIR/0ffa78d4-fe45-4639-bcd1-eb19ab168a35.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-TIR/1762652579.474566",
4
- "retrieved_timestamp": "1762652579.474567",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AI-MO/NuminaMath-7B-TIR",
18
- "developer": "AI-MO",
19
- "inference_platform": "unknown",
20
- "id": "AI-MO/NuminaMath-7B-TIR"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.27562423259174545
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.41436913375897894
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.1608761329305136
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.25838926174496646
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.35092708333333333
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.2732712765957447
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 6.91
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AI-Sweden-Models/AI-Sweden-Models_Llama-3-8B-instruct/1d68bd2e-de6e-4327-a8f1-33322eba537e.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AI-Sweden-Models_Llama-3-8B-instruct/1762652579.474785",
4
- "retrieved_timestamp": "1762652579.474786",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AI-Sweden-Models/Llama-3-8B-instruct",
18
- "developer": "AI-Sweden-Models",
19
- "inference_platform": "unknown",
20
- "id": "AI-Sweden-Models/Llama-3-8B-instruct"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.24012841482821137
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4173460154515302
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.03851963746223565
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.26593959731543626
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.47709375000000004
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.25972406914893614
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 8.03
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AI4free/AI4free_Dhanishtha/a554a3eb-943c-4135-966b-929129ef025d.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AI4free_Dhanishtha/1762652579.475332",
4
- "retrieved_timestamp": "1762652579.475332",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AI4free/Dhanishtha",
18
- "developer": "AI4free",
19
- "inference_platform": "unknown",
20
- "id": "AI4free/Dhanishtha"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.2451240486353985
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.34039444943326375
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.25604229607250756
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.2525167785234899
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.35694791666666664
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.16431183510638298
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 1.777
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AI4free/AI4free_t2/332ccdb5-faf5-47c6-afeb-a91d2148adf0.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AI4free_t2/1762652579.475577",
4
- "retrieved_timestamp": "1762652579.475578",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AI4free/t2",
18
- "developer": "AI4free",
19
- "inference_platform": "unknown",
20
- "id": "AI4free/t2"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3866828902866616
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.2910111436321769
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.18957703927492447
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.2575503355704698
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.3846354166666666
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.11436170212765957
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 7.613
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AIDC-AI/AIDC-AI_Marco-o1/17f7398f-675d-4b38-b233-64fc106737c3.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AIDC-AI_Marco-o1/1762652579.47579",
4
- "retrieved_timestamp": "1762652579.4757912",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AIDC-AI/Marco-o1",
18
- "developer": "AIDC-AI",
19
- "inference_platform": "unknown",
20
- "id": "AIDC-AI/Marco-o1"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.477083028586373
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5364362696398749
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.37462235649546827
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.25922818791946306
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.41384375
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.41165226063829785
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 7.616
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/Aashraf995/Aashraf995_Creative-7B-nerd/7ea9f4db-5b52-40a5-904e-785e43302934.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/Aashraf995_Creative-7B-nerd/1762652579.476046",
4
- "retrieved_timestamp": "1762652579.476046",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "Aashraf995/Creative-7B-nerd",
18
- "developer": "Aashraf995",
19
- "inference_platform": "unknown",
20
- "id": "Aashraf995/Creative-7B-nerd"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.4721871301480073
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.5606785565640195
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.3164652567975831
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3263422818791946
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.4515416666666667
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.44921875
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 7.616
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AbacusResearch/AbacusResearch_Jallabi-34B/76397277-901a-4ad0-9dae-0351ca875ec6.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AbacusResearch_Jallabi-34B/1762652579.477037",
4
- "retrieved_timestamp": "1762652579.4770381",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AbacusResearch/Jallabi-34B",
18
- "developer": "AbacusResearch",
19
- "inference_platform": "unknown",
20
- "id": "AbacusResearch/Jallabi-34B"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.3528604103777976
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.6023380603196266
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.05211480362537765
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.3389261744966443
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.48217708333333337
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.4681682180851064
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 34.389
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_StructuredThinker-v0.3-MoreStructure/81a5aafb-2cf7-490d-b619-ce638fcc8b38.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/Ahdoot_StructuredThinker-v0.3-MoreStructure/1762652579.4772868",
4
- "retrieved_timestamp": "1762652579.477288",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
18
- "developer": "Ahdoot",
19
- "inference_platform": "unknown",
20
- "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.4192808415005519
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.48376906494893984
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.290785498489426
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.29697986577181207
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.41582291666666665
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.36103723404255317
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 3.397
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_Test_StealthThinker/43c907eb-3e43-47ff-b38d-f912ba6ef46c.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/Ahdoot_Test_StealthThinker/1762652579.4775438",
4
- "retrieved_timestamp": "1762652579.4775438",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "Ahdoot/Test_StealthThinker",
18
- "developer": "Ahdoot",
19
- "inference_platform": "unknown",
20
- "id": "Ahdoot/Test_StealthThinker"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.42200361706937595
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.46466398134666304
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.17900302114803626
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.2961409395973154
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.42804166666666665
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.35970744680851063
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "Qwen2ForCausalLM",
105
- "params_billions": 3.086
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/48732edf-8baf-438e-8a5c-763eee6c0c18.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/1762652579.478028",
4
- "retrieved_timestamp": "1762652579.478029",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
18
- "developer": "AicoresSecurity",
19
- "inference_platform": "unknown",
20
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.7097656440466851
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4477501104993749
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.1487915407854985
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.27181208053691275
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.34079166666666666
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.3178191489361702
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 3.213
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0/38f169f0-e939-4b12-8f78-b2a27fb90de0.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0/1762652579.4777558",
4
- "retrieved_timestamp": "1762652579.477757",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
18
- "developer": "AicoresSecurity",
19
- "inference_platform": "unknown",
20
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.6358018945287394
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4497434194912941
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.11555891238670694
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.2634228187919463
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.33136458333333335
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.301030585106383
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "bfloat16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 3.213
106
- }
107
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/e8c63728-a1f5-432f-bf9f-204b0f4041aa.json DELETED
@@ -1,107 +0,0 @@
1
- {
2
- "schema_version": "0.0.1",
3
- "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/1762652579.478466",
4
- "retrieved_timestamp": "1762652579.478467",
5
- "source_data": [
6
- "https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
7
- ],
8
- "evaluation_source": {
9
- "evaluation_source_name": "HF Open LLM v2",
10
- "evaluation_source_type": "leaderboard"
11
- },
12
- "source_metadata": {
13
- "source_organization_name": "Hugging Face",
14
- "evaluator_relationship": "third_party"
15
- },
16
- "model_info": {
17
- "name": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
18
- "developer": "AicoresSecurity",
19
- "inference_platform": "unknown",
20
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1"
21
- },
22
- "evaluation_results": [
23
- {
24
- "evaluation_name": "IFEval",
25
- "metric_config": {
26
- "evaluation_description": "Accuracy on IFEval",
27
- "lower_is_better": false,
28
- "score_type": "continuous",
29
- "min_score": 0,
30
- "max_score": 1
31
- },
32
- "score_details": {
33
- "score": 0.6730209178313542
34
- }
35
- },
36
- {
37
- "evaluation_name": "BBH",
38
- "metric_config": {
39
- "evaluation_description": "Accuracy on BBH",
40
- "lower_is_better": false,
41
- "score_type": "continuous",
42
- "min_score": 0,
43
- "max_score": 1
44
- },
45
- "score_details": {
46
- "score": 0.4391775517124728
47
- }
48
- },
49
- {
50
- "evaluation_name": "MATH Level 5",
51
- "metric_config": {
52
- "evaluation_description": "Exact Match on MATH Level 5",
53
- "lower_is_better": false,
54
- "score_type": "continuous",
55
- "min_score": 0,
56
- "max_score": 1
57
- },
58
- "score_details": {
59
- "score": 0.17598187311178248
60
- }
61
- },
62
- {
63
- "evaluation_name": "GPQA",
64
- "metric_config": {
65
- "evaluation_description": "Accuracy on GPQA",
66
- "lower_is_better": false,
67
- "score_type": "continuous",
68
- "min_score": 0,
69
- "max_score": 1
70
- },
71
- "score_details": {
72
- "score": 0.2709731543624161
73
- }
74
- },
75
- {
76
- "evaluation_name": "MUSR",
77
- "metric_config": {
78
- "evaluation_description": "Accuracy on MUSR",
79
- "lower_is_better": false,
80
- "score_type": "continuous",
81
- "min_score": 0,
82
- "max_score": 1
83
- },
84
- "score_details": {
85
- "score": 0.35409375000000004
86
- }
87
- },
88
- {
89
- "evaluation_name": "MMLU-PRO",
90
- "metric_config": {
91
- "evaluation_description": "Accuracy on MMLU-PRO",
92
- "lower_is_better": false,
93
- "score_type": "continuous",
94
- "min_score": 0,
95
- "max_score": 1
96
- },
97
- "score_details": {
98
- "score": 0.308843085106383
99
- }
100
- }
101
- ],
102
- "additional_details": {
103
- "precision": "float16",
104
- "architecture": "LlamaForCausalLM",
105
- "params_billions": 3.213
106
- }
107
- }