Spaces:
Running
Running
Commit
·
d0ab546
1
Parent(s):
49c1354
initial commit, space + other info related to action
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .github/workflows/sync-to-hf.yml +55 -0
- .gitignore +8 -0
- .python-version +1 -0
- app.py +127 -531
- data_loader.py +317 -0
- eval.schema.json +282 -0
- hf_operations.py +202 -0
- json_to_parquet.py +228 -0
- leaderboard_data/HFOpenLLMv2/0-hero/0-hero_Matter-0.2-7B-DPO/40e80d5e-db72-46b7-bd14-b7d005df4be8.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-32K/0d91a153-1b6b-4891-8722-a5c7e372ba64.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat-16K/2192007d-1f6e-4f74-b518-7448ef3a896e.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat/e335874b-9b3e-4966-a7e0-22e9d16f8324.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B/8409c158-ef12-4e6c-8a1d-7be2084b3446.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B-Chat/3452e57f-3023-4e2e-ad84-b09e409fe334.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B/1a1f1263-96b6-4e32-a2c8-6c0d6b47dff9.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-32K/df9d9d44-daa1-4e61-9b46-192380043889.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat-16K/090c9691-4b7e-4a98-b9a2-644e21797be4.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat/9256c32b-d956-418f-97da-ea78e3ad9e48.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B/904d1f91-3153-49d5-afd3-9921bfc086f1.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-200K/fb2ebd9a-f5b8-42a2-9b58-e6f0e7d9b98a.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-Chat/5d9b9217-874b-426d-8af4-5105a3b1b3ad.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B/3ebcbf3d-cb2d-4332-bb8a-1db104033391.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-200K/6b720e8b-aab8-4ba4-9bce-e7a1de3cfb86.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-Chat/1120c801-7736-4d9d-b23d-08eeedb34186.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B/297419fa-855c-4eae-ad7c-3cf4a0262450.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B-200K/4299df04-495a-4687-b143-96b1b562d5e8.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B/0ec59add-f9a9-4dbd-8a83-c6aec0b8ad21.json +0 -107
- leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-Coder-9B-Chat/ef0cc3a5-0d62-4a45-b0c7-28a6f7dfdac4.json +0 -107
- leaderboard_data/HFOpenLLMv2/1-800-LLMs/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/a48b0864-76b7-4860-a448-942a8d74f68e.json +0 -107
- leaderboard_data/HFOpenLLMv2/152334H/152334H_miqu-1-70b-sf/f57d7b8d-85d5-4e0b-8dec-31e2931487dd.json +0 -107
- leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-7B-v0.1/1347cd1b-2ebc-4223-900f-7c2479e228a3.json +0 -107
- leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-v0.1/b2926dd6-628c-4274-b0e8-1efc64269bb2.json +0 -107
- leaderboard_data/HFOpenLLMv2/3rd-Degree-Burn/3rd-Degree-Burn_L-3.1-Science-Writer-8B/0c4fd071-b5c9-4bf1-a1d5-d658be1a3258.json +0 -107
- leaderboard_data/HFOpenLLMv2/4season/4season_final_model_test_v2/74973e37-cd82-4e8a-816a-02b035fabff4.json +0 -107
- leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/3766e8a0-99ad-4733-a01b-ced446b15eda.json +0 -107
- leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-SFT-preview/342ac912-805f-4166-b8f4-10f0503fa892.json +0 -107
- leaderboard_data/HFOpenLLMv2/AGI-0/AGI-0_Art-v0-3B/162b6d5f-f983-4989-9603-f6baea26b633.json +0 -107
- leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-CoT/9ac2ba3c-9a21-46b2-a21c-4909cfae6315.json +0 -107
- leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-TIR/0ffa78d4-fe45-4639-bcd1-eb19ab168a35.json +0 -107
- leaderboard_data/HFOpenLLMv2/AI-Sweden-Models/AI-Sweden-Models_Llama-3-8B-instruct/1d68bd2e-de6e-4327-a8f1-33322eba537e.json +0 -107
- leaderboard_data/HFOpenLLMv2/AI4free/AI4free_Dhanishtha/a554a3eb-943c-4135-966b-929129ef025d.json +0 -107
- leaderboard_data/HFOpenLLMv2/AI4free/AI4free_t2/332ccdb5-faf5-47c6-afeb-a91d2148adf0.json +0 -107
- leaderboard_data/HFOpenLLMv2/AIDC-AI/AIDC-AI_Marco-o1/17f7398f-675d-4b38-b233-64fc106737c3.json +0 -107
- leaderboard_data/HFOpenLLMv2/Aashraf995/Aashraf995_Creative-7B-nerd/7ea9f4db-5b52-40a5-904e-785e43302934.json +0 -107
- leaderboard_data/HFOpenLLMv2/AbacusResearch/AbacusResearch_Jallabi-34B/76397277-901a-4ad0-9dae-0351ca875ec6.json +0 -107
- leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_StructuredThinker-v0.3-MoreStructure/81a5aafb-2cf7-490d-b619-ce638fcc8b38.json +0 -107
- leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_Test_StealthThinker/43c907eb-3e43-47ff-b38d-f912ba6ef46c.json +0 -107
- leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/48732edf-8baf-438e-8a5c-763eee6c0c18.json +0 -107
- leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0/38f169f0-e939-4b12-8f78-b2a27fb90de0.json +0 -107
- leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/e8c63728-a1f5-432f-bf9f-204b0f4041aa.json +0 -107
.github/workflows/sync-to-hf.yml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to HuggingFace Dataset
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
paths:
|
| 7 |
+
- 'data/**/*.json'
|
| 8 |
+
workflow_dispatch: # Allow manual trigger
|
| 9 |
+
|
| 10 |
+
jobs:
|
| 11 |
+
sync-to-huggingface:
|
| 12 |
+
runs-on: ubuntu-latest
|
| 13 |
+
|
| 14 |
+
steps:
|
| 15 |
+
- name: Checkout repository
|
| 16 |
+
uses: actions/checkout@v4
|
| 17 |
+
with:
|
| 18 |
+
fetch-depth: 2
|
| 19 |
+
|
| 20 |
+
- name: Set up Python
|
| 21 |
+
uses: actions/setup-python@v5
|
| 22 |
+
with:
|
| 23 |
+
python-version: '3.11'
|
| 24 |
+
|
| 25 |
+
- name: Install dependencies
|
| 26 |
+
run: |
|
| 27 |
+
pip install datasets huggingface_hub pandas pyarrow
|
| 28 |
+
|
| 29 |
+
- name: Convert Changed JSONs to Parquet (Optimized)
|
| 30 |
+
env:
|
| 31 |
+
HF_DATASET_REPO: deepmage121/eee_test
|
| 32 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 33 |
+
run: |
|
| 34 |
+
echo "Detecting changed leaderboards..."
|
| 35 |
+
python scripts/convert_to_parquet.py
|
| 36 |
+
|
| 37 |
+
- name: Upload Changed Parquets to HuggingFace
|
| 38 |
+
env:
|
| 39 |
+
HF_DATASET_REPO: deepmage121/eee_test
|
| 40 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 41 |
+
run: |
|
| 42 |
+
echo "Uploading changed parquets..."
|
| 43 |
+
python scripts/upload_to_hf.py
|
| 44 |
+
|
| 45 |
+
- name: Report status
|
| 46 |
+
if: success()
|
| 47 |
+
run: |
|
| 48 |
+
echo "Successfully synced to HuggingFace dataset"
|
| 49 |
+
echo "View at: https://huggingface.co/datasets/deepmage121/eee_test"
|
| 50 |
+
if [ -f parquet_output/changed_leaderboards.json ]; then
|
| 51 |
+
echo ""
|
| 52 |
+
echo "Changes processed:"
|
| 53 |
+
cat parquet_output/changed_leaderboards.json
|
| 54 |
+
fi
|
| 55 |
+
|
.gitignore
CHANGED
|
@@ -1 +1,9 @@
|
|
| 1 |
.DS_Store
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.DS_Store
|
| 2 |
+
.secrets
|
| 3 |
+
.actrc
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.pyc
|
| 6 |
+
parquet_output/
|
| 7 |
+
*.venv*
|
| 8 |
+
*.md
|
| 9 |
+
*.ipynb_checkpoints
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.11
|
app.py
CHANGED
|
@@ -1,479 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
-
import json
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
|
| 17 |
-
model_id = data.get("model_info", {}).get("id", "Unknown Model")
|
| 18 |
-
developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
|
| 19 |
-
|
| 20 |
-
params = data.get("model_info", {}).get("params_billions", None)
|
| 21 |
-
architecture = data.get("model_info", {}).get("architecture", "Unknown")
|
| 22 |
-
precision = data.get("additional_details", {}).get("precision", "Unknown")
|
| 23 |
-
if precision == "Unknown":
|
| 24 |
-
precision = data.get("model_info", {}).get("precision", "Unknown")
|
| 25 |
-
|
| 26 |
-
results = {}
|
| 27 |
-
if "evaluation_results" in data:
|
| 28 |
-
for res in data["evaluation_results"]:
|
| 29 |
-
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 30 |
-
score = res.get("score_details", {}).get("score", None)
|
| 31 |
-
if score is not None:
|
| 32 |
-
results[eval_name] = score
|
| 33 |
-
|
| 34 |
-
return {
|
| 35 |
-
"leaderboard": leaderboard_name,
|
| 36 |
-
"provider": provider_name,
|
| 37 |
-
"model": model_id,
|
| 38 |
-
"developer": developer_name,
|
| 39 |
-
"params": params,
|
| 40 |
-
"architecture": architecture,
|
| 41 |
-
"precision": precision,
|
| 42 |
-
"results": results,
|
| 43 |
-
"raw_data": data
|
| 44 |
-
}
|
| 45 |
-
except Exception as e:
|
| 46 |
-
print(f"Error parsing {file_path}: {e}")
|
| 47 |
-
return None
|
| 48 |
-
|
| 49 |
-
def get_available_leaderboards():
|
| 50 |
-
"""Scans data directory for leaderboard folders."""
|
| 51 |
-
if not DATA_DIR.exists():
|
| 52 |
-
return []
|
| 53 |
-
return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
|
| 54 |
-
|
| 55 |
-
def normalize_leaderboard_name(name):
|
| 56 |
-
"""Normalizes leaderboard name to remove spaces."""
|
| 57 |
-
return name.replace(" ", "")
|
| 58 |
-
|
| 59 |
-
def sanitize_filename_component(name):
|
| 60 |
-
"""Sanitizes a name to be safe for use in directory names."""
|
| 61 |
-
return name.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
|
| 62 |
-
|
| 63 |
-
def walk_eval_files(leaderboard_name):
|
| 64 |
-
"""Generator that walks through Leaderboard directory recursively."""
|
| 65 |
-
lb_path = DATA_DIR / leaderboard_name
|
| 66 |
-
if not lb_path.exists():
|
| 67 |
-
return
|
| 68 |
-
|
| 69 |
-
yield from lb_path.rglob("*.json")
|
| 70 |
|
| 71 |
-
def get_eval_metadata(selected_leaderboard):
|
| 72 |
-
"""Extracts evaluation metadata from the leaderboard data."""
|
| 73 |
-
if not selected_leaderboard:
|
| 74 |
-
return {}
|
| 75 |
-
|
| 76 |
-
eval_metadata = {"evals": {}, "source_info": {}}
|
| 77 |
-
|
| 78 |
-
for json_file in walk_eval_files(selected_leaderboard):
|
| 79 |
-
parsed = parse_eval_json(json_file)
|
| 80 |
-
if parsed:
|
| 81 |
-
if not eval_metadata["source_info"]:
|
| 82 |
-
source_meta = parsed["raw_data"].get("source_metadata", {})
|
| 83 |
-
source_data_list = parsed["raw_data"].get("source_data", [])
|
| 84 |
-
url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
|
| 85 |
-
|
| 86 |
-
eval_metadata["source_info"] = {
|
| 87 |
-
"organization": source_meta.get("source_organization_name", "Unknown"),
|
| 88 |
-
"relationship": source_meta.get("evaluator_relationship", "Unknown"),
|
| 89 |
-
"url": url
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
if "evaluation_results" in parsed["raw_data"]:
|
| 93 |
-
for res in parsed["raw_data"]["evaluation_results"]:
|
| 94 |
-
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 95 |
-
if eval_name not in eval_metadata["evals"]:
|
| 96 |
-
metric_config = res.get("metric_config", {})
|
| 97 |
-
eval_metadata["evals"][eval_name] = {
|
| 98 |
-
"description": metric_config.get("evaluation_description", "No description available"),
|
| 99 |
-
"score_type": metric_config.get("score_type", "unknown"),
|
| 100 |
-
"lower_is_better": metric_config.get("lower_is_better", False),
|
| 101 |
-
"min_score": metric_config.get("min_score"),
|
| 102 |
-
"max_score": metric_config.get("max_score"),
|
| 103 |
-
"level_names": metric_config.get("level_names", []),
|
| 104 |
-
"level_metadata": metric_config.get("level_metadata", []),
|
| 105 |
-
"has_unknown_level": metric_config.get("has_unknown_level", False)
|
| 106 |
-
}
|
| 107 |
-
break
|
| 108 |
-
|
| 109 |
-
return eval_metadata
|
| 110 |
|
| 111 |
-
def
|
| 112 |
-
"""
|
| 113 |
if not selected_leaderboard:
|
| 114 |
-
return
|
| 115 |
-
<div style="text-align: center; padding: 3rem; color: var(--body-text-color-subdued);">
|
| 116 |
-
<h3>👋 Welcome to Eval Leaderboard</h3>
|
| 117 |
-
<p>Select a leaderboard above to visualize results and metadata.</p>
|
| 118 |
-
</div>
|
| 119 |
-
"""
|
| 120 |
-
|
| 121 |
-
metadata = get_eval_metadata(selected_leaderboard)
|
| 122 |
-
if not metadata or not metadata.get("evals"):
|
| 123 |
-
return f"""<div style="padding: 1rem;">No metadata found for {selected_leaderboard}</div>"""
|
| 124 |
-
|
| 125 |
-
source_info = metadata.get("source_info", {})
|
| 126 |
-
evals = metadata.get("evals", {})
|
| 127 |
-
unique_evals_count = len(evals)
|
| 128 |
-
|
| 129 |
-
eval_badges = "".join([
|
| 130 |
-
f'<span style="background: var(--background-fill-secondary); border: 1px solid var(--border-color-primary); padding: 2px 8px; border-radius: 4px; font-size: 0.85rem; white-space: nowrap;">{name}</span>'
|
| 131 |
-
for name in sorted(evals.keys())
|
| 132 |
-
])
|
| 133 |
-
|
| 134 |
-
source_url = source_info.get('url', '#')
|
| 135 |
-
source_link = f'<a href="{source_url}" target="_blank" style="text-decoration: none; color: var(--link-text-color); hover: underline;">🔗 {source_info.get("organization", "Unknown")}</a>'
|
| 136 |
-
|
| 137 |
-
html = f"""
|
| 138 |
-
<div style="
|
| 139 |
-
background: var(--block-background-fill);
|
| 140 |
-
border: 1px solid var(--border-color-primary);
|
| 141 |
-
border-radius: 8px;
|
| 142 |
-
padding: 1.5rem;
|
| 143 |
-
margin-bottom: 2rem;
|
| 144 |
-
box-shadow: var(--shadow-sm);
|
| 145 |
-
">
|
| 146 |
-
<h2 style="margin-top: 0; margin-bottom: 1rem;">📊 {selected_leaderboard}</h2>
|
| 147 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1.5rem;">
|
| 148 |
-
<div>
|
| 149 |
-
<div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600;">Source Organization</div>
|
| 150 |
-
<div style="font-size: 1.1rem; font-weight: 500;">{source_link}</div>
|
| 151 |
-
</div>
|
| 152 |
-
<div>
|
| 153 |
-
<div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600;">Evaluator Relationship</div>
|
| 154 |
-
<div style="font-size: 1.1rem; font-weight: 500;">{source_info.get('relationship', 'Unknown').replace('_', ' ').title()}</div>
|
| 155 |
-
</div>
|
| 156 |
-
<div>
|
| 157 |
-
<div style="font-size: 0.85rem; color: var(--body-text-color-subdued); text-transform: uppercase; letter-spacing: 0.05em; font-weight: 600; margin-bottom: 0.5rem;">Included Evaluations</div>
|
| 158 |
-
<div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">{eval_badges}</div>
|
| 159 |
-
</div>
|
| 160 |
-
</div>
|
| 161 |
-
</div>
|
| 162 |
-
|
| 163 |
-
<h3 style="margin-bottom: 1rem;">Metric Details</h3>
|
| 164 |
-
"""
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
gap: 1rem;
|
| 171 |
-
">
|
| 172 |
-
"""
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
direction_icon = "↓" if info['lower_is_better'] else "↑"
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
elif info['score_type'] == "levels" and info.get('level_names'):
|
| 183 |
-
levels = ", ".join(info['level_names'])
|
| 184 |
-
details_content += f"<div><span style='opacity: 0.7;'>Levels:</span> <strong>{levels}</strong></div>"
|
| 185 |
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
background: var(--background-fill-secondary);
|
| 192 |
-
border: 1px solid var(--border-color-primary);
|
| 193 |
-
border-radius: 6px;
|
| 194 |
-
overflow: hidden;
|
| 195 |
-
height: fit-content;
|
| 196 |
-
">
|
| 197 |
-
<summary style="
|
| 198 |
-
padding: 0.75rem 1rem;
|
| 199 |
-
cursor: pointer;
|
| 200 |
-
font-weight: 600;
|
| 201 |
-
display: flex;
|
| 202 |
-
align-items: center;
|
| 203 |
-
justify-content: space-between;
|
| 204 |
-
list-style: none;
|
| 205 |
-
font-size: 0.95rem;
|
| 206 |
-
">
|
| 207 |
-
<div style="display: flex; align-items: center; gap: 0.5rem;">
|
| 208 |
-
<span style="font-size: 1.1rem; opacity: 0.8;">🏷️</span>
|
| 209 |
-
<span style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{eval_name}</span>
|
| 210 |
-
</div>
|
| 211 |
-
<div style="display: flex; align-items: center; gap: 0.5rem;">
|
| 212 |
-
<span style="font-size: 0.8rem; font-weight: 400; color: var(--body-text-color-subdued); white-space: nowrap;">{direction_icon} {direction}</span>
|
| 213 |
-
</div>
|
| 214 |
-
</summary>
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
color: var(--body-text-color-subdued);
|
| 236 |
-
">{score_type}</span>
|
| 237 |
-
</div>
|
| 238 |
-
</div>
|
| 239 |
-
</details>
|
| 240 |
-
"""
|
| 241 |
-
|
| 242 |
-
html += "</div>"
|
| 243 |
-
return html
|
| 244 |
|
| 245 |
-
def update_leaderboard_table(selected_leaderboard, search_query="",
|
| 246 |
"""Loads and aggregates data for the selected leaderboard."""
|
| 247 |
if not selected_leaderboard:
|
| 248 |
-
return pd.DataFrame(),
|
| 249 |
|
| 250 |
-
|
| 251 |
-
full_df = None
|
| 252 |
-
if selected_leaderboard in LEADERBOARD_CACHE:
|
| 253 |
-
# Cache stores (df, meta_html)
|
| 254 |
-
full_df, meta_html = LEADERBOARD_CACHE[selected_leaderboard]
|
| 255 |
-
else:
|
| 256 |
-
progress(0, desc=f"Scanning {selected_leaderboard}...")
|
| 257 |
-
all_files = list(walk_eval_files(selected_leaderboard))
|
| 258 |
-
total_files = len(all_files)
|
| 259 |
-
|
| 260 |
-
rows = []
|
| 261 |
-
for i, json_file in enumerate(all_files):
|
| 262 |
-
if i % 100 == 0:
|
| 263 |
-
progress((i / total_files), desc=f"Loading {selected_leaderboard}...")
|
| 264 |
-
parsed = parse_eval_json(json_file)
|
| 265 |
-
if parsed:
|
| 266 |
-
row = {
|
| 267 |
-
"Model": parsed["model"],
|
| 268 |
-
"Developer": parsed["developer"],
|
| 269 |
-
"Params (B)": parsed["params"],
|
| 270 |
-
"Arch": parsed["architecture"],
|
| 271 |
-
"Precision": parsed["precision"]
|
| 272 |
-
}
|
| 273 |
-
row.update(parsed["results"])
|
| 274 |
-
rows.append(row)
|
| 275 |
-
|
| 276 |
-
meta_html = format_eval_info_html(selected_leaderboard)
|
| 277 |
-
|
| 278 |
-
if not rows:
|
| 279 |
-
full_df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision", "Score"])
|
| 280 |
-
else:
|
| 281 |
-
full_df = pd.DataFrame(rows)
|
| 282 |
-
numeric_cols = full_df.select_dtypes(include=['float', 'int']).columns
|
| 283 |
-
full_df[numeric_cols] = full_df[numeric_cols].round(3)
|
| 284 |
-
|
| 285 |
-
LEADERBOARD_CACHE[selected_leaderboard] = (full_df, meta_html)
|
| 286 |
-
|
| 287 |
-
# Filter by search query
|
| 288 |
-
df = full_df.copy()
|
| 289 |
-
if search_query:
|
| 290 |
-
df = df[
|
| 291 |
-
df["Model"].str.contains(search_query, case=False, na=False) |
|
| 292 |
-
df["Developer"].str.contains(search_query, case=False, na=False)
|
| 293 |
-
]
|
| 294 |
-
|
| 295 |
-
# Group by model and average scores if requested
|
| 296 |
-
if group_by_model and not df.empty:
|
| 297 |
-
# Identify grouping columns (non-numeric usually, or specific base cols)
|
| 298 |
-
# We group by the base identifiers.
|
| 299 |
-
base_cols_all = ["Model", "Developer", "Params (B)", "Arch", "Precision"]
|
| 300 |
-
group_cols = [c for c in base_cols_all if c in df.columns]
|
| 301 |
-
|
| 302 |
-
# Identify columns to average (numeric)
|
| 303 |
-
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 304 |
-
# Exclude group_cols from numeric_cols if they happen to be numeric (like Params)
|
| 305 |
-
# But groupby keys can be numeric.
|
| 306 |
-
# We want to average the SCORES.
|
| 307 |
-
# Any numeric column NOT in group_cols should be averaged.
|
| 308 |
-
agg_cols = [c for c in numeric_cols if c not in group_cols]
|
| 309 |
-
|
| 310 |
-
if group_cols and agg_cols:
|
| 311 |
-
df = df.groupby(group_cols)[agg_cols].mean().reset_index()
|
| 312 |
-
df = df.round(3)
|
| 313 |
-
|
| 314 |
-
# Drop columns where all values are null
|
| 315 |
-
df = df.dropna(axis=1, how='all')
|
| 316 |
-
|
| 317 |
-
if df.empty:
|
| 318 |
-
return df, meta_html
|
| 319 |
-
|
| 320 |
-
# Filter base_cols to only include columns that exist in df (in case some were dropped)
|
| 321 |
-
base_cols = [c for c in ["Model", "Developer", "Params (B)", "Arch", "Precision"] if c in df.columns]
|
| 322 |
-
eval_cols = [c for c in df.columns if c not in base_cols]
|
| 323 |
-
|
| 324 |
-
cols = base_cols + eval_cols
|
| 325 |
-
return df[cols], meta_html
|
| 326 |
-
|
| 327 |
-
def find_json_files(path):
|
| 328 |
-
"""Recursively finds all JSON files in a directory or returns the file if it's a JSON file."""
|
| 329 |
-
json_files = []
|
| 330 |
-
path_obj = Path(path)
|
| 331 |
-
|
| 332 |
-
if path_obj.is_file() and path_obj.suffix == ".json":
|
| 333 |
-
json_files.append(path_obj)
|
| 334 |
-
elif path_obj.is_dir():
|
| 335 |
-
json_files.extend(path_obj.rglob("*.json"))
|
| 336 |
-
|
| 337 |
-
return json_files
|
| 338 |
-
|
| 339 |
-
def check_is_duplicate(save_dir, new_eval_id):
|
| 340 |
-
"""Checks if a file with the same evaluation_id already exists in the directory."""
|
| 341 |
-
if not new_eval_id or not save_dir.exists():
|
| 342 |
-
return False
|
| 343 |
-
|
| 344 |
-
for existing_file in save_dir.glob("*.json"):
|
| 345 |
-
try:
|
| 346 |
-
with open(existing_file, 'r') as f:
|
| 347 |
-
data = json.load(f)
|
| 348 |
-
if data.get("evaluation_id") == new_eval_id:
|
| 349 |
-
return True
|
| 350 |
-
except:
|
| 351 |
-
continue
|
| 352 |
-
return False
|
| 353 |
-
|
| 354 |
-
def handle_file_upload(files, progress=gr.Progress()):
|
| 355 |
-
"""Processes uploaded files/folders and saves them to the correct structure.
|
| 356 |
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
"""
|
| 360 |
-
if not files:
|
| 361 |
-
return gr.update(), "No files uploaded."
|
| 362 |
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
skipped_count = 0
|
| 366 |
-
duplicate_count = 0
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
if Path(path).is_file() and Path(path).suffix != ".json":
|
| 374 |
-
skipped_count += 1
|
| 375 |
-
|
| 376 |
-
all_json_files.extend(json_files)
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
if not parsed:
|
| 384 |
-
continue
|
| 385 |
-
|
| 386 |
-
leaderboard = normalize_leaderboard_name(parsed["leaderboard"])
|
| 387 |
-
provider = parsed["provider"]
|
| 388 |
-
model_id = parsed["model"]
|
| 389 |
-
developer = parsed["developer"]
|
| 390 |
-
eval_id = parsed["raw_data"].get("evaluation_id")
|
| 391 |
-
|
| 392 |
-
# Sanitize names for directory structure
|
| 393 |
-
sanitized_provider = sanitize_filename_component(developer)
|
| 394 |
-
sanitized_model = sanitize_filename_component(model_id)
|
| 395 |
-
|
| 396 |
-
# Create structure: Leaderboard/Developer/Model
|
| 397 |
-
save_dir = DATA_DIR / leaderboard / sanitized_provider / sanitized_model
|
| 398 |
-
save_dir.mkdir(parents=True, exist_ok=True)
|
| 399 |
-
|
| 400 |
-
# Check for duplicates based on evaluation_id
|
| 401 |
-
if check_is_duplicate(save_dir, eval_id):
|
| 402 |
-
duplicate_count += 1
|
| 403 |
-
continue
|
| 404 |
-
|
| 405 |
-
# Preserve original filename
|
| 406 |
-
filename = json_file.name
|
| 407 |
-
save_path = save_dir / filename
|
| 408 |
-
|
| 409 |
-
# Avoid overwriting by appending counter
|
| 410 |
-
counter = 1
|
| 411 |
-
while save_path.exists():
|
| 412 |
-
stem = save_path.stem.rsplit('_', 1)[0] if '_' in save_path.stem else save_path.stem
|
| 413 |
-
save_path = save_dir / f"{stem}_{counter}.json"
|
| 414 |
-
counter += 1
|
| 415 |
-
|
| 416 |
-
with open(save_path, 'w') as f:
|
| 417 |
-
json.dump(parsed["raw_data"], f, indent=2)
|
| 418 |
-
|
| 419 |
-
saved_count += 1
|
| 420 |
-
|
| 421 |
-
except Exception as e:
|
| 422 |
-
print(f"Failed to save {json_file}: {e}")
|
| 423 |
-
|
| 424 |
-
# Clear cache since data changed
|
| 425 |
-
LEADERBOARD_CACHE.clear()
|
| 426 |
-
|
| 427 |
-
# Refresh leaderboard choices
|
| 428 |
-
choices = get_available_leaderboards()
|
| 429 |
|
| 430 |
-
|
| 431 |
-
if duplicate_count > 0:
|
| 432 |
-
msg_parts.append(f"Skipped {duplicate_count} duplicates.")
|
| 433 |
-
if skipped_count > 0:
|
| 434 |
-
msg_parts.append(f"Skipped {skipped_count} non-JSON files.")
|
| 435 |
-
|
| 436 |
-
return gr.Dropdown(choices=choices), " ".join(msg_parts), None, None
|
| 437 |
|
| 438 |
-
# Professional, high-contrast theme
|
| 439 |
-
theme = gr.themes.Soft(
|
| 440 |
-
primary_hue="slate",
|
| 441 |
-
neutral_hue="slate",
|
| 442 |
-
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"]
|
| 443 |
-
).set(
|
| 444 |
-
body_background_fill="var(--neutral-50)",
|
| 445 |
-
block_background_fill="white",
|
| 446 |
-
block_border_width="1px",
|
| 447 |
-
block_title_text_weight="600"
|
| 448 |
-
)
|
| 449 |
-
|
| 450 |
-
css = """
|
| 451 |
-
/* Clean up the global container */
|
| 452 |
-
.gradio-container {
|
| 453 |
-
max-width: 100% !important;
|
| 454 |
-
padding: 0 2rem !important;
|
| 455 |
-
}
|
| 456 |
-
|
| 457 |
-
/* Table Styles */
|
| 458 |
-
.dataframe {
|
| 459 |
-
border: 1px solid var(--border-color-primary) !important;
|
| 460 |
-
border-radius: 8px;
|
| 461 |
-
}
|
| 462 |
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
display: none !important;
|
| 466 |
-
}
|
| 467 |
-
"""
|
| 468 |
|
| 469 |
-
|
|
|
|
| 470 |
|
| 471 |
with gr.Row(variant="compact", elem_classes="header-row"):
|
| 472 |
with gr.Column(scale=1):
|
| 473 |
gr.Markdown("# 🏆 Evaluation Leaderboard")
|
| 474 |
gr.Markdown("Analyze and compare model performance metrics.", elem_classes="subtitle")
|
| 475 |
|
| 476 |
-
with gr.Row(variant="panel"
|
| 477 |
initial_choices = get_available_leaderboards()
|
| 478 |
initial_value = initial_choices[0] if initial_choices else None
|
| 479 |
|
|
@@ -482,56 +109,51 @@ with gr.Blocks(title="Eval Leaderboard", theme=theme, css=css) as demo:
|
|
| 482 |
choices=initial_choices,
|
| 483 |
value=initial_value,
|
| 484 |
label="Current Leaderboard",
|
| 485 |
-
interactive=True
|
| 486 |
-
container=False,
|
| 487 |
-
scale=1
|
| 488 |
-
)
|
| 489 |
-
with gr.Column(scale=2):
|
| 490 |
-
search_box = gr.Textbox(
|
| 491 |
-
label="Search Model/Developer",
|
| 492 |
-
placeholder="🔍 Search model or developer...",
|
| 493 |
-
show_label=False,
|
| 494 |
-
container=False,
|
| 495 |
-
scale=1
|
| 496 |
-
)
|
| 497 |
-
with gr.Column(scale=1, min_width=100):
|
| 498 |
-
group_by_model = gr.Checkbox(
|
| 499 |
-
label="Average by Model",
|
| 500 |
-
value=False,
|
| 501 |
-
container=False
|
| 502 |
)
|
| 503 |
-
with gr.Column(scale=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
|
| 505 |
|
| 506 |
-
with gr.Accordion("
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
value="Files",
|
| 510 |
-
label="Upload Mode",
|
| 511 |
-
info="Choose 'Files' for individual JSONs, or 'Folder' to upload a directory structure."
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
with gr.Group(visible=True) as file_upload_group:
|
| 515 |
-
file_uploader_files = gr.File(
|
| 516 |
-
file_count="multiple",
|
| 517 |
-
file_types=[".json"],
|
| 518 |
-
label="Select JSON Files"
|
| 519 |
-
)
|
| 520 |
-
|
| 521 |
-
with gr.Group(visible=False) as folder_upload_group:
|
| 522 |
-
file_uploader_folder = gr.File(
|
| 523 |
-
file_count="directory",
|
| 524 |
-
label="Select Folder"
|
| 525 |
-
)
|
| 526 |
-
|
| 527 |
-
upload_status = gr.Textbox(
|
| 528 |
-
label="Upload Status",
|
| 529 |
-
interactive=False
|
| 530 |
-
)
|
| 531 |
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
-
|
| 535 |
|
| 536 |
leaderboard_table = gr.Dataframe(
|
| 537 |
value=init_df,
|
|
@@ -541,59 +163,33 @@ with gr.Blocks(title="Eval Leaderboard", theme=theme, css=css) as demo:
|
|
| 541 |
elem_classes="dataframe"
|
| 542 |
)
|
| 543 |
|
| 544 |
-
|
| 545 |
-
return {
|
| 546 |
-
file_upload_group: gr.Group(visible=(mode == "Files")),
|
| 547 |
-
folder_upload_group: gr.Group(visible=(mode == "Folder"))
|
| 548 |
-
}
|
| 549 |
|
| 550 |
-
upload_mode.change(
|
| 551 |
-
fn=toggle_upload_input,
|
| 552 |
-
inputs=[upload_mode],
|
| 553 |
-
outputs=[file_upload_group, folder_upload_group]
|
| 554 |
-
)
|
| 555 |
-
|
| 556 |
-
file_uploader_files.upload(
|
| 557 |
-
fn=handle_file_upload,
|
| 558 |
-
inputs=[file_uploader_files],
|
| 559 |
-
outputs=[leaderboard_selector, upload_status, file_uploader_files, file_uploader_folder]
|
| 560 |
-
)
|
| 561 |
-
|
| 562 |
-
file_uploader_folder.upload(
|
| 563 |
-
fn=handle_file_upload,
|
| 564 |
-
inputs=[file_uploader_folder],
|
| 565 |
-
outputs=[leaderboard_selector, upload_status, file_uploader_files, file_uploader_folder]
|
| 566 |
-
)
|
| 567 |
|
|
|
|
| 568 |
leaderboard_selector.change(
|
| 569 |
fn=update_leaderboard_table,
|
| 570 |
-
inputs=[leaderboard_selector, search_box
|
| 571 |
-
outputs=[leaderboard_table,
|
| 572 |
-
)
|
| 573 |
-
|
| 574 |
-
search_box.change(
|
| 575 |
-
fn=update_leaderboard_table,
|
| 576 |
-
inputs=[leaderboard_selector, search_box, group_by_model],
|
| 577 |
-
outputs=[leaderboard_table, metadata_view]
|
| 578 |
)
|
| 579 |
|
| 580 |
-
|
| 581 |
-
fn=update_leaderboard_table,
|
| 582 |
-
inputs=[leaderboard_selector, search_box
|
| 583 |
-
outputs=[leaderboard_table,
|
| 584 |
)
|
| 585 |
|
| 586 |
refresh_btn.click(
|
| 587 |
-
fn=lambda:
|
| 588 |
-
outputs=[leaderboard_selector
|
| 589 |
).then(
|
| 590 |
-
fn=lambda:
|
| 591 |
).then(
|
| 592 |
fn=update_leaderboard_table,
|
| 593 |
-
inputs=[leaderboard_selector, search_box
|
| 594 |
-
outputs=[leaderboard_table,
|
| 595 |
)
|
| 596 |
-
|
| 597 |
DATA_DIR.mkdir(exist_ok=True)
|
| 598 |
|
| 599 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation Leaderboard - Gradio Interface
|
| 3 |
+
Displays model evaluation results from HuggingFace datasets.
|
| 4 |
+
"""
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
|
| 9 |
+
# Import custom modules
|
| 10 |
+
from data_loader import (
|
| 11 |
+
load_hf_dataset_on_startup,
|
| 12 |
+
get_available_leaderboards,
|
| 13 |
+
get_eval_metadata,
|
| 14 |
+
build_leaderboard_table,
|
| 15 |
+
clear_cache,
|
| 16 |
+
DATA_DIR
|
| 17 |
+
)
|
| 18 |
+
from ui_components import get_theme, get_custom_css, format_leaderboard_header, format_metric_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
def export_leaderboard_to_json(selected_leaderboard):
|
| 22 |
+
"""Export current leaderboard to JSON files in a zip using parquet_to_folder."""
|
| 23 |
if not selected_leaderboard:
|
| 24 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
import tempfile
|
| 27 |
+
import shutil
|
| 28 |
+
import zipfile
|
| 29 |
+
from json_to_parquet import parquet_to_folder
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
+
try:
|
| 32 |
+
# Find the parquet file in DATA_DIR
|
| 33 |
+
parquet_path = DATA_DIR / selected_leaderboard / f"{selected_leaderboard}.parquet"
|
|
|
|
| 34 |
|
| 35 |
+
if not parquet_path.exists():
|
| 36 |
+
print(f"Parquet file not found: {parquet_path}")
|
| 37 |
+
return None
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
+
# Create temp directory for export
|
| 40 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 41 |
+
temp_path = Path(temp_dir)
|
| 42 |
+
output_dir = temp_path / "json_export"
|
| 43 |
+
output_dir.mkdir()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
+
# Use the round-trip functionality from json_to_parquet
|
| 46 |
+
parquet_to_folder(str(parquet_path), str(output_dir))
|
| 47 |
+
|
| 48 |
+
# Create zip file
|
| 49 |
+
zip_path = temp_path / f"{selected_leaderboard}_export.zip"
|
| 50 |
+
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
|
| 51 |
+
for json_file in output_dir.rglob("*.json"):
|
| 52 |
+
arcname = json_file.relative_to(output_dir)
|
| 53 |
+
zipf.write(json_file, arcname)
|
| 54 |
+
|
| 55 |
+
# Copy to a permanent location for download
|
| 56 |
+
final_zip = Path(tempfile.gettempdir()) / f"{selected_leaderboard}_export.zip"
|
| 57 |
+
shutil.copy(zip_path, final_zip)
|
| 58 |
+
|
| 59 |
+
return str(final_zip)
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"Export error: {e}")
|
| 62 |
+
return None
|
| 63 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
def update_leaderboard_table(selected_leaderboard, search_query="", progress=gr.Progress()):
|
| 66 |
"""Loads and aggregates data for the selected leaderboard."""
|
| 67 |
if not selected_leaderboard:
|
| 68 |
+
return pd.DataFrame(), "", format_leaderboard_header(None, {}), format_metric_details(None, {})
|
| 69 |
|
| 70 |
+
metadata = get_eval_metadata(selected_leaderboard)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
def progress_callback(value, desc):
|
| 73 |
+
progress(value, desc=desc)
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
+
df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
|
| 76 |
+
total_count = len(df)
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
# Apply search filter (searches all columns)
|
| 79 |
+
if search_query and not df.empty:
|
| 80 |
+
mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
|
| 81 |
+
df = df[mask]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Build search status message
|
| 84 |
+
if search_query:
|
| 85 |
+
search_msg = f"Showing {len(df)} of {total_count} results for '{search_query}'"
|
| 86 |
+
else:
|
| 87 |
+
search_msg = f"Showing {len(df)} results"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
return df, search_msg, format_leaderboard_header(selected_leaderboard, metadata), format_metric_details(selected_leaderboard, metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# Load HF dataset BEFORE building the interface
|
| 93 |
+
load_hf_dataset_on_startup()
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
# Build Gradio interface
|
| 96 |
+
with gr.Blocks(title="Eval Leaderboard", theme=get_theme(), css=get_custom_css()) as demo:
|
| 97 |
|
| 98 |
with gr.Row(variant="compact", elem_classes="header-row"):
|
| 99 |
with gr.Column(scale=1):
|
| 100 |
gr.Markdown("# 🏆 Evaluation Leaderboard")
|
| 101 |
gr.Markdown("Analyze and compare model performance metrics.", elem_classes="subtitle")
|
| 102 |
|
| 103 |
+
with gr.Row(variant="panel"):
|
| 104 |
initial_choices = get_available_leaderboards()
|
| 105 |
initial_value = initial_choices[0] if initial_choices else None
|
| 106 |
|
|
|
|
| 109 |
choices=initial_choices,
|
| 110 |
value=initial_value,
|
| 111 |
label="Current Leaderboard",
|
| 112 |
+
interactive=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
)
|
| 114 |
+
with gr.Column(scale=3):
|
| 115 |
+
search_box = gr.Textbox(
|
| 116 |
+
label="Search",
|
| 117 |
+
placeholder="Type to search across all columns...",
|
| 118 |
+
show_label=False
|
| 119 |
+
)
|
| 120 |
+
with gr.Column(scale=1):
|
| 121 |
refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
|
| 122 |
|
| 123 |
+
with gr.Accordion("ℹ️ How to Submit Data", open=False):
|
| 124 |
+
gr.Markdown("""
|
| 125 |
+
### Submitting Evaluation Data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
**Data submissions happen via GitHub Pull Requests:**
|
| 128 |
+
|
| 129 |
+
1. **Fork** [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
|
| 130 |
+
2. **Add your JSON files** to `data/<leaderboard>/<developer>/<model>/`
|
| 131 |
+
3. **Create a Pull Request**
|
| 132 |
+
4. **Automated validation** checks your data
|
| 133 |
+
5. **After merge**: GitHub Actions automatically syncs to HuggingFace
|
| 134 |
+
6. **Refresh this page** to see your data!
|
| 135 |
+
|
| 136 |
+
#### File Structure
|
| 137 |
+
```
|
| 138 |
+
data/
|
| 139 |
+
└── YourBenchmark/
|
| 140 |
+
└── developer_name/
|
| 141 |
+
└── model_name/
|
| 142 |
+
└── {uuid}.json
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
Each JSON file should follow the schema and be named with a unique UUID.
|
| 146 |
+
|
| 147 |
+
📖 [**Full Submission Guide**](https://github.com/evaleval/every_eval_ever#contributor-guide) |
|
| 148 |
+
📋 [**JSON Schema**](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) |
|
| 149 |
+
👀 [**See Examples**](https://github.com/evaleval/every_eval_ever/tree/main/data)
|
| 150 |
+
""")
|
| 151 |
+
|
| 152 |
+
init_df, init_search_msg, init_header, init_metrics = update_leaderboard_table(initial_value)
|
| 153 |
+
|
| 154 |
+
header_view = gr.HTML(value=init_header)
|
| 155 |
|
| 156 |
+
search_info = gr.Markdown(value=init_search_msg)
|
| 157 |
|
| 158 |
leaderboard_table = gr.Dataframe(
|
| 159 |
value=init_df,
|
|
|
|
| 163 |
elem_classes="dataframe"
|
| 164 |
)
|
| 165 |
|
| 166 |
+
metrics_view = gr.HTML(value=init_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# Event handlers
|
| 170 |
leaderboard_selector.change(
|
| 171 |
fn=update_leaderboard_table,
|
| 172 |
+
inputs=[leaderboard_selector, search_box],
|
| 173 |
+
outputs=[leaderboard_table, search_info, header_view, metrics_view]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
)
|
| 175 |
|
| 176 |
+
search_box.input(
|
| 177 |
+
fn=update_leaderboard_table,
|
| 178 |
+
inputs=[leaderboard_selector, search_box],
|
| 179 |
+
outputs=[leaderboard_table, search_info, header_view, metrics_view]
|
| 180 |
)
|
| 181 |
|
| 182 |
refresh_btn.click(
|
| 183 |
+
fn=lambda: gr.Dropdown(choices=get_available_leaderboards()),
|
| 184 |
+
outputs=[leaderboard_selector]
|
| 185 |
).then(
|
| 186 |
+
fn=lambda: clear_cache()
|
| 187 |
).then(
|
| 188 |
fn=update_leaderboard_table,
|
| 189 |
+
inputs=[leaderboard_selector, search_box],
|
| 190 |
+
outputs=[leaderboard_table, search_info, header_view, metrics_view]
|
| 191 |
)
|
| 192 |
+
|
| 193 |
DATA_DIR.mkdir(exist_ok=True)
|
| 194 |
|
| 195 |
if __name__ == "__main__":
|
data_loader.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Loader: Load from HuggingFace, parse JSON files, and build tables.
|
| 3 |
+
"""
|
| 4 |
+
import json
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from datasets import load_dataset
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Global caches
|
| 11 |
+
HF_DATASET_CACHE = {}
|
| 12 |
+
LEADERBOARD_CACHE = {}
|
| 13 |
+
DATA_DIR = Path("leaderboard_data")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def load_hf_dataset_on_startup():
|
| 17 |
+
"""Load all splits from HuggingFace dataset at startup."""
|
| 18 |
+
print("Loading dataset from HuggingFace...")
|
| 19 |
+
try:
|
| 20 |
+
dataset = load_dataset("deepmage121/eee_test")
|
| 21 |
+
|
| 22 |
+
for split_name, split_data in dataset.items():
|
| 23 |
+
print(f"Loading split: {split_name} ({len(split_data)} rows)")
|
| 24 |
+
|
| 25 |
+
df = split_data.to_pandas()
|
| 26 |
+
parsed_items = []
|
| 27 |
+
|
| 28 |
+
for _, row in df.iterrows():
|
| 29 |
+
evaluation_results = json.loads(row['evaluation_results'])
|
| 30 |
+
|
| 31 |
+
results = {}
|
| 32 |
+
for eval_result in evaluation_results:
|
| 33 |
+
eval_name = eval_result.get("evaluation_name")
|
| 34 |
+
score = eval_result.get("score_details", {}).get("score")
|
| 35 |
+
if eval_name and score is not None:
|
| 36 |
+
results[eval_name] = score
|
| 37 |
+
|
| 38 |
+
additional_details = {}
|
| 39 |
+
if pd.notna(row.get('additional_details')):
|
| 40 |
+
additional_details = json.loads(row['additional_details'])
|
| 41 |
+
|
| 42 |
+
parsed_item = {
|
| 43 |
+
"leaderboard": row['_leaderboard'],
|
| 44 |
+
"provider": row['source_organization_name'],
|
| 45 |
+
"model": row['model_id'],
|
| 46 |
+
"developer": row['model_developer'],
|
| 47 |
+
"params": additional_details.get('params_billions'),
|
| 48 |
+
"architecture": additional_details.get('architecture', 'Unknown'),
|
| 49 |
+
"precision": additional_details.get('precision', 'Unknown'),
|
| 50 |
+
"results": results,
|
| 51 |
+
"raw_data": {
|
| 52 |
+
"schema_version": row['schema_version'],
|
| 53 |
+
"evaluation_id": row['evaluation_id'],
|
| 54 |
+
"retrieved_timestamp": row['retrieved_timestamp'],
|
| 55 |
+
"source_data": json.loads(row['source_data']),
|
| 56 |
+
"evaluation_source": {
|
| 57 |
+
"evaluation_source_name": row['evaluation_source_name'],
|
| 58 |
+
"evaluation_source_type": row['evaluation_source_type']
|
| 59 |
+
},
|
| 60 |
+
"source_metadata": {
|
| 61 |
+
"source_organization_name": row['source_organization_name'],
|
| 62 |
+
"evaluator_relationship": row['evaluator_relationship'],
|
| 63 |
+
},
|
| 64 |
+
"model_info": {
|
| 65 |
+
"name": row['model_name'],
|
| 66 |
+
"id": row['model_id'],
|
| 67 |
+
"developer": row['model_developer'],
|
| 68 |
+
},
|
| 69 |
+
"evaluation_results": evaluation_results,
|
| 70 |
+
"additional_details": additional_details
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
if pd.notna(row.get('source_organization_url')):
|
| 75 |
+
parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
|
| 76 |
+
if pd.notna(row.get('source_organization_logo_url')):
|
| 77 |
+
parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
|
| 78 |
+
if pd.notna(row.get('model_inference_platform')):
|
| 79 |
+
parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
|
| 80 |
+
|
| 81 |
+
parsed_items.append(parsed_item)
|
| 82 |
+
|
| 83 |
+
HF_DATASET_CACHE[split_name] = parsed_items
|
| 84 |
+
|
| 85 |
+
print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
|
| 86 |
+
return True
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"Warning: Could not load HuggingFace dataset: {e}")
|
| 89 |
+
print("Falling back to local file system...")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def parse_eval_json(file_path):
|
| 94 |
+
"""Parses a single JSON file to extract model, provider, and results."""
|
| 95 |
+
try:
|
| 96 |
+
with open(file_path, 'r') as f:
|
| 97 |
+
data = json.load(f)
|
| 98 |
+
|
| 99 |
+
leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
|
| 100 |
+
provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
|
| 101 |
+
model_id = data.get("model_info", {}).get("id", "Unknown Model")
|
| 102 |
+
developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
|
| 103 |
+
|
| 104 |
+
params = data.get("model_info", {}).get("params_billions", None)
|
| 105 |
+
architecture = data.get("model_info", {}).get("architecture", "Unknown")
|
| 106 |
+
precision = data.get("additional_details", {}).get("precision", "Unknown")
|
| 107 |
+
if precision == "Unknown":
|
| 108 |
+
precision = data.get("model_info", {}).get("precision", "Unknown")
|
| 109 |
+
|
| 110 |
+
results = {}
|
| 111 |
+
if "evaluation_results" in data:
|
| 112 |
+
for res in data["evaluation_results"]:
|
| 113 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 114 |
+
score = res.get("score_details", {}).get("score", None)
|
| 115 |
+
if score is not None:
|
| 116 |
+
results[eval_name] = score
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"leaderboard": leaderboard_name,
|
| 120 |
+
"provider": provider_name,
|
| 121 |
+
"model": model_id,
|
| 122 |
+
"developer": developer_name,
|
| 123 |
+
"params": params,
|
| 124 |
+
"architecture": architecture,
|
| 125 |
+
"precision": precision,
|
| 126 |
+
"results": results,
|
| 127 |
+
"raw_data": data
|
| 128 |
+
}
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f"Error parsing {file_path}: {e}")
|
| 131 |
+
return None
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def get_available_leaderboards():
|
| 135 |
+
"""Returns available leaderboards from HF cache or local directory."""
|
| 136 |
+
if HF_DATASET_CACHE:
|
| 137 |
+
return list(HF_DATASET_CACHE.keys())
|
| 138 |
+
|
| 139 |
+
if not DATA_DIR.exists():
|
| 140 |
+
return []
|
| 141 |
+
return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def walk_eval_files(leaderboard_name):
|
| 145 |
+
"""Generator that walks through Leaderboard directory recursively."""
|
| 146 |
+
lb_path = DATA_DIR / leaderboard_name
|
| 147 |
+
if not lb_path.exists():
|
| 148 |
+
return
|
| 149 |
+
yield from lb_path.rglob("*.json")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def get_eval_metadata(selected_leaderboard):
|
| 153 |
+
"""Extracts evaluation metadata from the leaderboard data."""
|
| 154 |
+
if not selected_leaderboard:
|
| 155 |
+
return {}
|
| 156 |
+
|
| 157 |
+
eval_metadata = {"evals": {}, "source_info": {}}
|
| 158 |
+
|
| 159 |
+
if selected_leaderboard in HF_DATASET_CACHE:
|
| 160 |
+
parsed_items = HF_DATASET_CACHE[selected_leaderboard]
|
| 161 |
+
if parsed_items:
|
| 162 |
+
parsed = parsed_items[0]
|
| 163 |
+
|
| 164 |
+
source_meta = parsed["raw_data"].get("source_metadata", {})
|
| 165 |
+
source_data_list = parsed["raw_data"].get("source_data", [])
|
| 166 |
+
url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
|
| 167 |
+
|
| 168 |
+
eval_metadata["source_info"] = {
|
| 169 |
+
"organization": source_meta.get("source_organization_name", "Unknown"),
|
| 170 |
+
"relationship": source_meta.get("evaluator_relationship", "Unknown"),
|
| 171 |
+
"url": url
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
if "evaluation_results" in parsed["raw_data"]:
|
| 175 |
+
for res in parsed["raw_data"]["evaluation_results"]:
|
| 176 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 177 |
+
if eval_name not in eval_metadata["evals"]:
|
| 178 |
+
metric_config = res.get("metric_config", {})
|
| 179 |
+
eval_metadata["evals"][eval_name] = {
|
| 180 |
+
"description": metric_config.get("evaluation_description", "No description available"),
|
| 181 |
+
"score_type": metric_config.get("score_type", "unknown"),
|
| 182 |
+
"lower_is_better": metric_config.get("lower_is_better", False),
|
| 183 |
+
"min_score": metric_config.get("min_score"),
|
| 184 |
+
"max_score": metric_config.get("max_score"),
|
| 185 |
+
"level_names": metric_config.get("level_names", []),
|
| 186 |
+
"level_metadata": metric_config.get("level_metadata", []),
|
| 187 |
+
"has_unknown_level": metric_config.get("has_unknown_level", False)
|
| 188 |
+
}
|
| 189 |
+
return eval_metadata
|
| 190 |
+
|
| 191 |
+
# Fall back to file system
|
| 192 |
+
for json_file in walk_eval_files(selected_leaderboard):
|
| 193 |
+
parsed = parse_eval_json(json_file)
|
| 194 |
+
if parsed:
|
| 195 |
+
if not eval_metadata["source_info"]:
|
| 196 |
+
source_meta = parsed["raw_data"].get("source_metadata", {})
|
| 197 |
+
source_data_list = parsed["raw_data"].get("source_data", [])
|
| 198 |
+
url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
|
| 199 |
+
|
| 200 |
+
eval_metadata["source_info"] = {
|
| 201 |
+
"organization": source_meta.get("source_organization_name", "Unknown"),
|
| 202 |
+
"relationship": source_meta.get("evaluator_relationship", "Unknown"),
|
| 203 |
+
"url": url
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
if "evaluation_results" in parsed["raw_data"]:
|
| 207 |
+
for res in parsed["raw_data"]["evaluation_results"]:
|
| 208 |
+
eval_name = res.get("evaluation_name", "Unknown Metric")
|
| 209 |
+
if eval_name not in eval_metadata["evals"]:
|
| 210 |
+
metric_config = res.get("metric_config", {})
|
| 211 |
+
eval_metadata["evals"][eval_name] = {
|
| 212 |
+
"description": metric_config.get("evaluation_description", "No description available"),
|
| 213 |
+
"score_type": metric_config.get("score_type", "unknown"),
|
| 214 |
+
"lower_is_better": metric_config.get("lower_is_better", False),
|
| 215 |
+
"min_score": metric_config.get("min_score"),
|
| 216 |
+
"max_score": metric_config.get("max_score"),
|
| 217 |
+
"level_names": metric_config.get("level_names", []),
|
| 218 |
+
"level_metadata": metric_config.get("level_metadata", []),
|
| 219 |
+
"has_unknown_level": metric_config.get("has_unknown_level", False)
|
| 220 |
+
}
|
| 221 |
+
break
|
| 222 |
+
|
| 223 |
+
return eval_metadata
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
|
| 227 |
+
"""Builds the leaderboard DataFrame from cache or files."""
|
| 228 |
+
if not selected_leaderboard:
|
| 229 |
+
return pd.DataFrame()
|
| 230 |
+
|
| 231 |
+
if selected_leaderboard in LEADERBOARD_CACHE:
|
| 232 |
+
df, _ = LEADERBOARD_CACHE[selected_leaderboard]
|
| 233 |
+
else:
|
| 234 |
+
rows = []
|
| 235 |
+
|
| 236 |
+
if selected_leaderboard in HF_DATASET_CACHE:
|
| 237 |
+
if progress_callback:
|
| 238 |
+
progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
|
| 239 |
+
|
| 240 |
+
parsed_items = HF_DATASET_CACHE[selected_leaderboard]
|
| 241 |
+
|
| 242 |
+
for i, parsed in enumerate(parsed_items):
|
| 243 |
+
if i % 100 == 0 and progress_callback:
|
| 244 |
+
progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
|
| 245 |
+
|
| 246 |
+
row = {
|
| 247 |
+
"Model": parsed["model"],
|
| 248 |
+
"Developer": parsed["developer"],
|
| 249 |
+
"Params (B)": parsed["params"],
|
| 250 |
+
"Arch": parsed["architecture"],
|
| 251 |
+
"Precision": parsed["precision"]
|
| 252 |
+
}
|
| 253 |
+
row.update(parsed["results"])
|
| 254 |
+
rows.append(row)
|
| 255 |
+
else:
|
| 256 |
+
# Fall back to file system
|
| 257 |
+
if progress_callback:
|
| 258 |
+
progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
|
| 259 |
+
|
| 260 |
+
all_files = list(walk_eval_files(selected_leaderboard))
|
| 261 |
+
total_files = len(all_files)
|
| 262 |
+
|
| 263 |
+
for i, json_file in enumerate(all_files):
|
| 264 |
+
if i % 100 == 0 and progress_callback:
|
| 265 |
+
progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
|
| 266 |
+
|
| 267 |
+
parsed = parse_eval_json(json_file)
|
| 268 |
+
if parsed:
|
| 269 |
+
row = {
|
| 270 |
+
"Model": parsed["model"],
|
| 271 |
+
"Developer": parsed["developer"],
|
| 272 |
+
"Params (B)": parsed["params"],
|
| 273 |
+
"Arch": parsed["architecture"],
|
| 274 |
+
"Precision": parsed["precision"]
|
| 275 |
+
}
|
| 276 |
+
row.update(parsed["results"])
|
| 277 |
+
rows.append(row)
|
| 278 |
+
|
| 279 |
+
if not rows:
|
| 280 |
+
df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
|
| 281 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 282 |
+
return df
|
| 283 |
+
|
| 284 |
+
df = pd.DataFrame(rows)
|
| 285 |
+
df = df.dropna(axis=1, how='all')
|
| 286 |
+
|
| 287 |
+
if df.empty:
|
| 288 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 289 |
+
return df
|
| 290 |
+
|
| 291 |
+
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
|
| 292 |
+
df[numeric_cols] = df[numeric_cols].round(3)
|
| 293 |
+
|
| 294 |
+
# Add Average Score
|
| 295 |
+
eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
|
| 296 |
+
if len(eval_only_cols) > 0:
|
| 297 |
+
df["Average"] = df[eval_only_cols].mean(axis=1).round(3)
|
| 298 |
+
|
| 299 |
+
base_cols = ["Model", "Developer", "Params (B)", "Arch", "Precision", "Average"]
|
| 300 |
+
eval_cols = [c for c in df.columns if c not in base_cols]
|
| 301 |
+
base_cols = [c for c in base_cols if c in df.columns]
|
| 302 |
+
|
| 303 |
+
final_cols = base_cols + sorted(eval_cols)
|
| 304 |
+
df = df[final_cols]
|
| 305 |
+
|
| 306 |
+
if "Average" in df.columns:
|
| 307 |
+
df = df.sort_values("Average", ascending=False)
|
| 308 |
+
|
| 309 |
+
LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
|
| 310 |
+
|
| 311 |
+
return df
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def clear_cache():
|
| 315 |
+
"""Clears all caches."""
|
| 316 |
+
LEADERBOARD_CACHE.clear()
|
| 317 |
+
|
eval.schema.json
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
| 3 |
+
"version": "0.0.1",
|
| 4 |
+
"type": "object",
|
| 5 |
+
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
|
| 6 |
+
"required": [
|
| 7 |
+
"schema_version",
|
| 8 |
+
"evaluation_id",
|
| 9 |
+
"evaluation_source",
|
| 10 |
+
"retrieved_timestamp",
|
| 11 |
+
"source_data",
|
| 12 |
+
"source_metadata",
|
| 13 |
+
"model_info",
|
| 14 |
+
"evaluation_results"
|
| 15 |
+
],
|
| 16 |
+
"properties": {
|
| 17 |
+
"schema_version": {
|
| 18 |
+
"type": "string",
|
| 19 |
+
"description": "Version of the schema used for this evaluation data"
|
| 20 |
+
},
|
| 21 |
+
"evaluation_id": {
|
| 22 |
+
"type": "string",
|
| 23 |
+
"description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
|
| 24 |
+
},
|
| 25 |
+
"retrieved_timestamp": {
|
| 26 |
+
"type": "string",
|
| 27 |
+
"description": "Timestamp for when this record was created"
|
| 28 |
+
},
|
| 29 |
+
"source_data": {
|
| 30 |
+
"type": "array",
|
| 31 |
+
"description": "URLs for the source of the evaluation data",
|
| 32 |
+
"items": {
|
| 33 |
+
"type": "string"
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
"evaluation_source": {
|
| 37 |
+
"type": "object",
|
| 38 |
+
"description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
|
| 39 |
+
"required": [
|
| 40 |
+
"evaluation_source_name",
|
| 41 |
+
"evaluation_source_type"
|
| 42 |
+
],
|
| 43 |
+
"properties": {
|
| 44 |
+
"evaluation_source_name": {
|
| 45 |
+
"type": "string",
|
| 46 |
+
"description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
|
| 47 |
+
},
|
| 48 |
+
"evaluation_source_type": {
|
| 49 |
+
"type": "string",
|
| 50 |
+
"enum": [
|
| 51 |
+
"leaderboard",
|
| 52 |
+
"evaluation_platform"
|
| 53 |
+
],
|
| 54 |
+
"description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
},
|
| 58 |
+
"source_metadata": {
|
| 59 |
+
"type": "object",
|
| 60 |
+
"description": "Metadata about the source of the leaderboard data",
|
| 61 |
+
"required": [
|
| 62 |
+
"source_organization_name",
|
| 63 |
+
"evaluator_relationship"
|
| 64 |
+
],
|
| 65 |
+
"properties": {
|
| 66 |
+
"source_organization_name": {
|
| 67 |
+
"type": "string",
|
| 68 |
+
"description": "Name of the organization that provides the data"
|
| 69 |
+
},
|
| 70 |
+
"source_organization_url": {
|
| 71 |
+
"type": "string",
|
| 72 |
+
"description": "URL for the organization that provides the data"
|
| 73 |
+
},
|
| 74 |
+
"source_organization_logo_url": {
|
| 75 |
+
"type": "string",
|
| 76 |
+
"description": "URL for the Logo for the organization that provides the data"
|
| 77 |
+
},
|
| 78 |
+
"evaluator_relationship": {
|
| 79 |
+
"type": "string",
|
| 80 |
+
"description": "Relationship between the evaluator and the model",
|
| 81 |
+
"enum": [
|
| 82 |
+
"first_party",
|
| 83 |
+
"third_party",
|
| 84 |
+
"collaborative",
|
| 85 |
+
"other"
|
| 86 |
+
]
|
| 87 |
+
}
|
| 88 |
+
}
|
| 89 |
+
},
|
| 90 |
+
"model_info": {
|
| 91 |
+
"type": "object",
|
| 92 |
+
"description": "Complete model specification including basic information, technical configuration and inference settings",
|
| 93 |
+
"required": [
|
| 94 |
+
"name",
|
| 95 |
+
"id"
|
| 96 |
+
],
|
| 97 |
+
"properties": {
|
| 98 |
+
"name": {
|
| 99 |
+
"type": "string",
|
| 100 |
+
"description": "Model name provided by evaluation source"
|
| 101 |
+
},
|
| 102 |
+
"id": {
|
| 103 |
+
"type": "string",
|
| 104 |
+
"description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
|
| 105 |
+
},
|
| 106 |
+
"developer": {
|
| 107 |
+
"type": "string",
|
| 108 |
+
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
|
| 109 |
+
},
|
| 110 |
+
"inference_platform": {
|
| 111 |
+
"type": "string",
|
| 112 |
+
"description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"evaluation_results": {
|
| 117 |
+
"type": "array",
|
| 118 |
+
"description": "Array of evaluation results",
|
| 119 |
+
"items": {
|
| 120 |
+
"type": "object",
|
| 121 |
+
"required": [
|
| 122 |
+
"evaluation_name",
|
| 123 |
+
"metric_config",
|
| 124 |
+
"score_details"
|
| 125 |
+
],
|
| 126 |
+
"properties": {
|
| 127 |
+
"evaluation_name": {
|
| 128 |
+
"type": "string",
|
| 129 |
+
"description": "Name of the evaluation"
|
| 130 |
+
},
|
| 131 |
+
"evaluation_timestamp": {
|
| 132 |
+
"type": "string",
|
| 133 |
+
"description": "Timestamp for when the evaluations were run"
|
| 134 |
+
},
|
| 135 |
+
"metric_config": {
|
| 136 |
+
"type": "object",
|
| 137 |
+
"description": "Details about the metric",
|
| 138 |
+
"required": [
|
| 139 |
+
"lower_is_better"
|
| 140 |
+
],
|
| 141 |
+
"properties": {
|
| 142 |
+
"evaluation_description": {
|
| 143 |
+
"type": "string",
|
| 144 |
+
"description": "Description of the evaluation"
|
| 145 |
+
},
|
| 146 |
+
"lower_is_better": {
|
| 147 |
+
"type": "boolean",
|
| 148 |
+
"description": "Whether a lower score is better"
|
| 149 |
+
},
|
| 150 |
+
"score_type": {
|
| 151 |
+
"type": "string",
|
| 152 |
+
"description": "Type of score",
|
| 153 |
+
"enum": [
|
| 154 |
+
"binary",
|
| 155 |
+
"continuous",
|
| 156 |
+
"levels"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
"level_names": {
|
| 160 |
+
"type": "array",
|
| 161 |
+
"description": "Names of the score levels",
|
| 162 |
+
"items": {
|
| 163 |
+
"type": "string"
|
| 164 |
+
}
|
| 165 |
+
},
|
| 166 |
+
"level_metadata": {
|
| 167 |
+
"type": "array",
|
| 168 |
+
"description": "Additional Description for each Score Level",
|
| 169 |
+
"items": {
|
| 170 |
+
"type": "string"
|
| 171 |
+
}
|
| 172 |
+
},
|
| 173 |
+
"has_unknown_level": {
|
| 174 |
+
"type": "boolean",
|
| 175 |
+
"description": "Indicates whether there is an Unknown Level - if True, then a score of -1 will be treated as Unknown"
|
| 176 |
+
},
|
| 177 |
+
"min_score": {
|
| 178 |
+
"type": "number",
|
| 179 |
+
"description": "Minimum possible score for continuous metric"
|
| 180 |
+
},
|
| 181 |
+
"max_score": {
|
| 182 |
+
"type": "number",
|
| 183 |
+
"description": "Maximum possible score for continuous metric"
|
| 184 |
+
}
|
| 185 |
+
},
|
| 186 |
+
"if": {
|
| 187 |
+
"properties": {
|
| 188 |
+
"score_type": {
|
| 189 |
+
"const": "levels"
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
},
|
| 193 |
+
"then": {
|
| 194 |
+
"required": [
|
| 195 |
+
"level_names",
|
| 196 |
+
"has_unknown_level"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
"else": {
|
| 200 |
+
"if": {
|
| 201 |
+
"properties": {
|
| 202 |
+
"score_type": {
|
| 203 |
+
"const": "continuous"
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
},
|
| 207 |
+
"then": {
|
| 208 |
+
"required": [
|
| 209 |
+
"min_score",
|
| 210 |
+
"max_score"
|
| 211 |
+
]
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
},
|
| 215 |
+
"score_details": {
|
| 216 |
+
"type": "object",
|
| 217 |
+
"description": "The score for the evaluation and related details",
|
| 218 |
+
"required": [
|
| 219 |
+
"score"
|
| 220 |
+
],
|
| 221 |
+
"properties": {
|
| 222 |
+
"score": {
|
| 223 |
+
"type": "number",
|
| 224 |
+
"description": "The score for the evaluation"
|
| 225 |
+
},
|
| 226 |
+
"details": {
|
| 227 |
+
"type": "object",
|
| 228 |
+
"description": "Any additional details about the score",
|
| 229 |
+
"additionalProperties": true
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
},
|
| 233 |
+
"detailed_evaluation_results_url": {
|
| 234 |
+
"type": "string",
|
| 235 |
+
"description": "Link to detailed evaluation data"
|
| 236 |
+
},
|
| 237 |
+
"generation_config": {
|
| 238 |
+
"type": "object",
|
| 239 |
+
"generation_args": {
|
| 240 |
+
"type": "object",
|
| 241 |
+
"description": "Parameters used to generate results - properties may vary by model type",
|
| 242 |
+
"properties": {
|
| 243 |
+
"temperature": {
|
| 244 |
+
"type": [
|
| 245 |
+
"null",
|
| 246 |
+
"number"
|
| 247 |
+
],
|
| 248 |
+
"description": "Sampling temperature"
|
| 249 |
+
},
|
| 250 |
+
"top_p": {
|
| 251 |
+
"type": [
|
| 252 |
+
"null",
|
| 253 |
+
"number"
|
| 254 |
+
],
|
| 255 |
+
"description": "Nucleus sampling parameter"
|
| 256 |
+
},
|
| 257 |
+
"top_k": {
|
| 258 |
+
"type": [
|
| 259 |
+
"null",
|
| 260 |
+
"number"
|
| 261 |
+
],
|
| 262 |
+
"description": "Top-k sampling parameter"
|
| 263 |
+
},
|
| 264 |
+
"max_tokens": {
|
| 265 |
+
"type": "integer",
|
| 266 |
+
"minimum": 1,
|
| 267 |
+
"description": "Maximum number of tokens to generate"
|
| 268 |
+
}
|
| 269 |
+
},
|
| 270 |
+
"additionalProperties": true
|
| 271 |
+
},
|
| 272 |
+
"additional_details": {
|
| 273 |
+
"type": "string",
|
| 274 |
+
"description": "Additional details about how the results for this metric were generated."
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
}
|
hf_operations.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Operations: Upload data, create PRs, validate schemas.
|
| 3 |
+
"""
|
| 4 |
+
from huggingface_hub import HfApi, login
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import json
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from jsonschema import validate, ValidationError, Draft7Validator
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Load schema once at module level
|
| 12 |
+
SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
|
| 13 |
+
with open(SCHEMA_PATH, 'r') as f:
|
| 14 |
+
EVAL_SCHEMA = json.load(f)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def validate_json_against_schema(json_data):
|
| 18 |
+
"""
|
| 19 |
+
Validate a JSON object against eval.schema.json.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
json_data: Dict containing the evaluation data
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
(bool, str): (is_valid, error_message)
|
| 26 |
+
"""
|
| 27 |
+
try:
|
| 28 |
+
validate(instance=json_data, schema=EVAL_SCHEMA)
|
| 29 |
+
return True, "Schema validation passed"
|
| 30 |
+
except ValidationError as e:
|
| 31 |
+
# Extract the most relevant error message
|
| 32 |
+
error_path = " → ".join(str(p) for p in e.path) if e.path else "root"
|
| 33 |
+
return False, f"❌ Schema validation failed at '{error_path}': {e.message}"
|
| 34 |
+
except Exception as e:
|
| 35 |
+
return False, f"❌ Validation error: {str(e)}"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def upload_to_hf_dataset(parquet_file, split_name, repo_id="deepmage121/eee_test"):
|
| 39 |
+
"""
|
| 40 |
+
Upload a parquet file as a new split to the HF dataset.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
parquet_file: Path to parquet file
|
| 44 |
+
split_name: Name of the split (leaderboard name)
|
| 45 |
+
repo_id: HuggingFace dataset repository ID
|
| 46 |
+
"""
|
| 47 |
+
# TODO: Implement upload logic
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def check_hf_authentication():
|
| 52 |
+
"""
|
| 53 |
+
Check if user is authenticated with HuggingFace.
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
(bool, str): (is_authenticated, username or error_message)
|
| 57 |
+
"""
|
| 58 |
+
try:
|
| 59 |
+
api = HfApi()
|
| 60 |
+
user_info = api.whoami()
|
| 61 |
+
return True, user_info['name']
|
| 62 |
+
except Exception as e:
|
| 63 |
+
return False, "Not authenticated. Run: huggingface-cli login"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def check_duplicate_pr_exists(leaderboard_name, repo_id="deepmage121/eee_test"):
|
| 67 |
+
"""
|
| 68 |
+
Check if a PR already exists for this leaderboard.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
leaderboard_name: Name of the leaderboard
|
| 72 |
+
repo_id: HuggingFace dataset repository ID
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
(bool, str or None): (exists, pr_url if exists)
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
api = HfApi()
|
| 79 |
+
discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
|
| 80 |
+
|
| 81 |
+
# Check for open PRs with matching title
|
| 82 |
+
pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
|
| 83 |
+
for discussion in discussions:
|
| 84 |
+
if discussion.is_pull_request and discussion.status == "open":
|
| 85 |
+
if pr_title_pattern in discussion.title.lower():
|
| 86 |
+
pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
|
| 87 |
+
return True, pr_url
|
| 88 |
+
|
| 89 |
+
return False, None
|
| 90 |
+
except Exception as e:
|
| 91 |
+
# If we can't check, assume no duplicate (fail open)
|
| 92 |
+
print(f"Warning: Could not check for duplicate PRs: {e}")
|
| 93 |
+
return False, None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id="deepmage121/eee_test"):
|
| 97 |
+
"""
|
| 98 |
+
Create a pull request to add a new leaderboard split.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
leaderboard_name: Name of the new leaderboard
|
| 102 |
+
parquet_file: Path to parquet file
|
| 103 |
+
repo_id: HuggingFace dataset repository ID
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
(success, pr_url or error_message)
|
| 107 |
+
"""
|
| 108 |
+
# 1. Check authentication
|
| 109 |
+
is_auth, auth_result = check_hf_authentication()
|
| 110 |
+
if not is_auth:
|
| 111 |
+
return False, f"❌ {auth_result}"
|
| 112 |
+
|
| 113 |
+
# 2. Check for duplicate PR
|
| 114 |
+
has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
|
| 115 |
+
if has_duplicate:
|
| 116 |
+
return False, f"⚠️ PR already exists: {duplicate_url}"
|
| 117 |
+
|
| 118 |
+
# 3. Validate parquet file exists and has data
|
| 119 |
+
parquet_path = Path(parquet_file)
|
| 120 |
+
if not parquet_path.exists():
|
| 121 |
+
return False, "❌ Parquet file not found"
|
| 122 |
+
|
| 123 |
+
df = pd.read_parquet(parquet_file)
|
| 124 |
+
if len(df) == 0:
|
| 125 |
+
return False, "❌ Parquet file is empty"
|
| 126 |
+
|
| 127 |
+
# 4. Create PR
|
| 128 |
+
try:
|
| 129 |
+
api = HfApi()
|
| 130 |
+
|
| 131 |
+
# Upload the parquet file to the branch
|
| 132 |
+
commit_message = f"Add new leaderboard: {leaderboard_name}"
|
| 133 |
+
|
| 134 |
+
# Upload file and create PR
|
| 135 |
+
commit_info = api.upload_file(
|
| 136 |
+
path_or_fileobj=parquet_file,
|
| 137 |
+
path_in_repo=f"data/{leaderboard_name}.parquet",
|
| 138 |
+
repo_id=repo_id,
|
| 139 |
+
repo_type="dataset",
|
| 140 |
+
commit_message=commit_message,
|
| 141 |
+
create_pr=True,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# Extract PR URL from commit info
|
| 145 |
+
pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
|
| 146 |
+
|
| 147 |
+
return True, f"PR created ({len(df)} rows): {pr_url}"
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
return False, f"❌ Failed to create PR: {str(e)}"
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def validate_schema(parquet_file):
|
| 154 |
+
"""
|
| 155 |
+
Validate that a parquet file matches the expected schema.
|
| 156 |
+
|
| 157 |
+
Args:
|
| 158 |
+
parquet_file: Path to parquet file to validate
|
| 159 |
+
|
| 160 |
+
Returns:
|
| 161 |
+
(bool, str): (is_valid, error_message)
|
| 162 |
+
"""
|
| 163 |
+
try:
|
| 164 |
+
df = pd.read_parquet(parquet_file)
|
| 165 |
+
|
| 166 |
+
# Required columns
|
| 167 |
+
required_cols = [
|
| 168 |
+
'_leaderboard', '_developer', '_model', '_uuid',
|
| 169 |
+
'schema_version', 'evaluation_id', 'retrieved_timestamp',
|
| 170 |
+
'source_data', 'evaluation_source_name', 'evaluation_source_type',
|
| 171 |
+
'source_organization_name', 'evaluator_relationship',
|
| 172 |
+
'model_name', 'model_id', 'model_developer',
|
| 173 |
+
'evaluation_results'
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
missing = [col for col in required_cols if col not in df.columns]
|
| 177 |
+
if missing:
|
| 178 |
+
return False, f"Missing required columns: {', '.join(missing)}"
|
| 179 |
+
|
| 180 |
+
# Check data types (all should be strings)
|
| 181 |
+
for col in df.columns:
|
| 182 |
+
if df[col].dtype not in ['object', 'string']:
|
| 183 |
+
return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
|
| 184 |
+
|
| 185 |
+
return True, "Schema validation passed"
|
| 186 |
+
|
| 187 |
+
except Exception as e:
|
| 188 |
+
return False, f"Validation error: {str(e)}"
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def export_to_json(parquet_file, output_dir):
|
| 192 |
+
"""
|
| 193 |
+
Export parquet data back to JSON files.
|
| 194 |
+
Uses the parquet_to_folder function from json_to_parquet.py
|
| 195 |
+
|
| 196 |
+
Args:
|
| 197 |
+
parquet_file: Path to parquet file
|
| 198 |
+
output_dir: Directory to write JSON files to
|
| 199 |
+
"""
|
| 200 |
+
from json_to_parquet import parquet_to_folder
|
| 201 |
+
parquet_to_folder(parquet_file, output_dir)
|
| 202 |
+
|
json_to_parquet.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def json_to_row(json_path: Path) -> dict:
|
| 9 |
+
"""Convert one JSON to a single row (1 JSON = 1 row, evaluations as columns)."""
|
| 10 |
+
with open(json_path, 'r') as f:
|
| 11 |
+
data = json.load(f)
|
| 12 |
+
|
| 13 |
+
required_fields = ["schema_version", "evaluation_id", "evaluation_source", "retrieved_timestamp",
|
| 14 |
+
"source_data", "source_metadata", "model_info", "evaluation_results"]
|
| 15 |
+
for field in required_fields:
|
| 16 |
+
if field not in data:
|
| 17 |
+
raise ValueError(f"{json_path}: Missing required field '{field}'")
|
| 18 |
+
|
| 19 |
+
if "evaluation_source_name" not in data["evaluation_source"]:
|
| 20 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_name'")
|
| 21 |
+
if "evaluation_source_type" not in data["evaluation_source"]:
|
| 22 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_type'")
|
| 23 |
+
|
| 24 |
+
if "source_organization_name" not in data["source_metadata"]:
|
| 25 |
+
raise ValueError(f"{json_path}: Missing required field 'source_metadata.source_organization_name'")
|
| 26 |
+
if "evaluator_relationship" not in data["source_metadata"]:
|
| 27 |
+
raise ValueError(f"{json_path}: Missing required field 'source_metadata.evaluator_relationship'")
|
| 28 |
+
|
| 29 |
+
if "name" not in data["model_info"]:
|
| 30 |
+
raise ValueError(f"{json_path}: Missing required field 'model_info.name'")
|
| 31 |
+
if "id" not in data["model_info"]:
|
| 32 |
+
raise ValueError(f"{json_path}: Missing required field 'model_info.id'")
|
| 33 |
+
if "developer" not in data["model_info"]:
|
| 34 |
+
raise ValueError(f"{json_path}: Missing required field 'model_info.developer'")
|
| 35 |
+
|
| 36 |
+
leaderboard = data["evaluation_source"]["evaluation_source_name"]
|
| 37 |
+
model = data["model_info"]["id"]
|
| 38 |
+
uuid = json_path.stem
|
| 39 |
+
developer = data["model_info"]["developer"]
|
| 40 |
+
|
| 41 |
+
# Validate evaluation results
|
| 42 |
+
for eval_result in data["evaluation_results"]:
|
| 43 |
+
if "evaluation_name" not in eval_result:
|
| 44 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].evaluation_name'")
|
| 45 |
+
if "metric_config" not in eval_result:
|
| 46 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config'")
|
| 47 |
+
if "score_details" not in eval_result:
|
| 48 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details'")
|
| 49 |
+
|
| 50 |
+
if "lower_is_better" not in eval_result["metric_config"]:
|
| 51 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config.lower_is_better'")
|
| 52 |
+
if "score" not in eval_result["score_details"]:
|
| 53 |
+
raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details.score'")
|
| 54 |
+
|
| 55 |
+
row = {
|
| 56 |
+
# Folder structure (for reconstruction)
|
| 57 |
+
"_leaderboard": leaderboard,
|
| 58 |
+
"_developer": developer,
|
| 59 |
+
"_model": model,
|
| 60 |
+
"_uuid": uuid,
|
| 61 |
+
|
| 62 |
+
# Required top-level fields
|
| 63 |
+
"schema_version": data["schema_version"],
|
| 64 |
+
"evaluation_id": data["evaluation_id"],
|
| 65 |
+
"retrieved_timestamp": data["retrieved_timestamp"],
|
| 66 |
+
"source_data": json.dumps(data["source_data"]),
|
| 67 |
+
|
| 68 |
+
# Required nested fields
|
| 69 |
+
"evaluation_source_name": data["evaluation_source"]["evaluation_source_name"],
|
| 70 |
+
"evaluation_source_type": data["evaluation_source"]["evaluation_source_type"],
|
| 71 |
+
|
| 72 |
+
"source_organization_name": data["source_metadata"]["source_organization_name"],
|
| 73 |
+
"source_organization_url": data["source_metadata"].get("source_organization_url"),
|
| 74 |
+
"source_organization_logo_url": data["source_metadata"].get("source_organization_logo_url"),
|
| 75 |
+
"evaluator_relationship": data["source_metadata"]["evaluator_relationship"],
|
| 76 |
+
|
| 77 |
+
"model_name": data["model_info"]["name"],
|
| 78 |
+
"model_id": data["model_info"]["id"],
|
| 79 |
+
"model_developer": data["model_info"]["developer"],
|
| 80 |
+
"model_inference_platform": data["model_info"].get("inference_platform"),
|
| 81 |
+
|
| 82 |
+
# Store full evaluation_results and additional_details as JSON
|
| 83 |
+
"evaluation_results": json.dumps(data["evaluation_results"]),
|
| 84 |
+
"additional_details": json.dumps(data["additional_details"]) if "additional_details" in data else None,
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
return row
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def add_to_parquet(json_or_folder: str, parquet_file: str):
|
| 91 |
+
"""
|
| 92 |
+
Add JSON(s) to Parquet file.
|
| 93 |
+
Creates new file if it doesn't exist, appends and deduplicates if it does.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
json_or_folder: Path to single JSON file or folder containing JSONs
|
| 97 |
+
parquet_file: Output Parquet file path
|
| 98 |
+
"""
|
| 99 |
+
input_path = Path(json_or_folder)
|
| 100 |
+
|
| 101 |
+
if input_path.is_file():
|
| 102 |
+
json_files = [input_path]
|
| 103 |
+
elif input_path.is_dir():
|
| 104 |
+
json_files = list(input_path.rglob("*.json"))
|
| 105 |
+
if not json_files:
|
| 106 |
+
raise ValueError(f"No JSON files found in directory: {json_or_folder}")
|
| 107 |
+
else:
|
| 108 |
+
raise ValueError(f"Invalid input: {json_or_folder}")
|
| 109 |
+
|
| 110 |
+
print(f"Processing {len(json_files)} JSON file(s)...")
|
| 111 |
+
|
| 112 |
+
parquet_path = Path(parquet_file)
|
| 113 |
+
if parquet_path.exists():
|
| 114 |
+
existing_df = pd.read_parquet(parquet_file)
|
| 115 |
+
existing_keys = set(
|
| 116 |
+
existing_df[["_leaderboard", "_developer", "_model", "_uuid"]]
|
| 117 |
+
.apply(tuple, axis=1)
|
| 118 |
+
)
|
| 119 |
+
print(f"Found {len(existing_df)} existing rows")
|
| 120 |
+
else:
|
| 121 |
+
existing_df = None
|
| 122 |
+
existing_keys = set()
|
| 123 |
+
|
| 124 |
+
all_rows = []
|
| 125 |
+
skipped = 0
|
| 126 |
+
for i, jf in enumerate(json_files, 1):
|
| 127 |
+
if i % 100 == 0:
|
| 128 |
+
print(f" {i}/{len(json_files)}")
|
| 129 |
+
|
| 130 |
+
row = json_to_row(jf)
|
| 131 |
+
key = (row["_leaderboard"], row["_developer"], row["_model"], row["_uuid"])
|
| 132 |
+
if key not in existing_keys:
|
| 133 |
+
all_rows.append(row)
|
| 134 |
+
existing_keys.add(key)
|
| 135 |
+
else:
|
| 136 |
+
skipped += 1
|
| 137 |
+
|
| 138 |
+
if skipped > 0:
|
| 139 |
+
print(f" Skipped {skipped} duplicate file(s)")
|
| 140 |
+
|
| 141 |
+
# Handle case where no new rows to add
|
| 142 |
+
if not all_rows:
|
| 143 |
+
if existing_df is not None:
|
| 144 |
+
print(f"No new files to add, keeping existing {len(existing_df)} file(s)")
|
| 145 |
+
return
|
| 146 |
+
else:
|
| 147 |
+
raise ValueError("No valid JSON files to process and no existing parquet file")
|
| 148 |
+
|
| 149 |
+
new_df = pd.DataFrame(all_rows)
|
| 150 |
+
|
| 151 |
+
if existing_df is not None:
|
| 152 |
+
df = pd.concat([existing_df, new_df], ignore_index=True)
|
| 153 |
+
print(f"Added {len(new_df)} new file(s) to existing {len(existing_df)} file(s)")
|
| 154 |
+
else:
|
| 155 |
+
df = new_df
|
| 156 |
+
|
| 157 |
+
df.to_parquet(parquet_file, index=False)
|
| 158 |
+
print(f"Saved {len(df)} total file(s) to {parquet_file} ({parquet_path.stat().st_size / 1024 / 1024:.1f} MB)")
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def parquet_to_folder(parquet_file: str, output_dir: str):
|
| 162 |
+
"""Reconstruct folder structure from Parquet."""
|
| 163 |
+
df = pd.read_parquet(parquet_file)
|
| 164 |
+
out = Path(output_dir)
|
| 165 |
+
|
| 166 |
+
for _, row in df.iterrows():
|
| 167 |
+
lb = row["_leaderboard"]
|
| 168 |
+
dev = row["_developer"]
|
| 169 |
+
model = row["_model"]
|
| 170 |
+
uuid = row["_uuid"]
|
| 171 |
+
|
| 172 |
+
json_data = {
|
| 173 |
+
"schema_version": row["schema_version"],
|
| 174 |
+
"evaluation_id": row["evaluation_id"],
|
| 175 |
+
"retrieved_timestamp": row["retrieved_timestamp"],
|
| 176 |
+
"source_data": json.loads(row["source_data"]),
|
| 177 |
+
"evaluation_source": {
|
| 178 |
+
"evaluation_source_name": row["evaluation_source_name"],
|
| 179 |
+
"evaluation_source_type": row["evaluation_source_type"]
|
| 180 |
+
},
|
| 181 |
+
"source_metadata": {
|
| 182 |
+
"source_organization_name": row["source_organization_name"],
|
| 183 |
+
"evaluator_relationship": row["evaluator_relationship"]
|
| 184 |
+
},
|
| 185 |
+
"model_info": {
|
| 186 |
+
"name": row["model_name"],
|
| 187 |
+
"id": row["model_id"],
|
| 188 |
+
"developer": row["model_developer"]
|
| 189 |
+
},
|
| 190 |
+
"evaluation_results": json.loads(row["evaluation_results"])
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
if pd.notna(row["source_organization_url"]):
|
| 194 |
+
json_data["source_metadata"]["source_organization_url"] = row["source_organization_url"]
|
| 195 |
+
if pd.notna(row["source_organization_logo_url"]):
|
| 196 |
+
json_data["source_metadata"]["source_organization_logo_url"] = row["source_organization_logo_url"]
|
| 197 |
+
|
| 198 |
+
if pd.notna(row["model_inference_platform"]):
|
| 199 |
+
json_data["model_info"]["inference_platform"] = row["model_inference_platform"]
|
| 200 |
+
|
| 201 |
+
if pd.notna(row["additional_details"]):
|
| 202 |
+
json_data["additional_details"] = json.loads(row["additional_details"])
|
| 203 |
+
|
| 204 |
+
file_path = out / lb / dev / model / f"{uuid}.json"
|
| 205 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 206 |
+
with open(file_path, 'w') as f:
|
| 207 |
+
json.dump(json_data, f, indent=2)
|
| 208 |
+
|
| 209 |
+
print(f"Reconstructed {len(df)} files to {output_dir}")
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
if __name__ == "__main__":
|
| 213 |
+
import sys
|
| 214 |
+
|
| 215 |
+
if len(sys.argv) < 2:
|
| 216 |
+
print("Usage:")
|
| 217 |
+
print(" python json_to_parquet.py add <json_or_folder> <output.parquet>")
|
| 218 |
+
print(" python json_to_parquet.py export <input.parquet> <output_dir>")
|
| 219 |
+
sys.exit(1)
|
| 220 |
+
|
| 221 |
+
cmd = sys.argv[1]
|
| 222 |
+
|
| 223 |
+
if cmd == "add":
|
| 224 |
+
add_to_parquet(sys.argv[2], sys.argv[3])
|
| 225 |
+
elif cmd == "export":
|
| 226 |
+
parquet_to_folder(sys.argv[2], sys.argv[3])
|
| 227 |
+
else:
|
| 228 |
+
print(f"Unknown command: {cmd}")
|
leaderboard_data/HFOpenLLMv2/0-hero/0-hero_Matter-0.2-7B-DPO/40e80d5e-db72-46b7-bd14-b7d005df4be8.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/0-hero_Matter-0.2-7B-DPO/1762652579.4626381",
|
| 4 |
-
"retrieved_timestamp": "1762652579.462642",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "0-hero/Matter-0.2-7B-DPO",
|
| 18 |
-
"developer": "0-hero",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "0-hero/Matter-0.2-7B-DPO"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3302792147058693
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.3596254301656297
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.014350453172205438
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.25922818791946306
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.381375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.1163563829787234
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "MistralForCausalLM",
|
| 105 |
-
"params_billions": 7.242
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-32K/0d91a153-1b6b-4891-8722-a5c7e372ba64.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-32K/1762652579.463656",
|
| 4 |
-
"retrieved_timestamp": "1762652579.463657",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-34B-32K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-34B-32K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3118691737922047
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6015685776542417
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.1540785498489426
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.36325503355704697
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4398229166666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.4709109042553192
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat-16K/2192007d-1f6e-4f74-b518-7448ef3a896e.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat-16K/1762652579.464125",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4641259",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-34B-Chat-16K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-34B-Chat-16K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.456449997118756
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6100218256499571
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.21374622356495468
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.33808724832214765
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.43976041666666665
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.45445478723404253
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B-Chat/e335874b-9b3e-4966-a7e0-22e9d16f8324.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat/1762652579.463886",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4638872",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-34B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-34B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.6066758423205982
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6083748310271819
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.277190332326284
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3649328859060403
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4281979166666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.45204454787234044
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-34B/8409c158-ef12-4e6c-8a1d-7be2084b3446.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B/1762652579.4633532",
|
| 4 |
-
"retrieved_timestamp": "1762652579.463354",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-34B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-34B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.2841172533322695
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5976391706360018
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.15332326283987915
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.36577181208053694
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4236041666666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.4665890957446808
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B-Chat/3452e57f-3023-4e2e-ad84-b09e409fe334.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B-Chat/1762652579.464571",
|
| 4 |
-
"retrieved_timestamp": "1762652579.464572",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-6B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-6B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.5145270105542183
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4571311331954389
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.1623867069486405
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.30201342281879195
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.43917708333333333
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3193151595744681
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.061
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-6B/1a1f1263-96b6-4e32-a2c8-6c0d6b47dff9.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B/1762652579.464354",
|
| 4 |
-
"retrieved_timestamp": "1762652579.464355",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-6B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-6B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.26166017278598563
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.44925820198929056
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.06646525679758308
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.313758389261745
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.43740625
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.31441156914893614
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.061
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-32K/df9d9d44-daa1-4e61-9b46-192380043889.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-32K/1762652579.4649951",
|
| 4 |
-
"retrieved_timestamp": "1762652579.464996",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-9B-32K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-9B-32K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.23031113002389217
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.496332115988265
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.10800604229607251
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.35906040268456374
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4186145833333333
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.37649601063829785
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat-16K/090c9691-4b7e-4a98-b9a2-644e21797be4.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat-16K/1762652579.465471",
|
| 4 |
-
"retrieved_timestamp": "1762652579.465471",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-9B-Chat-16K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-9B-Chat-16K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.4214040966856829
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5153383364651778
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.1782477341389728
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3087248322147651
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.40990624999999997
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.39935172872340424
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B-Chat/9256c32b-d956-418f-97da-ea78e3ad9e48.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat/1762652579.465226",
|
| 4 |
-
"retrieved_timestamp": "1762652579.465226",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-9B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-9B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.6045525871354672
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.555906430281685
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.2258308157099698
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3347315436241611
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.42590625
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.39752327127659576
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-1.5-9B/904d1f91-3153-49d5-afd3-9921bfc086f1.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B/1762652579.464781",
|
| 4 |
-
"retrieved_timestamp": "1762652579.464782",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-1.5-9B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-1.5-9B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.29358435617494916
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.514294179104191
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.11404833836858005
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.37919463087248323
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.43278124999999995
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3916223404255319
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-200K/fb2ebd9a-f5b8-42a2-9b58-e6f0e7d9b98a.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-200K/1762652579.465893",
|
| 4 |
-
"retrieved_timestamp": "1762652579.465894",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-34B-200K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-34B-200K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.15424850507763843
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5441817925289527
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.05740181268882175
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3565436241610738
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.38171874999999994
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.45345744680851063
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B-Chat/5d9b9217-874b-426d-8af4-5105a3b1b3ad.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-Chat/1762652579.466115",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4661162",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-34B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-34B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.4698887839820565
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5560872910766164
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.06268882175226587
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.33808724832214765
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.39784375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.4093251329787234
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-34B/3ebcbf3d-cb2d-4332-bb8a-1db104033391.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-34B/1762652579.4656792",
|
| 4 |
-
"retrieved_timestamp": "1762652579.46568",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-34B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-34B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3045751938190667
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5457099951794562
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.0513595166163142
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.36661073825503354
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4118541666666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.441156914893617
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-200K/6b720e8b-aab8-4ba4-9bce-e7a1de3cfb86.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-200K/1762652579.4665558",
|
| 4 |
-
"retrieved_timestamp": "1762652579.466557",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-6B-200K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-6B-200K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.08433068702154728
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.42892948109603307
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.01812688821752266
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.28187919463087246
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.45873958333333337
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.2844082446808511
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.061
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B-Chat/1120c801-7736-4d9d-b23d-08eeedb34186.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-Chat/1762652579.466805",
|
| 4 |
-
"retrieved_timestamp": "1762652579.466806",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-6B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-6B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.33952135888331847
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.41326019207548687
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.013595166163141994
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.29446308724832215
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.36879166666666663
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3061003989361702
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.061
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-6B/297419fa-855c-4eae-ad7c-3cf4a0262450.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-6B/1762652579.4663382",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4663382",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-6B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-6B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.28933784580468713
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4309230591000865
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.015861027190332326
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.26929530201342283
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.39368749999999997
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.29911901595744683
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.061
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B-200K/4299df04-495a-4687-b143-96b1b562d5e8.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-9B-200K/1762652579.467233",
|
| 4 |
-
"retrieved_timestamp": "1762652579.467233",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-9B-200K",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-9B-200K"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.23270921155866434
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4793302602023641
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.06646525679758308
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.31543624161073824
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.42940625
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.36220079787234044
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-9B/0ec59add-f9a9-4dbd-8a83-c6aec0b8ad21.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-9B/1762652579.46702",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4670231",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-9B",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-9B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.2708779372066118
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.49396075125308075
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.055891238670694864
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3179530201342282
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.40540624999999997
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.35738031914893614
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/01-ai/01-ai_Yi-Coder-9B-Chat/ef0cc3a5-0d62-4a45-b0c7-28a6f7dfdac4.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/01-ai_Yi-Coder-9B-Chat/1762652579.4674509",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4674518",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "01-ai/Yi-Coder-9B-Chat",
|
| 18 |
-
"developer": "01-ai",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "01-ai/Yi-Coder-9B-Chat"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.4817041006750976
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.48142000339111674
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.04003021148036254
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.24748322147651006
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.3991770833333333
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.24251994680851063
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.829
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/1-800-LLMs/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/a48b0864-76b7-4860-a448-942a8d74f68e.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/1762652579.468073",
|
| 4 |
-
"retrieved_timestamp": "1762652579.468074",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
|
| 18 |
-
"developer": "1-800-LLMs",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.30774677854758703
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6284322714967584
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.311178247734139
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3699664429530201
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4490625
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.516373005319149
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 14.77
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/152334H/152334H_miqu-1-70b-sf/f57d7b8d-85d5-4e0b-8dec-31e2931487dd.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/152334H_miqu-1-70b-sf/1762652579.469194",
|
| 4 |
-
"retrieved_timestamp": "1762652579.469195",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "152334H/miqu-1-70b-sf",
|
| 18 |
-
"developer": "152334H",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "152334H/miqu-1-70b-sf"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.5181740005407873
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6102361685099691
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.12462235649546828
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.35067114093959734
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.45820833333333333
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.42278922872340424
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 68.977
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-7B-v0.1/1347cd1b-2ebc-4223-900f-7c2479e228a3.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-7B-v0.1/1762652579.469481",
|
| 4 |
-
"retrieved_timestamp": "1762652579.469482",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "1TuanPham/T-VisStar-7B-v0.1",
|
| 18 |
-
"developer": "1TuanPham",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "1TuanPham/T-VisStar-7B-v0.1"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.36070404305021786
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5052203113352468
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.05740181268882175
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.28523489932885904
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3210605053191489
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "MistralForCausalLM",
|
| 105 |
-
"params_billions": 7.294
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/1TuanPham/1TuanPham_T-VisStar-v0.1/b2926dd6-628c-4274-b0e8-1efc64269bb2.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-v0.1/1762652579.469921",
|
| 4 |
-
"retrieved_timestamp": "1762652579.469923",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "1TuanPham/T-VisStar-v0.1",
|
| 18 |
-
"developer": "1TuanPham",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "1TuanPham/T-VisStar-v0.1"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.36070404305021786
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5052203113352468
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.05740181268882175
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.28523489932885904
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3210605053191489
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "MistralForCausalLM",
|
| 105 |
-
"params_billions": 7.294
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/3rd-Degree-Burn/3rd-Degree-Burn_L-3.1-Science-Writer-8B/0c4fd071-b5c9-4bf1-a1d5-d658be1a3258.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_L-3.1-Science-Writer-8B/1762652579.470164",
|
| 4 |
-
"retrieved_timestamp": "1762652579.470165",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
|
| 18 |
-
"developer": "3rd-Degree-Burn",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.42625012743963797
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5041306326216103
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.10347432024169184
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.27432885906040266
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.3959479166666666
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.36494348404255317
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.03
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/4season/4season_final_model_test_v2/74973e37-cd82-4e8a-816a-02b035fabff4.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/4season_final_model_test_v2/1762652579.4714398",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4714408",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "4season/final_model_test_v2",
|
| 18 |
-
"developer": "4season",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "4season/final_model_test_v2"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3191132860809319
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6342049783295018
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.08383685800604229
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3271812080536913
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4314479166666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3528091755319149
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 21.421
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/3766e8a0-99ad-4733-a01b-ced446b15eda.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/1762652579.471838",
|
| 4 |
-
"retrieved_timestamp": "1762652579.471839",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
|
| 18 |
-
"developer": "AALF",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.7189579205397235
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5119887898349903
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.24773413897280966
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3053691275167785
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.38200000000000006
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3732546542553192
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.03
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AALF/AALF_FuseChat-Llama-3.1-8B-SFT-preview/342ac912-805f-4166-b8f4-10f0503fa892.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-SFT-preview/1762652579.472149",
|
| 4 |
-
"retrieved_timestamp": "1762652579.47215",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
|
| 18 |
-
"developer": "AALF",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.7280504616639405
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5240303130445233
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.22507552870090636
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.30453020134228187
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.40199999999999997
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.37433510638297873
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.03
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AGI-0/AGI-0_Art-v0-3B/162b6d5f-f983-4989-9603-f6baea26b633.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AGI-0_Art-v0-3B/1762652579.473539",
|
| 4 |
-
"retrieved_timestamp": "1762652579.47354",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AGI-0/Art-v0-3B",
|
| 18 |
-
"developer": "AGI-0",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AGI-0/Art-v0-3B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.319238509377341
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.3400959483013824
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.24622356495468278
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.25922818791946306
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.3768229166666666
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.11785239361702128
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 3.086
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-CoT/9ac2ba3c-9a21-46b2-a21c-4909cfae6315.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-CoT/1762652579.474318",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4743192",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AI-MO/NuminaMath-7B-CoT",
|
| 18 |
-
"developer": "AI-MO",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AI-MO/NuminaMath-7B-CoT"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.2688544173903022
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4314193495860012
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.26963746223564955
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.26593959731543626
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.33034375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.28681848404255317
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.91
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AI-MO/AI-MO_NuminaMath-7B-TIR/0ffa78d4-fe45-4639-bcd1-eb19ab168a35.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-TIR/1762652579.474566",
|
| 4 |
-
"retrieved_timestamp": "1762652579.474567",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AI-MO/NuminaMath-7B-TIR",
|
| 18 |
-
"developer": "AI-MO",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AI-MO/NuminaMath-7B-TIR"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.27562423259174545
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.41436913375897894
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.1608761329305136
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.25838926174496646
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.35092708333333333
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.2732712765957447
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 6.91
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AI-Sweden-Models/AI-Sweden-Models_Llama-3-8B-instruct/1d68bd2e-de6e-4327-a8f1-33322eba537e.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AI-Sweden-Models_Llama-3-8B-instruct/1762652579.474785",
|
| 4 |
-
"retrieved_timestamp": "1762652579.474786",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AI-Sweden-Models/Llama-3-8B-instruct",
|
| 18 |
-
"developer": "AI-Sweden-Models",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AI-Sweden-Models/Llama-3-8B-instruct"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.24012841482821137
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4173460154515302
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.03851963746223565
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.26593959731543626
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.47709375000000004
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.25972406914893614
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 8.03
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AI4free/AI4free_Dhanishtha/a554a3eb-943c-4135-966b-929129ef025d.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AI4free_Dhanishtha/1762652579.475332",
|
| 4 |
-
"retrieved_timestamp": "1762652579.475332",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AI4free/Dhanishtha",
|
| 18 |
-
"developer": "AI4free",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AI4free/Dhanishtha"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.2451240486353985
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.34039444943326375
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.25604229607250756
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.2525167785234899
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.35694791666666664
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.16431183510638298
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 1.777
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AI4free/AI4free_t2/332ccdb5-faf5-47c6-afeb-a91d2148adf0.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AI4free_t2/1762652579.475577",
|
| 4 |
-
"retrieved_timestamp": "1762652579.475578",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AI4free/t2",
|
| 18 |
-
"developer": "AI4free",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AI4free/t2"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3866828902866616
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.2910111436321769
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.18957703927492447
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.2575503355704698
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.3846354166666666
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.11436170212765957
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 7.613
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AIDC-AI/AIDC-AI_Marco-o1/17f7398f-675d-4b38-b233-64fc106737c3.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AIDC-AI_Marco-o1/1762652579.47579",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4757912",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AIDC-AI/Marco-o1",
|
| 18 |
-
"developer": "AIDC-AI",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AIDC-AI/Marco-o1"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.477083028586373
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5364362696398749
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.37462235649546827
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.25922818791946306
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.41384375
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.41165226063829785
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 7.616
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/Aashraf995/Aashraf995_Creative-7B-nerd/7ea9f4db-5b52-40a5-904e-785e43302934.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/Aashraf995_Creative-7B-nerd/1762652579.476046",
|
| 4 |
-
"retrieved_timestamp": "1762652579.476046",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "Aashraf995/Creative-7B-nerd",
|
| 18 |
-
"developer": "Aashraf995",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "Aashraf995/Creative-7B-nerd"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.4721871301480073
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.5606785565640195
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.3164652567975831
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3263422818791946
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.4515416666666667
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.44921875
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 7.616
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AbacusResearch/AbacusResearch_Jallabi-34B/76397277-901a-4ad0-9dae-0351ca875ec6.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AbacusResearch_Jallabi-34B/1762652579.477037",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4770381",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AbacusResearch/Jallabi-34B",
|
| 18 |
-
"developer": "AbacusResearch",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AbacusResearch/Jallabi-34B"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.3528604103777976
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.6023380603196266
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.05211480362537765
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.3389261744966443
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.48217708333333337
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.4681682180851064
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 34.389
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_StructuredThinker-v0.3-MoreStructure/81a5aafb-2cf7-490d-b619-ce638fcc8b38.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/Ahdoot_StructuredThinker-v0.3-MoreStructure/1762652579.4772868",
|
| 4 |
-
"retrieved_timestamp": "1762652579.477288",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
|
| 18 |
-
"developer": "Ahdoot",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "Ahdoot/StructuredThinker-v0.3-MoreStructure"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.4192808415005519
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.48376906494893984
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.290785498489426
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.29697986577181207
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.41582291666666665
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.36103723404255317
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 3.397
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/Ahdoot/Ahdoot_Test_StealthThinker/43c907eb-3e43-47ff-b38d-f912ba6ef46c.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/Ahdoot_Test_StealthThinker/1762652579.4775438",
|
| 4 |
-
"retrieved_timestamp": "1762652579.4775438",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "Ahdoot/Test_StealthThinker",
|
| 18 |
-
"developer": "Ahdoot",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "Ahdoot/Test_StealthThinker"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.42200361706937595
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.46466398134666304
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.17900302114803626
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.2961409395973154
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.42804166666666665
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.35970744680851063
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "Qwen2ForCausalLM",
|
| 105 |
-
"params_billions": 3.086
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/48732edf-8baf-438e-8a5c-763eee6c0c18.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/1762652579.478028",
|
| 4 |
-
"retrieved_timestamp": "1762652579.478029",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
|
| 18 |
-
"developer": "AicoresSecurity",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.7097656440466851
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4477501104993749
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.1487915407854985
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.27181208053691275
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.34079166666666666
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.3178191489361702
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 3.213
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V0/38f169f0-e939-4b12-8f78-b2a27fb90de0.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0/1762652579.4777558",
|
| 4 |
-
"retrieved_timestamp": "1762652579.477757",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
|
| 18 |
-
"developer": "AicoresSecurity",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.6358018945287394
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4497434194912941
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.11555891238670694
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.2634228187919463
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.33136458333333335
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.301030585106383
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "bfloat16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 3.213
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data/HFOpenLLMv2/AicoresSecurity/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/e8c63728-a1f5-432f-bf9f-204b0f4041aa.json
DELETED
|
@@ -1,107 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"schema_version": "0.0.1",
|
| 3 |
-
"evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/1762652579.478466",
|
| 4 |
-
"retrieved_timestamp": "1762652579.478467",
|
| 5 |
-
"source_data": [
|
| 6 |
-
"https://open-llm-leaderboard-open-llm-leaderboard.hf.space/api/leaderboard/formatted"
|
| 7 |
-
],
|
| 8 |
-
"evaluation_source": {
|
| 9 |
-
"evaluation_source_name": "HF Open LLM v2",
|
| 10 |
-
"evaluation_source_type": "leaderboard"
|
| 11 |
-
},
|
| 12 |
-
"source_metadata": {
|
| 13 |
-
"source_organization_name": "Hugging Face",
|
| 14 |
-
"evaluator_relationship": "third_party"
|
| 15 |
-
},
|
| 16 |
-
"model_info": {
|
| 17 |
-
"name": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
|
| 18 |
-
"developer": "AicoresSecurity",
|
| 19 |
-
"inference_platform": "unknown",
|
| 20 |
-
"id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1"
|
| 21 |
-
},
|
| 22 |
-
"evaluation_results": [
|
| 23 |
-
{
|
| 24 |
-
"evaluation_name": "IFEval",
|
| 25 |
-
"metric_config": {
|
| 26 |
-
"evaluation_description": "Accuracy on IFEval",
|
| 27 |
-
"lower_is_better": false,
|
| 28 |
-
"score_type": "continuous",
|
| 29 |
-
"min_score": 0,
|
| 30 |
-
"max_score": 1
|
| 31 |
-
},
|
| 32 |
-
"score_details": {
|
| 33 |
-
"score": 0.6730209178313542
|
| 34 |
-
}
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"evaluation_name": "BBH",
|
| 38 |
-
"metric_config": {
|
| 39 |
-
"evaluation_description": "Accuracy on BBH",
|
| 40 |
-
"lower_is_better": false,
|
| 41 |
-
"score_type": "continuous",
|
| 42 |
-
"min_score": 0,
|
| 43 |
-
"max_score": 1
|
| 44 |
-
},
|
| 45 |
-
"score_details": {
|
| 46 |
-
"score": 0.4391775517124728
|
| 47 |
-
}
|
| 48 |
-
},
|
| 49 |
-
{
|
| 50 |
-
"evaluation_name": "MATH Level 5",
|
| 51 |
-
"metric_config": {
|
| 52 |
-
"evaluation_description": "Exact Match on MATH Level 5",
|
| 53 |
-
"lower_is_better": false,
|
| 54 |
-
"score_type": "continuous",
|
| 55 |
-
"min_score": 0,
|
| 56 |
-
"max_score": 1
|
| 57 |
-
},
|
| 58 |
-
"score_details": {
|
| 59 |
-
"score": 0.17598187311178248
|
| 60 |
-
}
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"evaluation_name": "GPQA",
|
| 64 |
-
"metric_config": {
|
| 65 |
-
"evaluation_description": "Accuracy on GPQA",
|
| 66 |
-
"lower_is_better": false,
|
| 67 |
-
"score_type": "continuous",
|
| 68 |
-
"min_score": 0,
|
| 69 |
-
"max_score": 1
|
| 70 |
-
},
|
| 71 |
-
"score_details": {
|
| 72 |
-
"score": 0.2709731543624161
|
| 73 |
-
}
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"evaluation_name": "MUSR",
|
| 77 |
-
"metric_config": {
|
| 78 |
-
"evaluation_description": "Accuracy on MUSR",
|
| 79 |
-
"lower_is_better": false,
|
| 80 |
-
"score_type": "continuous",
|
| 81 |
-
"min_score": 0,
|
| 82 |
-
"max_score": 1
|
| 83 |
-
},
|
| 84 |
-
"score_details": {
|
| 85 |
-
"score": 0.35409375000000004
|
| 86 |
-
}
|
| 87 |
-
},
|
| 88 |
-
{
|
| 89 |
-
"evaluation_name": "MMLU-PRO",
|
| 90 |
-
"metric_config": {
|
| 91 |
-
"evaluation_description": "Accuracy on MMLU-PRO",
|
| 92 |
-
"lower_is_better": false,
|
| 93 |
-
"score_type": "continuous",
|
| 94 |
-
"min_score": 0,
|
| 95 |
-
"max_score": 1
|
| 96 |
-
},
|
| 97 |
-
"score_details": {
|
| 98 |
-
"score": 0.308843085106383
|
| 99 |
-
}
|
| 100 |
-
}
|
| 101 |
-
],
|
| 102 |
-
"additional_details": {
|
| 103 |
-
"precision": "float16",
|
| 104 |
-
"architecture": "LlamaForCausalLM",
|
| 105 |
-
"params_billions": 3.213
|
| 106 |
-
}
|
| 107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|