Spaces:

deepmage121
/

EEE_viz

Running

App Files Files Community

deepmage121 commited on 24 days ago

Commit

b5c23a1

1 Parent(s): d0ab546

major changes to space

Browse files

Files changed (8) hide show

.github/workflows/sync-to-hf.yml +0 -55
app.py +280 -125
data_loader.py +72 -3
json_to_parquet.py +0 -228
scripts/convert_to_parquet.py +0 -142
scripts/json_to_parquet.py +0 -222
ui_components.py +761 -134
upload_to_hf.py +0 -122

.github/workflows/sync-to-hf.yml DELETED Viewed

@@ -1,55 +0,0 @@
-name: Sync to HuggingFace Dataset
-on:
-  push:
-    branches: [main]
-    paths:
-      - 'data/**/*.json'
-  workflow_dispatch:  # Allow manual trigger
-jobs:
-  sync-to-huggingface:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-      - name: Install dependencies
-        run: |
-          pip install datasets huggingface_hub pandas pyarrow
-      - name: Convert Changed JSONs to Parquet (Optimized)
-        env:
-          HF_DATASET_REPO: deepmage121/eee_test
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          echo "Detecting changed leaderboards..."
-          python scripts/convert_to_parquet.py
-      - name: Upload Changed Parquets to HuggingFace
-        env:
-          HF_DATASET_REPO: deepmage121/eee_test
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: |
-          echo "Uploading changed parquets..."
-          python scripts/upload_to_hf.py
-      - name: Report status
-        if: success()
-        run: |
-          echo "Successfully synced to HuggingFace dataset"
-          echo "View at: https://huggingface.co/datasets/deepmage121/eee_test"
-          if [ -f parquet_output/changed_leaderboards.json ]; then
-            echo ""
-            echo "Changes processed:"
-            cat parquet_output/changed_leaderboards.json
-          fi

app.py CHANGED Viewed

@@ -6,66 +6,40 @@ import gradio as gr
 import pandas as pd
 from pathlib import Path
-# Import custom modules
 from data_loader import (
     load_hf_dataset_on_startup,
     get_available_leaderboards,
     get_eval_metadata,
     build_leaderboard_table,
     clear_cache,
     DATA_DIR
 )
-from ui_components import get_theme, get_custom_css, format_leaderboard_header, format_metric_details
-def export_leaderboard_to_json(selected_leaderboard):
-    """Export current leaderboard to JSON files in a zip using parquet_to_folder."""
-    if not selected_leaderboard:
-        return None
-    import tempfile
-    import shutil
-    import zipfile
-    from json_to_parquet import parquet_to_folder
-    try:
-        # Find the parquet file in DATA_DIR
-        parquet_path = DATA_DIR / selected_leaderboard / f"{selected_leaderboard}.parquet"
-        if not parquet_path.exists():
-            print(f"Parquet file not found: {parquet_path}")
-            return None
-        # Create temp directory for export
-        with tempfile.TemporaryDirectory() as temp_dir:
-            temp_path = Path(temp_dir)
-            output_dir = temp_path / "json_export"
-            output_dir.mkdir()
-            # Use the round-trip functionality from json_to_parquet
-            parquet_to_folder(str(parquet_path), str(output_dir))
-            # Create zip file
-            zip_path = temp_path / f"{selected_leaderboard}_export.zip"
-            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
-                for json_file in output_dir.rglob("*.json"):
-                    arcname = json_file.relative_to(output_dir)
-                    zipf.write(json_file, arcname)
-            # Copy to a permanent location for download
-            final_zip = Path(tempfile.gettempdir()) / f"{selected_leaderboard}_export.zip"
-            shutil.copy(zip_path, final_zip)
-            return str(final_zip)
-    except Exception as e:
-        print(f"Export error: {e}")
-        return None
-def update_leaderboard_table(selected_leaderboard, search_query="", progress=gr.Progress()):
     """Loads and aggregates data for the selected leaderboard."""
     if not selected_leaderboard:
-        return pd.DataFrame(), "", format_leaderboard_header(None, {}), format_metric_details(None, {})
     metadata = get_eval_metadata(selected_leaderboard)
@@ -73,110 +47,261 @@ def update_leaderboard_table(selected_leaderboard, search_query="", progress=gr.
         progress(value, desc=desc)
     df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
-    total_count = len(df)
-    # Apply search filter (searches all columns)
     if search_query and not df.empty:
         mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
         df = df[mask]
-    # Build search status message
-    if search_query:
-        search_msg = f"Showing {len(df)} of {total_count} results for '{search_query}'"
-    else:
-        search_msg = f"Showing {len(df)} results"
-    return df, search_msg, format_leaderboard_header(selected_leaderboard, metadata), format_metric_details(selected_leaderboard, metadata)
-# Load HF dataset BEFORE building the interface
 load_hf_dataset_on_startup()
-# Build Gradio interface
-with gr.Blocks(title="Eval Leaderboard", theme=get_theme(), css=get_custom_css()) as demo:
-    with gr.Row(variant="compact", elem_classes="header-row"):
-        with gr.Column(scale=1):
-            gr.Markdown("# 🏆 Evaluation Leaderboard")
-            gr.Markdown("Analyze and compare model performance metrics.", elem_classes="subtitle")
-    with gr.Row(variant="panel"):
-        initial_choices = get_available_leaderboards()
-        initial_value = initial_choices[0] if initial_choices else None
-        with gr.Column(scale=3):
-            leaderboard_selector = gr.Dropdown(
-                choices=initial_choices,
-                value=initial_value,
-                label="Current Leaderboard",
-                interactive=True
             )
-        with gr.Column(scale=3):
-             search_box = gr.Textbox(
-                label="Search",
-                placeholder="Type to search across all columns...",
-                show_label=False
-             )
-        with gr.Column(scale=1):
-             refresh_btn = gr.Button("🔄 Refresh", variant="secondary", size="sm")
-    with gr.Accordion("ℹ️ How to Submit Data", open=False):
         gr.Markdown("""
-### Submitting Evaluation Data
-**Data submissions happen via GitHub Pull Requests:**
-1. **Fork** [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
-2. **Add your JSON files** to `data/<leaderboard>/<developer>/<model>/`
-3. **Create a Pull Request**
-4. **Automated validation** checks your data
-5. **After merge**: GitHub Actions automatically syncs to HuggingFace
-6. **Refresh this page** to see your data!
-#### File Structure
-```
-data/
-└── YourBenchmark/
-    └── developer_name/
-        └── model_name/
-            └── {uuid}.json
-```
-Each JSON file should follow the schema and be named with a unique UUID.
-📖 [**Full Submission Guide**](https://github.com/evaleval/every_eval_ever#contributor-guide) |
-📋 [**JSON Schema**](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json) |
-👀 [**See Examples**](https://github.com/evaleval/every_eval_ever/tree/main/data)
         """)
-    init_df, init_search_msg, init_header, init_metrics = update_leaderboard_table(initial_value)
-    header_view = gr.HTML(value=init_header)
-    search_info = gr.Markdown(value=init_search_msg)
-    leaderboard_table = gr.Dataframe(
-        value=init_df,
-        label=None,
-        interactive=False,
-        wrap=True,
-        elem_classes="dataframe"
-    )
-    metrics_view = gr.HTML(value=init_metrics)
-    # Event handlers
     leaderboard_selector.change(
         fn=update_leaderboard_table,
-        inputs=[leaderboard_selector, search_box],
-        outputs=[leaderboard_table, search_info, header_view, metrics_view]
     )
     search_box.input(
         fn=update_leaderboard_table,
-        inputs=[leaderboard_selector, search_box],
-        outputs=[leaderboard_table, search_info, header_view, metrics_view]
     )
     refresh_btn.click(
@@ -184,10 +309,40 @@ Each JSON file should follow the schema and be named with a unique UUID.
         outputs=[leaderboard_selector]
     ).then(
         fn=lambda: clear_cache()
     ).then(
         fn=update_leaderboard_table,
-        inputs=[leaderboard_selector, search_box],
-        outputs=[leaderboard_table, search_info, header_view, metrics_view]
     )
     DATA_DIR.mkdir(exist_ok=True)

 import pandas as pd
 from pathlib import Path
 from data_loader import (
     load_hf_dataset_on_startup,
     get_available_leaderboards,
     get_eval_metadata,
     build_leaderboard_table,
     clear_cache,
+    search_model_across_leaderboards,
+    get_all_model_names,
     DATA_DIR
 )
+from ui_components import (
+    get_theme,
+    get_custom_css,
+    format_leaderboard_header,
+    format_metric_details,
+    format_model_card,
+)
+PAGE_SIZE = 50
+def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, progress=gr.Progress()):
     """Loads and aggregates data for the selected leaderboard."""
     if not selected_leaderboard:
+        return (
+            pd.DataFrame(),
+            format_leaderboard_header(None, {}),
+            format_metric_details(None, {}),
+            gr.update(choices=[], value=None),
+            gr.update(interactive=False),
+            gr.update(interactive=False),
+            gr.update(choices=[], value=None),
+            "0 / 0",
+        )
     metadata = get_eval_metadata(selected_leaderboard)
         progress(value, desc=desc)
     df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
     if search_query and not df.empty:
         mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
         df = df[mask]
+    filtered_count = len(df)
+    if sort_column and sort_column in df.columns and not df.empty:
+        df = df.sort_values(by=sort_column, ascending=False, na_position='last')
+    total_pages = max(1, (filtered_count + PAGE_SIZE - 1) // PAGE_SIZE) if filtered_count > 0 else 1
+    current_page = max(1, min(current_page, total_pages))
+    start_idx = (current_page - 1) * PAGE_SIZE
+    end_idx = start_idx + PAGE_SIZE
+    df_paginated = df.iloc[start_idx:end_idx] if not df.empty else df
+    page_choices = [str(i) for i in range(1, total_pages + 1)]
+    page_dropdown = gr.update(choices=page_choices, value=str(current_page))
+    prev_btn = gr.update(interactive=(current_page > 1))
+    next_btn = gr.update(interactive=(current_page < total_pages))
+    page_info = f"{current_page} / {total_pages}"
+    sort_choices = list(df.columns) if not df.empty else []
+    default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
+    sort_column_update = gr.update(choices=sort_choices, value=default_sort)
+    return (
+        df_paginated,
+        format_leaderboard_header(selected_leaderboard, metadata),
+        format_metric_details(selected_leaderboard, metadata),
+        page_dropdown,
+        prev_btn,
+        next_btn,
+        sort_column_update,
+        page_info,
+    )
+def search_model(model_query):
+    """Search for a model and return formatted card."""
+    if not model_query or len(model_query) < 2:
+        return """
+        <div class="no-results">
+            <h3>Search for a model</h3>
+            <p>Enter a model name to see its benchmarks across all leaderboards</p>
+        </div>
+        """
+    results, _ = search_model_across_leaderboards(model_query)
+    if not results:
+        return f"""
+        <div class="no-results">
+            <h3>No results for "{model_query}"</h3>
+            <p>Try a different model name or check the spelling</p>
+        </div>
+        """
+    # Use the first matching model
+    model_name = list(results.keys())[0]
+    model_data = results[model_name]
+    return format_model_card(model_name, model_data)
+def get_model_suggestions(query):
+    """Get model name suggestions for autocomplete."""
+    if not query or len(query) < 2:
+        return gr.update(choices=[])
+    _, matches = search_model_across_leaderboards(query)
+    return gr.update(choices=matches[:15])
+# Load data at startup
 load_hf_dataset_on_startup()
+# Build interface
+with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
+    # Header
+    gr.HTML("""
+        <div class="app-header">
+            <div class="logo-mark">E³</div>
+            <div class="brand">
+                <h1>Every Eval Ever</h1>
+                <span class="tagline">Browse and compare model benchmarks</span>
+            </div>
+            <div class="header-right">
+                <span class="version-badge">beta</span>
+            </div>
+        </div>
+    """)
+    with gr.Tabs():
+        # === TAB 1: Leaderboard View ===
+        with gr.TabItem("📊 Leaderboards"):
+            with gr.Row(elem_classes="controls-bar"):
+                initial_choices = get_available_leaderboards()
+                initial_value = initial_choices[0] if initial_choices else None
+                with gr.Column(scale=2, min_width=200):
+                    leaderboard_selector = gr.Dropdown(
+                        choices=initial_choices,
+                        value=initial_value,
+                        label="Leaderboard",
+                        interactive=True
+                    )
+                with gr.Column(scale=3, min_width=250):
+                    search_box = gr.Textbox(
+                        label="Filter",
+                        placeholder="Filter models...",
+                        show_label=True
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
+            init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info = update_leaderboard_table(initial_value, "", 1, "Average")
+            header_view = gr.HTML(value=init_header)
+            # Hidden sort state (default to Average)
+            sort_column_dropdown = gr.Dropdown(
+                choices=init_sort_cols.get("choices", []) if hasattr(init_sort_cols, 'get') else [],
+                value=init_sort_cols.get("value") if hasattr(init_sort_cols, 'get') else None,
+                visible=False,
             )
+            leaderboard_table = gr.Dataframe(
+                value=init_df,
+                label=None,
+                interactive=False,
+                wrap=False,
+                elem_classes="dataframe",
+                column_widths=["28%", "12%", "7%", "7%"]
+            )
+            # Pagination below table - centered
+            with gr.Row(elem_classes="pagination-bar"):
+                prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
+                page_info = gr.Markdown(value=init_page_info, elem_classes="page-info")
+                next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
+                page_dropdown = gr.Dropdown(
+                    choices=[],
+                    value="1",
+                    visible=False,
+                )
+            metrics_view = gr.HTML(value=init_metrics)
+        # === TAB 2: Model View ===
+        with gr.TabItem("🔍 Model Lookup"):
+            gr.Markdown("### Find a model's benchmarks across all leaderboards")
+            with gr.Row(elem_classes="controls-bar"):
+                with gr.Column(scale=4):
+                    model_search_dropdown = gr.Dropdown(
+                        choices=[],
+                        label="Model Name",
+                        allow_custom_value=True,
+                        interactive=True,
+                        filterable=True,
+                    )
+                with gr.Column(scale=1, min_width=100):
+                    model_search_btn = gr.Button("Search", variant="primary", size="sm")
+            model_card_view = gr.HTML(value="""
+                <div class="no-results">
+                    <h3>Search for a model</h3>
+                    <p>Start typing to see suggestions, then select a model</p>
+                </div>
+            """)
+    # Submission guide
+    with gr.Accordion("📤 How to Submit Data", open=False):
         gr.Markdown("""
+**Submit via GitHub Pull Request:**
+1. Fork [evaleval/every_eval_ever](https://github.com/evaleval/every_eval_ever)
+2. Add JSON files to `data/<leaderboard>/<developer>/<model>/`
+3. Open a PR — automated validation runs on submission
+4. After merge, data syncs to HuggingFace automatically
+[Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) · [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
         """)
+    # === State ===
+    current_page_state = gr.State(value=1)
+    sort_column_state = gr.State(value="Average")
+    def go_prev(current):
+        return max(1, current - 1)
+    def go_next(current):
+        return current + 1
+    def reset_page():
+        return 1
+    # === Leaderboard Events ===
     leaderboard_selector.change(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=lambda: "Average", outputs=[sort_column_state]
+    ).then(
         fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
     )
     search_box.input(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    sort_column_dropdown.change(
+        fn=lambda col: col,
+        inputs=[sort_column_dropdown],
+        outputs=[sort_column_state]
+    ).then(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    page_dropdown.change(
+        fn=lambda p: int(p) if p else 1,
+        inputs=[page_dropdown],
+        outputs=[current_page_state]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    prev_btn.click(
+        fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
+    ).then(
+        fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    next_btn.click(
+        fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
+    ).then(
         fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
     )
     refresh_btn.click(
         outputs=[leaderboard_selector]
     ).then(
         fn=lambda: clear_cache()
+    ).then(
+        fn=reset_page, outputs=[current_page_state]
+    ).then(
+        fn=lambda: "Average", outputs=[sort_column_state]
     ).then(
         fn=update_leaderboard_table,
+        inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
+        outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
+    )
+    # === Model Search Events ===
+    def update_model_suggestions(query):
+        """Update dropdown choices based on query."""
+        if not query or len(query) < 2:
+            return gr.update(choices=[])
+        _, matches = search_model_across_leaderboards(query)
+        return gr.update(choices=matches[:20])
+    model_search_dropdown.input(
+        fn=update_model_suggestions,
+        inputs=[model_search_dropdown],
+        outputs=[model_search_dropdown]
+    )
+    model_search_btn.click(
+        fn=search_model,
+        inputs=[model_search_dropdown],
+        outputs=[model_card_view]
+    )
+    model_search_dropdown.select(
+        fn=search_model,
+        inputs=[model_search_dropdown],
+        outputs=[model_card_view]
     )
     DATA_DIR.mkdir(exist_ok=True)

data_loader.py CHANGED Viewed

@@ -296,11 +296,16 @@ def build_leaderboard_table(selected_leaderboard, search_query="", progress_call
         if len(eval_only_cols) > 0:
             df["Average"] = df[eval_only_cols].mean(axis=1).round(3)
-        base_cols = ["Model", "Developer", "Params (B)", "Arch", "Precision", "Average"]
-        eval_cols = [c for c in df.columns if c not in base_cols]
         base_cols = [c for c in base_cols if c in df.columns]
-        final_cols = base_cols + sorted(eval_cols)
         df = df[final_cols]
         if "Average" in df.columns:
@@ -315,3 +320,67 @@ def clear_cache():
     """Clears all caches."""
     LEADERBOARD_CACHE.clear()

         if len(eval_only_cols) > 0:
             df["Average"] = df[eval_only_cols].mean(axis=1).round(3)
+        # Base columns: Model, Developer, Params, Average
+        # Eval columns: all evaluation scores
+        # Model detail columns: Arch, Precision (moved to end)
+        base_cols = ["Model", "Developer", "Params (B)", "Average"]
+        model_detail_cols = ["Arch", "Precision"]
+        eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols]
         base_cols = [c for c in base_cols if c in df.columns]
+        model_detail_cols = [c for c in model_detail_cols if c in df.columns]
+        final_cols = base_cols + sorted(eval_cols) + model_detail_cols
         df = df[final_cols]
         if "Average" in df.columns:
     """Clears all caches."""
     LEADERBOARD_CACHE.clear()
+def search_model_across_leaderboards(model_query):
+    """Search for a model across all leaderboards and return aggregated results."""
+    if not model_query or not HF_DATASET_CACHE:
+        return {}, []
+    model_query_lower = model_query.lower().strip()
+    results = {}
+    all_matches = []
+    for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
+        for item in parsed_items:
+            model_id = item.get("model", "")
+            # Check if query matches model name (case insensitive, partial match)
+            if model_query_lower in model_id.lower():
+                all_matches.append(model_id)
+                # Exact match gets priority
+                if model_id.lower() == model_query_lower or model_id == model_query:
+                    if model_id not in results:
+                        results[model_id] = {}
+                    results[model_id][leaderboard_name] = {
+                        "developer": item.get("developer"),
+                        "params": item.get("params"),
+                        "architecture": item.get("architecture"),
+                        "precision": item.get("precision"),
+                        "results": item.get("results", {})
+                    }
+    # If no exact match, use partial matches
+    if not results and all_matches:
+        # Get the first partial match
+        for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
+            for item in parsed_items:
+                model_id = item.get("model", "")
+                if model_query_lower in model_id.lower():
+                    if model_id not in results:
+                        results[model_id] = {}
+                    results[model_id][leaderboard_name] = {
+                        "developer": item.get("developer"),
+                        "params": item.get("params"),
+                        "architecture": item.get("architecture"),
+                        "precision": item.get("precision"),
+                        "results": item.get("results", {})
+                    }
+    # Return unique matches for autocomplete
+    unique_matches = sorted(set(all_matches))[:20]  # Limit to 20 suggestions
+    return results, unique_matches
+def get_all_model_names():
+    """Get all unique model names across all leaderboards."""
+    if not HF_DATASET_CACHE:
+        return []
+    models = set()
+    for parsed_items in HF_DATASET_CACHE.values():
+        for item in parsed_items:
+            models.add(item.get("model", ""))
+    return sorted(models)

json_to_parquet.py DELETED Viewed

@@ -1,228 +0,0 @@
-import json
-from pathlib import Path
-import pandas as pd
-def json_to_row(json_path: Path) -> dict:
-    """Convert one JSON to a single row (1 JSON = 1 row, evaluations as columns)."""
-    with open(json_path, 'r') as f:
-        data = json.load(f)
-    required_fields = ["schema_version", "evaluation_id", "evaluation_source", "retrieved_timestamp",
-                      "source_data", "source_metadata", "model_info", "evaluation_results"]
-    for field in required_fields:
-        if field not in data:
-            raise ValueError(f"{json_path}: Missing required field '{field}'")
-    if "evaluation_source_name" not in data["evaluation_source"]:
-        raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_name'")
-    if "evaluation_source_type" not in data["evaluation_source"]:
-        raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_type'")
-    if "source_organization_name" not in data["source_metadata"]:
-        raise ValueError(f"{json_path}: Missing required field 'source_metadata.source_organization_name'")
-    if "evaluator_relationship" not in data["source_metadata"]:
-        raise ValueError(f"{json_path}: Missing required field 'source_metadata.evaluator_relationship'")
-    if "name" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.name'")
-    if "id" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.id'")
-    if "developer" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.developer'")
-    leaderboard = data["evaluation_source"]["evaluation_source_name"]
-    model = data["model_info"]["id"]
-    uuid = json_path.stem
-    developer = data["model_info"]["developer"]
-    # Validate evaluation results
-    for eval_result in data["evaluation_results"]:
-        if "evaluation_name" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].evaluation_name'")
-        if "metric_config" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config'")
-        if "score_details" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details'")
-        if "lower_is_better" not in eval_result["metric_config"]:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config.lower_is_better'")
-        if "score" not in eval_result["score_details"]:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details.score'")
-    row = {
-        # Folder structure (for reconstruction)
-        "_leaderboard": leaderboard,
-        "_developer": developer,
-        "_model": model,
-        "_uuid": uuid,
-        # Required top-level fields
-        "schema_version": data["schema_version"],
-        "evaluation_id": data["evaluation_id"],
-        "retrieved_timestamp": data["retrieved_timestamp"],
-        "source_data": json.dumps(data["source_data"]),
-        # Required nested fields
-        "evaluation_source_name": data["evaluation_source"]["evaluation_source_name"],
-        "evaluation_source_type": data["evaluation_source"]["evaluation_source_type"],
-        "source_organization_name": data["source_metadata"]["source_organization_name"],
-        "source_organization_url": data["source_metadata"].get("source_organization_url"),
-        "source_organization_logo_url": data["source_metadata"].get("source_organization_logo_url"),
-        "evaluator_relationship": data["source_metadata"]["evaluator_relationship"],
-        "model_name": data["model_info"]["name"],
-        "model_id": data["model_info"]["id"],
-        "model_developer": data["model_info"]["developer"],
-        "model_inference_platform": data["model_info"].get("inference_platform"),
-        # Store full evaluation_results and additional_details as JSON
-        "evaluation_results": json.dumps(data["evaluation_results"]),
-        "additional_details": json.dumps(data["additional_details"]) if "additional_details" in data else None,
-    }
-    return row
-def add_to_parquet(json_or_folder: str, parquet_file: str):
-    """
-    Add JSON(s) to Parquet file.
-    Creates new file if it doesn't exist, appends and deduplicates if it does.
-    Args:
-        json_or_folder: Path to single JSON file or folder containing JSONs
-        parquet_file: Output Parquet file path
-    """
-    input_path = Path(json_or_folder)
-    if input_path.is_file():
-        json_files = [input_path]
-    elif input_path.is_dir():
-        json_files = list(input_path.rglob("*.json"))
-        if not json_files:
-            raise ValueError(f"No JSON files found in directory: {json_or_folder}")
-    else:
-        raise ValueError(f"Invalid input: {json_or_folder}")
-    print(f"Processing {len(json_files)} JSON file(s)...")
-    parquet_path = Path(parquet_file)
-    if parquet_path.exists():
-        existing_df = pd.read_parquet(parquet_file)
-        existing_keys = set(
-            existing_df[["_leaderboard", "_developer", "_model", "_uuid"]]
-            .apply(tuple, axis=1)
-        )
-        print(f"Found {len(existing_df)} existing rows")
-    else:
-        existing_df = None
-        existing_keys = set()
-    all_rows = []
-    skipped = 0
-    for i, jf in enumerate(json_files, 1):
-        if i % 100 == 0:
-            print(f"  {i}/{len(json_files)}")
-        row = json_to_row(jf)
-        key = (row["_leaderboard"], row["_developer"], row["_model"], row["_uuid"])
-        if key not in existing_keys:
-            all_rows.append(row)
-            existing_keys.add(key)
-        else:
-            skipped += 1
-    if skipped > 0:
-        print(f"  Skipped {skipped} duplicate file(s)")
-    # Handle case where no new rows to add
-    if not all_rows:
-        if existing_df is not None:
-            print(f"No new files to add, keeping existing {len(existing_df)} file(s)")
-            return
-        else:
-            raise ValueError("No valid JSON files to process and no existing parquet file")
-    new_df = pd.DataFrame(all_rows)
-    if existing_df is not None:
-        df = pd.concat([existing_df, new_df], ignore_index=True)
-        print(f"Added {len(new_df)} new file(s) to existing {len(existing_df)} file(s)")
-    else:
-        df = new_df
-    df.to_parquet(parquet_file, index=False)
-    print(f"Saved {len(df)} total file(s) to {parquet_file} ({parquet_path.stat().st_size / 1024 / 1024:.1f} MB)")
-def parquet_to_folder(parquet_file: str, output_dir: str):
-    """Reconstruct folder structure from Parquet."""
-    df = pd.read_parquet(parquet_file)
-    out = Path(output_dir)
-    for _, row in df.iterrows():
-        lb = row["_leaderboard"]
-        dev = row["_developer"]
-        model = row["_model"]
-        uuid = row["_uuid"]
-        json_data = {
-            "schema_version": row["schema_version"],
-            "evaluation_id": row["evaluation_id"],
-            "retrieved_timestamp": row["retrieved_timestamp"],
-            "source_data": json.loads(row["source_data"]),
-            "evaluation_source": {
-                "evaluation_source_name": row["evaluation_source_name"],
-                "evaluation_source_type": row["evaluation_source_type"]
-            },
-            "source_metadata": {
-                "source_organization_name": row["source_organization_name"],
-                "evaluator_relationship": row["evaluator_relationship"]
-            },
-            "model_info": {
-                "name": row["model_name"],
-                "id": row["model_id"],
-                "developer": row["model_developer"]
-            },
-            "evaluation_results": json.loads(row["evaluation_results"])
-        }
-        if pd.notna(row["source_organization_url"]):
-            json_data["source_metadata"]["source_organization_url"] = row["source_organization_url"]
-        if pd.notna(row["source_organization_logo_url"]):
-            json_data["source_metadata"]["source_organization_logo_url"] = row["source_organization_logo_url"]
-        if pd.notna(row["model_inference_platform"]):
-            json_data["model_info"]["inference_platform"] = row["model_inference_platform"]
-        if pd.notna(row["additional_details"]):
-            json_data["additional_details"] = json.loads(row["additional_details"])
-        file_path = out / lb / dev / model / f"{uuid}.json"
-        file_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(file_path, 'w') as f:
-            json.dump(json_data, f, indent=2)
-    print(f"Reconstructed {len(df)} files to {output_dir}")
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) < 2:
-        print("Usage:")
-        print("  python json_to_parquet.py add <json_or_folder> <output.parquet>")
-        print("  python json_to_parquet.py export <input.parquet> <output_dir>")
-        sys.exit(1)
-    cmd = sys.argv[1]
-    if cmd == "add":
-        add_to_parquet(sys.argv[2], sys.argv[3])
-    elif cmd == "export":
-        parquet_to_folder(sys.argv[2], sys.argv[3])
-    else:
-        print(f"Unknown command: {cmd}")

scripts/convert_to_parquet.py DELETED Viewed

@@ -1,142 +0,0 @@
-"""
-Incremental parquet conversion with HuggingFace sync.
-Optimized workflow:
-1. Detect changed leaderboards via git diff (instant!)
-2. Download ONLY changed parquets from HF (fast!)
-3. Re-convert ONLY changed leaderboards
-4. Ready for upload (handled by upload_to_hf.py)
-This avoids downloading and processing unchanged leaderboards.
-"""
-from pathlib import Path
-import sys
-import subprocess
-import os
-import json
-from datasets import load_dataset
-sys.path.insert(0, str(Path(__file__).parent.resolve().parent))
-from json_to_parquet import add_to_parquet
-HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "deepmage121/eee_test")
-def download_leaderboards(output_dir: Path, leaderboard_names: set[str]) -> set[str]:
-    """Download existing leaderboard parquets from HuggingFace."""
-    try:
-        dataset_dict = load_dataset(HF_DATASET_REPO)
-        downloaded: set[str] = set()
-        for lb in leaderboard_names:
-            if lb in dataset_dict:
-                print(f"  Downloading {lb}")
-                dataset_dict[lb].to_pandas().to_parquet(output_dir / f"{lb}.parquet", index=False)
-                downloaded.add(lb)
-            else:
-                print(f"  {lb} (new)")
-        print(f"Downloaded {len(downloaded)}/{len(leaderboard_names)} parquet(s)")
-        return downloaded
-    except Exception as e:
-        print(f"HF download failed: {e}")
-        sys.exit(1)
-def detect_modified_leaderboards() -> set[str]:
-    """Get leaderboards with changed JSONs via git diff (HEAD~1)."""
-    try:
-        result = subprocess.run(
-            ["git", "diff", "--name-only", "HEAD~1", "HEAD", "data/"],
-            capture_output=True, text=True, check=True
-        )
-        changed_files = result.stdout.strip().split('\n')
-        if not changed_files or changed_files == ['']:
-            print("No changes detected in data/")
-            return set()
-        leaderboards = {
-            Path(f).parts[1]
-            for f in changed_files
-            if f.startswith('data/') and f.endswith('.json') and len(Path(f).parts) >= 2
-        }
-        return leaderboards
-    except subprocess.CalledProcessError as e:
-        print(f"ERROR: Git command failed: {e}")
-        sys.exit(1)
-def convert_changed_leaderboards():
-    """
-    Optimized conversion: detect changes, download only changed, re-convert only changed.
-    """
-    data_dir = Path("data")
-    output_dir = Path("parquet_output")
-    output_dir.mkdir(exist_ok=True)
-    if not data_dir.exists():
-        print(f"Data directory not found: {data_dir}")
-        sys.exit(1)
-    changed_leaderboards: set[str] = detect_modified_leaderboards()
-    if len(changed_leaderboards) == 0:
-        print("No changes detected, nothing to upload")
-        manifest = {"changed": [], "converted": []}
-        (output_dir / "changed_leaderboards.json").write_text(json.dumps(manifest, indent=2))
-        sys.exit(0)
-    print(f"Detected {len(changed_leaderboards)} changed leaderboard(s):")
-    for lb in changed_leaderboards:
-        print(f"  {lb}")
-    downloaded = download_leaderboards(output_dir, changed_leaderboards)
-    converted_count = 0
-    error_count = 0
-    converted_leaderboards = []
-    for leaderboard_name in changed_leaderboards:
-        leaderboard_dir = os.path.join(data_dir, leaderboard_name)
-        parquet_path = os.path.join(output_dir, f"{leaderboard_name}.parquet")
-        print(f"\nConverting: {leaderboard_name}")
-        try:
-            add_to_parquet(json_or_folder=str(leaderboard_dir), parquet_file=str(parquet_path))
-            print(f"   Converted to {parquet_path}")
-            converted_count += 1
-            converted_leaderboards.append(leaderboard_name)
-        except Exception as e:
-            print(f"   Error: {e}")
-            error_count += 1
-    manifest = {
-        "changed": list(changed_leaderboards),
-        "converted": converted_leaderboards,
-        "downloaded": list(downloaded),
-        "errors": error_count
-    }
-    manifest_path = os.path.join(output_dir, "changed_leaderboards.json")
-    with open(manifest_path, 'w') as f:
-        json.dump(manifest, f, indent=2)
-    if error_count > 0:
-        sys.exit(1)
-    if converted_count == 0:
-        print("Warning: No parquet files successfully converted!")
-        sys.exit(1)
-if __name__ == "__main__":
-    convert_changed_leaderboards()

scripts/json_to_parquet.py DELETED Viewed

@@ -1,222 +0,0 @@
-"""
-Convert evaluation JSONs to Parquet for HF Datasets.
-Input: single JSON or folder of JSONs (any structure)
-Output: Parquet with all data + reconstructable folder structure
-"""
-import json
-from pathlib import Path
-import pandas as pd
-def json_to_row(json_path: Path) -> dict:
-    """Convert one JSON to a single row (1 JSON = 1 row, evaluations as columns)."""
-    with open(json_path, 'r') as f:
-        data = json.load(f)
-    required_fields = ["schema_version", "evaluation_id", "evaluation_source", "retrieved_timestamp",
-                      "source_data", "source_metadata", "model_info", "evaluation_results"]
-    for field in required_fields:
-        if field not in data:
-            raise ValueError(f"{json_path}: Missing required field '{field}'")
-    if "evaluation_source_name" not in data["evaluation_source"]:
-        raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_name'")
-    if "evaluation_source_type" not in data["evaluation_source"]:
-        raise ValueError(f"{json_path}: Missing required field 'evaluation_source.evaluation_source_type'")
-    if "source_organization_name" not in data["source_metadata"]:
-        raise ValueError(f"{json_path}: Missing required field 'source_metadata.source_organization_name'")
-    if "evaluator_relationship" not in data["source_metadata"]:
-        raise ValueError(f"{json_path}: Missing required field 'source_metadata.evaluator_relationship'")
-    if "name" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.name'")
-    if "id" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.id'")
-    if "developer" not in data["model_info"]:
-        raise ValueError(f"{json_path}: Missing required field 'model_info.developer'")
-    leaderboard = data["evaluation_source"]["evaluation_source_name"]
-    model = data["model_info"]["id"]
-    uuid = json_path.stem
-    developer = data["model_info"]["developer"]
-    # Validate evaluation results
-    for eval_result in data["evaluation_results"]:
-        if "evaluation_name" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].evaluation_name'")
-        if "metric_config" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config'")
-        if "score_details" not in eval_result:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details'")
-        if "lower_is_better" not in eval_result["metric_config"]:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].metric_config.lower_is_better'")
-        if "score" not in eval_result["score_details"]:
-            raise ValueError(f"{json_path}: Missing required field 'evaluation_results[].score_details.score'")
-    row = {
-        # Folder structure (for reconstruction)
-        "_leaderboard": leaderboard,
-        "_developer": developer,
-        "_model": model,
-        "_uuid": uuid,
-        # Required top-level fields
-        "schema_version": data["schema_version"],
-        "evaluation_id": data["evaluation_id"],
-        "retrieved_timestamp": data["retrieved_timestamp"],
-        "source_data": json.dumps(data["source_data"]),
-        # Required nested fields
-        "evaluation_source_name": data["evaluation_source"]["evaluation_source_name"],
-        "evaluation_source_type": data["evaluation_source"]["evaluation_source_type"],
-        "source_organization_name": data["source_metadata"]["source_organization_name"],
-        "source_organization_url": data["source_metadata"].get("source_organization_url"),
-        "source_organization_logo_url": data["source_metadata"].get("source_organization_logo_url"),
-        "evaluator_relationship": data["source_metadata"]["evaluator_relationship"],
-        "model_name": data["model_info"]["name"],
-        "model_id": data["model_info"]["id"],
-        "model_developer": data["model_info"]["developer"],
-        "model_inference_platform": data["model_info"].get("inference_platform"),
-        # Store full evaluation_results and additional_details as JSON
-        "evaluation_results": json.dumps(data["evaluation_results"]),
-        "additional_details": json.dumps(data["additional_details"]) if "additional_details" in data else None,
-    }
-    return row
-def add_to_parquet(json_input: str, parquet_file: str):
-    """
-    Add JSON(s) to Parquet file.
-    Creates new file if it doesn't exist, appends and deduplicates if it does.
-    Args:
-        json_input: Path to single JSON file or folder containing JSONs
-        parquet_file: Output Parquet file path
-    """
-    input_path = Path(json_input)
-    if input_path.is_file():
-        json_files = [input_path]
-    elif input_path.is_dir():
-        json_files = list(input_path.rglob("*.json"))
-    else:
-        raise ValueError(f"Invalid input: {json_input}")
-    print(f"Processing {len(json_files)} JSON file(s)...")
-    parquet_path = Path(parquet_file)
-    if parquet_path.exists():
-        existing_df = pd.read_parquet(parquet_file)
-        existing_keys = set(
-            existing_df[["_leaderboard", "_developer", "_model", "_uuid"]]
-            .apply(tuple, axis=1)
-        )
-        print(f"Found {len(existing_df)} existing rows")
-    else:
-        existing_df = None
-        existing_keys = set()
-    all_rows = []
-    skipped = 0
-    for i, jf in enumerate(json_files, 1):
-        if i % 100 == 0:
-            print(f"  {i}/{len(json_files)}")
-        row = json_to_row(jf)
-        key = (row["_leaderboard"], row["_developer"], row["_model"], row["_uuid"])
-        if key not in existing_keys:
-            all_rows.append(row)
-            existing_keys.add(key)
-        else:
-            skipped += 1
-    if skipped > 0:
-        print(f"  Skipped {skipped} duplicate file(s)")
-    new_df = pd.DataFrame(all_rows)
-    if existing_df is not None:
-        df = pd.concat([existing_df, new_df], ignore_index=True)
-        print(f"Added {len(new_df)} new file(s) to existing {len(existing_df)} file(s)")
-    else:
-        df = new_df
-    df.to_parquet(parquet_file, index=False)
-    print(f"Saved {len(df)} total file(s) to {parquet_file} ({parquet_path.stat().st_size / 1024 / 1024:.1f} MB)")
-def parquet_to_folder(parquet_file: str, output_dir: str):
-    """Reconstruct folder structure from Parquet."""
-    df = pd.read_parquet(parquet_file)
-    out = Path(output_dir)
-    for _, row in df.iterrows():
-        lb = row["_leaderboard"]
-        dev = row["_developer"]
-        model = row["_model"]
-        uuid = row["_uuid"]
-        json_data = {
-            "schema_version": row["schema_version"],
-            "evaluation_id": row["evaluation_id"],
-            "retrieved_timestamp": row["retrieved_timestamp"],
-            "source_data": json.loads(row["source_data"]),
-            "evaluation_source": {
-                "evaluation_source_name": row["evaluation_source_name"],
-                "evaluation_source_type": row["evaluation_source_type"]
-            },
-            "source_metadata": {
-                "source_organization_name": row["source_organization_name"],
-                "evaluator_relationship": row["evaluator_relationship"]
-            },
-            "model_info": {
-                "name": row["model_name"],
-                "id": row["model_id"],
-                "developer": row["model_developer"]
-            },
-            "evaluation_results": json.loads(row["evaluation_results"])
-        }
-        if pd.notna(row["source_organization_url"]):
-            json_data["source_metadata"]["source_organization_url"] = row["source_organization_url"]
-        if pd.notna(row["source_organization_logo_url"]):
-            json_data["source_metadata"]["source_organization_logo_url"] = row["source_organization_logo_url"]
-        if pd.notna(row["model_inference_platform"]):
-            json_data["model_info"]["inference_platform"] = row["model_inference_platform"]
-        if pd.notna(row["additional_details"]):
-            json_data["additional_details"] = json.loads(row["additional_details"])
-        file_path = out / lb / dev / model / f"{uuid}.json"
-        file_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(file_path, 'w') as f:
-            json.dump(json_data, f, indent=2)
-    print(f"Reconstructed {len(df)} files to {output_dir}")
-if __name__ == "__main__":
-    import sys
-    if len(sys.argv) < 2:
-        print("Usage:")
-        print("  python json_to_parquet.py add <json_or_folder> <output.parquet>")
-        print("  python json_to_parquet.py export <input.parquet> <output_dir>")
-        sys.exit(1)
-    cmd = sys.argv[1]
-    if cmd == "add":
-        add_to_parquet(sys.argv[2], sys.argv[3])
-    elif cmd == "export":
-        parquet_to_folder(sys.argv[2], sys.argv[3])
-    else:
-        print(f"Unknown command: {cmd}")

ui_components.py CHANGED Viewed

@@ -1,211 +1,838 @@
 """
 UI Components: Themes, CSS, and HTML formatters for the Gradio interface.
 """
 import gradio as gr
 def get_theme():
-    """Returns the custom Gradio theme."""
-    return gr.themes.Soft(
-        primary_hue="slate",
         neutral_hue="slate",
-        font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"]
     ).set(
-        body_background_fill="var(--neutral-50)",
-        block_background_fill="white",
         block_border_width="1px",
-        block_title_text_weight="600"
     )
 def get_custom_css():
-    """Returns custom CSS for the interface."""
     return """
-/* Clean up the global container */
 .gradio-container {
     max-width: 100% !important;
-    padding: 0 2rem !important;
 }
-/* Hide file list in uploaders */
-.file-preview {
-    display: none !important;
 }
-/* Ensure details elements work independently */
-details {
-    position: relative;
-    isolation: isolate;
 }
-details summary {
     cursor: pointer;
 }
 """
 def format_leaderboard_header(selected_leaderboard, metadata):
-    """Formats the leaderboard header info (goes at top)."""
     if not selected_leaderboard:
         return """
-        <div style="text-align: center; padding: 3rem; color: var(--body-text-color-subdued);">
-            <h3>👋 Welcome to Eval Leaderboard</h3>
-            <p>Select a leaderboard above to visualize results and metadata.</p>
         </div>
         """
     if not metadata or not metadata.get("evals"):
-        return f"""<div style="padding: 1rem;">No metadata found for {selected_leaderboard}</div>"""
     source_info = metadata.get("source_info", {})
     org = source_info.get("organization", "Unknown")
-    relationship = source_info.get("relationship", "Unknown").replace("_", " ").title()
     url = source_info.get("url", "#")
     eval_names = list(metadata["evals"].keys())
-    # Create badges for evaluations
-    eval_badges = "".join([f"""
-        <span style="
-            display: inline-block;
-            padding: 0.25rem 0.75rem;
-            margin: 0.25rem 0.25rem 0.25rem 0;
-            background: var(--background-fill-primary);
-            border: 1px solid var(--border-color-primary);
-            border-radius: 16px;
-            font-size: 0.8rem;
-            color: var(--body-text-color);
-            font-weight: 500;
-        ">{name}</span>
-    """ for name in eval_names])
     return f"""
-    <div style="
-        padding: 1.25rem;
-        background: var(--background-fill-secondary);
-        border-radius: 8px;
-        border-left: 4px solid #667eea;
-    ">
-        <div style="display: flex; justify-content: space-between; align-items: flex-start; margin-bottom: 1rem;">
-            <div style="flex: 1;">
-                <h3 style="margin: 0 0 0.5rem 0; font-size: 1.2rem; font-weight: 600; color: var(--body-text-color);">
-                    {selected_leaderboard}
-                </h3>
-                <div style="font-size: 0.9rem; color: var(--body-text-color-subdued); margin-bottom: 0.75rem;">
-                    <span><strong>Source Organization:</strong> {org}</span> •
-                    <span><strong>Evaluator Relationship:</strong> {relationship}</span>
-                </div>
-                <div style="margin-top: 0.75rem;">
-                    <div style="font-size: 0.85rem; font-weight: 600; color: var(--body-text-color); margin-bottom: 0.5rem;">
-                        Included Evaluations:
-                    </div>
-                    <div>{eval_badges}</div>
-                </div>
             </div>
             <a href="{url}" target="_blank" style="
-                font-size: 0.85rem;
-                color: var(--link-text-color);
                 text-decoration: none;
-                padding: 0.5rem 1rem;
                 border-radius: 6px;
-                background: var(--background-fill-primary);
-                border: 1px solid var(--border-color-primary);
-                transition: all 0.2s;
                 white-space: nowrap;
-            ">
-                Source
-            </a>
         </div>
     </div>
     """
 def format_metric_details(selected_leaderboard, metadata):
-    """Formats metric detail cards (goes below table)."""
     if not selected_leaderboard or not metadata or not metadata.get("evals"):
         return ""
     evals = metadata.get("evals", {})
     html = """
-    <h3 style="margin: 2rem 0 1rem 0; font-size: 1.1rem; font-weight: 600; color: var(--body-text-color);">
-        📏 Metric Details
-    </h3>
-    <div style="
-        display: grid;
-        grid-template-columns: repeat(auto-fill, minmax(350px, 1fr));
-        gap: 1rem;
-    ">
     """
     for eval_name, info in evals.items():
-        score_type = info['score_type'].upper() if info.get('score_type') else "UNKNOWN"
         direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
-        direction_icon = "↓" if info.get('lower_is_better') else "↑"
-        details_content = ""
         if info.get('score_type') == "continuous" and info.get('min_score') is not None:
-            details_content += f"<div><span style='opacity: 0.7;'>Range:</span> <strong>[{info['min_score']} - {info['max_score']}]</strong></div>"
         elif info.get('score_type') == "levels" and info.get('level_names'):
-            levels = ", ".join(str(l) for l in info['level_names'])
-            details_content += f"<div><span style='opacity: 0.7;'>Levels:</span> <strong>{levels}</strong></div>"
-        if info.get('has_unknown_level'):
-             details_content += "<div style='margin-top: 0.25rem; font-size: 0.8rem; opacity: 0.7;'>* -1 indicates Unknown</div>"
         html += f"""
-        <details style="
-            background: var(--background-fill-secondary);
-            border: 1px solid var(--border-color-primary);
-            border-radius: 6px;
-            overflow: hidden;
-            height: fit-content;
-        ">
-            <summary style="
-                padding: 0.75rem 1rem;
-                cursor: pointer;
-                font-weight: 600;
-                display: flex;
-                align-items: center;
-                justify-content: space-between;
-                list-style: none;
-                font-size: 0.95rem;
-            ">
-                <div style="display: flex; align-items: center; gap: 0.5rem;">
-                   <span style="font-size: 1.1rem; opacity: 0.8;">🏷️</span>
-                   <span style="white-space: nowrap; overflow: hidden; text-overflow: ellipsis;">{eval_name}</span>
-                </div>
-                <div style="display: flex; align-items: center; gap: 0.5rem;">
-                    <span style="font-size: 0.8rem; font-weight: 400; color: var(--body-text-color-subdued); white-space: nowrap;">{direction_icon} {direction}</span>
-                </div>
             </summary>
-            <div style="
-                padding: 0.75rem 1rem;
-                border-top: 1px solid var(--border-color-primary);
-                background: var(--block-background-fill);
-                font-size: 0.9rem;
-            ">
-                <p style="margin: 0 0 0.5rem 0; color: var(--body-text-color-subdued); line-height: 1.4;">
-                    {info['description']}
-                </p>
-                <div style="display: flex; justify-content: space-between; align-items: flex-end; margin-top: 0.5rem;">
-                    <div style="font-size: 0.85rem;">
-                        {details_content}
-                    </div>
-                    <span style="
-                        font-size: 0.7rem;
-                        padding: 1px 6px;
-                        border-radius: 4px;
-                        background: var(--background-fill-primary);
-                        border: 1px solid var(--border-color-primary);
-                        color: var(--body-text-color-subdued);
-                    ">{score_type}</span>
                 </div>
             </div>
         </details>
         """
-    html += "</div>"
     return html

 """
 UI Components: Themes, CSS, and HTML formatters for the Gradio interface.
+Nord color theme with balanced contrast.
 """
 import gradio as gr
 def get_theme():
+    """Returns the Nord-themed Gradio theme."""
+    return gr.themes.Base(
+        primary_hue="blue",
         neutral_hue="slate",
+        font=[gr.themes.GoogleFont("DM Sans"), "system-ui", "sans-serif"],
+        font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
     ).set(
+        body_background_fill="#2E3440",
+        body_background_fill_dark="#2E3440",
+        body_text_color="#ECEFF4",
+        body_text_color_dark="#ECEFF4",
+        body_text_color_subdued="#4C566A",
+        body_text_color_subdued_dark="#4C566A",
+        block_background_fill="#3B4252",
+        block_background_fill_dark="#3B4252",
         block_border_width="1px",
+        block_border_color="#434C5E",
+        block_border_color_dark="#434C5E",
+        block_label_text_color="#D8DEE9",
+        block_label_text_color_dark="#D8DEE9",
+        block_title_text_color="#ECEFF4",
+        block_title_text_color_dark="#ECEFF4",
+        input_background_fill="#2E3440",
+        input_background_fill_dark="#2E3440",
+        input_border_color="#4C566A",
+        input_border_color_dark="#4C566A",
+        button_primary_background_fill="#88C0D0",
+        button_primary_background_fill_dark="#88C0D0",
+        button_primary_text_color="#2E3440",
+        button_primary_text_color_dark="#2E3440",
+        button_secondary_background_fill="#434C5E",
+        button_secondary_background_fill_dark="#434C5E",
+        button_secondary_text_color="#ECEFF4",
+        button_secondary_text_color_dark="#ECEFF4",
     )
 def get_custom_css():
+    """Returns custom CSS with Nord colors."""
     return """
+/* === Nord Theme ===
+   Polar Night: #2E3440 (bg), #3B4252 (surface), #434C5E, #4C566A
+   Snow Storm:  #D8DEE9, #E5E9F0, #ECEFF4
+   Frost:       #8FBCBB, #88C0D0, #81A1C1, #5E81AC
+   Aurora:      #BF616A, #D08770, #EBCB8B, #A3BE8C, #B48EAD
+*/
+/* === Base === */
 .gradio-container {
     max-width: 100% !important;
+    margin: 0 !important;
+    padding: 1.25rem 2.5rem 2rem !important;
+    background: #2E3440 !important;
+    color: #ECEFF4 !important;
+    font-family: 'DM Sans', system-ui, sans-serif !important;
+    font-size: 16px !important;
 }
+/* === Header === */
+.app-header {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    margin-bottom: 1.5rem;
+    padding: 1.25rem 1.5rem;
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+}
+.app-header .logo-mark {
+    width: 48px;
+    height: 48px;
+    background: linear-gradient(135deg, #88C0D0 0%, #81A1C1 100%);
+    border-radius: 12px;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-weight: 800;
+    font-size: 1.1rem;
+    color: #2E3440;
+}
+.app-header .brand {
+    display: flex;
+    flex-direction: column;
+    gap: 0.125rem;
+}
+.app-header h1 {
+    margin: 0;
+    font-size: 1.5rem;
+    font-weight: 700;
+    color: #ECEFF4;
+    letter-spacing: -0.02em;
+}
+.app-header .tagline {
+    color: #D8DEE9;
+    font-size: 0.85rem;
+}
+.app-header .header-right {
+    margin-left: auto;
+    display: flex;
+    align-items: center;
+    gap: 0.75rem;
+}
+.app-header .version-badge {
+    background: rgba(136, 192, 208, 0.2);
+    border: 1px solid rgba(136, 192, 208, 0.4);
+    border-radius: 6px;
+    padding: 0.25rem 0.625rem;
+    font-size: 0.7rem;
+    font-family: 'JetBrains Mono', monospace;
+    color: #88C0D0;
+}
+/* === Tabs === */
+.tabs {
+    border: none !important;
+    background: transparent !important;
+}
+.tab-nav {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    padding: 0.25rem !important;
+    gap: 0.25rem !important;
+    margin-bottom: 1.25rem !important;
+    display: inline-flex !important;
+}
+.tab-nav button {
+    background: transparent !important;
+    border: none !important;
+    color: #D8DEE9 !important;
+    padding: 0.75rem 1.5rem !important;
+    font-size: 0.95rem !important;
+    font-weight: 500 !important;
+    border-radius: 8px !important;
+    transition: all 0.15s ease !important;
+}
+.tab-nav button.selected {
+    color: #2E3440 !important;
+    background: #88C0D0 !important;
+}
+.tab-nav button:hover:not(.selected) {
+    background: #434C5E !important;
+    color: #ECEFF4 !important;
+}
+.tabitem {
+    background: transparent !important;
+    border: none !important;
+    padding: 0 !important;
+}
+/* === Controls bar === */
+.controls-bar {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    padding: 0.75rem 1.25rem !important;
+    margin-bottom: 1rem !important;
+    gap: 0.75rem !important;
+}
+.controls-bar label {
+    font-size: 0.75rem !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.04em !important;
+    color: #D8DEE9 !important;
+    font-weight: 500 !important;
+}
+/* === Info banner === */
+.info-banner {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-left: 3px solid #88C0D0 !important;
+    border-radius: 0 10px 10px 0 !important;
+    padding: 0.75rem 1rem !important;
+    margin-bottom: 1rem !important;
+}
+.info-banner h3 {
+    margin: 0;
+    font-size: 1.1rem;
+    font-weight: 600;
+    color: #ECEFF4;
+}
+.info-banner .eval-tags {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 0.375rem;
+}
+.info-banner .eval-tag {
+    background: rgba(143, 188, 187, 0.15);
+    border: 1px solid rgba(143, 188, 187, 0.3);
+    border-radius: 4px;
+    padding: 0.3rem 0.6rem;
+    font-size: 0.8rem;
+    font-family: 'JetBrains Mono', monospace;
+    color: #8FBCBB;
+}
+/* === Dataframe - seamless styling === */
+.dataframe,
+.dataframe > div,
+.dataframe > div > div,
+.dataframe .table-wrap,
+.dataframe .svelte-1gfkn6j {
+    background: #2E3440 !important;
+    border: none !important;
+    box-shadow: none !important;
+    border-radius: 0 !important;
+}
+.dataframe table {
+    width: 100% !important;
+    border-collapse: collapse !important;
+    font-size: 0.95rem !important;
+    table-layout: fixed !important;
+    background: #2E3440 !important;
+}
+.dataframe thead th:nth-child(1) { width: 28%; }
+.dataframe thead th:nth-child(2) { width: 12%; }
+.dataframe thead th:nth-child(3) { width: 7%; }
+.dataframe thead th:nth-child(4) { width: 7%; }
+.dataframe thead th:nth-child(n+5) { width: auto; }
+.dataframe thead,
+.dataframe thead tr {
+    background: #2E3440 !important;
+    position: sticky;
+    top: 0;
+    z-index: 10;
+}
+.dataframe thead th {
+    padding: 0.875rem 1rem !important;
+    font-weight: 600 !important;
+    font-size: 0.75rem !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.05em !important;
+    color: #81A1C1 !important;
+    border-bottom: 1px solid #434C5E !important;
+    border-top: none !important;
+    text-align: left !important;
+    background: #2E3440 !important;
+}
+.dataframe tbody,
+.dataframe tbody tr {
+    background: #2E3440 !important;
+}
+.dataframe tbody tr {
+    border-bottom: 1px solid #3B4252 !important;
+}
+.dataframe tbody tr:hover {
+    background: rgba(136, 192, 208, 0.04) !important;
+}
+.dataframe tbody td {
+    padding: 0.75rem 1rem !important;
+    color: #E5E9F0 !important;
+    background: #2E3440 !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+    border: none !important;
+}
+/* === Pagination bar === */
+.pagination-bar {
+    margin-top: 1rem !important;
+    padding: 1rem 0 !important;
+    border-top: 1px solid #3B4252 !important;
+    display: flex !important;
+    justify-content: center !important;
+    align-items: center !important;
+    gap: 1rem !important;
+}
+.page-info {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 1rem !important;
+    color: #D8DEE9 !important;
+    min-width: 80px !important;
+    text-align: center !important;
+}
+/* Model name - white, readable */
+.dataframe tbody td:first-child {
+    font-weight: 500 !important;
+    color: #ECEFF4 !important;
+    white-space: nowrap !important;
+}
+/* Developer - frost blue */
+.dataframe tbody td:nth-child(2) {
+    color: #88C0D0 !important;
+    white-space: nowrap !important;
+}
+/* Params - aurora orange */
+.dataframe tbody td:nth-child(3) {
+    font-family: 'JetBrains Mono', monospace !important;
+    color: #D08770 !important;
+    text-align: right !important;
+}
+/* Average - aurora green */
+.dataframe tbody td:nth-child(4) {
+    font-family: 'JetBrains Mono', monospace !important;
+    font-weight: 600 !important;
+    color: #A3BE8C !important;
+    text-align: right !important;
+}
+/* Metrics - frost teal */
+.dataframe tbody td:nth-child(n+5) {
+    font-family: 'JetBrains Mono', monospace !important;
+    text-align: right !important;
+    color: #8FBCBB !important;
+    white-space: nowrap !important;
+}
+/* === Status text === */
+.status-text {
+    font-size: 0.9rem !important;
+    color: #D8DEE9 !important;
+    padding: 0.5rem 0 !important;
+    font-family: 'JetBrains Mono', monospace !important;
+}
+/* === Model Card === */
+.model-card-container {
+    display: flex;
+    flex-direction: column;
+    gap: 1.25rem;
+}
+.model-card-header {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 12px;
+    padding: 1.5rem 2rem;
+}
+.model-card-header h2 {
+    margin: 0 0 0.5rem 0;
+    font-size: 1.5rem;
+    font-weight: 600;
+    color: #ECEFF4;
+}
+.model-card-header .model-meta {
+    display: flex;
+    gap: 1.5rem;
+    color: #D8DEE9;
+    font-size: 0.95rem;
+}
+.model-card-header .model-meta strong {
+    color: #8FBCBB;
+}
+.leaderboard-section {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 10px;
+    overflow: hidden;
+}
+.leaderboard-section-header {
+    background: #434C5E;
+    padding: 1rem 1.25rem;
+    border-bottom: 1px solid #4C566A;
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+}
+.leaderboard-section-header h3 {
+    margin: 0;
+    font-size: 1rem;
+    font-weight: 600;
+    color: #88C0D0;
+}
+.leaderboard-section-header .lb-avg {
+    background: rgba(163, 190, 140, 0.15);
+    border: 1px solid rgba(163, 190, 140, 0.3);
+    border-radius: 8px;
+    padding: 0.5rem 1rem;
+    font-size: 0.85rem;
+    color: #D8DEE9;
+}
+.leaderboard-section-header .lb-avg strong {
+    color: #A3BE8C;
+    font-family: 'JetBrains Mono', monospace;
+    font-size: 1.1rem;
+    font-weight: 700;
+}
+.scores-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
+    gap: 1px;
+    background: #434C5E;
+}
+.score-item {
+    background: #3B4252;
+    padding: 1rem 1.25rem;
+}
+.score-item .score-label {
+    font-size: 0.8rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    color: #D8DEE9;
+    margin-bottom: 0.375rem;
+}
+.score-item .score-value {
+    font-size: 1.5rem;
+    font-weight: 600;
+    font-family: 'JetBrains Mono', monospace;
+    color: #A3BE8C;
+}
+.score-item.highlight .score-value {
+    color: #88C0D0;
 }
+.no-results {
+    text-align: center;
+    padding: 3rem 1rem;
+    color: #D8DEE9;
 }
+.no-results h3 {
+    color: #ECEFF4;
+    margin-bottom: 0.5rem;
+}
+/* === Buttons === */
+button {
+    border-radius: 8px !important;
+    font-weight: 500 !important;
+    font-size: 0.95rem !important;
+    transition: all 0.15s ease !important;
+}
+button.primary {
+    background: #88C0D0 !important;
+    color: #2E3440 !important;
+    border: none !important;
+}
+button.primary:hover:not(:disabled) {
+    background: #8FBCBB !important;
+}
+button.secondary,
+button[variant="secondary"] {
+    background: #434C5E !important;
+    color: #ECEFF4 !important;
+    border: 1px solid #4C566A !important;
+}
+button.secondary:hover:not(:disabled),
+button[variant="secondary"]:hover:not(:disabled) {
+    background: #4C566A !important;
+}
+button:disabled {
+    opacity: 0.35 !important;
+}
+/* === Inputs === */
+input[type="text"],
+select {
+    background: #2E3440 !important;
+    border: 1px solid #4C566A !important;
+    border-radius: 8px !important;
+    color: #ECEFF4 !important;
+    font-size: 1rem !important;
+}
+input[type="text"]:focus,
+select:focus {
+    border-color: #88C0D0 !important;
+    box-shadow: 0 0 0 3px rgba(136, 192, 208, 0.15) !important;
+    outline: none !important;
+}
+input::placeholder {
+    color: #4C566A !important;
+}
+/* === Accordion === */
+.accordion {
+    background: #3B4252 !important;
+    border: 1px solid #434C5E !important;
+    border-radius: 10px !important;
+    margin-top: 1.5rem !important;
+}
+.accordion > .label-wrap {
+    background: transparent !important;
+    padding: 1rem 1.25rem !important;
+    color: #D8DEE9 !important;
+    font-size: 0.95rem !important;
+}
+.accordion > .wrap {
+    padding: 0.5rem 1.25rem 1.25rem !important;
+    color: #D8DEE9 !important;
+    font-size: 0.95rem !important;
+    line-height: 1.6 !important;
+}
+.accordion code {
+    background: #434C5E !important;
+    padding: 0.125rem 0.375rem !important;
+    border-radius: 4px !important;
+    font-family: 'JetBrains Mono', monospace !important;
+    font-size: 0.8rem !important;
+    color: #8FBCBB !important;
+}
+/* === Metrics section === */
+.metrics-section {
+    margin-top: 1.5rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid #434C5E;
+}
+.metrics-section h3 {
+    font-size: 0.85rem;
+    font-weight: 600;
+    color: #D8DEE9;
+    margin: 0 0 1rem 0;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+}
+.metrics-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
+    gap: 0.75rem;
+}
+.metric-card {
+    background: #3B4252;
+    border: 1px solid #434C5E;
+    border-radius: 8px;
+    overflow: hidden;
+}
+.metric-card-header {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    padding: 0.75rem 1rem;
     cursor: pointer;
+    list-style: none;
+}
+.metric-card-header::-webkit-details-marker {
+    display: none;
+}
+.metric-card-name {
+    font-weight: 500;
+    font-size: 0.95rem;
+    color: #ECEFF4;
+}
+.metric-card-direction {
+    font-size: 0.8rem;
+    color: #D8DEE9;
+}
+.metric-card-direction .arrow {
+    color: #A3BE8C;
+    font-weight: 600;
+}
+.metric-card-body {
+    padding: 0.875rem 1.25rem;
+    border-top: 1px solid #434C5E;
+    font-size: 0.9rem;
+    color: #D8DEE9;
+    line-height: 1.5;
+}
+.metric-type-badge {
+    font-size: 0.65rem;
+    text-transform: uppercase;
+    letter-spacing: 0.05em;
+    padding: 0.15rem 0.4rem;
+    background: rgba(180, 142, 173, 0.2);
+    border: 1px solid rgba(180, 142, 173, 0.35);
+    border-radius: 4px;
+    color: #B48EAD;
+    font-family: 'JetBrains Mono', monospace;
+}
+/* === Scrollbar === */
+::-webkit-scrollbar {
+    width: 8px;
+    height: 8px;
+}
+::-webkit-scrollbar-track {
+    background: #2E3440;
+}
+::-webkit-scrollbar-thumb {
+    background: #4C566A;
+    border-radius: 4px;
+}
+::-webkit-scrollbar-thumb:hover {
+    background: #5E81AC;
+}
+/* === Responsive === */
+@media (max-width: 768px) {
+    .gradio-container {
+        padding: 1rem !important;
+    }
+    .scores-grid {
+        grid-template-columns: repeat(2, 1fr);
+    }
+}
+/* === Overrides === */
+.gradio-container footer {
+    display: none !important;
+}
+.block {
+    background: #3B4252 !important;
+}
+.gradio-radio label {
+    background: #434C5E !important;
+    border: 1px solid #4C566A !important;
+    color: #ECEFF4 !important;
+    border-radius: 8px !important;
+    font-size: 0.85rem !important;
+}
+.gradio-radio label.selected {
+    background: #88C0D0 !important;
+    border-color: #88C0D0 !important;
+    color: #2E3440 !important;
 }
 """
 def format_leaderboard_header(selected_leaderboard, metadata):
+    """Formats the leaderboard header info section."""
     if not selected_leaderboard:
         return """
+        <div style="text-align: center; padding: 2rem 1rem; color: #D8DEE9;">
+            <div style="font-size: 1.1rem;">Select a leaderboard to explore</div>
         </div>
         """
     if not metadata or not metadata.get("evals"):
+        return f"""
+        <div class="info-banner">
+            <h3>{selected_leaderboard}</h3>
+        </div>
+        """
     source_info = metadata.get("source_info", {})
     org = source_info.get("organization", "Unknown")
     url = source_info.get("url", "#")
     eval_names = list(metadata["evals"].keys())
+    eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
     return f"""
+    <div class="info-banner">
+        <div style="display: flex; justify-content: space-between; align-items: center; gap: 1rem;">
+            <div style="display: flex; align-items: center; gap: 1rem; flex-wrap: wrap;">
+                <h3 style="margin: 0;">{selected_leaderboard}</h3>
+                <span style="color: #D8DEE9; font-size: 0.8rem;">by {org}</span>
+                <div class="eval-tags" style="margin: 0;">{eval_tags}</div>
             </div>
             <a href="{url}" target="_blank" style="
+                font-size: 0.75rem;
+                color: #88C0D0;
                 text-decoration: none;
+                padding: 0.375rem 0.75rem;
+                border: 1px solid rgba(136, 192, 208, 0.4);
                 border-radius: 6px;
                 white-space: nowrap;
+            ">Source →</a>
         </div>
     </div>
     """
 def format_metric_details(selected_leaderboard, metadata):
+    """Formats metric detail cards."""
     if not selected_leaderboard or not metadata or not metadata.get("evals"):
         return ""
     evals = metadata.get("evals", {})
     html = """
+    <div class="metrics-section">
+        <h3>Metric Reference</h3>
+        <div class="metrics-grid">
     """
     for eval_name, info in evals.items():
+        score_type = info['score_type'].upper() if info.get('score_type') else "—"
         direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
+        arrow = "↓" if info.get('lower_is_better') else "↑"
+        details = ""
         if info.get('score_type') == "continuous" and info.get('min_score') is not None:
+            details = f"Range: [{info['min_score']} – {info['max_score']}]"
         elif info.get('score_type') == "levels" and info.get('level_names'):
+            details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
         html += f"""
+        <details class="metric-card">
+            <summary class="metric-card-header">
+                <span class="metric-card-name">{eval_name}</span>
+                <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
             </summary>
+            <div class="metric-card-body">
+                <div>{info.get('description', 'No description')}</div>
+                <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
+                    <span style="font-size: 0.75rem; color: #D8DEE9;">{details}</span>
+                    <span class="metric-type-badge">{score_type}</span>
                 </div>
             </div>
         </details>
         """
+    html += "</div></div>"
     return html
+def format_model_card(model_name, model_data):
+    """Formats a model card showing all evals across leaderboards."""
+    if not model_data:
+        return """
+        <div class="no-results">
+            <h3>No results found</h3>
+            <p>Try searching for a different model name</p>
+        </div>
+        """
+    first = list(model_data.values())[0]
+    developer = first.get("developer", "Unknown")
+    params = first.get("params")
+    arch = first.get("architecture", "Unknown")
+    params_str = f"{params}B" if params else "—"
+    html = f"""
+    <div class="model-card-container">
+        <div class="model-card-header">
+            <h2>{model_name}</h2>
+            <div class="model-meta">
+                <span><strong>Developer:</strong> {developer}</span>
+                <span><strong>Parameters:</strong> {params_str}</span>
+                <span><strong>Architecture:</strong> {arch}</span>
+            </div>
+        </div>
+    """
+    for leaderboard_name, data in model_data.items():
+        results = data.get("results", {})
+        if not results:
+            continue
+        scores = [v for v in results.values() if v is not None]
+        avg = sum(scores) / len(scores) if scores else None
+        avg_str = f"{avg:.3f}" if avg else "—"
+        html += f"""
+        <div class="leaderboard-section">
+            <div class="leaderboard-section-header">
+                <h3>{leaderboard_name}</h3>
+                <span class="lb-avg">Avg: <strong>{avg_str}</strong></span>
+            </div>
+            <div class="scores-grid">
+        """
+        sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
+        for i, (metric_name, score) in enumerate(sorted_results):
+            score_display = f"{score:.3f}" if score is not None else "—"
+            highlight_class = "highlight" if i == 0 else ""
+            html += f"""
+                <div class="score-item {highlight_class}">
+                    <div class="score-label">{metric_name}</div>
+                    <div class="score-value">{score_display}</div>
+                </div>
+            """
+        html += "</div></div>"
+    html += "</div>"
+    return html

upload_to_hf.py DELETED Viewed

@@ -1,122 +0,0 @@
-"""
-Upload changed parquet files to HuggingFace dataset.
-This script:
-1. Reads the manifest of changed leaderboards
-2. Uploads ONLY the changed parquet files
-3. Uses HfApi for efficient individual file uploads
-Usage:
-    # With HF_TOKEN environment variable (GitHub Actions):
-    python upload_to_hf.py
-    # Interactive login (local):
-    python upload_to_hf.py --login
-"""
-from huggingface_hub import login, HfFolder, HfApi
-import pandas as pd
-from pathlib import Path
-import sys
-import os
-import json
-HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "deepmage121/eee_test")
-PARQUET_DIR = Path("parquet_output")
-MANIFEST_PATH = PARQUET_DIR / "changed_leaderboards.json"
-def upload_changed_parquets():
-    """
-    Upload only changed parquet files from manifest.
-    """
-    hf_token = os.environ.get("HF_TOKEN")
-    if hf_token:
-        print("Using HF_TOKEN from environment")
-        HfFolder.save_token(hf_token)
-    elif "--login" in sys.argv:
-        print("Logging in to HuggingFace...")
-        login()
-    else:
-        if not HfFolder.get_token():
-            print("ERROR: Not logged in. Run with --login flag or set HF_TOKEN environment variable")
-            sys.exit(1)
-        print("Using existing HuggingFace token")
-    api = HfApi()
-    if not MANIFEST_PATH.exists():
-        print(f"ERROR: No manifest found at {MANIFEST_PATH}")
-        print("Run convert_changed_to_parquet.py first to generate the manifest")
-        sys.exit(1)
-    manifest = json.loads(MANIFEST_PATH.read_text())
-    converted_leaderboards = manifest.get("converted", [])
-    if not converted_leaderboards:
-        print("\nNo changed leaderboards to upload (per manifest)")
-        sys.exit(0)
-    print(f"\nManifest found: {len(converted_leaderboards)} leaderboard(s) to upload")
-    files_to_upload = [
-        PARQUET_DIR / f"{lb}.parquet"
-        for lb in converted_leaderboards
-    ]
-    files_to_upload = [f for f in files_to_upload if f.exists()]
-    if not files_to_upload:
-        print(f"ERROR: No parquet files to upload in {PARQUET_DIR}")
-        sys.exit(1)
-    print(f"\nUploading {len(files_to_upload)} parquet file(s):")
-    for pf in files_to_upload:
-        print(f"  - {pf.stem}")
-    uploaded_count = 0
-    error_count = 0
-    for parquet_file in files_to_upload:
-        leaderboard_name = parquet_file.stem
-        path_in_repo = f"data/{leaderboard_name}/data-00000-of-00001.parquet"
-        try:
-            print(f"\nUploading: {leaderboard_name}")
-            df = pd.read_parquet(parquet_file)
-            print(f"   {len(df)} rows, {len(df.columns)} columns")
-            api.upload_file(
-                path_or_fileobj=str(parquet_file),
-                path_in_repo=path_in_repo,
-                repo_id=HF_DATASET_REPO,
-                repo_type="dataset",
-                commit_message=f"Update {leaderboard_name} leaderboard data"
-            )
-            print(f"   SUCCESS: Uploaded → {path_in_repo}")
-            uploaded_count += 1
-        except Exception as e:
-            print(f"   ERROR: Error uploading {leaderboard_name}: {e}")
-            error_count += 1
-    print(f"\n{'='*70}")
-    print(f"Upload Summary:")
-    print(f"{'='*70}")
-    print(f"  Successfully uploaded: {uploaded_count} file(s)")
-    print(f"  Errors:                {error_count} file(s)")
-    print(f"{'='*70}")
-    if error_count > 0:
-        print(f"\nWARNING: {error_count} file(s) failed to upload")
-        sys.exit(1)
-    print(f"\nSuccessfully uploaded to HuggingFace!")
-    print(f"View at: https://huggingface.co/datasets/{HF_DATASET_REPO}")
-if __name__ == "__main__":
-    upload_changed_parquets()