deepmage121 commited on
Commit
122f378
Β·
1 Parent(s): c4de40b

update to model lookup, table column selection and other QoL features

Browse files
Files changed (3) hide show
  1. app.py +184 -48
  2. data_loader.py +2 -2
  3. ui_components.py +545 -20
app.py CHANGED
@@ -22,12 +22,13 @@ from ui_components import (
22
  format_leaderboard_header,
23
  format_metric_details,
24
  format_model_card,
 
25
  )
26
 
27
  PAGE_SIZE = 50
28
 
29
 
30
- def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, progress=gr.Progress()):
31
  """Loads and aggregates data for the selected leaderboard."""
32
  if not selected_leaderboard:
33
  return (
@@ -39,6 +40,7 @@ def update_leaderboard_table(selected_leaderboard, search_query="", current_page
39
  gr.update(interactive=False),
40
  gr.update(choices=[], value=None),
41
  "0 / 0",
 
42
  )
43
 
44
  metadata = get_eval_metadata(selected_leaderboard)
@@ -48,6 +50,20 @@ def update_leaderboard_table(selected_leaderboard, search_query="", current_page
48
 
49
  df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  if search_query and not df.empty:
52
  mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
53
  df = df[mask]
@@ -74,6 +90,17 @@ def update_leaderboard_table(selected_leaderboard, search_query="", current_page
74
  default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
75
  sort_column_update = gr.update(choices=sort_choices, value=default_sort)
76
 
 
 
 
 
 
 
 
 
 
 
 
77
  return (
78
  df_paginated,
79
  format_leaderboard_header(selected_leaderboard, metadata),
@@ -83,6 +110,7 @@ def update_leaderboard_table(selected_leaderboard, search_query="", current_page
83
  next_btn,
84
  sort_column_update,
85
  page_info,
 
86
  )
87
 
88
 
@@ -113,6 +141,41 @@ def search_model(model_query):
113
  return format_model_card(model_name, model_data)
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def get_model_suggestions(query):
117
  """Get model name suggestions for autocomplete."""
118
  if not query or len(query) < 2:
@@ -165,7 +228,7 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
165
  with gr.Column(scale=1, min_width=100):
166
  refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
167
 
168
- init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info = update_leaderboard_table(initial_value, "", 1, "Average")
169
 
170
  header_view = gr.HTML(value=init_header)
171
 
@@ -176,13 +239,22 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
176
  visible=False,
177
  )
178
 
 
 
 
 
 
 
 
 
 
 
179
  leaderboard_table = gr.Dataframe(
180
  value=init_df,
181
  label=None,
182
  interactive=False,
183
  wrap=False,
184
  elem_classes="dataframe",
185
- column_widths=["28%", "12%", "7%", "7%"]
186
  )
187
 
188
  # Pagination below table - centered
@@ -212,26 +284,38 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
212
 
213
  # === TAB 2: Model View ===
214
  with gr.TabItem("πŸ” Model Lookup"):
215
- gr.Markdown("### Find a model's benchmarks across all leaderboards")
 
 
 
 
 
 
 
 
216
 
217
  with gr.Row(elem_classes="controls-bar"):
218
  with gr.Column(scale=4):
219
- model_search_dropdown = gr.Dropdown(
220
- choices=[],
221
- label="Model Name",
222
- allow_custom_value=True,
223
  interactive=True,
 
224
  filterable=True,
225
  )
226
  with gr.Column(scale=1, min_width=100):
227
- model_search_btn = gr.Button("Search", variant="primary", size="sm")
228
 
229
- model_card_view = gr.HTML(value="""
230
- <div class="no-results">
231
- <h3>Search for a model</h3>
232
- <p>Start typing to see suggestions, then select a model</p>
233
- </div>
234
- """)
 
 
 
235
 
236
  # Submission guide
237
  with gr.Accordion("πŸ“€ How to Submit Data", open=False):
@@ -259,22 +343,30 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
259
  def reset_page():
260
  return 1
261
 
 
 
 
 
 
 
262
  # === Leaderboard Events ===
263
  leaderboard_selector.change(
264
  fn=reset_page, outputs=[current_page_state]
265
  ).then(
266
  fn=lambda: "Average", outputs=[sort_column_state]
 
 
267
  ).then(
268
  fn=update_leaderboard_table,
269
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
270
- outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
271
  )
272
 
273
  search_box.input(
274
  fn=reset_page, outputs=[current_page_state]
275
  ).then(
276
- fn=update_leaderboard_table,
277
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
278
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
279
  )
280
 
@@ -285,8 +377,16 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
285
  ).then(
286
  fn=reset_page, outputs=[current_page_state]
287
  ).then(
288
- fn=update_leaderboard_table,
289
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
 
 
 
 
 
 
 
 
290
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
291
  )
292
 
@@ -295,24 +395,24 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
295
  inputs=[page_dropdown],
296
  outputs=[current_page_state]
297
  ).then(
298
- fn=update_leaderboard_table,
299
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
300
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
301
  )
302
 
303
  prev_btn.click(
304
  fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
305
  ).then(
306
- fn=update_leaderboard_table,
307
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
308
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
309
  )
310
 
311
  next_btn.click(
312
  fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
313
  ).then(
314
- fn=update_leaderboard_table,
315
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
316
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
317
  )
318
 
@@ -325,36 +425,72 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
325
  fn=reset_page, outputs=[current_page_state]
326
  ).then(
327
  fn=lambda: "Average", outputs=[sort_column_state]
 
 
328
  ).then(
329
  fn=update_leaderboard_table,
330
- inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state],
331
- outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
332
  )
333
 
334
  # === Model Search Events ===
335
- def update_model_suggestions(query):
336
- """Update dropdown choices based on query."""
337
- if not query or len(query) < 2:
338
- return gr.update(choices=[])
339
- _, matches = search_model_across_leaderboards(query)
340
- return gr.update(choices=matches[:20])
341
-
342
- model_search_dropdown.input(
343
- fn=update_model_suggestions,
344
- inputs=[model_search_dropdown],
345
- outputs=[model_search_dropdown]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  )
347
 
348
- model_search_btn.click(
349
- fn=search_model,
350
- inputs=[model_search_dropdown],
351
- outputs=[model_card_view]
352
  )
353
 
354
- model_search_dropdown.select(
355
- fn=search_model,
356
- inputs=[model_search_dropdown],
357
- outputs=[model_card_view]
358
  )
359
 
360
  DATA_DIR.mkdir(exist_ok=True)
 
22
  format_leaderboard_header,
23
  format_metric_details,
24
  format_model_card,
25
+ format_model_comparison,
26
  )
27
 
28
  PAGE_SIZE = 50
29
 
30
 
31
+ def update_leaderboard_table(selected_leaderboard, search_query="", current_page=1, sort_column=None, selected_columns=None, progress=gr.Progress()):
32
  """Loads and aggregates data for the selected leaderboard."""
33
  if not selected_leaderboard:
34
  return (
 
40
  gr.update(interactive=False),
41
  gr.update(choices=[], value=None),
42
  "0 / 0",
43
+ gr.update(choices=[], value=[]),
44
  )
45
 
46
  metadata = get_eval_metadata(selected_leaderboard)
 
50
 
51
  df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
52
 
53
+ # Get all available columns BEFORE filtering (for column selector)
54
+ all_available_columns = list(df.columns) if not df.empty else []
55
+
56
+ # Filter columns if selected (if None or empty, show all columns)
57
+ if selected_columns is not None and len(selected_columns) > 0:
58
+ # Ensure Model column is always included
59
+ base_cols = ["Model"]
60
+ available_cols = list(df.columns)
61
+ cols_to_show = [col for col in base_cols if col in available_cols]
62
+ # Add Developer and other selected columns
63
+ cols_to_show.extend([col for col in selected_columns if col in available_cols and col not in cols_to_show])
64
+ if cols_to_show:
65
+ df = df[cols_to_show]
66
+
67
  if search_query and not df.empty:
68
  mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
69
  df = df[mask]
 
90
  default_sort = sort_column if sort_column and sort_column in sort_choices else ("Average" if "Average" in sort_choices else (sort_choices[0] if sort_choices else None))
91
  sort_column_update = gr.update(choices=sort_choices, value=default_sort)
92
 
93
+ # Get all available columns for column selector (use full list, not filtered)
94
+ # Include all columns except Model in the selector (Model is always shown)
95
+ column_choices = [col for col in all_available_columns if col != "Model"]
96
+ # Preserve current selection, or default to all columns if None or empty
97
+ if selected_columns is None or len(selected_columns) == 0:
98
+ column_value = column_choices
99
+ else:
100
+ # Preserve user's selection, filtering out any invalid choices
101
+ column_value = [col for col in selected_columns if col in column_choices]
102
+ column_selector_update = gr.update(choices=column_choices, value=column_value)
103
+
104
  return (
105
  df_paginated,
106
  format_leaderboard_header(selected_leaderboard, metadata),
 
110
  next_btn,
111
  sort_column_update,
112
  page_info,
113
+ column_selector_update,
114
  )
115
 
116
 
 
141
  return format_model_card(model_name, model_data)
142
 
143
 
144
+ def compare_models(selected_models):
145
+ """Compare multiple selected models."""
146
+ if not selected_models or len(selected_models) == 0:
147
+ return """
148
+ <div class="no-results">
149
+ <h3>Select models to compare</h3>
150
+ <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
151
+ </div>
152
+ """
153
+
154
+ # Get data for all selected models
155
+ all_results = {}
156
+ for model_name in selected_models:
157
+ results, _ = search_model_across_leaderboards(model_name)
158
+ if results:
159
+ # Use the first matching model (exact match preferred)
160
+ matched_model = list(results.keys())[0]
161
+ all_results[matched_model] = results[matched_model]
162
+
163
+ if len(all_results) == 1:
164
+ # Single model - show card view
165
+ model_name = list(all_results.keys())[0]
166
+ return format_model_card(model_name, all_results[model_name])
167
+ elif len(all_results) > 1:
168
+ # Multiple models - show comparison
169
+ return format_model_comparison(list(all_results.keys()), all_results)
170
+ else:
171
+ return """
172
+ <div class="no-results">
173
+ <h3>No results found</h3>
174
+ <p>Try selecting different models</p>
175
+ </div>
176
+ """
177
+
178
+
179
  def get_model_suggestions(query):
180
  """Get model name suggestions for autocomplete."""
181
  if not query or len(query) < 2:
 
228
  with gr.Column(scale=1, min_width=100):
229
  refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
230
 
231
+ init_df, init_header, init_metrics, init_page_dropdown, init_prev, init_next, init_sort_cols, init_page_info, init_column_selector = update_leaderboard_table(initial_value, "", 1, "Average", None)
232
 
233
  header_view = gr.HTML(value=init_header)
234
 
 
239
  visible=False,
240
  )
241
 
242
+ # Column selector
243
+ with gr.Row(elem_classes="controls-bar"):
244
+ column_selector = gr.CheckboxGroup(
245
+ choices=init_column_selector.get("choices", []) if isinstance(init_column_selector, dict) else [],
246
+ value=init_column_selector.get("value", []) if isinstance(init_column_selector, dict) else [],
247
+ label="Columns to Display",
248
+ interactive=True,
249
+ show_label=True,
250
+ )
251
+
252
  leaderboard_table = gr.Dataframe(
253
  value=init_df,
254
  label=None,
255
  interactive=False,
256
  wrap=False,
257
  elem_classes="dataframe",
 
258
  )
259
 
260
  # Pagination below table - centered
 
284
 
285
  # === TAB 2: Model View ===
286
  with gr.TabItem("πŸ” Model Lookup"):
287
+ gr.Markdown("### Find and compare models across all leaderboards")
288
+
289
+ selected_models_state = gr.State(value=[])
290
+ default_compare_html = """
291
+ <div class="no-results">
292
+ <h3>Search for models to compare</h3>
293
+ <p>Type in the dropdown above, then click a model to add it</p>
294
+ </div>
295
+ """
296
 
297
  with gr.Row(elem_classes="controls-bar"):
298
  with gr.Column(scale=4):
299
+ all_models = get_all_model_names()
300
+ model_dropdown = gr.Dropdown(
301
+ choices=all_models,
302
+ label="Search models to add",
303
  interactive=True,
304
+ allow_custom_value=False,
305
  filterable=True,
306
  )
307
  with gr.Column(scale=1, min_width=100):
308
+ clear_models_btn = gr.Button("Clear All", variant="secondary", size="sm")
309
 
310
+ selected_models_group = gr.CheckboxGroup(
311
+ choices=[],
312
+ value=[],
313
+ label="Selected Models (click to remove)",
314
+ interactive=True,
315
+ elem_classes="selected-models-group"
316
+ )
317
+
318
+ model_card_view = gr.HTML(value=default_compare_html)
319
 
320
  # Submission guide
321
  with gr.Accordion("πŸ“€ How to Submit Data", open=False):
 
343
  def reset_page():
344
  return 1
345
 
346
+ def update_table_only(selected_leaderboard, search_query, current_page, sort_column, selected_columns):
347
+ """Update table without modifying column selector (for column changes)."""
348
+ result = update_leaderboard_table(selected_leaderboard, search_query, current_page, sort_column, selected_columns)
349
+ # Return all outputs except the last one (column_selector)
350
+ return result[:-1]
351
+
352
  # === Leaderboard Events ===
353
  leaderboard_selector.change(
354
  fn=reset_page, outputs=[current_page_state]
355
  ).then(
356
  fn=lambda: "Average", outputs=[sort_column_state]
357
+ ).then(
358
+ fn=lambda: None, outputs=[column_selector]
359
  ).then(
360
  fn=update_leaderboard_table,
361
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
362
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
363
  )
364
 
365
  search_box.input(
366
  fn=reset_page, outputs=[current_page_state]
367
  ).then(
368
+ fn=update_table_only,
369
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
370
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
371
  )
372
 
 
377
  ).then(
378
  fn=reset_page, outputs=[current_page_state]
379
  ).then(
380
+ fn=update_table_only,
381
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
382
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
383
+ )
384
+
385
+ column_selector.change(
386
+ fn=reset_page, outputs=[current_page_state]
387
+ ).then(
388
+ fn=update_table_only,
389
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
390
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
391
  )
392
 
 
395
  inputs=[page_dropdown],
396
  outputs=[current_page_state]
397
  ).then(
398
+ fn=update_table_only,
399
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
400
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
401
  )
402
 
403
  prev_btn.click(
404
  fn=go_prev, inputs=[current_page_state], outputs=[current_page_state]
405
  ).then(
406
+ fn=update_table_only,
407
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
408
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
409
  )
410
 
411
  next_btn.click(
412
  fn=go_next, inputs=[current_page_state], outputs=[current_page_state]
413
  ).then(
414
+ fn=update_table_only,
415
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
416
  outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info]
417
  )
418
 
 
425
  fn=reset_page, outputs=[current_page_state]
426
  ).then(
427
  fn=lambda: "Average", outputs=[sort_column_state]
428
+ ).then(
429
+ fn=lambda: None, outputs=[column_selector]
430
  ).then(
431
  fn=update_leaderboard_table,
432
+ inputs=[leaderboard_selector, search_box, current_page_state, sort_column_state, column_selector],
433
+ outputs=[leaderboard_table, header_view, metrics_view, page_dropdown, prev_btn, next_btn, sort_column_dropdown, page_info, column_selector]
434
  )
435
 
436
  # === Model Search Events ===
437
+ def add_model_and_compare(selected_model, current_selected):
438
+ """Add a model and auto-compare."""
439
+ if not selected_model:
440
+ comparison_html = compare_models(current_selected) if current_selected else default_compare_html
441
+ return (
442
+ current_selected,
443
+ gr.update(value=None),
444
+ gr.update(choices=current_selected, value=current_selected),
445
+ comparison_html
446
+ )
447
+
448
+ if current_selected is None:
449
+ current_selected = []
450
+
451
+ if selected_model not in current_selected:
452
+ current_selected = current_selected + [selected_model]
453
+
454
+ comparison_html = compare_models(current_selected)
455
+
456
+ return (
457
+ current_selected,
458
+ gr.update(value=None),
459
+ gr.update(choices=current_selected, value=current_selected),
460
+ comparison_html
461
+ )
462
+
463
+ def update_selection(selected_list):
464
+ """Update selection from checkbox changes."""
465
+ selected_list = selected_list or []
466
+ comparison_html = compare_models(selected_list) if selected_list else default_compare_html
467
+ return selected_list, comparison_html
468
+
469
+ def clear_all_models():
470
+ """Clear all selected models."""
471
+ return (
472
+ [],
473
+ gr.update(value=None),
474
+ gr.update(choices=[], value=[]),
475
+ default_compare_html
476
+ )
477
+
478
+ # Select from dropdown adds model and auto-compares
479
+ model_dropdown.select(
480
+ fn=add_model_and_compare,
481
+ inputs=[model_dropdown, selected_models_state],
482
+ outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
483
  )
484
 
485
+ selected_models_group.change(
486
+ fn=update_selection,
487
+ inputs=[selected_models_group],
488
+ outputs=[selected_models_state, model_card_view]
489
  )
490
 
491
+ clear_models_btn.click(
492
+ fn=clear_all_models,
493
+ outputs=[selected_models_state, model_dropdown, selected_models_group, model_card_view]
 
494
  )
495
 
496
  DATA_DIR.mkdir(exist_ok=True)
data_loader.py CHANGED
@@ -289,12 +289,12 @@ def build_leaderboard_table(selected_leaderboard, search_query="", progress_call
289
  return df
290
 
291
  numeric_cols = df.select_dtypes(include=['float', 'int']).columns
292
- df[numeric_cols] = df[numeric_cols].round(3)
293
 
294
  # Add Average Score
295
  eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
296
  if len(eval_only_cols) > 0:
297
- df["Average"] = df[eval_only_cols].mean(axis=1).round(3)
298
 
299
  # Base columns: Model, Developer, Params, Average
300
  # Eval columns: all evaluation scores
 
289
  return df
290
 
291
  numeric_cols = df.select_dtypes(include=['float', 'int']).columns
292
+ df[numeric_cols] = df[numeric_cols].round(2)
293
 
294
  # Add Average Score
295
  eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
296
  if len(eval_only_cols) > 0:
297
+ df["Average"] = df[eval_only_cols].mean(axis=1).round(2)
298
 
299
  # Base columns: Model, Developer, Params, Average
300
  # Eval columns: all evaluation scores
ui_components.py CHANGED
@@ -235,16 +235,10 @@ def get_custom_css():
235
  width: 100% !important;
236
  border-collapse: collapse !important;
237
  font-size: 0.95rem !important;
238
- table-layout: fixed !important;
239
  background: #2E3440 !important;
240
  }
241
 
242
- .dataframe thead th:nth-child(1) { width: 28%; }
243
- .dataframe thead th:nth-child(2) { width: 12%; }
244
- .dataframe thead th:nth-child(3) { width: 7%; }
245
- .dataframe thead th:nth-child(4) { width: 7%; }
246
- .dataframe thead th:nth-child(n+5) { width: auto; }
247
-
248
  .dataframe thead,
249
  .dataframe thead tr {
250
  background: #2E3440 !important;
@@ -314,32 +308,28 @@ def get_custom_css():
314
  white-space: nowrap !important;
315
  }
316
 
317
- /* Developer - frost blue */
 
 
 
 
 
 
318
  .dataframe tbody td:nth-child(2) {
319
  color: #88C0D0 !important;
320
  white-space: nowrap !important;
321
  }
322
 
323
- /* Params - aurora orange */
324
  .dataframe tbody td:nth-child(3) {
325
- font-family: 'JetBrains Mono', monospace !important;
326
  color: #D08770 !important;
327
- text-align: right !important;
328
  }
329
 
330
- /* Average - aurora green */
331
  .dataframe tbody td:nth-child(4) {
332
- font-family: 'JetBrains Mono', monospace !important;
333
  font-weight: 600 !important;
334
  color: #A3BE8C !important;
335
- text-align: right !important;
336
  }
337
 
338
- /* Metrics - frost teal */
339
  .dataframe tbody td:nth-child(n+5) {
340
- font-family: 'JetBrains Mono', monospace !important;
341
- text-align: right !important;
342
- color: #8FBCBB !important;
343
  white-space: nowrap !important;
344
  }
345
 
@@ -464,6 +454,350 @@ def get_custom_css():
464
  margin-bottom: 0.5rem;
465
  }
466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  /* === Buttons === */
468
  button {
469
  border-radius: 8px !important;
@@ -808,7 +1142,7 @@ def format_model_card(model_name, model_data):
808
 
809
  scores = [v for v in results.values() if v is not None]
810
  avg = sum(scores) / len(scores) if scores else None
811
- avg_str = f"{avg:.3f}" if avg else "β€”"
812
 
813
  html += f"""
814
  <div class="leaderboard-section">
@@ -822,7 +1156,7 @@ def format_model_card(model_name, model_data):
822
  sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
823
 
824
  for i, (metric_name, score) in enumerate(sorted_results):
825
- score_display = f"{score:.3f}" if score is not None else "β€”"
826
  highlight_class = "highlight" if i == 0 else ""
827
 
828
  html += f"""
@@ -836,3 +1170,194 @@ def format_model_card(model_name, model_data):
836
 
837
  html += "</div>"
838
  return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  width: 100% !important;
236
  border-collapse: collapse !important;
237
  font-size: 0.95rem !important;
238
+ table-layout: auto !important;
239
  background: #2E3440 !important;
240
  }
241
 
 
 
 
 
 
 
242
  .dataframe thead,
243
  .dataframe thead tr {
244
  background: #2E3440 !important;
 
308
  white-space: nowrap !important;
309
  }
310
 
311
+ /* All other columns - use monospace for numbers */
312
+ .dataframe tbody td:not(:first-child) {
313
+ font-family: 'JetBrains Mono', monospace !important;
314
+ color: #8FBCBB !important;
315
+ text-align: left !important;
316
+ }
317
+
318
  .dataframe tbody td:nth-child(2) {
319
  color: #88C0D0 !important;
320
  white-space: nowrap !important;
321
  }
322
 
 
323
  .dataframe tbody td:nth-child(3) {
 
324
  color: #D08770 !important;
 
325
  }
326
 
 
327
  .dataframe tbody td:nth-child(4) {
 
328
  font-weight: 600 !important;
329
  color: #A3BE8C !important;
 
330
  }
331
 
 
332
  .dataframe tbody td:nth-child(n+5) {
 
 
 
333
  white-space: nowrap !important;
334
  }
335
 
 
454
  margin-bottom: 0.5rem;
455
  }
456
 
457
+
458
+ /* === New Comparison View === */
459
+ .comparison-container {
460
+ display: flex;
461
+ flex-direction: column;
462
+ gap: 1.5rem;
463
+ }
464
+
465
+ .comparison-summary {
466
+ background: #3B4252;
467
+ border: 1px solid #434C5E;
468
+ border-radius: 12px;
469
+ padding: 1.5rem;
470
+ }
471
+
472
+ .comparison-summary h2 {
473
+ margin: 0 0 1rem 0;
474
+ color: #ECEFF4;
475
+ font-size: 1.25rem;
476
+ }
477
+
478
+ .summary-cards {
479
+ display: flex;
480
+ gap: 1rem;
481
+ flex-wrap: wrap;
482
+ }
483
+
484
+ .summary-card {
485
+ flex: 1;
486
+ min-width: 200px;
487
+ background: #2E3440;
488
+ border-radius: 8px;
489
+ padding: 1rem;
490
+ }
491
+
492
+ .summary-card-header {
493
+ display: flex;
494
+ align-items: center;
495
+ gap: 0.5rem;
496
+ margin-bottom: 0.75rem;
497
+ }
498
+
499
+ .model-dot {
500
+ width: 10px;
501
+ height: 10px;
502
+ border-radius: 50%;
503
+ }
504
+
505
+ .model-name {
506
+ font-weight: 600;
507
+ color: #ECEFF4;
508
+ font-size: 0.9rem;
509
+ overflow: hidden;
510
+ text-overflow: ellipsis;
511
+ white-space: nowrap;
512
+ }
513
+
514
+ .summary-card-body {
515
+ display: flex;
516
+ flex-direction: column;
517
+ gap: 0.5rem;
518
+ }
519
+
520
+ .summary-stat {
521
+ display: flex;
522
+ justify-content: space-between;
523
+ align-items: center;
524
+ }
525
+
526
+ .summary-stat .stat-label {
527
+ font-size: 0.75rem;
528
+ color: #D8DEE9;
529
+ text-transform: uppercase;
530
+ letter-spacing: 0.05em;
531
+ }
532
+
533
+ .summary-stat .stat-value {
534
+ font-family: 'JetBrains Mono', monospace;
535
+ color: #8FBCBB;
536
+ }
537
+
538
+ .summary-stat.primary .stat-value.large {
539
+ font-size: 1.5rem;
540
+ font-weight: 700;
541
+ color: #A3BE8C;
542
+ }
543
+
544
+ .leaderboard-comparison-card {
545
+ background: #3B4252;
546
+ border: 1px solid #434C5E;
547
+ border-radius: 12px;
548
+ overflow: hidden;
549
+ }
550
+
551
+ .lb-card-header {
552
+ background: #434C5E;
553
+ padding: 0.875rem 1.25rem;
554
+ }
555
+
556
+ .lb-card-header h3 {
557
+ margin: 0;
558
+ color: #88C0D0;
559
+ font-size: 1rem;
560
+ font-weight: 600;
561
+ }
562
+
563
+ .lb-card-body {
564
+ padding: 1rem 1.25rem;
565
+ display: flex;
566
+ flex-direction: column;
567
+ gap: 0.75rem;
568
+ }
569
+
570
+ .metric-comparison {
571
+ display: flex;
572
+ flex-direction: column;
573
+ gap: 0.375rem;
574
+ }
575
+
576
+ .metric-name-row {
577
+ margin-bottom: 0.25rem;
578
+ }
579
+
580
+ .metric-title {
581
+ font-size: 0.85rem;
582
+ font-weight: 600;
583
+ color: #ECEFF4;
584
+ }
585
+
586
+ .metric-title.sub {
587
+ font-size: 0.75rem;
588
+ font-weight: 500;
589
+ color: #D8DEE9;
590
+ }
591
+
592
+ .model-score-row {
593
+ display: flex;
594
+ align-items: center;
595
+ gap: 0.5rem;
596
+ padding: 0.375rem 0;
597
+ }
598
+
599
+ .model-score-row.compact {
600
+ padding: 0.25rem 0;
601
+ }
602
+
603
+ .model-score-row.best-score {
604
+ background: rgba(163, 190, 140, 0.1);
605
+ border-radius: 4px;
606
+ padding-left: 0.5rem;
607
+ margin-left: -0.5rem;
608
+ }
609
+
610
+ .model-score-row.no-data {
611
+ opacity: 0.5;
612
+ }
613
+
614
+ .model-indicator {
615
+ width: 8px;
616
+ height: 8px;
617
+ border-radius: 2px;
618
+ flex-shrink: 0;
619
+ }
620
+
621
+ .model-indicator.small {
622
+ width: 6px;
623
+ height: 6px;
624
+ }
625
+
626
+ .score-bar-container {
627
+ flex: 1;
628
+ display: flex;
629
+ align-items: center;
630
+ gap: 0.75rem;
631
+ height: 24px;
632
+ background: #2E3440;
633
+ border-radius: 4px;
634
+ padding: 0 0.5rem;
635
+ position: relative;
636
+ }
637
+
638
+ .score-bar {
639
+ position: absolute;
640
+ left: 0;
641
+ top: 0;
642
+ bottom: 0;
643
+ border-radius: 4px;
644
+ opacity: 0.3;
645
+ }
646
+
647
+ .score-bar.thin {
648
+ opacity: 0.2;
649
+ }
650
+
651
+ .score-value {
652
+ position: relative;
653
+ font-family: 'JetBrains Mono', monospace;
654
+ font-size: 0.9rem;
655
+ font-weight: 600;
656
+ color: #ECEFF4;
657
+ z-index: 1;
658
+ }
659
+
660
+ .score-value.small {
661
+ font-size: 0.8rem;
662
+ font-weight: 500;
663
+ }
664
+
665
+ .score-value.dim {
666
+ color: #4C566A;
667
+ }
668
+
669
+ /* === Selected Models Chips === */
670
+ .selected-models-group label {
671
+ display: inline-flex !important;
672
+ align-items: center !important;
673
+ background: #434C5E;
674
+ border: 1px solid #4C566A;
675
+ border-radius: 16px;
676
+ padding: 0.35rem 0.85rem;
677
+ font-size: 0.85rem;
678
+ color: #ECEFF4;
679
+ gap: 0.4rem;
680
+ cursor: pointer;
681
+ margin: 0.15rem 0.3rem 0.15rem 0 !important;
682
+ }
683
+
684
+ .selected-models-group label span::before {
685
+ content: "Γ—";
686
+ font-size: 0.75rem;
687
+ color: #EBCB8B;
688
+ opacity: 0;
689
+ transition: opacity 0.15s ease;
690
+ }
691
+
692
+ .selected-models-group label:hover span::before {
693
+ opacity: 1;
694
+ }
695
+
696
+ .selected-models-group input[type="checkbox"] {
697
+ display: none;
698
+ }
699
+
700
+ /* === Heat Map Table === */
701
+ .heatmap-table-wrapper {
702
+ overflow-x: auto;
703
+ margin-top: 1rem;
704
+ }
705
+
706
+ .heatmap-table {
707
+ width: 100%;
708
+ border-collapse: collapse;
709
+ font-size: 0.85rem;
710
+ }
711
+
712
+ .heatmap-table thead {
713
+ position: sticky;
714
+ top: 0;
715
+ z-index: 10;
716
+ }
717
+
718
+ .heatmap-table th {
719
+ background: #434C5E;
720
+ padding: 0.625rem 0.75rem;
721
+ font-weight: 600;
722
+ font-size: 0.7rem;
723
+ text-transform: uppercase;
724
+ letter-spacing: 0.05em;
725
+ color: #81A1C1;
726
+ text-align: left;
727
+ border-bottom: 2px solid #4C566A;
728
+ white-space: nowrap;
729
+ }
730
+
731
+ .heatmap-table th.metric-header {
732
+ min-width: 120px;
733
+ }
734
+
735
+ .heatmap-table th.model-header {
736
+ text-align: center;
737
+ max-width: 150px;
738
+ overflow: hidden;
739
+ text-overflow: ellipsis;
740
+ }
741
+
742
+ .heatmap-table td {
743
+ padding: 0.5rem 0.75rem;
744
+ border-bottom: 1px solid #3B4252;
745
+ }
746
+
747
+ .heatmap-table td.metric-name {
748
+ font-weight: 500;
749
+ color: #D8DEE9;
750
+ background: #2E3440;
751
+ }
752
+
753
+ .heatmap-table td.score-cell {
754
+ text-align: center;
755
+ font-family: 'JetBrains Mono', monospace;
756
+ font-weight: 500;
757
+ transition: all 0.15s ease;
758
+ }
759
+
760
+ .heatmap-table td.score-cell.best {
761
+ background: rgba(163, 190, 140, 0.25);
762
+ color: #A3BE8C;
763
+ font-weight: 700;
764
+ }
765
+
766
+ .heatmap-table td.score-cell.good {
767
+ background: rgba(163, 190, 140, 0.12);
768
+ color: #A3BE8C;
769
+ }
770
+
771
+ .heatmap-table td.score-cell.mid {
772
+ background: rgba(235, 203, 139, 0.12);
773
+ color: #EBCB8B;
774
+ }
775
+
776
+ .heatmap-table td.score-cell.low {
777
+ background: rgba(208, 135, 112, 0.12);
778
+ color: #D08770;
779
+ }
780
+
781
+ .heatmap-table td.score-cell.worst {
782
+ background: rgba(191, 97, 106, 0.15);
783
+ color: #BF616A;
784
+ }
785
+
786
+ .heatmap-table td.score-cell.na {
787
+ color: #4C566A;
788
+ font-style: italic;
789
+ }
790
+
791
+ .heatmap-table tr.avg-row {
792
+ background: rgba(136, 192, 208, 0.08);
793
+ }
794
+
795
+ .heatmap-table tr.avg-row td.metric-name {
796
+ font-weight: 700;
797
+ color: #88C0D0;
798
+ background: rgba(136, 192, 208, 0.08);
799
+ }
800
+
801
  /* === Buttons === */
802
  button {
803
  border-radius: 8px !important;
 
1142
 
1143
  scores = [v for v in results.values() if v is not None]
1144
  avg = sum(scores) / len(scores) if scores else None
1145
+ avg_str = f"{avg:.2f}" if avg else "β€”"
1146
 
1147
  html += f"""
1148
  <div class="leaderboard-section">
 
1156
  sorted_results = sorted(results.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True)
1157
 
1158
  for i, (metric_name, score) in enumerate(sorted_results):
1159
+ score_display = f"{score:.2f}" if score is not None else "β€”"
1160
  highlight_class = "highlight" if i == 0 else ""
1161
 
1162
  html += f"""
 
1170
 
1171
  html += "</div>"
1172
  return html
1173
+
1174
+
1175
+ def format_model_comparison(selected_models, all_results):
1176
+ """Formats a comparison view showing multiple models with visual indicators."""
1177
+ if not selected_models or not all_results:
1178
+ return """
1179
+ <div class="no-results">
1180
+ <h3>Select models to compare</h3>
1181
+ <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
1182
+ </div>
1183
+ """
1184
+
1185
+ # Get all unique leaderboards across selected models
1186
+ all_leaderboards = set()
1187
+ model_data_dict = {}
1188
+
1189
+ for model_name in selected_models:
1190
+ if model_name in all_results:
1191
+ model_data_dict[model_name] = all_results[model_name]
1192
+ for leaderboard_name in all_results[model_name].keys():
1193
+ all_leaderboards.add(leaderboard_name)
1194
+
1195
+ if not model_data_dict:
1196
+ return """
1197
+ <div class="no-results">
1198
+ <h3>No data found for selected models</h3>
1199
+ <p>Try selecting different models</p>
1200
+ </div>
1201
+ """
1202
+
1203
+ all_leaderboards = sorted(all_leaderboards)
1204
+ model_colors = ['#88C0D0', '#A3BE8C', '#EBCB8B', '#D08770', '#B48EAD', '#8FBCBB', '#81A1C1', '#BF616A']
1205
+
1206
+ # Calculate overall averages for summary
1207
+ overall_avgs = {}
1208
+ for model_name in selected_models:
1209
+ if model_name in model_data_dict:
1210
+ all_scores = []
1211
+ for lb_data in model_data_dict[model_name].values():
1212
+ all_scores.extend([v for v in lb_data.get("results", {}).values() if v is not None])
1213
+ overall_avgs[model_name] = sum(all_scores) / len(all_scores) if all_scores else None
1214
+
1215
+ html = """
1216
+ <div class="comparison-container">
1217
+ <div class="comparison-summary">
1218
+ <h2>Model Comparison</h2>
1219
+ <div class="summary-cards">
1220
+ """
1221
+
1222
+ # Summary cards for each model
1223
+ for i, model_name in enumerate(selected_models):
1224
+ color = model_colors[i % len(model_colors)]
1225
+ avg = overall_avgs.get(model_name)
1226
+ avg_str = f"{avg:.2f}" if avg is not None else "β€”"
1227
+
1228
+ # Get model info
1229
+ model_info = list(model_data_dict.get(model_name, {}).values())
1230
+ developer = model_info[0].get("developer", "Unknown") if model_info else "Unknown"
1231
+
1232
+ html += f"""
1233
+ <div class="summary-card" style="border-left: 4px solid {color};">
1234
+ <div class="summary-card-header">
1235
+ <span class="model-dot" style="background: {color};"></span>
1236
+ <span class="model-name">{model_name}</span>
1237
+ </div>
1238
+ <div class="summary-card-body">
1239
+ <div class="summary-stat">
1240
+ <span class="stat-label">Developer</span>
1241
+ <span class="stat-value">{developer}</span>
1242
+ </div>
1243
+ <div class="summary-stat primary">
1244
+ <span class="stat-label">Overall Avg</span>
1245
+ <span class="stat-value large">{avg_str}</span>
1246
+ </div>
1247
+ </div>
1248
+ </div>
1249
+ """
1250
+
1251
+ html += """
1252
+ </div>
1253
+ </div>
1254
+ """
1255
+
1256
+ # Leaderboard comparison cards
1257
+ for leaderboard_name in all_leaderboards:
1258
+ leaderboard_metrics = set()
1259
+ for model_data in model_data_dict.values():
1260
+ if leaderboard_name in model_data:
1261
+ results = model_data[leaderboard_name].get("results", {})
1262
+ leaderboard_metrics.update(results.keys())
1263
+
1264
+ leaderboard_metrics = sorted(leaderboard_metrics)
1265
+ if not leaderboard_metrics:
1266
+ continue
1267
+
1268
+ # Calculate averages for ranking
1269
+ model_avgs = {}
1270
+ for model_name in selected_models:
1271
+ if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
1272
+ results = model_data_dict[model_name][leaderboard_name].get("results", {})
1273
+ scores = [v for v in results.values() if v is not None]
1274
+ model_avgs[model_name] = sum(scores) / len(scores) if scores else None
1275
+
1276
+ html += f"""
1277
+ <div class="leaderboard-comparison-card">
1278
+ <div class="lb-card-header">
1279
+ <h3>{leaderboard_name}</h3>
1280
+ </div>
1281
+ <div class="lb-card-body">
1282
+ """
1283
+
1284
+ # Compact heat-map table
1285
+ html += '<div class="heatmap-table-wrapper">'
1286
+ html += '<table class="heatmap-table">'
1287
+
1288
+ # Header with model names
1289
+ html += '<thead><tr><th class="metric-header">Metric</th>'
1290
+ for i, model_name in enumerate(selected_models):
1291
+ # Truncate long names
1292
+ short_name = model_name if len(model_name) <= 20 else model_name[:18] + "…"
1293
+ html += f'<th class="model-header" title="{model_name}">{short_name}</th>'
1294
+ html += '</tr></thead>'
1295
+
1296
+ html += '<tbody>'
1297
+
1298
+ # Average row first
1299
+ html += '<tr class="avg-row"><td class="metric-name">Average</td>'
1300
+ valid_avgs_list = [model_avgs.get(m) for m in selected_models if model_avgs.get(m) is not None]
1301
+ max_avg_val = max(valid_avgs_list) if valid_avgs_list else None
1302
+
1303
+ for model_name in selected_models:
1304
+ avg = model_avgs.get(model_name)
1305
+ if avg is not None:
1306
+ cell_class = "best" if avg == max_avg_val and len(valid_avgs_list) > 1 else ""
1307
+ html += f'<td class="score-cell {cell_class}">{avg:.2f}</td>'
1308
+ else:
1309
+ html += '<td class="score-cell na">β€”</td>'
1310
+ html += '</tr>'
1311
+
1312
+ # Individual metric rows
1313
+ for metric_name in leaderboard_metrics:
1314
+ html += f'<tr><td class="metric-name">{metric_name}</td>'
1315
+
1316
+ # Get all scores for this metric
1317
+ metric_scores = {}
1318
+ for model_name in selected_models:
1319
+ if model_name in model_data_dict and leaderboard_name in model_data_dict[model_name]:
1320
+ results = model_data_dict[model_name][leaderboard_name].get("results", {})
1321
+ metric_scores[model_name] = results.get(metric_name)
1322
+
1323
+ valid_scores = [v for v in metric_scores.values() if v is not None]
1324
+ if valid_scores:
1325
+ max_score = max(valid_scores)
1326
+ min_score = min(valid_scores)
1327
+ score_range = max_score - min_score if max_score > min_score else 1
1328
+ else:
1329
+ max_score = min_score = score_range = None
1330
+
1331
+ for model_name in selected_models:
1332
+ score = metric_scores.get(model_name)
1333
+ if score is not None and score_range is not None:
1334
+ # Determine color class based on relative position
1335
+ if len(valid_scores) > 1:
1336
+ pct = (score - min_score) / score_range if score_range > 0 else 1
1337
+ if score == max_score:
1338
+ cell_class = "best"
1339
+ elif pct >= 0.75:
1340
+ cell_class = "good"
1341
+ elif pct >= 0.5:
1342
+ cell_class = "mid"
1343
+ elif pct >= 0.25:
1344
+ cell_class = "low"
1345
+ else:
1346
+ cell_class = "worst"
1347
+ else:
1348
+ cell_class = ""
1349
+ html += f'<td class="score-cell {cell_class}">{score:.2f}</td>'
1350
+ else:
1351
+ html += '<td class="score-cell na">β€”</td>'
1352
+
1353
+ html += '</tr>'
1354
+
1355
+ html += '</tbody></table></div>'
1356
+
1357
+ html += """
1358
+ </div>
1359
+ </div>
1360
+ """
1361
+
1362
+ html += "</div>"
1363
+ return html