|
|
import gradio as gr
|
|
|
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
|
import pandas as pd
|
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
from huggingface_hub import snapshot_download
|
|
|
from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
|
|
|
from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
|
|
|
from src.display.css_html_js import custom_css
|
|
|
from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
|
|
|
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
|
|
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
from src.submission.submit import add_new_eval
|
|
|
import random
|
|
|
import matplotlib.pyplot as plt
|
|
|
import re
|
|
|
import plotly.express as px
|
|
|
import plotly.graph_objects as go
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids):
|
|
|
"""
|
|
|
Computes a simple Prompt Sensitivity Index (PSI) over the tasks
|
|
|
using the distribution of 'Best Prompt Id' across the provided prompt_ids.
|
|
|
"""
|
|
|
cv_per_task = []
|
|
|
for task in tasks:
|
|
|
prompt_col = f"{task} Best Prompt Id"
|
|
|
task_accuracies = []
|
|
|
for pid in prompt_ids:
|
|
|
total = len(dataframe[prompt_col].dropna()) if prompt_col in dataframe.columns else 0
|
|
|
count = (dataframe[prompt_col] == pid).sum() if prompt_col in dataframe.columns else 0
|
|
|
acc = (count / total * 100) if total > 0 else 0
|
|
|
task_accuracies.append(acc)
|
|
|
if task_accuracies:
|
|
|
mean_acc = np.mean(task_accuracies)
|
|
|
std_acc = np.std(task_accuracies)
|
|
|
cv_per_task.append((std_acc / mean_acc) if mean_acc > 0 else 0)
|
|
|
else:
|
|
|
cv_per_task.append(0)
|
|
|
mean_cv = np.mean(cv_per_task) if cv_per_task else 0
|
|
|
psi = 1.0 if mean_cv >= 0.5 else (mean_cv / 0.5)
|
|
|
return psi, mean_cv, cv_per_task
|
|
|
|
|
|
def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None):
|
|
|
"""
|
|
|
Table with best overall model per task and the model with the best prompt score.
|
|
|
Applies optional filters:
|
|
|
- lang in {EN, IT, SL, SK, GR, PL} or None/"All"
|
|
|
- shot in {"0","10"} or None/"All" (mapped to IS_FS False/True)
|
|
|
"""
|
|
|
tasks = ["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHARMAER"]
|
|
|
df = dataframe.copy()
|
|
|
|
|
|
if lang and lang != "All" and "LANG" in df.columns:
|
|
|
df = df[df["LANG"] == lang]
|
|
|
if shot and shot != "All" and "IS_FS" in df.columns:
|
|
|
df = df[df["IS_FS"] == (shot == "10")]
|
|
|
|
|
|
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
|
|
|
|
|
for task in tasks:
|
|
|
if task not in df.columns or df.empty:
|
|
|
continue
|
|
|
|
|
|
|
|
|
max_idx = pd.to_numeric(df[task], errors='coerce').idxmax()
|
|
|
try:
|
|
|
model_raw = df.loc[max_idx, 'Model']
|
|
|
except Exception as e:
|
|
|
break
|
|
|
|
|
|
if isinstance(model_raw, str) and '<' in model_raw:
|
|
|
match = re.search(r'>([^<]+)<', model_raw)
|
|
|
model_name = match.group(1) if match else model_raw
|
|
|
else:
|
|
|
model_name = str(model_raw)
|
|
|
comb_perf_value = df.loc[max_idx, task]
|
|
|
|
|
|
|
|
|
best_prompt_column = f"{task} Best Prompt"
|
|
|
if best_prompt_column in df.columns and df[best_prompt_column].notna().any():
|
|
|
best_prompt_idx= pd.to_numeric(df[best_prompt_column],errors='coerce').idxmax()
|
|
|
try:
|
|
|
best_prompt_model_raw = df.loc[best_prompt_idx, 'Model']
|
|
|
except Exception as e:
|
|
|
break
|
|
|
if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
|
|
|
match = re.search(r'>([^<]+)<', best_prompt_model_raw)
|
|
|
best_prompt_model = match.group(1) if match else best_prompt_model_raw
|
|
|
else:
|
|
|
best_prompt_model = str(best_prompt_model_raw)
|
|
|
best_prompt_accuracy = df.loc[best_prompt_idx, best_prompt_column]
|
|
|
else:
|
|
|
best_prompt_model = "n/a"
|
|
|
best_prompt_accuracy = float('nan')
|
|
|
|
|
|
table_data['Task'].append(task)
|
|
|
table_data['Best Overall Model'].append(model_name)
|
|
|
table_data['CPS'].append(f"{comb_perf_value:.2f}")
|
|
|
table_data['Best Prompt Model'].append(best_prompt_model)
|
|
|
table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a")
|
|
|
|
|
|
fig = go.Figure(data=[go.Table(
|
|
|
columnwidth=[60, 220, 60, 220, 60],
|
|
|
header=dict(
|
|
|
values=[f'<b>{col}</b>' for col in table_data.keys()],
|
|
|
fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'],
|
|
|
font=dict(color='white', size=12, family='Arial'),
|
|
|
align='center', height=30
|
|
|
),
|
|
|
cells=dict(
|
|
|
values=list(table_data.values()),
|
|
|
fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]],
|
|
|
font=dict(color='#2c3e50', size=11, family='Arial'),
|
|
|
align=['center', 'left', 'center', 'left', 'center'],
|
|
|
height=30
|
|
|
)
|
|
|
)])
|
|
|
|
|
|
subtitle = []
|
|
|
subtitle.append(lang if (lang and lang != "All") else "All languages")
|
|
|
subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
|
|
|
|
|
|
fig.update_layout(
|
|
|
title={'text': f"Top Model per Task: CPS & Best Prompt β {', '.join(subtitle)}",
|
|
|
'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
|
|
|
font=dict(family="Arial", size=11),
|
|
|
height=420, margin=dict(l=20, r=20, t=50, b=80)
|
|
|
)
|
|
|
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_best_model_comparison_table_without_lang(dataframe):
|
|
|
"""
|
|
|
Table with the best overall model per task (NER, REL,) and the model that
|
|
|
achieves the best score with its own best prompt.
|
|
|
"""
|
|
|
tasks = ["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHARMAER"]
|
|
|
table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []}
|
|
|
|
|
|
for task in tasks:
|
|
|
if task not in dataframe.columns:
|
|
|
continue
|
|
|
|
|
|
|
|
|
max_idx = dataframe[task].idxmax()
|
|
|
model_raw = dataframe.loc[max_idx, 'Model']
|
|
|
if isinstance(model_raw, str) and '<' in model_raw:
|
|
|
match = re.search(r'>([^<]+)<', model_raw)
|
|
|
model_name = match.group(1) if match else model_raw
|
|
|
else:
|
|
|
model_name = str(model_raw)
|
|
|
comb_perf_value = dataframe.loc[max_idx, task]
|
|
|
|
|
|
|
|
|
best_prompt_column = f"{task} Best Prompt"
|
|
|
if best_prompt_column in dataframe.columns:
|
|
|
best_prompt_idx = dataframe[best_prompt_column].idxmax()
|
|
|
best_prompt_model_raw = dataframe.loc[best_prompt_idx, 'Model']
|
|
|
if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw:
|
|
|
match = re.search(r'>([^<]+)<', best_prompt_model_raw)
|
|
|
best_prompt_model = match.group(1) if match else best_prompt_model_raw
|
|
|
else:
|
|
|
best_prompt_model = str(best_prompt_model_raw)
|
|
|
best_prompt_accuracy = dataframe.loc[best_prompt_idx, best_prompt_column]
|
|
|
else:
|
|
|
best_prompt_model = "n/a"
|
|
|
best_prompt_accuracy = float('nan')
|
|
|
|
|
|
table_data['Task'].append(task)
|
|
|
table_data['Best Overall Model'].append(model_name)
|
|
|
table_data['CPS'].append(f"{comb_perf_value:.2f}")
|
|
|
table_data['Best Prompt Model'].append(best_prompt_model)
|
|
|
table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a")
|
|
|
|
|
|
fig = go.Figure(data=[go.Table(
|
|
|
columnwidth=[60, 220, 60, 220, 60],
|
|
|
header=dict(
|
|
|
values=[f'<b>{col}</b>' for col in table_data.keys()],
|
|
|
fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'],
|
|
|
font=dict(color='white', size=12, family='Arial'),
|
|
|
align='center', height=30
|
|
|
),
|
|
|
cells=dict(
|
|
|
values=list(table_data.values()),
|
|
|
fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]],
|
|
|
font=dict(color='#2c3e50', size=11, family='Arial'),
|
|
|
align=['center', 'left', 'center', 'left', 'center'],
|
|
|
height=30
|
|
|
)
|
|
|
)])
|
|
|
fig.update_layout(
|
|
|
title={'text': "Top Model per Task: CPS & Best Prompt (NER/REL)",
|
|
|
'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
|
|
|
font=dict(family="Arial", size=11),
|
|
|
height=420, margin=dict(l=20, r=20, t=50, b=80)
|
|
|
)
|
|
|
fig.add_annotation(
|
|
|
text=("Best Overall Model uses the task's primary metric (CPS). "
|
|
|
"Best Prompt Model is the one whose own best prompt yields the highest score."),
|
|
|
xref="paper", yref="paper", x=0.5, y=-0.20, showarrow=False,
|
|
|
font=dict(size=11, color="gray", family="Arial"), align="center", xanchor="center"
|
|
|
)
|
|
|
return fig
|
|
|
|
|
|
def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None = None):
|
|
|
"""
|
|
|
Heatmap of share (%) of models whose BEST prompt is each pid, for NER/REL with prompts p1..p3.
|
|
|
Optional filters:
|
|
|
- lang: None or one of EN/IT/SL/SK/GR/PL (None means All)
|
|
|
- shot: None or "0"/"10" (None means All) mapped to IS_FS False/True
|
|
|
"""
|
|
|
tasks = ["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHARMAER"]
|
|
|
|
|
|
df = dataframe.copy()
|
|
|
|
|
|
if lang and lang != "All" and "LANG" in df.columns:
|
|
|
df = df[df["LANG"] == lang]
|
|
|
|
|
|
if shot and shot != "All" and "IS_FS" in df.columns:
|
|
|
df = df[df["IS_FS"] == (shot == "10")]
|
|
|
|
|
|
|
|
|
def label_for(pid):
|
|
|
if isinstance(pid, str): return pid
|
|
|
try: return f"p{int(pid)}"
|
|
|
except Exception: return str(pid)
|
|
|
|
|
|
all_ids = set()
|
|
|
for task in tasks:
|
|
|
col = f"{task} Best Prompt Id"
|
|
|
if col in df.columns:
|
|
|
|
|
|
|
|
|
all_ids.update(df[col].dropna().unique())
|
|
|
prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0))
|
|
|
prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] or [1, 2, 3]
|
|
|
y_tick_labels = [label_for(pid) for pid in prompt_ids_raw]
|
|
|
|
|
|
matrix, hovers = [], []
|
|
|
for pid in prompt_ids_raw:
|
|
|
row, hover_row = [], []
|
|
|
for task in tasks:
|
|
|
col = f"{task} Best Prompt Id"
|
|
|
if col in df.columns and len(df[col].dropna()) > 0:
|
|
|
series = df[col].dropna()
|
|
|
|
|
|
def same_pid(v):
|
|
|
a = re.sub(r'[^0-9]', '', str(v))
|
|
|
b = re.sub(r'[^0-9]', '', str(pid))
|
|
|
return a == b and a != ""
|
|
|
|
|
|
total = len(series)
|
|
|
count = sum(same_pid(v) for v in series)
|
|
|
pct = (count / total * 100) if total > 0 else 0
|
|
|
row.append(pct)
|
|
|
hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>Models: {count}/{total}<br>Percentage: {pct:.1f}%")
|
|
|
else:
|
|
|
row.append(0); hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>No data")
|
|
|
matrix.append(row); hovers.append(hover_row)
|
|
|
|
|
|
fig = go.Figure(data=go.Heatmap(
|
|
|
z=matrix, x=tasks, y=y_tick_labels,
|
|
|
colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']],
|
|
|
text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix],
|
|
|
texttemplate="%{text}", textfont={"size": 11, "family": "Arial"},
|
|
|
hovertemplate='%{customdata}<extra></extra>', customdata=hovers,
|
|
|
colorbar=dict(title="% Models", ticksuffix="%"),
|
|
|
zmin=0, zmax=100
|
|
|
))
|
|
|
|
|
|
title_parts = []
|
|
|
title_parts.append(lang if (lang and lang != "All") else "All languages")
|
|
|
title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots")
|
|
|
fig.update_layout(
|
|
|
title={'text': f"Most Effective Prompts β {', '.join(title_parts)}",
|
|
|
'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
|
|
|
xaxis_title="Task", yaxis_title="Prompt",
|
|
|
font=dict(family="Arial", size=11), margin=dict(b=100),
|
|
|
template="plotly_white", dragmode=False, height=420
|
|
|
)
|
|
|
fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True)
|
|
|
return fig
|
|
|
|
|
|
|
|
|
|
|
|
def create_prompt_heatmap_without_lang(dataframe):
|
|
|
"""
|
|
|
Heatmap of the share of models (in %) whose BEST prompt for the task is each prompt id,
|
|
|
for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing
|
|
|
ids as integers (1/2/3) or strings ('p1'/'p2'/'p3').
|
|
|
"""
|
|
|
tasks = ["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF", "NER-PHARMAER"]
|
|
|
|
|
|
|
|
|
all_ids = set()
|
|
|
for task in tasks:
|
|
|
col = f"{task} Best Prompt Id"
|
|
|
if col in dataframe.columns:
|
|
|
all_ids.update(dataframe[col].dropna().unique())
|
|
|
|
|
|
|
|
|
def label_for(pid):
|
|
|
if isinstance(pid, str):
|
|
|
return pid
|
|
|
try:
|
|
|
return f"p{int(pid)}"
|
|
|
except Exception:
|
|
|
return str(pid)
|
|
|
|
|
|
prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0))
|
|
|
|
|
|
prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}]
|
|
|
|
|
|
if not prompt_ids_raw:
|
|
|
|
|
|
prompt_ids_raw = [1, 2, 3]
|
|
|
|
|
|
y_tick_labels = [label_for(pid) for pid in prompt_ids_raw]
|
|
|
|
|
|
matrix, hovers = [], []
|
|
|
for pid in prompt_ids_raw:
|
|
|
row, hover_row = [], []
|
|
|
for task in tasks:
|
|
|
col = f"{task} Best Prompt Id"
|
|
|
if col in dataframe.columns:
|
|
|
series = dataframe[col].dropna()
|
|
|
|
|
|
def same_pid(v):
|
|
|
a = re.sub(r'[^0-9]', '', str(v))
|
|
|
b = re.sub(r'[^0-9]', '', str(pid))
|
|
|
return a == b and a != ""
|
|
|
total = len(series)
|
|
|
count = sum(same_pid(v) for v in series)
|
|
|
pct = (count / total * 100) if total > 0 else 0
|
|
|
row.append(pct)
|
|
|
hover_row.append(
|
|
|
f"<b>{task} β {label_for(pid)}</b><br>Models: {count}/{total}<br>Percentage: {pct:.1f}%"
|
|
|
)
|
|
|
else:
|
|
|
row.append(0); hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>No data")
|
|
|
matrix.append(row)
|
|
|
hovers.append(hover_row)
|
|
|
|
|
|
fig = go.Figure(data=go.Heatmap(
|
|
|
z=matrix, x=tasks, y=y_tick_labels,
|
|
|
colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']],
|
|
|
text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix],
|
|
|
texttemplate="%{text}",
|
|
|
textfont={"size": 11, "family": "Arial"},
|
|
|
hovertemplate='%{customdata}<extra></extra>',
|
|
|
customdata=hovers,
|
|
|
colorbar=dict(title="% Models", ticksuffix="%"),
|
|
|
zmin=0, zmax=100
|
|
|
))
|
|
|
fig.update_layout(
|
|
|
title={'text': "Most Effective Prompts Across Models (NER/REL)",
|
|
|
'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}},
|
|
|
xaxis_title="Task", yaxis_title="Prompt",
|
|
|
font=dict(family="Arial", size=11),
|
|
|
margin=dict(b=120), template="plotly_white", dragmode=False, height=420
|
|
|
)
|
|
|
|
|
|
|
|
|
psi, mean_cv, _ = calculate_prompt_sensitivity(
|
|
|
dataframe, tasks, prompt_ids_raw
|
|
|
)
|
|
|
fig.add_annotation(
|
|
|
text=f"Prompt Sensitivity (mean CV): {mean_cv:.2f}",
|
|
|
xref="paper", yref="paper", x=0.3, y=1.12, showarrow=False,
|
|
|
font=dict(size=11, color="#2c3e50", family="Arial")
|
|
|
)
|
|
|
|
|
|
fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True)
|
|
|
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mean_of_max_per_field(df):
|
|
|
"""
|
|
|
Calcola il massimo per ciascun campo e poi la media dei massimi.
|
|
|
|
|
|
Args:
|
|
|
df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL, RML, DIA, HIS
|
|
|
|
|
|
Returns:
|
|
|
float: media dei valori massimi dei campi
|
|
|
"""
|
|
|
|
|
|
fields = ["NER", "REL", "RML-CRF", "DIA-CRF", "HIS-CRF", "NER-PHARMAER"]
|
|
|
|
|
|
|
|
|
|
|
|
print(df.columns)
|
|
|
missing = [f for f in fields if f not in df.columns]
|
|
|
if missing:
|
|
|
raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}")
|
|
|
|
|
|
|
|
|
max_values = df[fields].apply(pd.to_numeric, errors='coerce').max(skipna=True)
|
|
|
|
|
|
|
|
|
mean_max = max_values.mean()
|
|
|
|
|
|
return mean_max
|
|
|
|
|
|
|
|
|
def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
|
|
if tasks is None:
|
|
|
tasks = [ "NER", "REL", "RML-CRF", "DIA-CRF", "HIS-CRF","NER-PHARMAER"]
|
|
|
|
|
|
task_means = {}
|
|
|
|
|
|
for task in tasks:
|
|
|
if task not in dataframe.columns:
|
|
|
continue
|
|
|
|
|
|
|
|
|
few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]]
|
|
|
zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]]
|
|
|
|
|
|
|
|
|
merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero"))
|
|
|
|
|
|
|
|
|
merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"])
|
|
|
|
|
|
if merged.empty:
|
|
|
continue
|
|
|
|
|
|
|
|
|
diff = merged[f"{task}_few"] - merged[f"{task}_zero"]
|
|
|
|
|
|
|
|
|
task_means[task] = diff.mean()
|
|
|
|
|
|
|
|
|
fig = go.Figure([go.Bar(
|
|
|
x=list(task_means.keys()),
|
|
|
y=list(task_means.values()),
|
|
|
marker_color="#ff7f0e",
|
|
|
text=[f"{v:.2f}" for v in task_means.values()],
|
|
|
textposition="outside",
|
|
|
hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>"
|
|
|
)])
|
|
|
|
|
|
|
|
|
'''
|
|
|
fig.add_shape(
|
|
|
type="line",
|
|
|
x0=-0.5, x1=len(task_means) - 0.5,
|
|
|
y0=0, y1=0,
|
|
|
line=dict(color="black", width=2, dash="dash"),
|
|
|
xref="x", yref="y"
|
|
|
)
|
|
|
'''
|
|
|
|
|
|
fig.update_layout(
|
|
|
title="Mean Accuracy Difference (Few-shot β Zero-shot) per Task",
|
|
|
xaxis_title="",
|
|
|
yaxis_title="Mean Delta Combined Performance",
|
|
|
template="plotly_white",
|
|
|
font=dict(family="Arial", size=13),
|
|
|
|
|
|
)
|
|
|
|
|
|
fig.add_annotation(
|
|
|
text="10-shot learning generally outperforms zero-shot. <br>"
|
|
|
"",
|
|
|
xref="paper", yref="paper",
|
|
|
x=0, y=-0.2,
|
|
|
showarrow=False,
|
|
|
font=dict(size=11, color="gray"),
|
|
|
align="left"
|
|
|
)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
|
|
|
def boxplot_per_task(dataframe=None, baselines=None, references=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tasks =["NER", "REL", "RML-CRF", "HIS-CRF", "DIA-CRF" , "NER-PHARMAER"]
|
|
|
if dataframe is None:
|
|
|
np.random.seed(42)
|
|
|
dataframe = pd.DataFrame({
|
|
|
task: np.random.uniform(0.4, 0.9, 20) * 100
|
|
|
for task in tasks
|
|
|
})
|
|
|
|
|
|
if baselines is None:
|
|
|
baselines = {task: np.random.randint(50, 70) for task in tasks}
|
|
|
|
|
|
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
|
|
|
"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
|
|
|
|
|
|
fig = go.Figure()
|
|
|
|
|
|
for i, task in enumerate(tasks):
|
|
|
if task in dataframe.columns:
|
|
|
y_data = dataframe[task].dropna().tolist()
|
|
|
|
|
|
|
|
|
fig.add_trace(go.Box(
|
|
|
y=y_data,
|
|
|
name=task,
|
|
|
marker=dict(color=colors[i]),
|
|
|
line=dict(color="black", width=2),
|
|
|
fillcolor=colors[i],
|
|
|
opacity=0.7,
|
|
|
hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
|
|
|
width=0.6,
|
|
|
whiskerwidth=0.2,
|
|
|
quartilemethod="linear"
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fig.update_layout(
|
|
|
title="Distribution of Model Accuracy by Task",
|
|
|
xaxis_title="Task",
|
|
|
yaxis_title="Combined Performance",
|
|
|
template="plotly_white",
|
|
|
boxmode="group",
|
|
|
dragmode=False,
|
|
|
font=dict(family="Arial", size=10),
|
|
|
margin=dict(b=80),
|
|
|
)
|
|
|
|
|
|
fig.add_annotation(
|
|
|
text=(""
|
|
|
|
|
|
|
|
|
|
|
|
),
|
|
|
xref="paper", yref="paper",
|
|
|
x=0.5, y=-0.30,
|
|
|
showarrow=False,
|
|
|
font=dict(size=11, color="gray"),
|
|
|
align="left"
|
|
|
)
|
|
|
|
|
|
fig.update_yaxes(range=[0, 100], fixedrange=True)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
|
|
|
BASELINES = {
|
|
|
"TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
|
|
|
"LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
|
|
|
}
|
|
|
|
|
|
|
|
|
REFERENCES = {
|
|
|
"NER": 79.11,
|
|
|
"REL": 63.32,
|
|
|
"LS": 59.25,
|
|
|
"SU": 33.04
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
def boxplot_prompts_per_task(dataframe, tasks=None):
|
|
|
if tasks is None:
|
|
|
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
|
|
|
|
|
|
|
|
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
|
|
|
|
|
for col in cols_to_update:
|
|
|
dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
|
|
|
|
|
|
fig = go.Figure()
|
|
|
|
|
|
|
|
|
avg_x, avg_y = [], []
|
|
|
best_x, best_y, best_text = [], [], []
|
|
|
|
|
|
for task in tasks:
|
|
|
avg_col = f"{task} Prompt Average"
|
|
|
best_col = f"{task} Best Prompt"
|
|
|
best_id_col = f"{task} Best Prompt Id"
|
|
|
|
|
|
if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
|
|
|
avg_value = dataframe[avg_col].mean()
|
|
|
avg_x.append(task)
|
|
|
avg_y.append(avg_value)
|
|
|
|
|
|
best_value = dataframe[best_col].mean()
|
|
|
best_x.append(task)
|
|
|
best_y.append(best_value)
|
|
|
best_id = dataframe[best_id_col].mode()[0]
|
|
|
best_text.append(f"P:{best_id}")
|
|
|
|
|
|
|
|
|
fig.add_trace(go.Bar(
|
|
|
x=avg_x,
|
|
|
y=avg_y,
|
|
|
name="Avg. Accuracy",
|
|
|
marker_color="#1f77b4",
|
|
|
))
|
|
|
|
|
|
|
|
|
fig.add_trace(go.Bar(
|
|
|
x=best_x,
|
|
|
y=best_y,
|
|
|
name="Best Prompt",
|
|
|
marker_color="#d62728",
|
|
|
))
|
|
|
|
|
|
|
|
|
for x, y, text in zip(best_x, best_y, best_text):
|
|
|
fig.add_annotation(
|
|
|
x=x,
|
|
|
y=y + 3,
|
|
|
text=text,
|
|
|
showarrow=False,
|
|
|
font=dict(size=12, color="black")
|
|
|
)
|
|
|
|
|
|
fig.update_layout(
|
|
|
title= "Prompt Accuracy: Avg vs Best",
|
|
|
xaxis_title="Task",
|
|
|
yaxis_title="Combined Performance",
|
|
|
barmode='group',
|
|
|
template="plotly_white",
|
|
|
font=dict(family="Arial", size=10),
|
|
|
yaxis=dict(range=[0, 100], fixedrange=True)
|
|
|
)
|
|
|
|
|
|
|
|
|
fig.add_annotation(
|
|
|
text="There is no single prompt that performs best across all tasks.<br>"
|
|
|
"Different prompts achieve the highest accuracy on different tasks.",
|
|
|
xref="paper", yref="paper",
|
|
|
x=0.5, y=-0.3,
|
|
|
showarrow=False,
|
|
|
font=dict(size=11, color="gray"),
|
|
|
align="center",
|
|
|
xanchor="center"
|
|
|
)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
|
|
|
def line_chart(dataframe):
|
|
|
|
|
|
|
|
|
def scale_sizes(values, min_size=8, max_size=30):
|
|
|
vmin, vmax = min(values), max(values)
|
|
|
return [
|
|
|
min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2
|
|
|
for val in values
|
|
|
]
|
|
|
|
|
|
|
|
|
df_true = dataframe[dataframe['IS_FS'] == True]
|
|
|
df_false = dataframe[dataframe['IS_FS'] == False]
|
|
|
|
|
|
|
|
|
x_true = df_true['#Params (B)'].tolist()
|
|
|
y_true = df_true['Avg. Comb. Perf. β¬οΈ'].tolist()
|
|
|
labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()]
|
|
|
|
|
|
x_false = df_false['#Params (B)'].tolist()
|
|
|
y_false = df_false['Avg. Comb. Perf. β¬οΈ'].tolist()
|
|
|
labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()]
|
|
|
|
|
|
fig = go.Figure()
|
|
|
|
|
|
|
|
|
fig.add_trace(go.Scatter(
|
|
|
x=x_true,
|
|
|
y=y_true,
|
|
|
mode='markers',
|
|
|
name='10-Shot',
|
|
|
marker=dict(
|
|
|
color='blue',
|
|
|
size=scale_sizes(x_true)
|
|
|
),
|
|
|
hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
|
|
|
customdata=labels_true
|
|
|
))
|
|
|
|
|
|
|
|
|
fig.add_trace(go.Scatter(
|
|
|
x=x_false,
|
|
|
y=y_false,
|
|
|
mode='markers',
|
|
|
name='0-Shot',
|
|
|
marker=dict(
|
|
|
color='red',
|
|
|
size=scale_sizes(x_false)
|
|
|
),
|
|
|
hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>',
|
|
|
customdata=labels_false
|
|
|
))
|
|
|
|
|
|
|
|
|
all_y = y_true + y_false
|
|
|
all_x = x_true + x_false
|
|
|
all_labels = labels_true + labels_false
|
|
|
max_idx = all_y.index(max(all_y))
|
|
|
max_x = all_x[max_idx]
|
|
|
max_y = all_y[max_idx]
|
|
|
max_label = all_labels[max_idx]
|
|
|
|
|
|
|
|
|
fig.add_annotation(
|
|
|
x=max_x,
|
|
|
y=max_y,
|
|
|
|
|
|
text=f"{max_label}",
|
|
|
showarrow=True,
|
|
|
arrowhead=2,
|
|
|
arrowsize=1,
|
|
|
arrowwidth=2,
|
|
|
arrowcolor="black",
|
|
|
font=dict(size=11, color="black"),
|
|
|
xshift=10,
|
|
|
yshift=10,
|
|
|
ax = -30, ay = -20,
|
|
|
xanchor = "right"
|
|
|
)
|
|
|
|
|
|
fig.update_layout(
|
|
|
title="Avg. Combined Performance vs #Params",
|
|
|
xaxis_title="#Params (B)",
|
|
|
yaxis_title="Avg. Combined Performance",
|
|
|
template="plotly_white",
|
|
|
hovermode="closest",
|
|
|
font=dict(family="Arial", size=10),
|
|
|
dragmode=False,
|
|
|
xaxis=dict(
|
|
|
tickvals=[0, 25, 50, 75, 100, 125],
|
|
|
ticktext=["0", "25", "50", "75", "100"]
|
|
|
),
|
|
|
yaxis=dict(
|
|
|
tickvals=[0, 20, 40, 60, 80, 100],
|
|
|
range=[0, 100]
|
|
|
)
|
|
|
)
|
|
|
|
|
|
|
|
|
fig.add_annotation(
|
|
|
text="Accuracy generally rises with #Params, but smaller models <br>"
|
|
|
"with 10-shot can outperform larger zero-shot models.",
|
|
|
xref="paper", yref="paper",
|
|
|
x=0.5, y=-0.3,
|
|
|
showarrow=False,
|
|
|
font=dict(size=11, color="gray"),
|
|
|
align="center",
|
|
|
xanchor="center"
|
|
|
)
|
|
|
|
|
|
fig.update_xaxes(fixedrange=True, rangeslider_visible=False)
|
|
|
fig.update_yaxes(fixedrange=True)
|
|
|
|
|
|
return fig
|
|
|
|
|
|
|
|
|
|
|
|
TASK_METADATA_MULTIPLECHOICE = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
TASK_METADATA_GENERATIVE = {
|
|
|
|
|
|
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
|
|
"REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""},
|
|
|
"RML-CRF": {"icon": "π", "name": "CRF RML", "tooltip": "CRF RML"},
|
|
|
"DIA-CRF": {"icon": "π₯", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"},
|
|
|
"HIS-CRF": {"icon": "π", "name": "CRF History", "tooltip": "CRF History"},
|
|
|
"NER-PHARMAER": {"icon": "π·οΈ", "name": "Named Entity Recognition over PharmaER.It Datasets", "tooltip": ""},
|
|
|
|
|
|
}
|
|
|
|
|
|
def restart_space():
|
|
|
"""Restart the Hugging Face space."""
|
|
|
API.restart_space(repo_id=REPO_ID)
|
|
|
|
|
|
|
|
|
def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
|
"""
|
|
|
Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
|
|
|
The table is sorted based on the "Avg. Combined Performance" field.
|
|
|
"""
|
|
|
if dataframe is None or dataframe.empty:
|
|
|
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
|
|
|
|
|
sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. β¬οΈ", ascending=False)
|
|
|
|
|
|
sorted_dataframe = sorted_dataframe.reset_index(drop=True)
|
|
|
sorted_dataframe["Rank"] = sorted_dataframe.index + 1
|
|
|
|
|
|
|
|
|
large_medal_fs_assigned = False
|
|
|
medium_medal_fs_assigned = False
|
|
|
small_medal_fs_assigned = False
|
|
|
|
|
|
large_medal_0shot_assigned = False
|
|
|
medium_medal_0shot_assigned = False
|
|
|
small_medal_0shot_assigned = False
|
|
|
|
|
|
|
|
|
new_model_column = []
|
|
|
|
|
|
for _, row in sorted_dataframe.iterrows():
|
|
|
if row['IS_FS']:
|
|
|
if row["Size"] == "π΅π΅π΅" and not large_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π΅π")
|
|
|
large_medal_fs_assigned = True
|
|
|
elif row["Size"] == "π΅π΅" and not medium_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π")
|
|
|
medium_medal_fs_assigned = True
|
|
|
elif row["Size"] == "π΅" and not small_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π")
|
|
|
small_medal_fs_assigned = True
|
|
|
else:
|
|
|
new_model_column.append(row["Model"])
|
|
|
else:
|
|
|
if row["Size"] == "π΅π΅π΅" and not large_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π΅ποΈ")
|
|
|
large_medal_0shot_assigned = True
|
|
|
elif row["Size"] == "π΅π΅" and not medium_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅ποΈ")
|
|
|
medium_medal_0shot_assigned = True
|
|
|
elif row["Size"] == "π΅" and not small_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅ποΈ")
|
|
|
small_medal_0shot_assigned = True
|
|
|
else:
|
|
|
new_model_column.append(row["Model"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sorted_dataframe["Model"] = new_model_column
|
|
|
|
|
|
field_list = fields(AutoEvalColumn)
|
|
|
|
|
|
return Leaderboard(
|
|
|
value=sorted_dataframe,
|
|
|
datatype=[c.type for c in field_list],
|
|
|
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
|
|
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
|
|
filter_columns=[
|
|
|
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
|
|
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "),
|
|
|
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"),
|
|
|
],
|
|
|
bool_checkboxgroup_label="Evaluation Mode",
|
|
|
interactive=False,
|
|
|
)
|
|
|
|
|
|
def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
|
"""
|
|
|
Update and return the leaderboard when a specific task is selected.
|
|
|
The table is sorted based on the "Combined Performance" field.
|
|
|
"""
|
|
|
if dataframe is None or dataframe.empty:
|
|
|
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
clean_df = dataframe.assign( **{"Combined Performance": pd.to_numeric(dataframe["Combined Performance"], errors="coerce")}).loc[lambda df: df["Combined Performance"].notna() & (df["Combined Performance"] != 0)]
|
|
|
|
|
|
sorted_dataframe = clean_df.sort_values(by="Combined Performance", ascending=False)
|
|
|
|
|
|
|
|
|
sorted_dataframe = sorted_dataframe.reset_index(drop=True)
|
|
|
sorted_dataframe["Rank"] = sorted_dataframe.index + 1
|
|
|
|
|
|
|
|
|
large_medal_fs_assigned = False
|
|
|
medium_medal_fs_assigned = False
|
|
|
small_medal_fs_assigned = False
|
|
|
|
|
|
large_medal_0shot_assigned = False
|
|
|
medium_medal_0shot_assigned = False
|
|
|
small_medal_0shot_assigned = False
|
|
|
|
|
|
|
|
|
new_model_column = []
|
|
|
|
|
|
for _, row in sorted_dataframe.iterrows():
|
|
|
if row['IS_FS']:
|
|
|
if row["Size"] == "π΅π΅π΅" and not large_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π΅π")
|
|
|
large_medal_fs_assigned = True
|
|
|
elif row["Size"] == "π΅π΅" and not medium_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π")
|
|
|
medium_medal_fs_assigned = True
|
|
|
elif row["Size"] == "π΅" and not small_medal_fs_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π")
|
|
|
small_medal_fs_assigned = True
|
|
|
else:
|
|
|
new_model_column.append(row["Model"])
|
|
|
else:
|
|
|
if row["Size"] == "π΅π΅π΅" and not large_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅π΅ποΈ")
|
|
|
large_medal_0shot_assigned = True
|
|
|
elif row["Size"] == "π΅π΅" and not medium_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅π΅ποΈ")
|
|
|
medium_medal_0shot_assigned = True
|
|
|
elif row["Size"] == "π΅" and not small_medal_0shot_assigned:
|
|
|
new_model_column.append(f"{row['Model']} π΅ποΈ")
|
|
|
small_medal_0shot_assigned = True
|
|
|
else:
|
|
|
new_model_column.append(row["Model"])
|
|
|
|
|
|
|
|
|
sorted_dataframe["Model"] = new_model_column
|
|
|
|
|
|
pd.set_option('display.max_colwidth', None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
field_list = fields(AutoEvalColumn)
|
|
|
|
|
|
return Leaderboard(
|
|
|
value=sorted_dataframe,
|
|
|
|
|
|
datatype=[c.type for c in field_list] + [int],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
|
|
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
|
|
filter_columns=[
|
|
|
ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "),
|
|
|
ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "),
|
|
|
|
|
|
ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100],
|
|
|
label="Select the number of parameters (B)"),
|
|
|
],
|
|
|
bool_checkboxgroup_label="Evaluation Mode",
|
|
|
interactive=False
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def download_snapshot(repo, local_dir):
|
|
|
"""Try to download a snapshot from Hugging Face Hub."""
|
|
|
try:
|
|
|
print(f"Downloading from {repo} to {local_dir}...")
|
|
|
snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
|
|
|
except Exception as e:
|
|
|
print(f"Error downloading {repo}: {e}")
|
|
|
restart_space()
|
|
|
|
|
|
|
|
|
|
|
|
download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
|
|
download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
|
|
|
|
|
|
|
|
|
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
|
|
|
|
|
|
|
|
theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF)
|
|
|
|
|
|
|
|
|
demo = gr.Blocks(css=custom_css)
|
|
|
with demo:
|
|
|
|
|
|
gr.HTML(
|
|
|
"""
|
|
|
<div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;">
|
|
|
<h1 style="
|
|
|
margin: 0 auto;
|
|
|
font-weight: 900;
|
|
|
font-size: 5.5em;
|
|
|
letter-spacing: 2px;
|
|
|
text-transform: uppercase;
|
|
|
color: red;
|
|
|
background: linear-gradient(90deg, #1f77b4, #00c6ff);
|
|
|
-webkit-background-clip: text;
|
|
|
-webkit-text-fill-color: transparent;
|
|
|
text-shadow: 2px 2px 8px rgba(0.2,0,0,0);
|
|
|
">
|
|
|
ECREAM-LLM Leaderboard
|
|
|
</h1>
|
|
|
</div>
|
|
|
"""
|
|
|
)
|
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
with gr.Row():
|
|
|
gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
|
|
|
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Row(elem_id="filters-wrap"):
|
|
|
lang_dd = gr.Dropdown(
|
|
|
choices=["All", "EN", "IT", "SL", "SK", "GR", "PL"],
|
|
|
value="All", label="Language: ", scale=1
|
|
|
)
|
|
|
shot_dd = gr.Dropdown(
|
|
|
choices=["All", "0", "10"],
|
|
|
value="All", label="N-Shot: ", scale=1
|
|
|
)
|
|
|
|
|
|
with gr.Row():
|
|
|
heatmap_plot = gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF, None, None), elem_id="prompt-heatmap")
|
|
|
table_plot = gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF, None, None), elem_id="best-model-table")
|
|
|
|
|
|
def _update_both(lang, shot):
|
|
|
return (
|
|
|
create_prompt_heatmap(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot),
|
|
|
create_best_model_comparison_table(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot)
|
|
|
)
|
|
|
|
|
|
lang_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot])
|
|
|
shot_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot])
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
|
|
|
|
|
with gr.TabItem("π
Benchmark"):
|
|
|
|
|
|
leaderboard = init_leaderboard(
|
|
|
LEADERBOARD_DF,
|
|
|
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML-CRF", "DIA-CRF", "HIS-CRF", "NER-PHARMAER"],
|
|
|
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML-CRF", "DIA-CRF", "HIS-CRF", "NER-PHARMAER"]]
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
with gr.TabItem("π About"):
|
|
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
|
|
|
|
|
|
with gr.TabItem(f"{metadata['icon']}{task}"):
|
|
|
|
|
|
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
|
|
gr.Markdown(task_description, elem_classes="markdown-text")
|
|
|
|
|
|
leaderboard = update_task_leaderboard(
|
|
|
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
|
|
|
default_selection=['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'],
|
|
|
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.TabItem("β", interactive=False):
|
|
|
gr.Markdown("", elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
for task, metadata in TASK_METADATA_GENERATIVE.items():
|
|
|
with gr.TabItem(f"{metadata['icon']}{task}"):
|
|
|
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
|
|
gr.Markdown(task_description, elem_classes="markdown-text1")
|
|
|
print (task)
|
|
|
leaderboard = update_task_leaderboard(
|
|
|
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
|
|
|
f"{task} Prompt Std": "Prompt Std",
|
|
|
f"{task} Best Prompt": "Best Prompt",
|
|
|
f"{task} Best Prompt Id": "Best Prompt Id",
|
|
|
task: "Combined Performance"}),
|
|
|
default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt',
|
|
|
'Best Prompt Id'],
|
|
|
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
|
|
col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std',
|
|
|
'Best Prompt', 'Best Prompt Id']]
|
|
|
)
|
|
|
|
|
|
|
|
|
with gr.Accordion("π Citation", open=False):
|
|
|
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
|
|
|
|
|
|
with gr.Accordion("π Credits", open=False):
|
|
|
gr.Markdown(
|
|
|
"""
|
|
|
***This project has been funded by the European Union under:
|
|
|
|
|
|
Horizon Europe eCREAM Project (Grant Agreement No.101057726)
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
|
|
|
scheduler = BackgroundScheduler()
|
|
|
scheduler.add_job(restart_space, "interval", seconds=1800)
|
|
|
scheduler.start()
|
|
|
|
|
|
|
|
|
demo.queue(default_concurrency_limit=40).launch(debug=True,
|
|
|
show_error=True) |