Sfarzi commited on
Commit
b9301cc
·
1 Parent(s): 024a2c7

Add my custom leaderboard files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .ipynb_checkpoints/README-checkpoint.md +12 -0
  2. Makefile +13 -0
  3. app.py +224 -0
  4. e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json +8 -0
  5. e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json +8 -0
  6. e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_it-checkpoint.json +39 -0
  7. e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_sl-checkpoint.json +39 -0
  8. e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct_5-checkpoint.json +24 -0
  9. e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_it.json +39 -0
  10. e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_sl.json +39 -0
  11. e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json +39 -0
  12. example_app.py +324 -0
  13. example_app2.py +216 -0
  14. get_model_info.py +129 -0
  15. lb_e3c.zip +0 -0
  16. preprocess_models_output.py +250 -0
  17. preprocess_models_output_old.py +201 -0
  18. pyproject.toml +13 -0
  19. requirements.txt +16 -0
  20. run_instructions.txt +46 -0
  21. src/.ipynb_checkpoints/about-checkpoint.py +188 -0
  22. src/.ipynb_checkpoints/envs-checkpoint.py +30 -0
  23. src/.ipynb_checkpoints/populate-checkpoint.py +58 -0
  24. src/.ipynb_checkpoints/tasks-checkpoint.py +183 -0
  25. src/__pycache__/about.cpython-310.pyc +0 -0
  26. src/__pycache__/about.cpython-312.pyc +0 -0
  27. src/__pycache__/envs.cpython-310.pyc +0 -0
  28. src/__pycache__/populate.cpython-310.pyc +0 -0
  29. src/__pycache__/tasks.cpython-310.pyc +0 -0
  30. src/__pycache__/tasks.cpython-312.pyc +0 -0
  31. src/about.py +198 -0
  32. src/display/.ipynb_checkpoints/utils-checkpoint.py +160 -0
  33. src/display/__pycache__/css_html_js.cpython-310.pyc +0 -0
  34. src/display/__pycache__/css_html_js.cpython-312.pyc +0 -0
  35. src/display/__pycache__/formatting.cpython-310.pyc +0 -0
  36. src/display/__pycache__/utils.cpython-310.pyc +0 -0
  37. src/display/__pycache__/utils.cpython-312.pyc +0 -0
  38. src/display/css_html_js.py +106 -0
  39. src/display/formatting.py +27 -0
  40. src/display/utils.py +166 -0
  41. src/envs.py +36 -0
  42. src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py +214 -0
  43. src/leaderboard/__pycache__/read_evals.cpython-310.pyc +0 -0
  44. src/leaderboard/read_evals.py +257 -0
  45. src/populate.py +62 -0
  46. src/submission/__pycache__/check_validity.cpython-310.pyc +0 -0
  47. src/submission/__pycache__/submit.cpython-310.pyc +0 -0
  48. src/submission/check_validity.py +99 -0
  49. src/submission/submit.py +119 -0
  50. src/tasks.py +183 -0
.ipynb_checkpoints/README-checkpoint.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MediLingua Leaderboard
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.45.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
Makefile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: style format
2
+
3
+
4
+ style:
5
+ python -m black --line-length 119 .
6
+ python -m isort .
7
+ ruff check --fix .
8
+
9
+
10
+ quality:
11
+ python -m black --check --line-length 119 .
12
+ python -m isort --check-only .
13
+ ruff check .
app.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+ from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
7
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
8
+ from src.display.css_html_js import custom_css
9
+ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
10
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
+ from src.submission.submit import add_new_eval
13
+ import random
14
+
15
+ # Define task metadata (icons, names, descriptions)
16
+ TASK_METADATA_MULTIPLECHOICE = {
17
+ # "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
18
+ # "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
19
+ # "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
20
+ # "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
21
+ # "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
22
+ # "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
23
+ }
24
+
25
+ # Define task metadata (icons, names, descriptions)
26
+ TASK_METADATA_GENERATIVE = {
27
+ # "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
28
+ # "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
29
+ "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
30
+ "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""},
31
+ }
32
+
33
+ def restart_space():
34
+ """Restart the Hugging Face space."""
35
+ API.restart_space(repo_id=REPO_ID)
36
+
37
+
38
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
39
+ """
40
+ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
41
+ The table is sorted based on the "Avg. Combined Performance" field.
42
+ """
43
+ if dataframe is None or dataframe.empty:
44
+ raise ValueError("Leaderboard DataFrame is empty or None.")
45
+
46
+ field_list = fields(AutoEvalColumn)
47
+
48
+ return Leaderboard(
49
+ value=dataframe,
50
+ datatype=[c.type for c in field_list],
51
+ #select_columns=SelectColumns(
52
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
53
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
54
+ # label="Select Columns to Display:",
55
+ #),
56
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
57
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
58
+ filter_columns=[
59
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
60
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
61
+ # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
62
+ ],
63
+ #filter_columns=[
64
+ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
65
+ # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
66
+ #],
67
+ bool_checkboxgroup_label="Evaluation Mode",
68
+ interactive=False,
69
+ )
70
+
71
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
72
+ """
73
+ Update and return the leaderboard when a specific task is selected.
74
+ The table is sorted based on the "Combined Performance" field.
75
+ """
76
+ if dataframe is None or dataframe.empty:
77
+ raise ValueError("Leaderboard DataFrame is empty or None.")
78
+ print ("-----------")
79
+ print(dataframe)
80
+ print("columns : ", dataframe.columns)
81
+ print ("-----------")
82
+
83
+ #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
84
+ sorted_dataframe = dataframe.sort_values(by="Avg. Combined Performance ⬆️", ascending=False)
85
+
86
+ #print(sorted_dataframe['Combined Performance'])
87
+
88
+ field_list = fields(AutoEvalColumn)
89
+
90
+ return Leaderboard(
91
+ value=sorted_dataframe,
92
+ datatype=[c.type for c in field_list],
93
+ #select_columns=SelectColumns(
94
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
95
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
96
+ # label="Select Columns to Display:",
97
+ #),
98
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
99
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
100
+ filter_columns=[
101
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
102
+ ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges "),
103
+ ],
104
+ bool_checkboxgroup_label="Evaluation Mode",
105
+ interactive=False
106
+ )
107
+
108
+ '''
109
+ # Helper function for leaderboard initialization
110
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
111
+ """Initialize and return a leaderboard."""
112
+ if dataframe is None or dataframe.empty:
113
+ raise ValueError("Leaderboard DataFrame is empty or None.")
114
+
115
+ return Leaderboard(
116
+ value=dataframe,
117
+ datatype=[c.type for c in fields(AutoEvalColumn)],
118
+ select_columns=SelectColumns(
119
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
120
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
121
+ label="Select Columns to Display:",
122
+ ),
123
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
124
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
125
+ filter_columns=[
126
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
127
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
128
+ ],
129
+ bool_checkboxgroup_label="Hide models",
130
+ interactive=False,
131
+ )
132
+ '''
133
+
134
+ def download_snapshot(repo, local_dir):
135
+ """Try to download a snapshot from Hugging Face Hub."""
136
+ try:
137
+ print(f"Downloading from {repo} to {local_dir}...")
138
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
139
+ except Exception as e:
140
+ print(f"Error downloading {repo}: {e}")
141
+ restart_space()
142
+
143
+
144
+ # Initialize the app by downloading snapshots
145
+ #download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
146
+ #download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
147
+
148
+ # Load leaderboard data
149
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
150
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
151
+
152
+ # Prepare the main interface
153
+ demo = gr.Blocks(css=custom_css)
154
+ with demo:
155
+ gr.HTML(TITLE)
156
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
157
+
158
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
159
+
160
+ # Main leaderboard tab
161
+ with gr.TabItem("🏅 Benchmark"):
162
+
163
+ leaderboard = init_leaderboard(
164
+ LEADERBOARD_DF,
165
+ default_selection=['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
166
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
167
+ )
168
+
169
+ # About tab
170
+ with gr.TabItem("📝 About"):
171
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
172
+
173
+ # About tab
174
+ with gr.TabItem("║", interactive=False):
175
+ gr.Markdown("", elem_classes="markdown-text")
176
+
177
+ # Task-specific leaderboards
178
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
179
+
180
+ with gr.TabItem(f"{metadata['icon']}{task}"):
181
+
182
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
183
+ gr.Markdown(task_description, elem_classes="markdown-text")
184
+
185
+ leaderboard = update_task_leaderboard(
186
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
187
+ default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
188
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
189
+ )
190
+
191
+ # About tab
192
+ with gr.TabItem("│", interactive=False):
193
+ gr.Markdown("", elem_classes="markdown-text")
194
+
195
+ # Task-specific leaderboards
196
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
197
+ with gr.TabItem(f"{metadata['icon']}{task}"):
198
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
199
+ gr.Markdown(task_description, elem_classes="markdown-text")
200
+
201
+ leaderboard = update_task_leaderboard(
202
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
203
+ f"{task} Best Prompt": "Best Prompt",
204
+ f"{task} Best Prompt Id": "Best Prompt Id",
205
+ task: "Combined Performance"}),
206
+ default_selection=['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
207
+ 'Best Prompt Id'],
208
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
209
+ col not in ['LANG','FS', 'Model', 'Combined Performance', 'Prompt Average',
210
+ 'Best Prompt', 'Best Prompt Id']]
211
+ )
212
+
213
+ # Citation section
214
+ with gr.Accordion("📙 Citation", open=False):
215
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
216
+
217
+ # Background job to restart space
218
+ scheduler = BackgroundScheduler()
219
+ scheduler.add_job(restart_space, "interval", seconds=1800)
220
+ scheduler.start()
221
+
222
+ # Launch the app with concurrent queueing
223
+ demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
224
+ show_error=True)
e3c_llm_requests/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct-checkpoint.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "meta-llama/Llama-3.2-1B-Instruct",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "9213176726f574b556790deb65791e0c5aa438b6",
5
+ "submitted_time": "2024-09-18 15:12:47+00:00",
6
+ "num_params_billion": 1.2358144,
7
+ "language": "en_de_fr_it_pt_hi_es_th"
8
+ }
e3c_llm_requests/meta-llama/Llama-3.2-1B-Instruct.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "meta-llama/Llama-3.2-1B-Instruct",
3
+ "base_model": "LlamaForCausalLM",
4
+ "revision": "9213176726f574b556790deb65791e0c5aa438b6",
5
+ "submitted_time": "2024-09-18 15:12:47+00:00",
6
+ "num_params_billion": 1.2358144,
7
+ "language": "en_de_fr_it_pt_hi_es_th"
8
+ }
e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_it-checkpoint.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 12.479999999999999,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "LANG":"EN",
7
+ "batch_size": 8
8
+ },
9
+ "tasks": {
10
+ "RE": {
11
+ "prompts": [
12
+ {
13
+ "prompt": "prom_1",
14
+ "metric": "f1",
15
+ "value": 12.479999999999999,
16
+ "stderr": null
17
+ }
18
+ ],
19
+ "average_accuracy": 12.479999999999999,
20
+ "best_prompt": 12.479999999999999,
21
+ "prompt_id": "prom_1",
22
+ "CPS": 12.479999999999999
23
+ },
24
+ "NER": {
25
+ "prompts": [
26
+ {
27
+ "prompt": "prom_1",
28
+ "metric": "f1",
29
+ "value": 20,
30
+ "stderr": null
31
+ }
32
+ ],
33
+ "average_accuracy": 20,
34
+ "best_prompt": 20,
35
+ "prompt_id": "prom_3",
36
+ "CPS": 20
37
+ }
38
+ }
39
+ }
e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3-1.2-1B-Instruct_5_sl-checkpoint.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 5,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "LANG":"IT",
7
+ "batch_size": 8
8
+ },
9
+ "tasks": {
10
+ "RE": {
11
+ "prompts": [
12
+ {
13
+ "prompt": "prom_1",
14
+ "metric": "f1",
15
+ "value": 5,
16
+ "stderr": null
17
+ }
18
+ ],
19
+ "average_accuracy": 5,
20
+ "best_prompt": 5,
21
+ "prompt_id": "prom_1",
22
+ "CPS": 5
23
+ },
24
+ "NER": {
25
+ "prompts": [
26
+ {
27
+ "prompt": "prom_1",
28
+ "metric": "f1",
29
+ "value": 25,
30
+ "stderr": null
31
+ }
32
+ ],
33
+ "average_accuracy": 25,
34
+ "best_prompt": 25,
35
+ "prompt_id": "prom_3",
36
+ "CPS": 25
37
+ }
38
+ }
39
+ }
e3c_llm_results/meta-llama/.ipynb_checkpoints/Llama-3.2-1B-Instruct_5-checkpoint.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 12.479999999999999,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "batch_size": 8
7
+ },
8
+ "tasks": {
9
+ "RE": {
10
+ "prompts": [
11
+ {
12
+ "prompt": "prom_1",
13
+ "metric": "f1",
14
+ "value": 12.479999999999999,
15
+ "stderr": null
16
+ }
17
+ ],
18
+ "average_accuracy": 12.479999999999999,
19
+ "best_prompt": 12.479999999999999,
20
+ "prompt_id": "prom_1",
21
+ "CPS": 12.479999999999999
22
+ }
23
+ }
24
+ }
e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_it.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 5,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "LANG":"IT",
7
+ "batch_size": 8
8
+ },
9
+ "tasks": {
10
+ "RE": {
11
+ "prompts": [
12
+ {
13
+ "prompt": "prom_1",
14
+ "metric": "f1",
15
+ "value": 5,
16
+ "stderr": null
17
+ }
18
+ ],
19
+ "average_accuracy": 5,
20
+ "best_prompt": 5,
21
+ "prompt_id": "prom_1",
22
+ "CPS": 5
23
+ },
24
+ "NER": {
25
+ "prompts": [
26
+ {
27
+ "prompt": "prom_1",
28
+ "metric": "f1",
29
+ "value": 25,
30
+ "stderr": null
31
+ }
32
+ ],
33
+ "average_accuracy": 25,
34
+ "best_prompt": 25,
35
+ "prompt_id": "prom_3",
36
+ "CPS": 25
37
+ }
38
+ }
39
+ }
e3c_llm_results/meta-llama/Llama-3-1.2-1B-Instruct_5_sl.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 5,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "LANG":"SL",
7
+ "batch_size": 8
8
+ },
9
+ "tasks": {
10
+ "RE": {
11
+ "prompts": [
12
+ {
13
+ "prompt": "prom_1",
14
+ "metric": "f1",
15
+ "value": 8,
16
+ "stderr": null
17
+ }
18
+ ],
19
+ "average_accuracy": 8,
20
+ "best_prompt": 8,
21
+ "prompt_id": "prom_1",
22
+ "CPS": 8
23
+ },
24
+ "NER": {
25
+ "prompts": [
26
+ {
27
+ "prompt": "prom_1",
28
+ "metric": "f1",
29
+ "value": 28,
30
+ "stderr": null
31
+ }
32
+ ],
33
+ "average_accuracy": 28,
34
+ "best_prompt": 28,
35
+ "prompt_id": "prom_3",
36
+ "CPS": 28
37
+ }
38
+ }
39
+ }
e3c_llm_results/meta-llama/Llama-3.2-1B-Instruct_5.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "average_CPS": 12.479999999999999,
3
+ "config": {
4
+ "model_name": "meta-llama/Llama-3.2-1B-Instruct",
5
+ "num_fewshot": "5",
6
+ "LANG":"EN",
7
+ "batch_size": 8
8
+ },
9
+ "tasks": {
10
+ "RE": {
11
+ "prompts": [
12
+ {
13
+ "prompt": "prom_1",
14
+ "metric": "f1",
15
+ "value": 12.479999999999999,
16
+ "stderr": null
17
+ }
18
+ ],
19
+ "average_accuracy": 12.479999999999999,
20
+ "best_prompt": 12.479999999999999,
21
+ "prompt_id": "prom_1",
22
+ "CPS": 12.479999999999999
23
+ },
24
+ "NER": {
25
+ "prompts": [
26
+ {
27
+ "prompt": "prom_1",
28
+ "metric": "f1",
29
+ "value": 20,
30
+ "stderr": null
31
+ }
32
+ ],
33
+ "average_accuracy": 20,
34
+ "best_prompt": 20,
35
+ "prompt_id": "prom_3",
36
+ "CPS": 20
37
+ }
38
+ }
39
+ }
example_app.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+
16
+ from src.tasks import (
17
+ TE_DESCRIPTION,
18
+ )
19
+
20
+ from src.display.css_html_js import custom_css
21
+ from src.display.utils import (
22
+ BENCHMARK_COLS,
23
+ COLS,
24
+ EVAL_COLS,
25
+ EVAL_TYPES,
26
+ AutoEvalColumn,
27
+ ModelType,
28
+ fields,
29
+ WeightType,
30
+ Precision
31
+ )
32
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
33
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
34
+ from src.submission.submit import add_new_eval
35
+
36
+
37
+ def restart_space():
38
+ API.restart_space(repo_id=REPO_ID)
39
+
40
+ ### Space initialisation
41
+ try:
42
+ print(EVAL_REQUESTS_PATH)
43
+ snapshot_download(
44
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
45
+ )
46
+ except Exception:
47
+ restart_space()
48
+ try:
49
+ print(EVAL_RESULTS_PATH)
50
+ snapshot_download(
51
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
52
+ )
53
+ except Exception:
54
+ restart_space()
55
+
56
+
57
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
58
+
59
+ (
60
+ finished_eval_queue_df,
61
+ running_eval_queue_df,
62
+ pending_eval_queue_df,
63
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
64
+
65
+ def init_leaderboard(dataframe):
66
+ print(dataframe)
67
+ if dataframe is None or dataframe.empty:
68
+ raise ValueError("Leaderboard DataFrame is empty or None.")
69
+ return Leaderboard(
70
+ value=dataframe,
71
+ datatype=[c.type for c in fields(AutoEvalColumn)],
72
+ select_columns=SelectColumns(
73
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
74
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
75
+ label="Select Columns to Display:",
76
+ ),
77
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
78
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
79
+ filter_columns=[
80
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
81
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
82
+ ColumnFilter(
83
+ AutoEvalColumn.params.name,
84
+ type="slider",
85
+ min=0.01,
86
+ max=150,
87
+ label="Select the number of parameters (B)",
88
+ ),
89
+ ColumnFilter(
90
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
91
+ ),
92
+ ],
93
+ bool_checkboxgroup_label="Hide models",
94
+ interactive=False,
95
+ )
96
+
97
+
98
+ def init_leaderboard2(dataframe, default_selection=None, hidden_columns=None):
99
+
100
+ print("entrato===============================================")
101
+
102
+ if dataframe is None or dataframe.empty:
103
+ raise ValueError("Leaderboard DataFrame is empty or None.")
104
+ return Leaderboard(
105
+ value=dataframe,
106
+ datatype=[c.type for c in fields(AutoEvalColumn)],
107
+ select_columns=SelectColumns(
108
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
109
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
110
+ label="Select Columns to Display:",
111
+ ),
112
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
113
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
114
+ filter_columns=[
115
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
116
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
117
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
118
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
119
+ ],
120
+ bool_checkboxgroup_label="Hide models",
121
+ interactive=False,
122
+ )
123
+
124
+
125
+ demo = gr.Blocks(css=custom_css)
126
+ with demo:
127
+ gr.HTML(TITLE)
128
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
129
+
130
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
132
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
133
+
134
+ leaderboard = init_leaderboard2(
135
+ LEADERBOARD_DF,
136
+ default_selection=['T', 'Model', "Average ⬆��", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
137
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
138
+ col not in ['T', 'Model', "Average ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL" ]]
139
+ )
140
+
141
+
142
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
143
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
144
+
145
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
146
+ with gr.Column():
147
+ with gr.Row():
148
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
149
+
150
+ with gr.Column():
151
+ with gr.Accordion(
152
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
153
+ open=False,
154
+ ):
155
+ with gr.Row():
156
+ finished_eval_table = gr.components.Dataframe(
157
+ value=finished_eval_queue_df,
158
+ headers=EVAL_COLS,
159
+ datatype=EVAL_TYPES,
160
+ row_count=5,
161
+ )
162
+ with gr.Accordion(
163
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
164
+ open=False,
165
+ ):
166
+ with gr.Row():
167
+ running_eval_table = gr.components.Dataframe(
168
+ value=running_eval_queue_df,
169
+ headers=EVAL_COLS,
170
+ datatype=EVAL_TYPES,
171
+ row_count=5,
172
+ )
173
+
174
+ with gr.Accordion(
175
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
176
+ open=False,
177
+ ):
178
+ with gr.Row():
179
+ pending_eval_table = gr.components.Dataframe(
180
+ value=pending_eval_queue_df,
181
+ headers=EVAL_COLS,
182
+ datatype=EVAL_TYPES,
183
+ row_count=5,
184
+ )
185
+ with gr.Row():
186
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
187
+
188
+ with gr.Row():
189
+ with gr.Column():
190
+ model_name_textbox = gr.Textbox(label="Model name")
191
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
192
+ model_type = gr.Dropdown(
193
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
194
+ label="Model type",
195
+ multiselect=False,
196
+ value=None,
197
+ interactive=True,
198
+ )
199
+
200
+ with gr.Column():
201
+ precision = gr.Dropdown(
202
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
203
+ label="Precision",
204
+ multiselect=False,
205
+ value="float16",
206
+ interactive=True,
207
+ )
208
+ weight_type = gr.Dropdown(
209
+ choices=[i.value.name for i in WeightType],
210
+ label="Weights type",
211
+ multiselect=False,
212
+ value="Original",
213
+ interactive=True,
214
+ )
215
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
216
+
217
+ submit_button = gr.Button("Submit Eval")
218
+ submission_result = gr.Markdown()
219
+ submit_button.click(
220
+ add_new_eval,
221
+ [
222
+ model_name_textbox,
223
+ base_model_name_textbox,
224
+ revision_name_textbox,
225
+ precision,
226
+ weight_type,
227
+ model_type,
228
+ ],
229
+ submission_result,
230
+ )
231
+
232
+
233
+ with gr.TabItem("TE", elem_id="llm-benchmark-tab-table", id=4):
234
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
235
+ #leaderboard = init_leaderboard(LEADERBOARD_DF)
236
+
237
+ LEADERBOARD_DF_TE = LEADERBOARD_DF.rename(columns={"TE Prompt Average": "Prompt Average",
238
+ "TE Best Prompt": "Best Prompt",
239
+ "TE Best Prompt Id": "Best Prompt Id",
240
+ "TE": "Combined Performance"})
241
+
242
+ leaderboard = init_leaderboard2(
243
+ LEADERBOARD_DF_TE,
244
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
245
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
246
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
247
+ )
248
+
249
+
250
+ with gr.TabItem("SA", elem_id="llm-benchmark-tab-table", id=5):
251
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
252
+
253
+ LEADERBOARD_DF_SA = LEADERBOARD_DF.rename(columns={"SA Prompt Average": "Prompt Average",
254
+ "SA Best Prompt": "Best Prompt",
255
+ "SA Best Prompt Id": "Best Prompt Id",
256
+ "SA": "Combined Performance"})
257
+
258
+ leaderboard = init_leaderboard2(
259
+ LEADERBOARD_DF_SA,
260
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
261
+ 'Best Prompt Id'],
262
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
263
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
264
+ 'Best Prompt Id']]
265
+ )
266
+
267
+
268
+
269
+
270
+ with gr.TabItem("HS", elem_id="llm-benchmark-tab-table", id=6):
271
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
272
+
273
+ LEADERBOARD_DF_HS = LEADERBOARD_DF.rename(columns={"HS Prompt Average": "Prompt Average",
274
+ "HS Best Prompt": "Best Prompt",
275
+ "HS Best Prompt Id": "Best Prompt Id",
276
+ "HS": "Combined Performance"})
277
+
278
+ leaderboard = init_leaderboard2(
279
+ LEADERBOARD_DF_HS,
280
+ default_selection=['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
281
+ 'Best Prompt Id'],
282
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
283
+ col not in ['T', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
284
+ 'Best Prompt Id']]
285
+ )
286
+
287
+
288
+
289
+ with gr.TabItem("AT", elem_id="llm-benchmark-tab-table", id=7):
290
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
291
+
292
+ with gr.TabItem("WIC", elem_id="llm-benchmark-tab-table", id=8):
293
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
294
+
295
+ with gr.TabItem("FAQ", elem_id="llm-benchmark-tab-table", id=9):
296
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
297
+
298
+ with gr.TabItem("LS", elem_id="llm-benchmark-tab-table", id=10):
299
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
300
+
301
+ with gr.TabItem("SU", elem_id="llm-benchmark-tab-table", id=11):
302
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
303
+
304
+ with gr.TabItem("NER", elem_id="llm-benchmark-tab-table", id=12):
305
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
306
+
307
+ with gr.TabItem("REL", elem_id="llm-benchmark-tab-table", id=13):
308
+ gr.Markdown(TE_DESCRIPTION, elem_classes="markdown-text")
309
+
310
+
311
+ with gr.Row():
312
+ with gr.Accordion("📙 Citation", open=False):
313
+ citation_button = gr.Textbox(
314
+ value=CITATION_BUTTON_TEXT,
315
+ label=CITATION_BUTTON_LABEL,
316
+ lines=20,
317
+ elem_id="citation-button",
318
+ show_copy_button=True,
319
+ )
320
+
321
+ scheduler = BackgroundScheduler()
322
+ scheduler.add_job(restart_space, "interval", seconds=1800)
323
+ scheduler.start()
324
+ demo.queue(default_concurrency_limit=40).launch()
example_app2.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT,
9
+ INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
10
+ )
11
+ from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
12
+ from src.display.css_html_js import custom_css
13
+ from src.display.utils import (
14
+ BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn,
15
+ ModelType, fields, WeightType, Precision
16
+ )
17
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
18
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
19
+ from src.submission.submit import add_new_eval
20
+
21
+
22
+
23
+
24
+ # Define the task icons and names
25
+ TASK_ICONS = {
26
+ "TE": "📊", # Textual Entailment
27
+ "SA": "😃", # Sentiment Analysis
28
+ "HS": "⚠️", # Hate Speech
29
+ "AT": "🏥", # Admission Test
30
+ "WIC": "🔤", # Word in Context
31
+ "FAQ": "❓", # Frequently Asked Questions
32
+ "LS": "🔄", # Lexical Substitution
33
+ "SU": "📝", # Summarization
34
+ "NER": "🏷️", # Named Entity Recognition
35
+ "REL": "🔗", # Relation Extraction
36
+ }
37
+
38
+ TASK_NAMES = {
39
+ "TE": "Textual Entailment",
40
+ "SA": "Sentiment Analysis",
41
+ "HS": "Hate Speech",
42
+ "AT": "Admission Test",
43
+ "WIC": "Word in Context",
44
+ "FAQ": "Frequently Asked Questions",
45
+ "LS": "Lexical Substitution",
46
+ "SU": "Summarization",
47
+ "NER": "Named Entity Recognition",
48
+ "REL": "Relation Extraction",
49
+ }
50
+
51
+
52
+ # Tooltip descriptions for each task
53
+ TASK_TOOLTIPS = {
54
+ "TE": "Identify logical relationships between two text segments.",
55
+ "SA": "Classify the sentiment (positive, negative, neutral) of a text.",
56
+ "HS": "Detect hate speech in a text.",
57
+ "AT": "Classify whether a clinical statement pertains to an admission test.",
58
+ "WIC": "Identify words in context and their meaning.",
59
+ "FAQ": "Answer frequently asked questions based on given text.",
60
+ "LS": "Identify alternative words in a given context.",
61
+ "SU": "Summarize long text into a shorter version.",
62
+ "NER": "Identify named entities (e.g., persons, locations, organizations) in text.",
63
+ "REL": "Extract and link laboratory test results to the respective tests in clinical narratives.",
64
+ }
65
+
66
+
67
+
68
+
69
+ def restart_space():
70
+ """Restart the Hugging Face space."""
71
+ API.restart_space(repo_id=REPO_ID)
72
+
73
+
74
+ def download_snapshot(repo, local_dir):
75
+ """Try to download a snapshot from the Hugging Face Hub, restarting space on failure."""
76
+ try:
77
+ print(f"Downloading from {repo} to {local_dir}...")
78
+ snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
79
+ except Exception as e:
80
+ print(f"Error downloading {repo}: {e}")
81
+ restart_space()
82
+
83
+
84
+ # Space initialization
85
+ download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
86
+ download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)
87
+
88
+ # Load leaderboard and evaluation queue data
89
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
90
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
91
+
92
+
93
+ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
94
+ """Initialize a leaderboard with specific columns."""
95
+ if dataframe is None or dataframe.empty:
96
+ raise ValueError("Leaderboard DataFrame is empty or None.")
97
+
98
+ return Leaderboard(
99
+ value=dataframe,
100
+ datatype=[c.type for c in fields(AutoEvalColumn)],
101
+ select_columns=SelectColumns(
102
+ default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
103
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
104
+ label="Select Columns to Display:",
105
+ ),
106
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
107
+ hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
108
+ filter_columns=[
109
+ #ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
110
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
111
+ #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
112
+ ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
113
+ #ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
114
+ ],
115
+ bool_checkboxgroup_label="Hide models",
116
+ interactive=False,
117
+ )
118
+
119
+
120
+ def prepare_leaderboard_df(df, task_prefix):
121
+ """Rename columns for a specific task to a standard format."""
122
+ return df.rename(columns={
123
+ f"{task_prefix} Prompt Average": "Prompt Average",
124
+ f"{task_prefix} Best Prompt": "Best Prompt",
125
+ f"{task_prefix} Best Prompt Id": "Best Prompt Id",
126
+ task_prefix: "Combined Performance"
127
+ })
128
+
129
+
130
+ demo = gr.Blocks(css=custom_css)
131
+ with demo:
132
+ gr.HTML(TITLE)
133
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
134
+
135
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
136
+ # Main leaderboard tab
137
+ with gr.TabItem("🏅 EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"):
138
+ leaderboard = init_leaderboard(
139
+ LEADERBOARD_DF,
140
+ default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
141
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
142
+ ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
143
+ )
144
+
145
+ # About tab
146
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table"):
147
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
148
+
149
+ '''
150
+ # Submission tab
151
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table"):
152
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
153
+
154
+ for queue_name, queue_df in [
155
+ ("✅ Finished Evaluations", finished_eval_queue_df),
156
+ ("🔄 Running Evaluation Queue", running_eval_queue_df),
157
+ ("⏳ Pending Evaluation Queue", pending_eval_queue_df)
158
+ ]:
159
+ with gr.Accordion(f"{queue_name} ({len(queue_df)})", open=False):
160
+ gr.components.Dataframe(value=queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5)
161
+
162
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
163
+ with gr.Row():
164
+ model_name_textbox = gr.Textbox(label="Model name")
165
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
166
+ model_type = gr.Dropdown(choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
167
+ label="Model type", multiselect=False, interactive=True)
168
+ precision = gr.Dropdown(choices=[i.value.name for i in Precision if i != Precision.Unknown],
169
+ label="Precision", multiselect=False, value="float16", interactive=True)
170
+ weight_type = gr.Dropdown(choices=[i.value.name for i in WeightType],
171
+ label="Weights type", multiselect=False, value="Original", interactive=True)
172
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
173
+
174
+ submit_button = gr.Button("Submit Eval")
175
+ submission_result = gr.Markdown()
176
+ submit_button.click(
177
+ add_new_eval,
178
+ [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type],
179
+ submission_result,
180
+ )
181
+ '''
182
+
183
+ # Task-specific leaderboards
184
+ for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]:
185
+
186
+ with gr.TabItem(f"{TASK_ICONS[task]}{task}", elem_id="llm-benchmark-tab-table"):
187
+
188
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
189
+
190
+
191
+
192
+
193
+ gr.Markdown(task_description, elem_classes="markdown-text")
194
+
195
+
196
+ gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text")
197
+
198
+
199
+
200
+ leaderboard = init_leaderboard(
201
+ prepare_leaderboard_df(LEADERBOARD_DF, task),
202
+ default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
203
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in
204
+ ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
205
+ )
206
+
207
+ # Citation section
208
+ with gr.Accordion("📙 Citation", open=False):
209
+ gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
210
+
211
+ # Background job to restart space
212
+ scheduler = BackgroundScheduler()
213
+ scheduler.add_job(restart_space, "interval", seconds=1800)
214
+ scheduler.start()
215
+
216
+ demo.queue(default_concurrency_limit=40).launch()
get_model_info.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MODEL METADATA EXTRACTOR
3
+
4
+ This script processes model evaluation output files (input_folder) from the lm-eval-harness library,
5
+ extracts model identifiers, retrieves detailed metadata from HuggingFace
6
+ and saves the information as structured JSON files (output_folder).
7
+
8
+ Input: Directory containing .out files from lm-eval-harness
9
+ Output: Directory with JSON files containing model metadata
10
+ """
11
+
12
+ # Example input file format (lm-eval-harness output):
13
+ '''
14
+ hf (pretrained=swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA,trust_remote_code=True), gen_kwargs: (None), limit: None, num_fewshot: 5, batch_size: 1
15
+ | Tasks |Version|Filter|n-shot| Metric | |Value | |Stderr|
16
+ |------------------------|------:|------|-----:|--------|---|-----:|---|------|
17
+ |evalita-mp | 1|none | |acc |↑ |0.5605|± |0.0052|
18
+ ...
19
+ Job completed
20
+ '''
21
+
22
+ # Example output JSON format:
23
+ '''
24
+ {
25
+ "model": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
26
+ "base_model": "LlamaForCausalLM",
27
+ "revision": "2b6e46e4c9d341dc8bf8350a167492c880116b66",
28
+ "submitted_time": "2024-04-29 09:34:12+00:00",
29
+ "num_params_billion": 8.030261248,
30
+ "language": "en_it"
31
+ }
32
+ '''
33
+
34
+ import os
35
+ import re
36
+ import json
37
+ from huggingface_hub import HfApi
38
+
39
+ # Configures the Hugging Face token (if needed)
40
+ # TOKEN = "YOUR_HUGGINGFACE_API_TOKEN"
41
+ api = HfApi()
42
+
43
+ # Directory paths
44
+ # input_folder: Directory containing the output files of the lm-eval-harness library, including model accuracy metrics.
45
+ #input_folder = "../evalita_llm_models_output/"
46
+ input_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/"
47
+ # output_folder: Directory where JSON files with model characteristics will be saved.
48
+ output_folder = "/home/sfarzi/leaderboard/evalita_llm_leaderboard/e3c_llm_requests/"
49
+
50
+ # Creates the output folder if it doesn't exist
51
+ os.makedirs(output_folder, exist_ok=True)
52
+
53
+ # Regular expression to find the model name
54
+ model_pattern = re.compile(r"pretrained=([\w\-./]+)")
55
+
56
+ # Scans files in the input folder
57
+ for filename in os.listdir(input_folder):
58
+ if filename.endswith('.out'):
59
+ file_path = os.path.join(input_folder, filename)
60
+
61
+ # Reads the file content
62
+ with open(file_path, "r", encoding="utf-8") as f:
63
+ content = f.read()
64
+
65
+ # Extracts the model name
66
+ match = model_pattern.search(content)
67
+ if match:
68
+ model_name = match.group(1)
69
+ print(f"Processing model: {model_name}")
70
+
71
+ try:
72
+ # Retrieves model information from HuggingFace
73
+ model_info = api.model_info(model_name)
74
+
75
+ # Calculates the number of parameters in billions, if available
76
+ num_params = None
77
+ if model_info.safetensors and "BF16" in model_info.safetensors.parameters:
78
+ num_params = model_info.safetensors.parameters["BF16"] / 1e9 # Convert to billions
79
+
80
+ # Extracts and concatenates languages
81
+ language = "_".join(model_info.card_data.get("language", [])) if model_info.card_data else ""
82
+
83
+ #print(model_info)
84
+
85
+ # Builds the dictionary with required metadata
86
+ model_data = {
87
+ "model": model_name,
88
+ "base_model": model_info.config.get("architectures", [""])[0] if model_info.config else "",
89
+ "revision": model_info.sha,
90
+ # "precision": "bfloat16", # If available, replace with real value
91
+ # "weight_type": "Original",
92
+ # "status": "FINISHED",
93
+ "submitted_time": str(model_info.created_at),
94
+ # "model_type": "pretrained",
95
+ # "likes": model_info.likes,
96
+ # "params": model_info.safetensors_size_in_bytes / 1e9 if model_info.safetensors_size_in_bytes else None,
97
+ # "license": model_info.license,
98
+ # "private": model_info.private,
99
+ "num_params_billion": num_params, # Number of parameters in billions
100
+ "language": language, # Extracted language
101
+ }
102
+
103
+ # Separates the model_name into two parts: directory name and file name
104
+ if "/" in model_name:
105
+ dir_name, file_name = model_name.split("/", 1)
106
+ else:
107
+ dir_name, file_name = model_name, model_name # If no "/", use the same name
108
+
109
+ # Creates the folder for saving the produced json files
110
+ model_output_folder = os.path.join(output_folder, dir_name)
111
+ os.makedirs(model_output_folder, exist_ok=True)
112
+
113
+ # Saves the JSON file in the appropriate folder
114
+ output_file = os.path.join(model_output_folder, f"{file_name}.json")
115
+
116
+ # Check if the file already exists
117
+ if os.path.exists(output_file):
118
+ print(f"File {output_file} already exists. Skipping...")
119
+ continue
120
+
121
+ with open(output_file, "w", encoding="utf-8") as f:
122
+ json.dump(model_data, f, indent=4)
123
+
124
+ print(f"Saved metadata for {model_name} in {output_file}")
125
+
126
+ except Exception as e:
127
+ print(f"Error retrieving info for {model_name}: {e}")
128
+
129
+ print("Process finished!")
lb_e3c.zip ADDED
Binary file (1.04 kB). View file
 
preprocess_models_output.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ EVALITA LLM EVALUATION PROCESSOR
3
+
4
+ Transforms raw model evaluation outputs into structured performance reports for leaderboard integration.
5
+
6
+ DATA PIPELINE OVERVIEW:
7
+
8
+ 1. Inputs:
9
+ - Evaluation Results: Raw .out files from lm-eval-harness
10
+ - Model Metadata: Pre-collected .json files from HuggingFace
11
+
12
+ 2. Output:
13
+ - Comprehensive evaluation reports in JSON format
14
+ - Ready for ingestion into the evaluation leaderboard
15
+
16
+ --------------------------------------------------------------------
17
+ INPUT SPECIFICATION
18
+
19
+ Evaluation Results (.out format):
20
+ hf (pretrained=model-org/model-name), num_fewshot: 5, batch_size: 1
21
+ | Task | Metric | Value | Stderr |
22
+ |---------------|--------|--------|--------|
23
+ | main-task | acc | 0.5605 | 0.0052 |
24
+ | - sub-task | acc | 0.4640 | 0.0088 |
25
+ | - prompt-1 | acc | 0.3720 | 0.0216 |
26
+
27
+ Model Metadata (.json format):
28
+ {
29
+ "model": "model-org/model-name",
30
+ "base_model": "ModelArchitecture",
31
+ "revision": "git_commit_hash",
32
+ "parameters": 8.03,
33
+ "language": "en_it"
34
+ }
35
+
36
+ --------------------------------------------------------------------
37
+ OUTPUT SPECIFICATION
38
+
39
+ Evaluation Report (.json format):
40
+ {
41
+ "summary_metrics": {
42
+ "average_CPS": 41.74,
43
+ "num_tasks": 12
44
+ },
45
+ "model_config": {
46
+ "identifier": "model-org/model-name",
47
+ "architecture": "ModelArchitecture",
48
+ "parameters": 8.03,
49
+ "evaluation_settings": {
50
+ "fewshot": 5,
51
+ "batch_size": 1
52
+ }
53
+ },
54
+ "task_results": {
55
+ "task-name": {
56
+ "average_score": 52.60,
57
+ "best_prompt": {
58
+ "id": "prompt-6",
59
+ "score": 66.57
60
+ },
61
+ "prompt_analysis": [
62
+ {
63
+ "prompt_id": "prompt-1",
64
+ "score": 37.20,
65
+ "stderr": 0.0216
66
+ }
67
+ ]
68
+ }
69
+ }
70
+ }
71
+ """
72
+
73
+ import json
74
+ import os
75
+ import re
76
+
77
+ def safe_float(value):
78
+ """Safely converts a value to float, returning None if the conversion fails."""
79
+ try:
80
+ return float(value)
81
+ except ValueError:
82
+ return None
83
+
84
+
85
+ def calculate_task_metrics(task_info):
86
+ """Calculates average accuracy, best prompt accuracy, and CPS for a given task."""
87
+ accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
88
+
89
+ if not accuracies:
90
+ return None
91
+
92
+ task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
93
+ best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
94
+ task_info['best_prompt'] = best_prompt_data['value']
95
+ task_info['prompt_id'] = best_prompt_data['prompt']
96
+
97
+ # Calculate CPS
98
+ avg_acc = task_info['average_accuracy']
99
+ best_acc = task_info['best_prompt']
100
+ task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
101
+
102
+
103
+ def extract_data_from_file(file_path):
104
+ """Extracts task and prompt data from a specified file."""
105
+ with open(file_path, 'r') as file:
106
+ lines = file.readlines()
107
+
108
+ tasks_data = {}
109
+ current_task = None
110
+
111
+ for line in lines:
112
+ line = line.strip()
113
+
114
+ # Skips empty lines
115
+ if not line:
116
+ continue
117
+
118
+ # Skips header lines
119
+ if line.startswith("| Tasks"):
120
+ continue
121
+
122
+ # Extracts model configuration details
123
+ if line.startswith("hf (pretrained="):
124
+ start = line.find("pretrained=") + len("pretrained=")
125
+ end = line.find(",", start)
126
+ pretrained_model = line[start:end]
127
+
128
+ num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
129
+ num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
130
+
131
+ batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
132
+ batch_size = int(batch_size_match.group(1)) if batch_size_match else None
133
+
134
+ continue
135
+
136
+ columns = line.split('|')
137
+ if len(columns) != 11:
138
+ continue
139
+
140
+ task_name = columns[1]
141
+ metric = columns[5].strip()
142
+ value = safe_float(columns[7])
143
+ stderr = safe_float(columns[9])
144
+ print (value)
145
+ # Skips normalized accuracy metrics
146
+ if metric == "acc_norm":
147
+ continue
148
+
149
+ # Identifies task and prompt sections in the file
150
+ if task_name.startswith(" - "):
151
+ task_name = task_name[3:].strip()
152
+ current_task = task_name
153
+ tasks_data.setdefault(current_task,
154
+ {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
155
+ 'CPS': None})
156
+
157
+ elif task_name.startswith(" - ") and current_task:
158
+ prompt_name = task_name[4:].strip()
159
+ prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
160
+ 'stderr': stderr}
161
+ tasks_data[current_task]['prompts'].append(prompt_data)
162
+
163
+ # Special handling for evalita NER task to calculate weighted prompt averages
164
+ if "evalita NER" in tasks_data:
165
+ task_info = tasks_data["evalita NER"]
166
+ weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
167
+ "WN prompt-1": 2088, "WN prompt-2": 2088}
168
+
169
+ weighted_values = {"prompt-1": 0, "prompt-2": 0}
170
+ total_weights = sum(weight_map.values())
171
+
172
+ for prompt in task_info['prompts']:
173
+ if prompt['prompt'] in weight_map:
174
+ if "prompt-1" in prompt['prompt']:
175
+ weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
176
+ elif "prompt-2" in prompt['prompt']:
177
+ weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
178
+
179
+ task_info['prompts'] = [
180
+ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
181
+ 'stderr': None},
182
+ {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
183
+ 'stderr': None}]
184
+
185
+ # Calculates task metrics for each task
186
+ for task_info in tasks_data.values():
187
+ calculate_task_metrics(task_info)
188
+
189
+ # Calculates the average CPS across all tasks
190
+ tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
191
+ average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
192
+
193
+ config = {
194
+ "model_name": pretrained_model,
195
+ "num_fewshot": num_fewshot,
196
+ "batch_size": batch_size
197
+ }
198
+
199
+ return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
200
+
201
+
202
+ """
203
+ MAIN PROCESSING PIPELINE
204
+
205
+ This script executes the complete evaluation data processing workflow:
206
+
207
+ 1. Input Sources:
208
+ - Raw evaluation results (.out files) from: ../evalita_llm_models_output/
209
+ - Model metadata JSON files from: ../evalita_llm_requests/
210
+
211
+ 2. Processing Steps:
212
+ - Parses evaluation metrics from .out files
213
+ - Combines with model metadata
214
+ - Calculates aggregated performance statistics
215
+
216
+ 3. Output:
217
+ - Structured JSON results saved to: ../evalita_llm_results/
218
+ - Organized by model organization/name
219
+ - Contains complete evaluation results with metadata
220
+ """
221
+ directory_in_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/task_result/'
222
+ directory_in_requests_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_requests/'
223
+ directory_out_results_path = '/home/sfarzi/leaderboard/evalita_llm_leaderboard/evalita_llm_results/'
224
+
225
+ for filename in os.listdir(directory_in_path):
226
+ if filename.endswith('.out'):
227
+ file_path = os.path.join(directory_in_path, filename)
228
+ json_output = extract_data_from_file(file_path)
229
+
230
+ model_org_name, model_name = json_output['config']['model_name'].split('/')
231
+
232
+
233
+ config_file_path = os.path.join(directory_in_requests_path, model_org_name, f"{model_name}.json")
234
+
235
+ if os.path.exists(config_file_path):
236
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
237
+ additional_config = json.load(config_file)
238
+ json_output['config'].update(additional_config)
239
+
240
+
241
+ org_folder_path = os.path.join(directory_out_results_path, model_org_name)
242
+ os.makedirs(org_folder_path, exist_ok=True)
243
+
244
+ file_suffix = f"{json_output['config']['num_fewshot']}"
245
+ output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
246
+
247
+ with open(output_file_path, 'w', newline="\n") as outfile:
248
+ json.dump(json_output, outfile, indent=4)
249
+
250
+ print(f"File {filename} processed and saved to {output_file_path}")
preprocess_models_output_old.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ def safe_float(value):
6
+ """Convert a value to float safely. Returns None if conversion fails."""
7
+ try:
8
+ return float(value)
9
+ except ValueError:
10
+ return None
11
+
12
+
13
+ def calculate_task_metrics(task_info):
14
+ """Calculate average accuracy, best prompt, and CPS for a task."""
15
+ accuracies = [prompt['value'] for prompt in task_info['prompts'] if prompt['value'] is not None]
16
+
17
+ if not accuracies:
18
+ return None
19
+
20
+ task_info['average_accuracy'] = sum(accuracies) / len(accuracies)
21
+ best_prompt_data = max(task_info['prompts'], key=lambda x: x['value'])
22
+ task_info['best_prompt'] = best_prompt_data['value']
23
+ task_info['prompt_id'] = best_prompt_data['prompt']
24
+
25
+ # Calculate CPS
26
+ avg_acc = task_info['average_accuracy']
27
+ best_acc = task_info['best_prompt']
28
+ task_info['CPS'] = (1 - (best_acc - avg_acc) / 100) * best_acc
29
+
30
+
31
+ def extract_data_from_file(file_path):
32
+ """Extract task and prompt data from the given file."""
33
+ with open(file_path, 'r') as file:
34
+ lines = file.readlines()
35
+
36
+ tasks_data = {}
37
+ current_task = None
38
+
39
+ for line in lines:
40
+ line = line.strip()
41
+
42
+ # Skip irrelevant lines
43
+ if not line:
44
+ continue
45
+
46
+
47
+ if line.startswith("| Tasks"):
48
+ continue
49
+
50
+ if line.startswith("hf (pretrained="):
51
+
52
+ # Estrai la parte dopo "pretrained="
53
+ start = line.find("pretrained=") + len("pretrained=")
54
+ end = line.find(",", start) # Trova la virgola successiva
55
+ # Estrai la stringa desiderata
56
+ pretrained_model = line[start:end]
57
+
58
+ # Estrarre num_fewshot
59
+ num_fewshot_match = re.search(r"num_fewshot:\s*([\w\d]+)", line)
60
+ num_fewshot = num_fewshot_match.group(1) if num_fewshot_match else None
61
+
62
+ # Estrarre batch_size
63
+ batch_size_match = re.search(r"batch_size:\s*(\d+)", line)
64
+ batch_size = int(batch_size_match.group(1)) if batch_size_match else None
65
+
66
+ continue
67
+
68
+ columns = line.split('|')
69
+ if len(columns) != 11:
70
+ continue
71
+
72
+ task_name = columns[1]
73
+ metric = columns[5].strip()
74
+ value = safe_float(columns[7])
75
+ stderr = safe_float(columns[9])
76
+
77
+ if metric == "acc_norm":
78
+ continue
79
+
80
+ # Identify task and prompts
81
+ if task_name.startswith(" - "):
82
+ task_name = task_name[3:].strip()
83
+ current_task = task_name
84
+ tasks_data.setdefault(current_task,
85
+ {'prompts': [], 'average_accuracy': 0, 'best_prompt': None, 'prompt_id': None,
86
+ 'CPS': None})
87
+
88
+ elif task_name.startswith(" - ") and current_task:
89
+ prompt_name = task_name[4:].strip()
90
+ prompt_data = {'prompt': prompt_name, 'metric': metric, 'value': value * 100,
91
+ 'stderr': stderr}
92
+ tasks_data[current_task]['prompts'].append(prompt_data)
93
+
94
+ # Special handling for evalita NER
95
+ if "evalita NER" in tasks_data:
96
+ task_info = tasks_data["evalita NER"]
97
+ weight_map = {"ADG prompt-1": 521, "ADG prompt-2": 521, "FIC prompt-1": 1517, "FIC prompt-2": 1517,
98
+ "WN prompt-1": 2088, "WN prompt-2": 2088}
99
+
100
+ weighted_values = {"prompt-1": 0, "prompt-2": 0}
101
+ total_weights = sum(weight_map.values())
102
+
103
+ for prompt in task_info['prompts']:
104
+ if prompt['prompt'] in weight_map:
105
+ if "prompt-1" in prompt['prompt']:
106
+ weighted_values["prompt-1"] += weight_map[prompt['prompt']] * prompt['value']
107
+ elif "prompt-2" in prompt['prompt']:
108
+ weighted_values["prompt-2"] += weight_map[prompt['prompt']] * prompt['value']
109
+
110
+ task_info['prompts'] = [
111
+ {"prompt": "prompt-1", "metric": "acc", "value": weighted_values["prompt-1"] / total_weights,
112
+ 'stderr': None},
113
+ {"prompt": "prompt-2", "metric": "acc", "value": weighted_values["prompt-2"] / total_weights,
114
+ 'stderr': None}]
115
+
116
+ # Calculate metrics for each task
117
+ for task_info in tasks_data.values():
118
+ calculate_task_metrics(task_info)
119
+
120
+ # Calculate average CPS
121
+ tasks_with_cps = [task['CPS'] for task in tasks_data.values() if task['CPS'] is not None]
122
+ average_CPS = sum(tasks_with_cps) / len(tasks_with_cps) if tasks_with_cps else 0
123
+
124
+ config = {
125
+ "model_name": pretrained_model,
126
+ "num_fewshot": num_fewshot,
127
+ "batch_size": batch_size
128
+ }
129
+
130
+ return {'average_CPS': average_CPS, 'config': config, 'tasks': tasks_data}
131
+
132
+
133
+ # Example usage
134
+ #file_path = '../evalita_llm_results/models_output/slurm-7769.out'
135
+ #json_output = extract_data_from_file(file_path)
136
+ #print(json_output)
137
+
138
+
139
+ # Directory da cui leggere i file .out
140
+ directory_in_path = '../evalita_llm_models_output/'
141
+ directory_out_results_path = '../evalita_llm_results/'
142
+ directory_out_requests_path = '../evalita_llm_requests/'
143
+
144
+ # Itera sui file nella directory
145
+ for filename in os.listdir(directory_in_path):
146
+ if filename.endswith('.out'):
147
+ # Costruisci il percorso completo del file
148
+ file_path = os.path.join(directory_in_path, filename)
149
+
150
+ # Esegui la funzione extract_data_from_file
151
+ json_output = extract_data_from_file(file_path)
152
+
153
+ # Estrai model_org_name e model_name da model_name
154
+ model_org_name, model_name = json_output['config']['model_name'].split('/')
155
+
156
+
157
+
158
+
159
+
160
+
161
+ # Percorso del file JSON di configurazione in ../evalita_llm_requests2/
162
+ config_file_path = os.path.join(directory_out_requests_path, model_org_name, f"{model_name}.json")
163
+
164
+ # Se il file esiste, caricalo e aggiorna il dizionario config
165
+ if os.path.exists(config_file_path):
166
+ with open(config_file_path, 'r', encoding='utf-8') as config_file:
167
+ additional_config = json.load(config_file)
168
+
169
+ # Aggiorna la configurazione con i nuovi dati
170
+ json_output['config'].update(additional_config)
171
+
172
+
173
+
174
+
175
+ # Crea il percorso della cartella per model_org_name
176
+ org_folder_path = os.path.join(directory_out_results_path, model_org_name)
177
+ os.makedirs(org_folder_path, exist_ok=True) # Crea la cartella se non esiste
178
+
179
+ # Crea il percorso completo del file JSON
180
+ file_suffix = f"{json_output['config']['num_fewshot']}"
181
+ output_file_path = os.path.join(org_folder_path, f"{model_name}_{file_suffix}.json")
182
+
183
+ # Salva il JSON in un file con ritorni a capo compatibili con Linux
184
+ with open(output_file_path, 'w', newline="\n") as outfile:
185
+ json.dump(json_output, outfile, indent=4)
186
+
187
+ # Stampa il risultato
188
+ print(f"File {filename} elaborato e salvato in {output_file_path}")
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.ruff]
2
+ # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
+ select = ["E", "F"]
4
+ ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 119
6
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
+
8
+ [tool.isort]
9
+ profile = "black"
10
+ line_length = 119
11
+
12
+ [tool.black]
13
+ line-length = 119
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
run_instructions.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model Evaluation and Leaderboard
2
+
3
+ 1) Model Evaluation
4
+ Before integrating a model into the leaderboard, it must first be evaluated using the lm-eval-harness library in both zero-shot and 5-shot configurations.
5
+
6
+ This can be done with the following command:
7
+
8
+ lm_eval --model hf --model_args pretrained=google/gemma-3-12b-it \
9
+ --tasks evalita-mp --device cuda:0 --batch_size 1 --trust_remote_code \
10
+ --output_path model_output --num_fewshot 5 --
11
+
12
+ The output generated by the library will include the model's accuracy scores on the benchmark tasks.
13
+ This output is written to the standard output and should be saved in a txt file (e.g., slurm-8368.out), which needs to be placed in the
14
+ evalita_llm_models_output LOCAL directory for further processing. Examples of such files can be found in: https://huggingface.co/datasets/evalitahf/evalita_llm_models_output/
15
+
16
+ 2) Extracting Model Metadata
17
+ To display model details on the leaderboard (e.g., organization/group, model name, and parameter count), metadata must be retrieved from Hugging Face.
18
+
19
+ This can be done by running:
20
+
21
+ python get_model_info.py
22
+
23
+ This script processes the evaluation files from Step 1 and saves each model's metadata in a JSON file within the evalita_llm_requests LOCAL directory.
24
+
25
+ 3) Generating Leaderboard Submission File
26
+ The leaderboard requires a structured file containing each model’s metadata along with its benchmark accuracy scores.
27
+
28
+ To generate this file, run:
29
+
30
+ python preprocess_model_output.py
31
+
32
+ This script combines the accuracy results from Step 1 with the metadata from Step 2 and outputs a JSON file for each kind of model in the evalita_llm_results LOCAL directory.
33
+ Examples of these files are in https://huggingface.co/datasets/evalitahf/evalita_llm_results
34
+
35
+ 4) Updating the Hugging Face Repository
36
+ A commit and push of the following three directories from the local disk to HuggingFace is required, in order to update the evalita_llm_results repository with the newly generated files from Step 3:
37
+ evalita_llm_models_output, evalita_llm_requests and evalita_llm_results
38
+
39
+ 5) Running the Leaderboard Application
40
+ To test the leaderboard locally, run the following command in your terminal and open your browser at the indicated address:
41
+
42
+ python app.py
43
+
44
+ On Hugging Face, the leaderboard can be started or stopped directly from the graphical interface, so running this command is only necessary when working locally.
45
+
46
+
src/.ipynb_checkpoints/about-checkpoint.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ metric_type: str
9
+ col_name: str
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+
16
+ task1 = Task("text-entailment_1", "acc", "CPS", "TE")
17
+ task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average")
18
+ task3 = Task("text-entailment_3", "acc", "best_prompt", "TE Best Prompt")
19
+ task4 = Task("text-entailment_4", "acc", "prompt_id", "TE Best Prompt Id")
20
+
21
+ task5 = Task("sentiment-analysis_1", "acc", "CPS", "SA")
22
+ task6 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average")
23
+ task7 = Task("sentiment-analysis_3", "acc", "best_prompt", "SA Best Prompt")
24
+ task8 = Task("sentiment-analysis_4", "acc", "prompt_id", "SA Best Prompt Id")
25
+
26
+ task9 = Task("hate-speech-detection_1", "acc", "CPS", "HS")
27
+ task10 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average")
28
+ task11 = Task("hate-speech-detection_3", "acc", "best_prompt", "HS Best Prompt")
29
+ task12 = Task("hate-speech-detection_4", "acc", "prompt_id", "HS Best Prompt Id")
30
+
31
+ task13 = Task("admission-test_1", "acc", "CPS", "AT")
32
+ task14 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average")
33
+ task15 = Task("admission-test_3", "acc", "best_prompt", "AT Best Prompt")
34
+ task16 = Task("admission-test_4", "acc", "prompt_id", "AT Best Prompt Id")
35
+
36
+ task17 = Task("word-in-context_1", "acc", "CPS", "WIC")
37
+ task18 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average")
38
+ task19 = Task("word-in-context_3", "acc", "best_prompt", "WIC Best Prompt")
39
+ task20 = Task("word-in-context_4", "acc", "prompt_id", "WIC Best Prompt Id")
40
+
41
+ task21 = Task("faq_1", "acc", "CPS", "FAQ")
42
+ task22 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average")
43
+ task23 = Task("faq_3", "acc", "best_prompt", "FAQ Best Prompt")
44
+ task24 = Task("faq_4", "acc", "prompt_id", "FAQ Best Prompt Id")
45
+
46
+ task25 = Task("lexical-substitution_1", "acc", "CPS", "LS")
47
+ task26 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average")
48
+ task27 = Task("lexical-substitution_3", "acc", "best_prompt", "LS Best Prompt")
49
+ task28 = Task("lexical-substitution_4", "acc", "prompt_id", "LS Best Prompt Id")
50
+
51
+ task29 = Task("summarization-fanpage_1", "acc", "CPS", "SU")
52
+ task30 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average")
53
+ task31 = Task("summarization-fanpage_3", "acc", "best_prompt", "SU Best Prompt")
54
+ task32 = Task("summarization-fanpage_4", "acc", "prompt_id", "SU Best Prompt Id")
55
+
56
+ task33 = Task("evalita NER_1", "acc", "CPS", "NER")
57
+ task34 = Task("evalita NER_2", "acc", "average_accuracy", "NER Prompt Average")
58
+ task35 = Task("evalita NER_3", "acc", "best_prompt", "NER Best Prompt")
59
+ task36 = Task("evalita NER_4", "acc", "prompt_id", "NER Best Prompt Id")
60
+
61
+ task37 = Task("relation-extraction_1", "acc", "CPS", "REL")
62
+ task38 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average")
63
+ task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
64
+ task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
65
+
66
+ '''
67
+ task0 = Task("TextualEntailment", "acc", "Textual Entailment")
68
+ task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
69
+ task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis")
70
+ task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best")
71
+ task4 = Task("Hate Speech", "acc", "Hate Speech")
72
+ task5 = Task("Hate Speech_best", "acc", "Hate Speech_best")
73
+ task6 = Task("Admission Test", "acc", "Admission Test")
74
+ task7 = Task("Admission Test_best", "acc", "Admission Test_best")
75
+ task8 = Task("Word in Context", "acc", "Word in Context")
76
+ task9 = Task("Word in Context_best", "acc", "Word in Context_best")
77
+ task10 = Task("FAQ", "acc", "FAQ")
78
+ task11 = Task("FAQ_best", "acc", "FAQ_best")
79
+ task12 = Task("Lexical Substitution", "acc", "Lexical Substitution")
80
+ task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best")
81
+ task14 = Task("Summarization", "acc", "Summarization")
82
+ task15 = Task("Summarization_best", "acc", "Summarization_best")
83
+ task16 = Task("NER", "acc", "NER")
84
+ task17 = Task("NER_best", "acc", "NER_best")
85
+ task18 = Task("REL", "acc", "REL")
86
+ task19 = Task("REL_best", "acc", "REL_best")
87
+ '''
88
+
89
+ # Your leaderboard name
90
+ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
91
+
92
+ # What does your leaderboard evaluate?
93
+ INTRODUCTION_TEXT = """
94
+ Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding translation issues and potential cultural biases; (ii) the benchmark includes **generative** tasks, enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
95
+
96
+ **<small>Multiple-choice tasks:</small>** <small> 📊TE (Textual Entailment), 😃SA (Sentiment Analysis), ⚠️HS (Hate Speech Detection), 🏥AT (Admission Test), 🔤WIC (Word in Context), ❓FAQ (Frequently Asked Questions) </small><br>
97
+ **<small>Generative tasks:</small>** <small>🔄LS (Lexical Substitution), 📝SU (Summarization), 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction) </small>
98
+ """
99
+
100
+ # Which evaluations are you running? how can people reproduce what you have?
101
+ LLM_BENCHMARKS_TEXT = f"""
102
+ ### Groups
103
+
104
+ - `evalita-mp`: All tasks (perplexity and non-perplexity based).
105
+ - `evalita-mp_gen`: Only generative tasks.
106
+ - `evalita-mp_mc`: Only multiple-choice tasks.
107
+
108
+ #### Tasks
109
+
110
+ The following Evalita-LLM tasks can also be evaluated in isolation:
111
+ - `evalita-mp_te`: Textual Entailment (TE)
112
+ - `evalita-mp_sa`: Sentiment Analysis (SA)
113
+ - `evalita-mp_wic`: Word in Context (WIC)
114
+ - `evalita-mp_hs`: Hate Speech Detection (HS)
115
+ - `evalita-mp_at`: Admission Tests (AT)
116
+ - `evalita-mp_faq`: Frequently Asked Questions & Question Answering (FAQ)
117
+ - `evalita-mp_sum_fp`: Summarization (SU)
118
+ - `evalita-mp_ls`: Lexical Substitution LS)
119
+ - `evalita-mp_ner_group`: Named Entity Recognition (NER)
120
+ - `evalita-mp_re`: Relation Extraction (REL)
121
+
122
+
123
+ ### Usage
124
+
125
+ ```bash
126
+
127
+ lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1
128
+ ```
129
+
130
+ <!--
131
+ ### Checklist
132
+
133
+ * [x] Is the task an existing benchmark in the literature?
134
+ * [x] Have you referenced the original paper that introduced the task?
135
+ * [x] If yes, does the original paper provide a reference implementation?
136
+ * [x] Yes, original implementation contributed by author of the benchmark
137
+
138
+ If other tasks on this dataset are already supported:
139
+ * [x] Is the "Main" variant of this task clearly denoted?
140
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
141
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
142
+ -->
143
+
144
+
145
+ """
146
+
147
+ EVALUATION_QUEUE_TEXT = """
148
+ ## Some good practices before submitting a model
149
+
150
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
151
+ ```python
152
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
153
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
154
+ model = AutoModel.from_pretrained("your model name", revision=revision)
155
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
156
+ ```
157
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
158
+
159
+ Note: make sure your model is public!
160
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
161
+
162
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
163
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
164
+
165
+ ### 3) Make sure your model has an open license!
166
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
167
+
168
+ ### 4) Fill up your model card
169
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
170
+
171
+ ## In case of model failure
172
+ If your model is displayed in the `FAILED` category, its execution stopped.
173
+ Make sure you have followed the above steps first.
174
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
175
+ """
176
+
177
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
178
+ CITATION_BUTTON_TEXT = r"""
179
+ @misc{magnini2025evalitallmbenchmarkinglargelanguage,
180
+ title={Evalita-LLM: Benchmarking Large Language Models on Italian},
181
+ author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti},
182
+ year={2025},
183
+ eprint={2502.02289},
184
+ archivePrefix={arXiv},
185
+ primaryClass={cs.CL},
186
+ url={https://arxiv.org/abs/2502.02289},
187
+ }
188
+ """
src/.ipynb_checkpoints/envs-checkpoint.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
+
9
+ #OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ OWNER = "evalitahf"
11
+ # ----------------------------------
12
+
13
+ #REPO_ID = f"{OWNER}/leaderboard-evalita"
14
+ #QUEUE_REPO = f"{OWNER}/evalita-requests"
15
+ #RESULTS_REPO = f"{OWNER}/evalita-results"
16
+
17
+ REPO_ID = f"{OWNER}/evalita_llm_leaderboard"
18
+ QUEUE_REPO = f"{OWNER}/evalita_llm_requests"
19
+ RESULTS_REPO = f"{OWNER}/evalita_llm_results"
20
+
21
+ # If you setup a cache later, just change HF_HOME
22
+ CACHE_PATH=os.getenv("HF_HOME", ".")
23
+
24
+ # Local caches
25
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
26
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
27
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
28
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
29
+
30
+ API = HfApi(token=TOKEN)
src/.ipynb_checkpoints/populate-checkpoint.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
+
10
+
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+
16
+ df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ df = df[cols].round(decimals=2)
19
+
20
+ # filter out if any of the benchmarks have not been produced
21
+ df = df[has_no_nan_values(df, benchmark_cols)]
22
+ return df
23
+
24
+
25
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
+ """Creates the different dataframes for the evaluation queues requestes"""
27
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
+ all_evals = []
29
+
30
+ for entry in entries:
31
+ if ".json" in entry:
32
+ file_path = os.path.join(save_path, entry)
33
+ with open(file_path) as fp:
34
+ data = json.load(fp)
35
+
36
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
+
39
+ all_evals.append(data)
40
+ elif ".md" not in entry:
41
+ # this is a folder
42
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
+ for sub_entry in sub_entries:
44
+ file_path = os.path.join(save_path, entry, sub_entry)
45
+ with open(file_path) as fp:
46
+ data = json.load(fp)
47
+
48
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
+ all_evals.append(data)
51
+
52
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/.ipynb_checkpoints/tasks-checkpoint.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ # metric: str
8
+ accuracy: str
9
+ col_name: str
10
+
11
+ NUM_FEWSHOT = 0 # Change with your few shot
12
+ # ---------------------------------------------------
13
+
14
+ # Your leaderboard name
15
+ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
16
+
17
+ # What does your leaderboard evaluate?
18
+ INTRODUCTION_TEXT = """
19
+ Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
20
+ """
21
+
22
+ #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
23
+ MEASURE_DESCRIPTION = "<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above).</small>"
24
+ #MEASURE_DESCRIPTION = "<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = avg. accuracy over prompts. **Best Prompt** = accuracy of best prompt. **Prompt ID** = ID of the best prompt (see legend above).</small>"
25
+
26
+ # Tasks Descriptions
27
+ TE_DESCRIPTION = """### Textual Entailment (TE) --- *Multiple-choice task*
28
+ The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
29
+
30
+ | # | Prompt | Answer Choices |
31
+ |-----|------------|--------------|
32
+ | 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
33
+ | 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
34
+ | 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
35
+ | 4 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
36
+ | 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
37
+ | 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
38
+
39
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
40
+
41
+ """
42
+
43
+ SA_DESCRIPTION = """### Sentiment Analysis (SA) --- *Multiple-choice task*
44
+ The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
45
+
46
+ | # | Prompt | Answer Choices |
47
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
48
+ | 1 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
49
+ | 2 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
50
+ | 3 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: | ["A", "B", "C", "D"] |
51
+ | 4 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: | ["A", "B", "C", "D"] |
52
+ | 5 | Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
53
+ | 6 | Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
54
+
55
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
56
+
57
+ """
58
+
59
+ HS_DESCRIPTION = """### Hate Speech (HS) --- *Multiple-choice task*
60
+ The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
61
+
62
+ | # | Prompt | Answer Choices |
63
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
64
+ | 1 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
65
+ | 2 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
66
+ | 3 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: | ["B", "A"] |
67
+ | 4 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: | ["B", "A"] |
68
+ | 5 | Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
69
+ | 6 | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
70
+
71
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-micro averaged over the 6 prompts. **Best Prompt** = F1-micro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
72
+
73
+ """
74
+
75
+ AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task*
76
+ The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
77
+
78
+ | # | Prompt | Answer Choices |
79
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
80
+ | 1 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
81
+ | 2 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
82
+ | 3 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
83
+ | 4 | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
84
+ | 5 | Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' èDato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
85
+ | 6 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
86
+
87
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
88
+
89
+ """
90
+
91
+ WIC_DESCRIPTION = """### Word in Context (WIC) --- *Multiple-choice task*
92
+ The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).
93
+
94
+ | # | Prompt | Answer Choices |
95
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
96
+ | 1 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
97
+ | 2 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
98
+ | 3 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: | ["B", "A"] |
99
+ | 4 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: | ["B", "A"] |
100
+ | 5 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
101
+ | 6 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
102
+
103
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
104
+
105
+ """
106
+
107
+ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- *Multiple-choice task*
108
+ The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
109
+
110
+ | # | Prompt | Answer Choices |
111
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
112
+ | 1 | Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
113
+ | 2 | Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
114
+ | 3 | Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
115
+ | 4 | Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
116
+ | 5 | La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
117
+ | 6 | Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
118
+
119
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
120
+
121
+ """
122
+
123
+ LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task*
124
+ The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
125
+
126
+ | # | Prompt |
127
+ |-----|--------------------------------------------------------------------------------|
128
+ | 1 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
129
+ | 2 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
130
+
131
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
132
+
133
+ """
134
+
135
+ SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task*
136
+ The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
137
+
138
+ | # | Prompt |
139
+ |-----|--------------------------------------------------------------------------------|
140
+ | 1 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
141
+ | 2 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
142
+
143
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
144
+
145
+ """
146
+
147
+ NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task*
148
+ The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
149
+
150
+ | # | Prompt |
151
+ |-----|--------------------------------------------------------------------------------|
152
+ | 1 | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
153
+ | 2 | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
154
+
155
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
156
+
157
+ """
158
+
159
+ REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task*
160
+ The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
161
+
162
+ | # | Prompt |
163
+ |-----|--------------------------------------------------------------------------------|
164
+ | 1 | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
165
+ | 2 | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
166
+
167
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
168
+
169
+ """
170
+
171
+ # Create a dictionary to map task names to their descriptions
172
+ TASK_DESCRIPTIONS = {
173
+ "TE": TE_DESCRIPTION,
174
+ "SA": SA_DESCRIPTION,
175
+ "HS": HS_DESCRIPTION,
176
+ "AT": AT_DESCRIPTION,
177
+ "WIC": WIC_DESCRIPTION,
178
+ "FAQ": FAQ_DESCRIPTION,
179
+ "LS": LS_DESCRIPTION,
180
+ "SU": SU_DESCRIPTION,
181
+ "NER": NER_DESCRIPTION,
182
+ "REL": REL_DESCRIPTION
183
+ }
src/__pycache__/about.cpython-310.pyc ADDED
Binary file (5.58 kB). View file
 
src/__pycache__/about.cpython-312.pyc ADDED
Binary file (8.91 kB). View file
 
src/__pycache__/envs.cpython-310.pyc ADDED
Binary file (690 Bytes). View file
 
src/__pycache__/populate.cpython-310.pyc ADDED
Binary file (2.89 kB). View file
 
src/__pycache__/tasks.cpython-310.pyc ADDED
Binary file (18 kB). View file
 
src/__pycache__/tasks.cpython-312.pyc ADDED
Binary file (18.2 kB). View file
 
src/about.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ metric_type: str
9
+ col_name: str
10
+
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+
16
+ #task1 = Task("text-entailment_1", "acc", "CPS", "TE")
17
+ #task2 = Task("text-entailment_2", "acc", "average_accuracy", "TE Prompt Average")
18
+ #task3 = Task("text-entailment_3", "acc", "best_prompt", "TE Best Prompt")
19
+ #task4 = Task("text-entailment_4", "acc", "prompt_id", "TE Best Prompt Id")
20
+
21
+ #task5 = Task("sentiment-analysis_1", "acc", "CPS", "SA")
22
+ #task6 = Task("sentiment-analysis_2", "acc", "average_accuracy", "SA Prompt Average")
23
+ #task7 = Task("sentiment-analysis_3", "acc", "best_prompt", "SA Best Prompt")
24
+ #task8 = Task("sentiment-analysis_4", "acc", "prompt_id", "SA Best Prompt Id")
25
+
26
+ #task9 = Task("hate-speech-detection_1", "acc", "CPS", "HS")
27
+ #task10 = Task("hate-speech-detection_2", "acc", "average_accuracy", "HS Prompt Average")
28
+ #task11 = Task("hate-speech-detection_3", "acc", "best_prompt", "HS Best Prompt")
29
+ #task12 = Task("hate-speech-detection_4", "acc", "prompt_id", "HS Best Prompt Id")
30
+
31
+ #task13 = Task("admission-test_1", "acc", "CPS", "AT")
32
+ #task14 = Task("admission-test_2", "acc", "average_accuracy", "AT Prompt Average")
33
+ #task15 = Task("admission-test_3", "acc", "best_prompt", "AT Best Prompt")
34
+ #task16 = Task("admission-test_4", "acc", "prompt_id", "AT Best Prompt Id")
35
+
36
+ #task17 = Task("word-in-context_1", "acc", "CPS", "WIC")
37
+ #task18 = Task("word-in-context_2", "acc", "average_accuracy", "WIC Prompt Average")
38
+ #task19 = Task("word-in-context_3", "acc", "best_prompt", "WIC Best Prompt")
39
+ #task20 = Task("word-in-context_4", "acc", "prompt_id", "WIC Best Prompt Id")
40
+
41
+ #task21 = Task("faq_1", "acc", "CPS", "FAQ")
42
+ #task22 = Task("faq_2", "acc", "average_accuracy", "FAQ Prompt Average")
43
+ #task23 = Task("faq_3", "acc", "best_prompt", "FAQ Best Prompt")
44
+ #task24 = Task("faq_4", "acc", "prompt_id", "FAQ Best Prompt Id")
45
+
46
+ #task25 = Task("lexical-substitution_1", "acc", "CPS", "LS")
47
+ #task26 = Task("lexical-substitution_2", "acc", "average_accuracy", "LS Prompt Average")
48
+ #task27 = Task("lexical-substitution_3", "acc", "best_prompt", "LS Best Prompt")
49
+ #task28 = Task("lexical-substitution_4", "acc", "prompt_id", "LS Best Prompt Id")
50
+
51
+ #task29 = Task("summarization-fanpage_1", "acc", "CPS", "SU")
52
+ #task30 = Task("summarization-fanpage_2", "acc", "average_accuracy", "SU Prompt Average")
53
+ #task31 = Task("summarization-fanpage_3", "acc", "best_prompt", "SU Best Prompt")
54
+ #task32 = Task("summarization-fanpage_4", "acc", "prompt_id", "SU Best Prompt Id")
55
+
56
+ #task33 = Task("NER_1", "acc", "CPS", "NER")
57
+ #task34 = Task("NER_2", "acc", "average_accuracy", "NER Prompt Average")
58
+ #task35 = Task("NER_3", "acc", "best_prompt", "NER Best Prompt")
59
+ #task36 = Task("NER_4", "acc", "prompt_id", "NER Best Prompt Id")
60
+
61
+ #task37 = Task("relation-extraction_1", "acc", "CPS", "REL")
62
+ #task38 = Task("relation-extraction_2", "acc", "average_accuracy", "REL Prompt Average")
63
+ #task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
64
+ #task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
65
+ task1 = Task("RE_1", "acc", "CPS", "REL")
66
+ task2 = Task("RE_2", "acc", "average_accuracy", "REL Prompt Average")
67
+ task3 = Task("RE_3", "acc", "best_prompt", "REL Best Prompt")
68
+ task4 = Task("RE_4", "acc", "prompt_id", "REL Best Prompt Id")
69
+
70
+ task5 = Task("NER_1", "acc", "CPS", "NER")
71
+ task6 = Task("NER_2", "acc", "average_accuracy", "NER Prompt Average")
72
+ task7 = Task("NER_3", "acc", "best_prompt", "NER Best Prompt")
73
+ task8 = Task("NER_4", "acc", "prompt_id", "NER Best Prompt Id")
74
+ '''
75
+ task0 = Task("TextualEntailment", "acc", "Textual Entailment")
76
+ task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
77
+ task2 = Task("Sentiment Analysis", "acc", "Sentiment Analysis")
78
+ task3 = Task("Sentiment Analysis_best", "acc", "Sentiment Analysis_best")
79
+ task4 = Task("Hate Speech", "acc", "Hate Speech")
80
+ task5 = Task("Hate Speech_best", "acc", "Hate Speech_best")
81
+ task6 = Task("Admission Test", "acc", "Admission Test")
82
+ task7 = Task("Admission Test_best", "acc", "Admission Test_best")
83
+ task8 = Task("Word in Context", "acc", "Word in Context")
84
+ task9 = Task("Word in Context_best", "acc", "Word in Context_best")
85
+ task10 = Task("FAQ", "acc", "FAQ")
86
+ task11 = Task("FAQ_best", "acc", "FAQ_best")
87
+ task12 = Task("Lexical Substitution", "acc", "Lexical Substitution")
88
+ task13 = Task("Lexical Substitution_best", "acc", "Lexical Substitution_best")
89
+ task14 = Task("Summarization", "acc", "Summarization")
90
+ task15 = Task("Summarization_best", "acc", "Summarization_best")
91
+ task16 = Task("NER", "acc", "NER")
92
+ task17 = Task("NER_best", "acc", "NER_best")
93
+ task18 = Task("REL", "acc", "REL")
94
+ task19 = Task("REL_best", "acc", "REL_best")
95
+ '''
96
+
97
+ # Your leaderboard name
98
+ TITLE = """<h1 align="center" id="space-title">🚀 ECREAM-LLM Leaderboard 🚀</h1>"""
99
+
100
+ # What does your leaderboard evaluate?
101
+ INTRODUCTION_TEXT = """
102
+ Ecream ECREAM-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on 6 languges. ..........................................................
103
+ ..............................
104
+ ................................
105
+
106
+ **<small>Multiple-choice tasks:</small>** <small> 📊TE (Textual Entailment), 😃SA (Sentiment Analysis), ⚠️HS (Hate Speech Detection), 🏥AT (Admission Test), 🔤WIC (Word in Context), ❓FAQ (Frequently Asked Questions) </small><br>
107
+ **<small>Generative tasks:</small>** <small>🔄LS (Lexical Substitution), 📝SU (Summarization), 🏷️NER (Named Entity Recognition), 🔗REL (Relation Extraction) </small>
108
+ """
109
+
110
+ # Which evaluations are you running? how can people reproduce what you have?
111
+ LLM_BENCHMARKS_TEXT = f"""
112
+ ### Groups
113
+
114
+ - `evalita-mp`: All tasks (perplexity and non-perplexity based).
115
+ - `evalita-mp_gen`: Only generative tasks.
116
+ - `evalita-mp_mc`: Only multiple-choice tasks.
117
+
118
+ #### Tasks
119
+
120
+ The following Evalita-LLM tasks can also be evaluated in isolation:
121
+ - `evalita-mp_te`: Textual Entailment (TE)
122
+ - `evalita-mp_sa`: Sentiment Analysis (SA)
123
+ - `evalita-mp_wic`: Word in Context (WIC)
124
+ - `evalita-mp_hs`: Hate Speech Detection (HS)
125
+ - `evalita-mp_at`: Admission Tests (AT)
126
+ - `evalita-mp_faq`: Frequently Asked Questions & Question Answering (FAQ)
127
+ - `evalita-mp_sum_fp`: Summarization (SU)
128
+ - `evalita-mp_ls`: Lexical Substitution LS)
129
+ - `evalita-mp_ner_group`: Named Entity Recognition (NER)
130
+ - `evalita-mp_re`: Relation Extraction (REL)
131
+
132
+
133
+ ### Usage
134
+
135
+ ```bash
136
+
137
+ lm_eval --model hf --model_args pretrained=meta-llama/Llama-2-7b-hf --tasks evalita-mp --device cuda:0 --batch_size 1
138
+ ```
139
+
140
+ <!--
141
+ ### Checklist
142
+
143
+ * [x] Is the task an existing benchmark in the literature?
144
+ * [x] Have you referenced the original paper that introduced the task?
145
+ * [x] If yes, does the original paper provide a reference implementation?
146
+ * [x] Yes, original implementation contributed by author of the benchmark
147
+
148
+ If other tasks on this dataset are already supported:
149
+ * [x] Is the "Main" variant of this task clearly denoted?
150
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
151
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
152
+ -->
153
+
154
+
155
+ """
156
+
157
+ EVALUATION_QUEUE_TEXT = """
158
+ ## Some good practices before submitting a model
159
+
160
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
161
+ ```python
162
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
163
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
164
+ model = AutoModel.from_pretrained("your model name", revision=revision)
165
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
166
+ ```
167
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
168
+
169
+ Note: make sure your model is public!
170
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
171
+
172
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
173
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
174
+
175
+ ### 3) Make sure your model has an open license!
176
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
177
+
178
+ ### 4) Fill up your model card
179
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
180
+
181
+ ## In case of model failure
182
+ If your model is displayed in the `FAILED` category, its execution stopped.
183
+ Make sure you have followed the above steps first.
184
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
185
+ """
186
+
187
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
188
+ CITATION_BUTTON_TEXT = r"""
189
+ @misc{magnini2025evalitallmbenchmarkinglargelanguage,
190
+ title={Evalita-LLM: Benchmarking Large Language Models on Italian},
191
+ author={Bernardo Magnini and Roberto Zanoli and Michele Resta and Martin Cimmino and Paolo Albano and Marco Madeddu and Viviana Patti},
192
+ year={2025},
193
+ eprint={2502.02289},
194
+ archivePrefix={arXiv},
195
+ primaryClass={cs.CL},
196
+ url={https://arxiv.org/abs/2502.02289},
197
+ }
198
+ """
src/display/.ipynb_checkpoints/utils-checkpoint.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+
28
+ auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)])
30
+
31
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
+ #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
33
+
34
+ #Scores
35
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
36
+ for task in Tasks:
37
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
38
+
39
+ # Model information
40
+ #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
41
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
42
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
43
+ #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
44
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
45
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
46
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
47
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
48
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
49
+
50
+ # We use make dataclass to dynamically fill the scores from Tasks
51
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
52
+
53
+ ## For the queue columns in the submission tab
54
+ @dataclass(frozen=True)
55
+ class EvalQueueColumn: # Queue column
56
+ model = ColumnContent("model", "markdown", True)
57
+ revision = ColumnContent("revision", "str", True)
58
+ private = ColumnContent("private", "bool", True)
59
+ #precision = ColumnContent("precision", "str", True)
60
+ weight_type = ColumnContent("weight_type", "str", "Original")
61
+ status = ColumnContent("status", "str", True)
62
+
63
+ ## All the model information that we might need
64
+ @dataclass
65
+ class ModelDetails:
66
+ name: str
67
+ display_name: str = ""
68
+ symbol: str = "" # emoji
69
+
70
+
71
+ class ModelType(Enum):
72
+ PT = ModelDetails(name="pretrained", symbol="🟢")
73
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
74
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
75
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
76
+ Unknown = ModelDetails(name="", symbol="?")
77
+
78
+ def to_str(self, separator=" "):
79
+ return f"{self.value.symbol}{separator}{self.value.name}"
80
+
81
+ @staticmethod
82
+ def from_str(type):
83
+ if "fine-tuned" in type or "🔶" in type:
84
+ return ModelType.FT
85
+ if "pretrained" in type or "🟢" in type:
86
+ return ModelType.PT
87
+ if "RL-tuned" in type or "🟦" in type:
88
+ return ModelType.RL
89
+ if "instruction-tuned" in type or "⭕" in type:
90
+ return ModelType.IFT
91
+ return ModelType.Unknown
92
+
93
+ @dataclass
94
+ class FewShotDetails:
95
+ name: str
96
+ symbol: str = "" # emoji
97
+
98
+ class FewShotType(Enum):
99
+ ZS = FewShotDetails(name="zero-shot", symbol="0️⃣")
100
+ FS = FewShotDetails(name="5-few-shot", symbol="5️⃣")
101
+ Unknown = FewShotDetails(name="unknown", symbol="❓")
102
+
103
+ def to_str(self, separator=" "):
104
+ return f"{self.value.symbol}{separator}{self.value.name}"
105
+
106
+ @staticmethod
107
+ def from_num_fewshot(is_5fewshot):
108
+ """Determines FewShotType based on num_fewshot."""
109
+ if is_5fewshot is False:
110
+ return FewShotType.ZS
111
+ elif is_5fewshot is True:
112
+ return FewShotType.FS
113
+ return FewShotType.Unknown
114
+
115
+ class WeightType(Enum):
116
+ Adapter = ModelDetails("Adapter")
117
+ Original = ModelDetails("Original")
118
+ Delta = ModelDetails("Delta")
119
+
120
+ class Precision(Enum):
121
+ float16 = ModelDetails("float16")
122
+ bfloat16 = ModelDetails("bfloat16")
123
+ Unknown = ModelDetails("?")
124
+
125
+ def from_str(precision):
126
+ if precision in ["torch.float16", "float16"]:
127
+ return Precision.float16
128
+ if precision in ["torch.bfloat16", "bfloat16"]:
129
+ return Precision.bfloat16
130
+ return Precision.Unknown
131
+
132
+ # Column selection
133
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
134
+
135
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
136
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
137
+
138
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
139
+
140
+ '''
141
+ # Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella
142
+ @dataclass
143
+ class NewColumnContent:
144
+ name: str
145
+ type: str
146
+ displayed_by_default: bool
147
+ hidden: bool = False
148
+ never_hidden: bool = False
149
+ '''
150
+
151
+ '''
152
+ new_column_dict = []
153
+ # Aggiungi CPS, VERAGE, BEST, ID
154
+ new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)])
155
+ new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)])
156
+ new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)])
157
+ new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)])
158
+ NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True)
159
+ NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden]
160
+ '''
src/display/__pycache__/css_html_js.cpython-310.pyc ADDED
Binary file (1.94 kB). View file
 
src/display/__pycache__/css_html_js.cpython-312.pyc ADDED
Binary file (1.95 kB). View file
 
src/display/__pycache__/formatting.cpython-310.pyc ADDED
Binary file (1.42 kB). View file
 
src/display/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.14 kB). View file
 
src/display/__pycache__/utils.cpython-312.pyc ADDED
Binary file (7.41 kB). View file
 
src/display/css_html_js.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ .markdown-text {
4
+ font-size: 16px !important;
5
+ }
6
+
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
9
+ }
10
+
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
+ }
14
+
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
+ }
18
+
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+
24
+ #leaderboard-table {
25
+ margin-top: 15px
26
+ }
27
+
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
+ }
31
+
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
+ }
36
+
37
+ #search-bar {
38
+ padding: 0px;
39
+ }
40
+
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ #leaderboard-table td:nth-child(2),
43
+ #leaderboard-table th:nth-child(2) {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
+ }
48
+
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
+ }
61
+
62
+ #scale-logo .download {
63
+ display: none;
64
+ }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
69
+ }
70
+ #filter_type label {
71
+ display: flex;
72
+ }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
+ }
97
+
98
+ """
99
+
100
+ get_window_url_params = """
101
+ function(url_params) {
102
+ const params = new URLSearchParams(window.location.search);
103
+ url_params = Object.fromEntries(params);
104
+ return url_params;
105
+ }
106
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
+
4
+
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
+
9
+
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
+
13
+
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
+
17
+
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
+
21
+
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
+
25
+
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+
4
+ import pandas as pd
5
+
6
+ from src.about import Tasks
7
+
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
+
11
+
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
+ @dataclass
16
+ class ColumnContent:
17
+ name: str
18
+ type: str
19
+ displayed_by_default: bool
20
+ hidden: bool = False
21
+ never_hidden: bool = False
22
+
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+
28
+ auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)])
30
+
31
+
32
+ ##### languages #############
33
+ auto_eval_column_dict.append(["LANG", ColumnContent, ColumnContent("LANG", "str", True, never_hidden=True)])
34
+
35
+
36
+
37
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
38
+ #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
39
+
40
+ #Scores
41
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg. Combined Performance ⬆️", "number", True)])
42
+ for task in Tasks:
43
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
44
+
45
+ # Model information
46
+ #auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
47
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
48
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
49
+ #auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
50
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
51
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
52
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
53
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
54
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
55
+
56
+ # We use make dataclass to dynamically fill the scores from Tasks
57
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
58
+
59
+ ## For the queue columns in the submission tab
60
+ @dataclass(frozen=True)
61
+ class EvalQueueColumn: # Queue column
62
+ model = ColumnContent("model", "markdown", True)
63
+ revision = ColumnContent("revision", "str", True)
64
+ private = ColumnContent("private", "bool", True)
65
+ #precision = ColumnContent("precision", "str", True)
66
+ weight_type = ColumnContent("weight_type", "str", "Original")
67
+ status = ColumnContent("status", "str", True)
68
+
69
+ ## All the model information that we might need
70
+ @dataclass
71
+ class ModelDetails:
72
+ name: str
73
+ display_name: str = ""
74
+ symbol: str = "" # emoji
75
+
76
+
77
+ class ModelType(Enum):
78
+ PT = ModelDetails(name="pretrained", symbol="🟢")
79
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
80
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
81
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
82
+ Unknown = ModelDetails(name="", symbol="?")
83
+
84
+ def to_str(self, separator=" "):
85
+ return f"{self.value.symbol}{separator}{self.value.name}"
86
+
87
+ @staticmethod
88
+ def from_str(type):
89
+ if "fine-tuned" in type or "🔶" in type:
90
+ return ModelType.FT
91
+ if "pretrained" in type or "🟢" in type:
92
+ return ModelType.PT
93
+ if "RL-tuned" in type or "🟦" in type:
94
+ return ModelType.RL
95
+ if "instruction-tuned" in type or "⭕" in type:
96
+ return ModelType.IFT
97
+ return ModelType.Unknown
98
+
99
+ @dataclass
100
+ class FewShotDetails:
101
+ name: str
102
+ symbol: str = "" # emoji
103
+
104
+ class FewShotType(Enum):
105
+ ZS = FewShotDetails(name="zero-shot", symbol="0️⃣")
106
+ FS = FewShotDetails(name="5-few-shot", symbol="5️⃣")
107
+ Unknown = FewShotDetails(name="unknown", symbol="❓")
108
+
109
+ def to_str(self, separator=" "):
110
+ return f"{self.value.symbol}{separator}{self.value.name}"
111
+
112
+ @staticmethod
113
+ def from_num_fewshot(is_5fewshot):
114
+ """Determines FewShotType based on num_fewshot."""
115
+ if is_5fewshot is False:
116
+ return FewShotType.ZS
117
+ elif is_5fewshot is True:
118
+ return FewShotType.FS
119
+ return FewShotType.Unknown
120
+
121
+ class WeightType(Enum):
122
+ Adapter = ModelDetails("Adapter")
123
+ Original = ModelDetails("Original")
124
+ Delta = ModelDetails("Delta")
125
+
126
+ class Precision(Enum):
127
+ float16 = ModelDetails("float16")
128
+ bfloat16 = ModelDetails("bfloat16")
129
+ Unknown = ModelDetails("?")
130
+
131
+ def from_str(precision):
132
+ if precision in ["torch.float16", "float16"]:
133
+ return Precision.float16
134
+ if precision in ["torch.bfloat16", "bfloat16"]:
135
+ return Precision.bfloat16
136
+ return Precision.Unknown
137
+
138
+ # Column selection
139
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
140
+
141
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
142
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
143
+
144
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
145
+
146
+ '''
147
+ # Nuovi valori per CPS, AVERAGE, BEST, e ID nella tabella
148
+ @dataclass
149
+ class NewColumnContent:
150
+ name: str
151
+ type: str
152
+ displayed_by_default: bool
153
+ hidden: bool = False
154
+ never_hidden: bool = False
155
+ '''
156
+
157
+ '''
158
+ new_column_dict = []
159
+ # Aggiungi CPS, VERAGE, BEST, ID
160
+ new_column_dict.append(["CPS", NewColumnContent, NewColumnContent("CPS", "number", True)])
161
+ new_column_dict.append(["AVERAGE", NewColumnContent, NewColumnContent("Average ⬆️", "number", True)])
162
+ new_column_dict.append(["BEST", NewColumnContent, NewColumnContent("Best Performance", "number", True)])
163
+ new_column_dict.append(["ID", NewColumnContent, NewColumnContent("ID", "str", True)])
164
+ NewColumn = make_dataclass("NewColumn", new_column_dict, frozen=True)
165
+ NEW_COLS = [c.name for c in fields(NewColumn) if not c.hidden]
166
+ '''
src/envs.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("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") # A read/write token for your org
8
+
9
+ #OWNER = "giux78" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ OWNER = "saeedfarzi"
11
+ # ----------------------------------
12
+
13
+ #REPO_ID = f"{OWNER}/leaderboard-evalita"
14
+ #QUEUE_REPO = f"{OWNER}/evalita-requests"
15
+ #RESULTS_REPO = f"{OWNER}/evalita-results"
16
+
17
+ REPO_ID = f"{OWNER}/llm_leaderboard"
18
+ QUEUE_REPO = f"{OWNER}/e3c_llm_requests"
19
+ RESULTS_REPO = f"{OWNER}/e3c_llm_results"
20
+
21
+ # If you setup a cache later, just change HF_HOME
22
+ #CACHE_PATH=os.getenv("HF_HOME", "/home/sfarzi/leaderboard/")
23
+
24
+ # Local caches
25
+ #EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
26
+ #EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
27
+ #EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
28
+ #EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
29
+
30
+ EVAL_REQUESTS_PATH ='/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue")
31
+ EVAL_RESULTS_PATH = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results'#os.path.join(CACHE_PATH, "eval-results")
32
+ EVAL_REQUESTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_requests' #os.path.join(CACHE_PATH, "eval-queue-bk")
33
+ EVAL_RESULTS_PATH_BACKEND = '/home/sfarzi/leaderboard/llm_leaderboard/e3c_llm_results' #os.path.join(CACHE_PATH, "eval-results-bk")
34
+
35
+
36
+ API = HfApi(token=TOKEN)
src/leaderboard/.ipynb_checkpoints/read_evals-checkpoint.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+ from typing import Dict, Union
10
+
11
+ #from get_model_info import num_params
12
+ from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
14
+ from src.submission.check_validity import is_model_on_hub
15
+
16
+
17
+ @dataclass
18
+ class EvalResult:
19
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
20
+ """
21
+ eval_name: str # org_model_precision (uid)
22
+ full_model: str # org/model (path on hub)
23
+ org: str
24
+ model: str
25
+ revision: str # commit hash, "" if main
26
+ results: Dict[str, Union[float, int]] # float o int
27
+ average_CPS: float
28
+ is_5fewshot: bool
29
+ fewshot_symbol: FewShotType = FewShotType.Unknown
30
+ weight_type: WeightType = WeightType.Original # Original or Adapter
31
+ architecture: str = "Unknown"
32
+ license: str = "?"
33
+ likes: int = 0
34
+ num_params: int = 0
35
+ date: str = "" # submission date of request file
36
+ still_on_hub: bool = False
37
+
38
+ @classmethod
39
+ def init_from_json_file(self, json_filepath):
40
+ """Inits the result from the specific model result file"""
41
+ with open(json_filepath) as fp:
42
+ data = json.load(fp)
43
+
44
+ config = data.get("config")
45
+
46
+ #average_CPS = f"{data.get('average_CPS'):.2f}"
47
+ # Get average_CPS
48
+ average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
49
+ # Get number of fewshot
50
+ fewshot = config.get("num_fewshot", False)
51
+
52
+ try:
53
+ if fewshot == "5":
54
+ is_5fewshot = True
55
+ else:
56
+ is_5fewshot = False
57
+ except ValueError:
58
+ is_5fewshot = False
59
+ # Determine the few-shot type (ZS or FS) based on num_fewshot
60
+ fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new
61
+
62
+ # Determine the number of parameters of the models
63
+ num_params = int(0)
64
+ num_params_billion = config.get("num_params_billion")
65
+ if num_params_billion is not None:
66
+ num_params = math.ceil(num_params_billion)
67
+
68
+ # Get model and org
69
+ org_and_model = config.get("model_name", config.get("model_args", None))
70
+ org_and_model = org_and_model.split("/", 1)
71
+
72
+ if len(org_and_model) == 1:
73
+ org = None
74
+ model = org_and_model[0]
75
+ #result_key = f"{model}_{precision.value.name}"
76
+ result_key = f"{model}_{is_5fewshot}"
77
+ else:
78
+ org = org_and_model[0]
79
+ model = org_and_model[1]
80
+ #result_key = f"{org}_{model}_{precision.value.name}"
81
+ result_key = f"{org}_{model}_{is_5fewshot}"
82
+ full_model = "/".join(org_and_model)
83
+
84
+ still_on_hub, _, model_config = is_model_on_hub(
85
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
86
+ )
87
+ architecture = "?"
88
+ if model_config is not None:
89
+ architectures = getattr(model_config, "architectures", None)
90
+ if architectures:
91
+ architecture = ";".join(architectures)
92
+
93
+ # Extract the results of the models
94
+ results = {}
95
+ for task in Tasks:
96
+ task = task.value
97
+
98
+ for k, v in data["tasks"].items():
99
+ if task.benchmark[:-2] == k:
100
+ if "Best Prompt Id" in task.col_name:
101
+ results[task.benchmark] = int(v[task.metric_type][-1:])
102
+ else:
103
+ #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
104
+ results[task.benchmark] = float(v[task.metric_type])
105
+ #value = float(v[task.metric_type])
106
+ #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
107
+
108
+ return self(
109
+ eval_name=result_key,
110
+ full_model=full_model,
111
+ org=org,
112
+ model=model,
113
+ results=results,
114
+ average_CPS=average_CPS,
115
+ fewshot_symbol=fewshot_symbol,
116
+ is_5fewshot=is_5fewshot,
117
+ revision= config.get("model_sha", ""),
118
+ still_on_hub=still_on_hub,
119
+ architecture=architecture,
120
+ num_params=num_params
121
+ )
122
+
123
+ '''
124
+ def update_with_request_file(self, requests_path):
125
+ """Finds the relevant request file for the current model and updates info with it"""
126
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
127
+
128
+ try:
129
+ with open(request_file, "r") as f:
130
+ request = json.load(f)
131
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
132
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
133
+ self.license = request.get("license", "?")
134
+ self.likes = request.get("likes", 0)
135
+ self.num_params = request.get("params", 0)
136
+ self.date = request.get("submitted_time", "")
137
+ except Exception:
138
+ print(f"Could not find request file for {self.org}/{self.model} with precision
139
+ '''
140
+
141
+ def to_dict(self):
142
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
143
+ average = self.average_CPS
144
+
145
+ fewshot_symbol = (
146
+ self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓"
147
+ )
148
+
149
+ data_dict = {
150
+ "eval_name": self.eval_name, # not a column, just a save name,
151
+ #AutoEvalColumn.precision.name: self.precision.value.name,
152
+ #AutoEvalColumn.model_type.name: self.model_type.value.name,
153
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
154
+ #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
155
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
156
+ AutoEvalColumn.fewshot_symbol.name: fewshot_symbol,
157
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
158
+ AutoEvalColumn.architecture.name: self.architecture,
159
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
160
+ AutoEvalColumn.revision.name: self.revision,
161
+ AutoEvalColumn.average.name: average,
162
+ AutoEvalColumn.is_5fewshot.name: self.is_5fewshot,
163
+ AutoEvalColumn.license.name: self.license,
164
+ AutoEvalColumn.likes.name: self.likes,
165
+ AutoEvalColumn.params.name: self.num_params,
166
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
167
+ }
168
+
169
+ for task in Tasks:
170
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
171
+
172
+ return data_dict
173
+
174
+
175
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
176
+ """From the path of the results folder root, extract all needed info for results"""
177
+ model_result_filepaths = []
178
+
179
+ for root, _, files in os.walk(results_path):
180
+ # We should only have json files in model results
181
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
182
+ continue
183
+
184
+ # Sort the files by date
185
+ try:
186
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
187
+ except dateutil.parser._parser.ParserError:
188
+ files = [files[-1]]
189
+
190
+ for file in files:
191
+ model_result_filepaths.append(os.path.join(root, file))
192
+
193
+ eval_results = {}
194
+ for model_result_filepath in model_result_filepaths:
195
+ # Creation of result
196
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
197
+ #eval_result.update_with_request_file(requests_path)
198
+
199
+ # Store results of same eval together
200
+ eval_name = eval_result.eval_name
201
+ if eval_name in eval_results.keys():
202
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
203
+ else:
204
+ eval_results[eval_name] = eval_result
205
+
206
+ results = []
207
+ for v in eval_results.values():
208
+ try:
209
+ v.to_dict() # we test if the dict version is complete
210
+ results.append(v)
211
+ except KeyError: # not all eval values present
212
+ continue
213
+
214
+ return results
src/leaderboard/__pycache__/read_evals.cpython-310.pyc ADDED
Binary file (5.47 kB). View file
 
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+ from typing import Dict, Union
10
+
11
+ #from get_model_info import num_params
12
+ from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, FewShotType
14
+ from src.submission.check_validity import is_model_on_hub
15
+
16
+
17
+ @dataclass
18
+ class EvalResult:
19
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
20
+ """
21
+ eval_name: str # org_model_precision (uid)
22
+ full_model: str # org/model (path on hub)
23
+ org: str
24
+ model: str
25
+ revision: str # commit hash, "" if main
26
+ results: Dict[str, Union[float, int]] # float o int
27
+ average_CPS: float
28
+ is_5fewshot: bool
29
+ Lang:str="EN"
30
+ fewshot_symbol: FewShotType = FewShotType.Unknown
31
+ weight_type: WeightType = WeightType.Original # Original or Adapter
32
+ architecture: str = "Unknown"
33
+ license: str = "?"
34
+ likes: int = 0
35
+ num_params: int = 0
36
+ date: str = "" # submission date of request file
37
+ still_on_hub: bool = False
38
+
39
+ @classmethod
40
+ def init_from_json_file(self, json_filepath):
41
+ """Inits the result from the specific model result file"""
42
+ print ( "************ Reading file ****************")
43
+ print ("file name :" , json_filepath)
44
+ with open(json_filepath) as fp:
45
+ data = json.load(fp)
46
+ #print(json_filepath,data)
47
+ config = data.get("config")
48
+ print (config)
49
+ #print( data)
50
+ #average_CPS = f"{data.get('average_CPS'):.2f}"
51
+ # Get average_CPS
52
+ average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
53
+ # Get number of fewshot
54
+ fewshot = config.get("num_fewshot", False)
55
+ Lang=config.get("LANG", "EN")
56
+ try:
57
+ if fewshot == "5":
58
+ is_5fewshot = True
59
+ else:
60
+ is_5fewshot = False
61
+ except ValueError:
62
+ is_5fewshot = False
63
+ # Determine the few-shot type (ZS or FS) based on num_fewshot
64
+ fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new
65
+
66
+ # Determine the number of parameters of the models
67
+ num_params = int(0)
68
+ num_params_billion = config.get("num_params_billion")
69
+ if num_params_billion is not None:
70
+ num_params = math.ceil(num_params_billion)
71
+
72
+ # Get model and org
73
+ org_and_model = config.get("model_name", config.get("model_args", None))
74
+ org_and_model = org_and_model.split("/", 1)
75
+
76
+ if len(org_and_model) == 1:
77
+ org = None
78
+ model = org_and_model[0]
79
+ #result_key = f"{model}_{precision.value.name}"
80
+ result_key = f"{model}_{is_5fewshot}"
81
+ else:
82
+ org = org_and_model[0]
83
+ model = org_and_model[1]
84
+ #result_key = f"{org}_{model}_{precision.value.name}"
85
+ result_key = f"{org}_{model}_{is_5fewshot}"
86
+ full_model = "/".join(org_and_model)
87
+
88
+ still_on_hub, _, model_config = is_model_on_hub(
89
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
90
+ )
91
+ architecture = "?"
92
+ if model_config is not None:
93
+ architectures = getattr(model_config, "architectures", None)
94
+ if architectures:
95
+ architecture = ";".join(architectures)
96
+
97
+ # Extract the results of the models
98
+ results = {}
99
+ for task in Tasks:
100
+ #print(task, "data:", data["tasks"].items())
101
+ task = task.value
102
+
103
+ for k, v in data["tasks"].items():
104
+
105
+ if task.benchmark[:-2] == k:
106
+ if "Best Prompt Id" in task.col_name:
107
+ #print ("k:", k,"v:", v)
108
+ #print (task.metric_type)
109
+ #print(v[task.metric_type])
110
+ results[task.benchmark] = int(v[task.metric_type][-1:])
111
+ else:
112
+ #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
113
+ results[task.benchmark] = float(v[task.metric_type])
114
+ #value = float(v[task.metric_type])
115
+ #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
116
+
117
+ print ("Generated Object: ",self(
118
+ eval_name=result_key+"_"+Lang,
119
+ full_model=full_model,
120
+ Lang=Lang,
121
+ org=org,
122
+ model=model,
123
+ results=results,
124
+ average_CPS=average_CPS,
125
+ fewshot_symbol=fewshot_symbol,
126
+ is_5fewshot=is_5fewshot,
127
+ revision= config.get("model_sha", ""),
128
+ still_on_hub=still_on_hub,
129
+ architecture=architecture,
130
+ num_params=num_params
131
+ ))
132
+ print ( "************ End of Reading file ****************")
133
+ return self(
134
+ eval_name=result_key+"_"+Lang,
135
+ full_model=full_model,
136
+ Lang=Lang,
137
+ org=org,
138
+ model=model,
139
+ results=results,
140
+ average_CPS=average_CPS,
141
+ fewshot_symbol=fewshot_symbol,
142
+ is_5fewshot=is_5fewshot,
143
+ revision= config.get("model_sha", ""),
144
+ still_on_hub=still_on_hub,
145
+ architecture=architecture,
146
+ num_params=num_params
147
+ )
148
+
149
+ '''
150
+ def update_with_request_file(self, requests_path):
151
+ """Finds the relevant request file for the current model and updates info with it"""
152
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
153
+
154
+ try:
155
+ with open(request_file, "r") as f:
156
+ request = json.load(f)
157
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
158
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
159
+ self.license = request.get("license", "?")
160
+ self.likes = request.get("likes", 0)
161
+ self.num_params = request.get("params", 0)
162
+ self.date = request.get("submitted_time", "")
163
+ except Exception:
164
+ print(f"Could not find request file for {self.org}/{self.model} with precision
165
+ '''
166
+
167
+ def to_dict(self):
168
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
169
+ average = self.average_CPS
170
+
171
+
172
+ fewshot_symbol = (
173
+ self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓"
174
+ )
175
+
176
+ data_dict = {
177
+ "eval_name": self.eval_name, # not a column, just a save name,
178
+ #AutoEvalColumn.precision.name: self.precision.value.name,
179
+ #AutoEvalColumn.model_type.name: self.model_type.value.name,
180
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
181
+ #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
182
+ #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
183
+ AutoEvalColumn.fewshot_symbol.name: fewshot_symbol,
184
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
185
+ AutoEvalColumn.architecture.name: self.architecture,
186
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
187
+ AutoEvalColumn.revision.name: self.revision,
188
+ AutoEvalColumn.average.name: average,
189
+ AutoEvalColumn.is_5fewshot.name: self.is_5fewshot,
190
+ AutoEvalColumn.license.name: self.license,
191
+ AutoEvalColumn.likes.name: self.likes,
192
+ AutoEvalColumn.params.name: self.num_params,
193
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
194
+ AutoEvalColumn.LANG.name:self.Lang
195
+ }
196
+
197
+ for task in Tasks:
198
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
199
+
200
+ return data_dict
201
+
202
+
203
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
204
+ """From the path of the results folder root, extract all needed info for results"""
205
+ model_result_filepaths = []
206
+
207
+ for root, _, files in os.walk(results_path):
208
+ # We should only have json files in model results
209
+ #print(root,files)
210
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
211
+ continue
212
+ #if len(files) == 0 : continue
213
+ #json_files = [f for f in files if f.endswith(".json")]
214
+ #if not json_files:
215
+ #continue
216
+ # Sort the files by date
217
+ #print(root,files)
218
+ try:
219
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
220
+ except dateutil.parser._parser.ParserError:
221
+ files = [files[-1]]
222
+
223
+ for file in files:
224
+ if not file.endswith(".json"):continue
225
+ model_result_filepaths.append(os.path.join(root, file))
226
+
227
+ #print(model_result_filepaths)
228
+ eval_results = {}
229
+ for model_result_filepath in model_result_filepaths:
230
+ # Creation of result
231
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
232
+ #eval_result.update_with_request_file(requests_path)
233
+ #print ("************************")
234
+ #print("path: ", model_result_filepath)
235
+ #print('eval_result: ',eval_result)
236
+ # Store results of same eval together
237
+ eval_name = eval_result.eval_name
238
+ print('eval_name: ',eval_name)
239
+ print ("lang: ", eval_result.Lang)
240
+
241
+ if ( eval_name in eval_results.keys()) :
242
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
243
+ else:
244
+ eval_results[eval_name] = eval_result
245
+
246
+ results = []
247
+ #print("eval_results: ",eval_results)
248
+ for v in eval_results.values():
249
+ try:
250
+ v.to_dict() # we test if the dict version is complete
251
+ results.append(v)
252
+ except KeyError: # not all eval values present
253
+ print ("Except(error) : line 244 file read_evals.py")
254
+ continue
255
+ print("Final results: ",results)
256
+ print ("@@@@@@@@@@@@")
257
+ return results
src/populate.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
+
10
+
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ """Creates a dataframe from all the individual experiment results"""
13
+ print (results_path, requests_path)
14
+ raw_data = get_raw_eval_results(results_path, requests_path)
15
+ print(raw_data)
16
+ all_data_json = [v.to_dict() for v in raw_data]
17
+
18
+ df = pd.DataFrame.from_records(all_data_json)
19
+
20
+ print ("all_data_json: ", all_data_json)
21
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
22
+ df = df[cols].round(decimals=2)
23
+
24
+ # filter out if any of the benchmarks have not been produced
25
+ df = df[has_no_nan_values(df, benchmark_cols)]
26
+ return df
27
+
28
+
29
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
30
+ """Creates the different dataframes for the evaluation queues requestes"""
31
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
32
+ all_evals = []
33
+
34
+ for entry in entries:
35
+ if ".json" in entry:
36
+ file_path = os.path.join(save_path, entry)
37
+ with open(file_path) as fp:
38
+ data = json.load(fp)
39
+
40
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
41
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
42
+
43
+ all_evals.append(data)
44
+ elif ".md" not in entry:
45
+ # this is a folder
46
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
47
+ for sub_entry in sub_entries:
48
+ file_path = os.path.join(save_path, entry, sub_entry)
49
+ with open(file_path) as fp:
50
+ data = json.load(fp)
51
+
52
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
53
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
54
+ all_evals.append(data)
55
+
56
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
57
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
58
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
59
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
60
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
61
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
62
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/__pycache__/check_validity.cpython-310.pyc ADDED
Binary file (3.84 kB). View file
 
src/submission/__pycache__/submit.cpython-310.pyc ADDED
Binary file (2.84 kB). View file
 
src/submission/check_validity.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
+ if test_tokenizer:
39
+ try:
40
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
+ except ValueError as e:
42
+ return (
43
+ False,
44
+ f"uses a tokenizer which is not in a transformers release: {e}",
45
+ None
46
+ )
47
+ except Exception as e:
48
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
+ return True, None, config
50
+
51
+ except ValueError:
52
+ return (
53
+ False,
54
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
+ None
56
+ )
57
+
58
+ except Exception as e:
59
+ return False, "was not found on hub!", None
60
+
61
+
62
+ def get_model_size(model_info: ModelInfo, precision: str):
63
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ try:
65
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
+ except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in example_app.py
68
+
69
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
+ model_size = size_factor * model_size
71
+ return model_size
72
+
73
+ def get_model_arch(model_info: ModelInfo):
74
+ """Gets the model architecture from the configuration"""
75
+ return model_info.config.get("architectures", "Unknown")
76
+
77
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
78
+ """Gather a list of already submitted models to avoid duplicates"""
79
+ depth = 1
80
+ file_names = []
81
+ users_to_submission_dates = defaultdict(list)
82
+
83
+ for root, _, files in os.walk(requested_models_dir):
84
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
+ if current_depth == depth:
86
+ for file in files:
87
+ if not file.endswith(".json"):
88
+ continue
89
+ with open(os.path.join(root, file), "r") as f:
90
+ info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
+
93
+ # Select organisation
94
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
95
+ continue
96
+ organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
+
99
+ return set(file_names), users_to_submission_dates
src/submission/submit.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import (
8
+ already_submitted_models,
9
+ check_model_card,
10
+ get_model_size,
11
+ is_model_on_hub,
12
+ )
13
+
14
+ REQUESTED_MODELS = None
15
+ USERS_TO_SUBMISSION_DATES = None
16
+
17
+ def add_new_eval(
18
+ model: str,
19
+ base_model: str,
20
+ revision: str,
21
+ precision: str,
22
+ weight_type: str,
23
+ model_type: str,
24
+ ):
25
+ global REQUESTED_MODELS
26
+ global USERS_TO_SUBMISSION_DATES
27
+ if not REQUESTED_MODELS:
28
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+
30
+ user_name = ""
31
+ model_path = model
32
+ if "/" in model:
33
+ user_name = model.split("/")[0]
34
+ model_path = model.split("/")[1]
35
+
36
+ precision = precision.split(" ")[0]
37
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+
39
+ if model_type is None or model_type == "":
40
+ return styled_error("Please select a model type.")
41
+
42
+ # Does the model actually exist?
43
+ if revision == "":
44
+ revision = "main"
45
+
46
+ # Is the model on the hub?
47
+ if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ if not base_model_on_hub:
50
+ return styled_error(f'Base model "{base_model}" {error}')
51
+
52
+ if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ if not model_on_hub:
55
+ return styled_error(f'Model "{model}" {error}')
56
+
57
+ # Is the model info correctly filled?
58
+ try:
59
+ model_info = API.model_info(repo_id=model, revision=revision)
60
+ except Exception:
61
+ return styled_error("Could not get your model information. Please fill it up properly.")
62
+
63
+ model_size = get_model_size(model_info=model_info, precision=precision)
64
+
65
+ # Were the model card and license filled?
66
+ try:
67
+ license = model_info.cardData["license"]
68
+ except Exception:
69
+ return styled_error("Please select a license for your model")
70
+
71
+ modelcard_OK, error_msg = check_model_card(model)
72
+ if not modelcard_OK:
73
+ return styled_error(error_msg)
74
+
75
+ # Seems good, creating the eval
76
+ print("Adding new eval")
77
+
78
+ eval_entry = {
79
+ "model": model,
80
+ "base_model": base_model,
81
+ "revision": revision,
82
+ "precision": precision,
83
+ "weight_type": weight_type,
84
+ "status": "PENDING",
85
+ "submitted_time": current_time,
86
+ "model_type": model_type,
87
+ "likes": model_info.likes,
88
+ "params": model_size,
89
+ "license": license,
90
+ "private": False,
91
+ }
92
+
93
+ # Check for duplicate submission
94
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ return styled_warning("This model has been already submitted.")
96
+
97
+ print("Creating eval file")
98
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ os.makedirs(OUT_DIR, exist_ok=True)
100
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+
102
+ with open(out_path, "w") as f:
103
+ f.write(json.dumps(eval_entry))
104
+
105
+ print("Uploading eval file")
106
+ API.upload_file(
107
+ path_or_fileobj=out_path,
108
+ path_in_repo=out_path.split("eval-queue/")[1],
109
+ repo_id=QUEUE_REPO,
110
+ repo_type="dataset",
111
+ commit_message=f"Add {model} to eval queue",
112
+ )
113
+
114
+ # Remove the local file
115
+ os.remove(out_path)
116
+
117
+ return styled_message(
118
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ )
src/tasks.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ # metric: str
8
+ accuracy: str
9
+ col_name: str
10
+
11
+ NUM_FEWSHOT = 0 # Change with your few shot
12
+ # ---------------------------------------------------
13
+
14
+ # Your leaderboard name
15
+ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
16
+
17
+ # What does your leaderboard evaluate?
18
+ INTRODUCTION_TEXT = """
19
+ Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
20
+ """
21
+
22
+ #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
23
+ MEASURE_DESCRIPTION = "<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above).</small>"
24
+ #MEASURE_DESCRIPTION = "<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = avg. accuracy over prompts. **Best Prompt** = accuracy of best prompt. **Prompt ID** = ID of the best prompt (see legend above).</small>"
25
+
26
+ # Tasks Descriptions
27
+ TE_DESCRIPTION = """### Textual Entailment (TE) --- *Multiple-choice task*
28
+ The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
29
+
30
+ | # | Prompt | Answer Choices |
31
+ |-----|------------|--------------|
32
+ | 1 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
33
+ | 2 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? | ["Sì", "No"] |
34
+ | 3 | La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
35
+ | 4 | Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: | ["A", "B"] |
36
+ | 5 | Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
37
+ | 6 | Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' | ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] |
38
+
39
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
40
+
41
+ """
42
+
43
+ SA_DESCRIPTION = """### Sentiment Analysis (SA) --- *Multiple-choice task*
44
+ The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
45
+
46
+ | # | Prompt | Answer Choices |
47
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
48
+ | 1 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
49
+ | 2 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? | ["Positivo", "Negativo", "Neutro", "Misto"] |
50
+ | 3 | Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: | ["A", "B", "C", "D"] |
51
+ | 4 | Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: | ["A", "B", "C", "D"] |
52
+ | 5 | Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
53
+ | 6 | Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment | ["Positivo", "Negativo", "Neutro", "Misto"] |
54
+
55
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
56
+
57
+ """
58
+
59
+ HS_DESCRIPTION = """### Hate Speech (HS) --- *Multiple-choice task*
60
+ The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.
61
+
62
+ | # | Prompt | Answer Choices |
63
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
64
+ | 1 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
65
+ | 2 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? | ["Falso", "Vero"] |
66
+ | 3 | C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: | ["B", "A"] |
67
+ | 4 | Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: | ["B", "A"] |
68
+ | 5 | Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
69
+ | 6 | Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' | ["non contiene incitamento all'odio", "contiene incitamento all'odio"] |
70
+
71
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-micro averaged over the 6 prompts. **Best Prompt** = F1-micro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
72
+
73
+ """
74
+
75
+ AT_DESCRIPTION = """### Admission Tests (AT) --- *Multiple-choice task*
76
+ The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
77
+
78
+ | # | Prompt | Answer Choices |
79
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
80
+ | 1 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
81
+ | 2 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? | ["A", "B", "C", "D", "E"] |
82
+ | 3 | Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
83
+ | 4 | Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: | ["A", "B", "C", "D", "E"] |
84
+ | 5 | Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' èDato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
85
+ | 6 | Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: | ["A", "B", "C", "D", "E"] |
86
+
87
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
88
+
89
+ """
90
+
91
+ WIC_DESCRIPTION = """### Word in Context (WIC) --- *Multiple-choice task*
92
+ The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).
93
+
94
+ | # | Prompt | Answer Choices |
95
+ |-----|--------------------------------------------------------------------------------|-------------------------------------------------|
96
+ | 1 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
97
+ | 2 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? | ["No", "Sì"] |
98
+ | 3 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: | ["B", "A"] |
99
+ | 4 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: | ["B", "A"] |
100
+ | 5 | La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
101
+ | 6 | Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' | ["non hanno lo stesso significato", "hanno lo stesso significato"] |
102
+
103
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1-macro averaged over the 6 prompts. **Best Prompt** = F1-macro of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
104
+
105
+ """
106
+
107
+ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- *Multiple-choice task*
108
+ The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.
109
+
110
+ | # | Prompt | Answer Choices |
111
+ |-----|--------------------------------------------------------------------------------|-----------------------------|
112
+ | 1 | Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
113
+ | 2 | Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' | {{[A, B, C, D]}} |
114
+ | 3 | Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
115
+ | 4 | Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: | ["A", "B", "C", "D"] |
116
+ | 5 | La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
117
+ | 6 | Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: | {{[A, B, C, D]}} |
118
+
119
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the 6 prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
120
+
121
+ """
122
+
123
+ LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task*
124
+ The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
125
+
126
+ | # | Prompt |
127
+ |-----|--------------------------------------------------------------------------------|
128
+ | 1 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
129
+ | 2 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
130
+
131
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
132
+
133
+ """
134
+
135
+ SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task*
136
+ The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
137
+
138
+ | # | Prompt |
139
+ |-----|--------------------------------------------------------------------------------|
140
+ | 1 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
141
+ | 2 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
142
+
143
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
144
+
145
+ """
146
+
147
+ NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task*
148
+ The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
149
+
150
+ | # | Prompt |
151
+ |-----|--------------------------------------------------------------------------------|
152
+ | 1 | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
153
+ | 2 | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
154
+
155
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
156
+
157
+ """
158
+
159
+ REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task*
160
+ The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).
161
+
162
+ | # | Prompt |
163
+ |-----|--------------------------------------------------------------------------------|
164
+ | 1 | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
165
+ | 2 | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
166
+
167
+ <small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
168
+
169
+ """
170
+
171
+ # Create a dictionary to map task names to their descriptions
172
+ TASK_DESCRIPTIONS = {
173
+ "TE": TE_DESCRIPTION,
174
+ "SA": SA_DESCRIPTION,
175
+ "HS": HS_DESCRIPTION,
176
+ "AT": AT_DESCRIPTION,
177
+ "WIC": WIC_DESCRIPTION,
178
+ "FAQ": FAQ_DESCRIPTION,
179
+ "LS": LS_DESCRIPTION,
180
+ "SU": SU_DESCRIPTION,
181
+ "NER": NER_DESCRIPTION,
182
+ "REL": REL_DESCRIPTION
183
+ }