Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import snapshot_download | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, | |
| INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE | |
| ) | |
| from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION | |
| from src.display.css_html_js import custom_css | |
| from src.display.utils import ( | |
| BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, | |
| ModelType, fields, WeightType, Precision | |
| ) | |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
| from src.populate import get_evaluation_queue_df, get_leaderboard_df | |
| from src.submission.submit import add_new_eval | |
| # Define the task icons and names | |
| TASK_ICONS = { | |
| "TE": "π", # Textual Entailment | |
| "SA": "π", # Sentiment Analysis | |
| "HS": "β οΈ", # Hate Speech | |
| "AT": "π₯", # Admission Test | |
| "WIC": "π€", # Word in Context | |
| "FAQ": "β", # Frequently Asked Questions | |
| "LS": "π", # Lexical Substitution | |
| "SU": "π", # Summarization | |
| "NER": "π·οΈ", # Named Entity Recognition | |
| "REL": "π", # Relation Extraction | |
| } | |
| TASK_NAMES = { | |
| "TE": "Textual Entailment", | |
| "SA": "Sentiment Analysis", | |
| "HS": "Hate Speech", | |
| "AT": "Admission Test", | |
| "WIC": "Word in Context", | |
| "FAQ": "Frequently Asked Questions", | |
| "LS": "Lexical Substitution", | |
| "SU": "Summarization", | |
| "NER": "Named Entity Recognition", | |
| "REL": "Relation Extraction", | |
| } | |
| # Tooltip descriptions for each task | |
| TASK_TOOLTIPS = { | |
| "TE": "Identify logical relationships between two text segments.", | |
| "SA": "Classify the sentiment (positive, negative, neutral) of a text.", | |
| "HS": "Detect hate speech in a text.", | |
| "AT": "Classify whether a clinical statement pertains to an admission test.", | |
| "WIC": "Identify words in context and their meaning.", | |
| "FAQ": "Answer frequently asked questions based on given text.", | |
| "LS": "Identify alternative words in a given context.", | |
| "SU": "Summarize long text into a shorter version.", | |
| "NER": "Identify named entities (e.g., persons, locations, organizations) in text.", | |
| "REL": "Extract and link laboratory test results to the respective tests in clinical narratives.", | |
| } | |
| def restart_space(): | |
| """Restart the Hugging Face space.""" | |
| API.restart_space(repo_id=REPO_ID) | |
| def download_snapshot(repo, local_dir): | |
| """Try to download a snapshot from the Hugging Face Hub, restarting space on failure.""" | |
| try: | |
| print(f"Downloading from {repo} to {local_dir}...") | |
| snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) | |
| except Exception as e: | |
| print(f"Error downloading {repo}: {e}") | |
| restart_space() | |
| # Space initialization | |
| download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) | |
| download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) | |
| # Load leaderboard and evaluation queue data | |
| LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) | |
| finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) | |
| def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): | |
| """Initialize a leaderboard with specific columns.""" | |
| if dataframe is None or dataframe.empty: | |
| raise ValueError("Leaderboard DataFrame is empty or None.") | |
| return Leaderboard( | |
| value=dataframe, | |
| datatype=[c.type for c in fields(AutoEvalColumn)], | |
| select_columns=SelectColumns( | |
| default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], | |
| cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], | |
| hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], | |
| filter_columns=[ | |
| #ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), | |
| ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"), | |
| #ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), | |
| ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"), | |
| #ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True), | |
| ], | |
| bool_checkboxgroup_label="Hide models", | |
| interactive=False, | |
| ) | |
| def prepare_leaderboard_df(df, task_prefix): | |
| """Rename columns for a specific task to a standard format.""" | |
| return df.rename(columns={ | |
| f"{task_prefix} Prompt Average": "Prompt Average", | |
| f"{task_prefix} Best Prompt": "Best Prompt", | |
| f"{task_prefix} Best Prompt Id": "Best Prompt Id", | |
| task_prefix: "Combined Performance" | |
| }) | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| # Main leaderboard tab | |
| with gr.TabItem("π EVALITA-LLM Benchmark", elem_id="llm-benchmark-tab-table"): | |
| leaderboard = init_leaderboard( | |
| LEADERBOARD_DF, | |
| default_selection=['FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], | |
| hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in | |
| ['FS', 'Model', "Avg. Combined Performance β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] | |
| ) | |
| # About tab | |
| with gr.TabItem("π About", elem_id="llm-benchmark-tab-table"): | |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
| ''' | |
| # Submission tab | |
| with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table"): | |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") | |
| for queue_name, queue_df in [ | |
| ("β Finished Evaluations", finished_eval_queue_df), | |
| ("π Running Evaluation Queue", running_eval_queue_df), | |
| ("β³ Pending Evaluation Queue", pending_eval_queue_df) | |
| ]: | |
| with gr.Accordion(f"{queue_name} ({len(queue_df)})", open=False): | |
| gr.components.Dataframe(value=queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) | |
| gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text") | |
| with gr.Row(): | |
| model_name_textbox = gr.Textbox(label="Model name") | |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") | |
| model_type = gr.Dropdown(choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], | |
| label="Model type", multiselect=False, interactive=True) | |
| precision = gr.Dropdown(choices=[i.value.name for i in Precision if i != Precision.Unknown], | |
| label="Precision", multiselect=False, value="float16", interactive=True) | |
| weight_type = gr.Dropdown(choices=[i.value.name for i in WeightType], | |
| label="Weights type", multiselect=False, value="Original", interactive=True) | |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") | |
| submit_button = gr.Button("Submit Eval") | |
| submission_result = gr.Markdown() | |
| submit_button.click( | |
| add_new_eval, | |
| [model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type], | |
| submission_result, | |
| ) | |
| ''' | |
| # Task-specific leaderboards | |
| for task in ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]: | |
| with gr.TabItem(f"{TASK_ICONS[task]}{task}", elem_id="llm-benchmark-tab-table"): | |
| task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") | |
| gr.Markdown(task_description, elem_classes="markdown-text") | |
| gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text") | |
| leaderboard = init_leaderboard( | |
| prepare_leaderboard_df(LEADERBOARD_DF, task), | |
| default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], | |
| hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in | |
| ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] | |
| ) | |
| # Citation section | |
| with gr.Accordion("π Citation", open=False): | |
| gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) | |
| # Background job to restart space | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", seconds=1800) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=40).launch() |