Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import time | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| import threading | |
| import globals | |
| from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats | |
| from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs | |
| from typing import List, Optional | |
| def status_monitor() -> None: | |
| """Background thread to monitor job statuses.""" | |
| while True: | |
| update_job_statuses() | |
| time.sleep(240) # Check every 30 seconds | |
| def daily_checkpoint() -> None: | |
| """Daily checkpoint - save current state.""" | |
| print("Daily checkpoint - saving current state") | |
| save_results() | |
| # Create Gradio interface | |
| def create_app() -> gr.Blocks: | |
| with gr.Blocks(title="Inference Provider Testing Dashboard") as demo: | |
| with gr.Tab("Main"): | |
| gr.Markdown("# Inference Provider Testing Dashboard") | |
| gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.") | |
| # Manual job launch section | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| model_input = gr.Textbox( | |
| label="Model", | |
| placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct", | |
| info="Enter HuggingFace model ID" | |
| ) | |
| with gr.Column(scale=1): | |
| provider_input = gr.Textbox( | |
| label="Provider", | |
| placeholder="e.g., together-ai", | |
| info="Enter inference provider name" | |
| ) | |
| with gr.Column(scale=1): | |
| launch_single_btn = gr.Button("Launch Job", variant="primary") | |
| # Batch action buttons | |
| with gr.Row(): | |
| launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2) | |
| relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1) | |
| refresh_btn = gr.Button("🔄 Refresh", variant="secondary", scale=1) | |
| output = gr.Textbox(label="Status", interactive=False) | |
| # Summary statistics | |
| summary_stats = gr.Markdown(value=get_summary_stats()) | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Job Results") | |
| results_table = gr.Dataframe( | |
| value=get_results_table(), | |
| interactive=True, | |
| show_search="search", | |
| show_copy_button=True, | |
| show_fullscreen_button=True, | |
| wrap=True, | |
| static_columns=list(range(11)), | |
| datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"], | |
| elem_id="results_table" | |
| ) | |
| # Event handlers | |
| def launch_single_and_update(model: str, provider: str): | |
| """Launch multiple jobs for a model-provider combination and return updated table and stats.""" | |
| if not model or not provider: | |
| return "❌ Please provide both model and provider", get_results_table(), get_summary_stats() | |
| job_ids = run_multiple_jobs(model, provider, globals.TASKS) | |
| if not job_ids: | |
| return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats() | |
| save_results() | |
| return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats() | |
| launch_single_btn.click( | |
| fn=launch_single_and_update, | |
| inputs=[model_input, provider_input], | |
| outputs=[output, results_table, summary_stats] | |
| ) | |
| def launch_and_update(): | |
| """Launch jobs and return updated table and stats.""" | |
| result = launch_jobs() | |
| return result, get_results_table(), get_summary_stats() | |
| def relaunch_and_update(): | |
| """Relaunch failed jobs and return updated table and stats.""" | |
| result = relaunch_failed_jobs() | |
| return result, get_results_table(), get_summary_stats() | |
| launch_btn.click( | |
| fn=launch_and_update, | |
| outputs=[output, results_table, summary_stats] | |
| ) | |
| relaunch_failed_btn.click( | |
| fn=relaunch_and_update, | |
| outputs=[output, results_table, summary_stats] | |
| ) | |
| def refresh_display(): | |
| """Refresh the table and stats display.""" | |
| return get_results_table(), get_summary_stats() | |
| refresh_btn.click( | |
| fn=refresh_display, | |
| outputs=[results_table, summary_stats] | |
| ) | |
| # Handle dataframe cell selection for relaunch | |
| def handle_table_select(evt: gr.SelectData): | |
| """Handle when a cell in the results table is clicked.""" | |
| print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}") | |
| # If we selected a "rerun" cell, we relaunch a job | |
| if evt.index[1] == 11: | |
| # Get the full row data from the dataframe | |
| df = get_results_table() | |
| row_data = df.data.iloc[evt.index[0]] | |
| model = row_data['Model'] | |
| provider = row_data['Provider'] | |
| print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}") | |
| run_multiple_jobs(model, provider, globals.TASKS) | |
| # Save after relaunch | |
| save_results() | |
| # Then update the table and stats | |
| return get_results_table(), get_summary_stats() | |
| results_table.select( | |
| fn=handle_table_select, | |
| inputs=[], | |
| outputs=[results_table, summary_stats] | |
| ) | |
| # Auto-refresh table and stats every 30 seconds | |
| def auto_refresh(): | |
| """Auto-refresh table and summary stats.""" | |
| return get_results_table(), get_summary_stats() | |
| # Create a timer for auto-refresh | |
| timer = gr.Timer(value=30, active=True) | |
| timer.tick( | |
| fn=auto_refresh, | |
| inputs=[], | |
| outputs=[results_table, summary_stats] | |
| ) | |
| with gr.Tab("About"): | |
| gr.Markdown(""" | |
| In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`. | |
| The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited). | |
| To run any of these locally, you can use the following | |
| ```python | |
| from huggingface_hub import run_job, inspect_job, whoami | |
| job = run_job( | |
| image="hf.co/spaces/OpenEvals/EvalsOnTheHub", | |
| command=[ | |
| "lighteval", "endpoint", "inference-providers", | |
| "model_name=MODEL,provider=PROVIDER", | |
| "extended|ifeval|0,lighteval|gpqa:diamond|0", | |
| "--push-to-hub", "--save-details", | |
| "--results-org", "YOURORG" | |
| ], | |
| namespace="huggingface", | |
| secrets={"HF_TOKEN": YOURTOKEN}, | |
| token=YOURTOKEN | |
| ) | |
| ``` | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| # Load previous results | |
| load_results() | |
| print("Starting Inference Provider Testing Dashboard") | |
| # Start status monitor thread | |
| monitor_thread = threading.Thread(target=status_monitor, daemon=True) | |
| monitor_thread.start() | |
| print("Job status monitor started") | |
| # Start APScheduler for daily checkpoint | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) # Run at midnight | |
| scheduler.start() | |
| print("Daily checkpoint scheduler started (saves at 00:00)") | |
| # Create and launch the Gradio interface | |
| demo = create_app() | |
| demo.launch(server_name="0.0.0.0", server_port=7860) | |