Spaces:

OpenEvals
/

InferenceProviderTesting

Sleeping

Clémentine

added refresh btn back

5312397 19 days ago

8.44 kB

	import gradio as gr
	import time
	from apscheduler.schedulers.background import BackgroundScheduler
	import threading
	import globals
	from utils.io import save_results, load_results, load_models_providers, get_results_table, load_models_providers_str, get_summary_stats
	from utils.jobs import run_single_job, run_multiple_jobs, launch_jobs, update_job_statuses, relaunch_failed_jobs
	from typing import List, Optional


	def status_monitor() -> None:
	"""Background thread to monitor job statuses."""
	while True:
	update_job_statuses()
	time.sleep(240) # Check every 30 seconds


	def daily_checkpoint() -> None:
	"""Daily checkpoint - save current state."""
	print("Daily checkpoint - saving current state")
	save_results()


	# Create Gradio interface
	def create_app() -> gr.Blocks:
	with gr.Blocks(title="Inference Provider Testing Dashboard") as demo:
	with gr.Tab("Main"):
	gr.Markdown("# Inference Provider Testing Dashboard")
	gr.Markdown("Launch and monitor evaluation jobs for multiple models and providers.")

	# Manual job launch section
	with gr.Row():
	with gr.Column(scale=2):
	model_input = gr.Textbox(
	label="Model",
	placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct",
	info="Enter HuggingFace model ID"
	)
	with gr.Column(scale=1):
	provider_input = gr.Textbox(
	label="Provider",
	placeholder="e.g., together-ai",
	info="Enter inference provider name"
	)
	with gr.Column(scale=1):
	launch_single_btn = gr.Button("Launch Job", variant="primary")

	# Batch action buttons
	with gr.Row():
	launch_btn = gr.Button("Launch All Jobs", variant="secondary", scale=2)
	relaunch_failed_btn = gr.Button("Relaunch Failed", variant="stop", scale=1)
	refresh_btn = gr.Button("🔄 Refresh", variant="secondary", scale=1)

	output = gr.Textbox(label="Status", interactive=False)

	# Summary statistics
	summary_stats = gr.Markdown(value=get_summary_stats())

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Job Results")
	results_table = gr.Dataframe(
	value=get_results_table(),
	interactive=True,
	show_search="search",
	show_copy_button=True,
	show_fullscreen_button=True,
	wrap=True,
	static_columns=list(range(11)),
	datatype=["str", "str", "str", "str", "str", "str", "str", "str", "str", "str", "html", "str"],
	elem_id="results_table"
	)


	# Event handlers
	def launch_single_and_update(model: str, provider: str):
	"""Launch multiple jobs for a model-provider combination and return updated table and stats."""
	if not model or not provider:
	return "❌ Please provide both model and provider", get_results_table(), get_summary_stats()

	job_ids = run_multiple_jobs(model, provider, globals.TASKS)
	if not job_ids:
	return "❌ Failed to launch jobs (may already be running)", get_results_table(), get_summary_stats()

	save_results()
	return f"✅ Launched {len(job_ids)} jobs for {model} on {provider}", get_results_table(), get_summary_stats()

	launch_single_btn.click(
	fn=launch_single_and_update,
	inputs=[model_input, provider_input],
	outputs=[output, results_table, summary_stats]
	)

	def launch_and_update():
	"""Launch jobs and return updated table and stats."""
	result = launch_jobs()
	return result, get_results_table(), get_summary_stats()

	def relaunch_and_update():
	"""Relaunch failed jobs and return updated table and stats."""
	result = relaunch_failed_jobs()
	return result, get_results_table(), get_summary_stats()

	launch_btn.click(
	fn=launch_and_update,
	outputs=[output, results_table, summary_stats]
	)

	relaunch_failed_btn.click(
	fn=relaunch_and_update,
	outputs=[output, results_table, summary_stats]
	)

	def refresh_display():
	"""Refresh the table and stats display."""
	return get_results_table(), get_summary_stats()

	refresh_btn.click(
	fn=refresh_display,
	outputs=[results_table, summary_stats]
	)

	# Handle dataframe cell selection for relaunch
	def handle_table_select(evt: gr.SelectData):
	"""Handle when a cell in the results table is clicked."""
	print(f"[Relaunch] Cell selected - Row: {evt.index[0]}, Col: {evt.index[1]}, Value: {evt.value}")

	# If we selected a "rerun" cell, we relaunch a job
	if evt.index[1] == 11:
	# Get the full row data from the dataframe
	df = get_results_table()
	row_data = df.data.iloc[evt.index[0]]

	model = row_data['Model']
	provider = row_data['Provider']
	print(f"[Relaunch] Relaunching {globals.NUM_RUNS_PER_JOB} jobs - Model: {model}, Provider: {provider}")

	run_multiple_jobs(model, provider, globals.TASKS)
	# Save after relaunch
	save_results()

	# Then update the table and stats
	return get_results_table(), get_summary_stats()

	results_table.select(
	fn=handle_table_select,
	inputs=[],
	outputs=[results_table, summary_stats]
	)

	# Auto-refresh table and stats every 30 seconds
	def auto_refresh():
	"""Auto-refresh table and summary stats."""
	return get_results_table(), get_summary_stats()

	# Create a timer for auto-refresh
	timer = gr.Timer(value=30, active=True)
	timer.tick(
	fn=auto_refresh,
	inputs=[],
	outputs=[results_table, summary_stats]
	)
	with gr.Tab("About"):
	gr.Markdown("""
	In this demo, we run 10 samples of 3 evaluations: ifeval (instruction following), gsm_plus (grade school math problems, less contaminated than gsm8k) and gpqa, diamond subset (knowledge), with `lighteval`, `inference-providers` and `jobs`.

	The "status" column indicates whether the evaluation failed completely (usually because of the provider was down or because we were rate limited).

	To run any of these locally, you can use the following
	```python
	from huggingface_hub import run_job, inspect_job, whoami
	job = run_job(
	image="hf.co/spaces/OpenEvals/EvalsOnTheHub",
	command=[
	"lighteval", "endpoint", "inference-providers",
	"model_name=MODEL,provider=PROVIDER",
	"extended\|ifeval\|0,lighteval\|gpqa:diamond\|0",
	"--push-to-hub", "--save-details",
	"--results-org", "YOURORG"
	],
	namespace="huggingface",
	secrets={"HF_TOKEN": YOURTOKEN},
	token=YOURTOKEN
	)
	```
	""")

	return demo


	if __name__ == "__main__":
	# Load previous results
	load_results()
	print("Starting Inference Provider Testing Dashboard")

	# Start status monitor thread
	monitor_thread = threading.Thread(target=status_monitor, daemon=True)
	monitor_thread.start()
	print("Job status monitor started")

	# Start APScheduler for daily checkpoint
	scheduler = BackgroundScheduler()
	scheduler.add_job(daily_checkpoint, 'cron', hour=0, minute=0) # Run at midnight
	scheduler.start()
	print("Daily checkpoint scheduler started (saves at 00:00)")

	# Create and launch the Gradio interface
	demo = create_app()
	demo.launch(server_name="0.0.0.0", server_port=7860)