Spaces:

gjoliveira
/

data-ai-llm-eval-app

Running

Guilherme

Improved UI ; removed per section scores ; Changed color scale

d906888 22 days ago

23.7 kB

	# ui/csv_tab.py
	"""
	Builds the CSV-upload tab (batch metrics).
	- Summary table: only global scores (no S/O/A/P). Labels are short (e.g., "BLEU", not "BLEU GLOBAL").
	- Detailed table: shows only global F1 columns (colored) and, when available, dark badges for P/R.
	- CSV export includes whatever columns the backend produced; UI renders only the globals.
	- Upload "Status" is collapsed into the file input's label.
	- Errors (missing CSV, columns not chosen, etc.) are displayed in the status textbox under "Run Evaluation".
	"""
	import os
	import time
	import tempfile
	import gradio as gr
	import pandas as pd

	from metrics import compute_all_metrics_batch, BERT_FRIENDLY_TO_MODEL
	from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
	from utils.file_utils import smart_read_csv
	from utils.colors_utils import get_metric_color
	from utils.tokenizer_refgen import generate_diff_html


	# ------------------- Summary HTML builder (GLOBAL ONLY) -------------------
	def build_summary_html(result_df: pd.DataFrame, selected_metrics: list, bert_models: list \| None = None) -> str:
	def safe_stats(col):
	if col not in result_df.columns:
	return None
	s = result_df[col].dropna()
	if s.empty:
	return None
	s = s.astype(float)
	avg, mn, mx = s.mean(), s.min(), s.max()

	def audio_id_for(v):
	subset = result_df[result_df[col].astype(float) == v]
	if not subset.empty and "code_audio_transcription" in subset.columns:
	aid = subset.iloc[0]["code_audio_transcription"]
	try:
	return int(aid)
	except Exception:
	return aid
	return ""

	return {"avg": avg, "min": mn, "min_id": audio_id_for(mn), "max": mx, "max_id": audio_id_for(mx)}

	rows = []

	# NOTE: We used to show per-section rows (S/O/A/P) when a single metric was selected.
	# That logic has been removed; we now present only global rows for all metrics.

	if "BLEU" in selected_metrics:
	s = safe_stats("bleu_global")
	if s:
	rows.append(("bleu_global", s))
	if "BLEURT" in selected_metrics:
	s = safe_stats("bleurt_global")
	if s:
	rows.append(("bleurt_global", s))
	if "ROUGE" in selected_metrics:
	s = safe_stats("rougeL_global_f1")
	if s:
	rows.append(("rougeL_global_f1", s))

	# BERTScore (global only)
	if "BERTSCORE" in selected_metrics and bert_models:
	# NOTE: Previously, if only BERTScore with one model was selected, we added per-section rows.
	# That behavior is disabled. We only show global columns:
	# - bertscore_<short>_f1 (multi-model)
	# - or bertscore_global_f1 (if that's what backend produced)
	for friendly in bert_models:
	mid = BERT_FRIENDLY_TO_MODEL.get(friendly)
	if not mid:
	continue
	short = mid.split("/")[-1].replace("-", "_")
	col = f"bertscore_{short}_f1" if f"bertscore_{short}_f1" in result_df.columns else "bertscore_global_f1"
	s = safe_stats(col)
	if s:
	rows.append((col, s))

	if not rows:
	return "<div style='padding:8px;background:#1f1f1f;color:#eee;border-radius:6px;'>No summary available.</div>"

	# Build HTML table
	html = """
	<div style="margin-bottom:12px;overflow-x:auto;">
	<div style="font-weight:600;margin-bottom:4px;color:#f5f5f5;font-size:16px;">Summary Statistics</div>
	<table style="border-collapse:collapse;width:100%;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;border-radius:8px;overflow:hidden;min-width:500px;">
	<thead><tr>
	<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:left;font-weight:600;">Metric</th>
	<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Avg</th>
	<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Min (ID)</th>
	<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Max (ID)</th>
	</tr></thead><tbody>
	"""

	for col, stat in rows:
	# Pretty names (drop "GLOBAL")
	if col == "bleu_global":
	name = "BLEU"
	elif col == "bleurt_global":
	name = "BLEURT"
	elif col == "rougeL_global_f1":
	name = "ROUGE-L"
	elif col.startswith("bertscore_"):
	if col == "bertscore_global_f1":
	name = "BERTSCORE"
	else:
	label = " ".join(col.split("_")[1:-1]).upper()
	name = f"BERTSCORE {label}" if label else "BERTSCORE"
	else:
	name = col.replace("_", " ").upper()

	avg = f"{stat['avg']:.4f}"
	mn = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] != "" else f"{stat['min']:.4f}"
	mx = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] != "" else f"{stat['max']:.4f}"

	# Color scale by metric family (F1)
	if col.startswith("bleu_"):
	ca, cm, cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU")
	elif col.startswith("bleurt_"):
	ca, cm, cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT")
	elif col.startswith("rougeL_"):
	ca, cm, cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE")
	else:
	ca, cm, cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE")

	html += f"""
	<tr style="background:#0f1218;">
	<td style="padding:8px 12px;border:1px solid #2f3240;color:#fff;white-space:nowrap;">{name}</td>
	<td style="padding:8px 12px;border:1px solid #2f3240;background:{ca};color:#fff;text-align:center;white-space:nowrap;">{avg}</td>
	<td style="padding:8px 12px;border:1px solid #2f3240;background:{cm};color:#fff;text-align:center;white-space:nowrap;">{mn}</td>
	<td style="padding:8px 12px;border:1px solid #2f3240;background:{cx};color:#fff;text-align:center;white-space:nowrap;">{mx}</td>
	</tr>
	"""
	html += "</tbody></table></div>"
	return html


	# ------------------- Detailed table (GLOBAL ONLY, F1 colored + dark P/R badges) -------------------
	def render_results_table_html(result_df: pd.DataFrame) -> str:
	if result_df is None or result_df.empty:
	return "<div style='padding:8px;background:#1f1f1f;color:#eee;border-radius:6px;'>No results.</div>"

	# Keep only global F1 columns (skip *_p/_r and any S/O/A/P)
	def is_global_f1(col: str) -> bool:
	if col == "code_audio_transcription":
	return False
	if col.endswith("_p") or col.endswith("_r"):
	return False
	if col.startswith("bleu_"):
	return col == "bleu_global"
	if col.startswith("bleurt_"):
	return col == "bleurt_global"
	if col.startswith("rougeL_"):
	return col == "rougeL_global_f1"
	if col.startswith("bertscore_"):
	parts = col.split("_")
	# Exclude per-section: bertscore_S_f1, etc.
	if len(parts) >= 2 and parts[1] in {"S", "O", "A", "P"}:
	return False
	# Allow model-specific or "bertscore_global_f1"
	return parts[-1] == "f1" or col == "bertscore_global_f1"
	return False

	f1_cols = [c for c in result_df.columns if is_global_f1(c)]

	# Sort for readability: BLEU, BLEURT, ROUGE-L, BERTSCORE (...)
	def _grp_key(col):
	if col.startswith("bleu_"):
	g = 0
	elif col.startswith("bleurt_"):
	g = 1
	elif col.startswith("rougeL_"):
	g = 2
	elif col.startswith("bertscore_"):
	g = 3
	else:
	g = 9
	return (g, col)
	f1_cols = sorted(f1_cols, key=_grp_key)

	# HTML table
	html = [
	"<div style='overflow-x:auto;'>",
	"<div style='font-weight:600;margin:8px 0;color:#f5f5f5;font-size:16px;'>Individual Results</div>",
	"<table style='border-collapse:collapse;width:100%;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;border-radius:8px;overflow:hidden;'>",
	"<thead><tr>",
	"<th style='padding:8px 12px;background:#2d3748;color:#fff;text-align:left;font-weight:600;white-space:nowrap;'>ID</th>",
	]

	def pretty_header(col: str) -> str:
	if col == "bleu_global":
	return "BLEU"
	if col == "bleurt_global":
	return "BLEURT"
	if col == "rougeL_global_f1":
	return "ROUGE-L"
	if col.startswith("bertscore_"):
	if col == "bertscore_global_f1":
	return "BERTSCORE"
	label = " ".join(col.split("_")[1:-1]).upper()
	return f"BERTSCORE {label}" if label else "BERTSCORE"
	return col.replace("_", " ").upper()

	for col in f1_cols:
	html.append(
	f"<th style='padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;white-space:nowrap;'>{pretty_header(col)}</th>"
	)
	html.append("</tr></thead><tbody>")

	for _, row in result_df.iterrows():
	rid = row.get("code_audio_transcription", "")
	try:
	rid = int(rid)
	except Exception:
	pass
	html.append("<tr style='background:#0f1218;'>")
	html.append(f"<td style='padding:8px 12px;border:1px solid #2f3240;color:#fff;white-space:nowrap;'>{rid}</td>")

	for col in f1_cols:
	val = row.get(col, None)

	# figure metric family & pick P/R columns accordingly
	metric_kind = "BERTSCORE"
	p_text = r_text = ""

	if col.startswith("bleu_"):
	metric_kind = "BLEU"
	# BLEU: no P/R
	elif col.startswith("bleurt_"):
	metric_kind = "BLEURT"
	elif col.startswith("rougeL_"):
	metric_kind = "ROUGE"
	base = "rougeL_global" # global root
	pcol, rcol = f"{base}_p", f"{base}_r"
	p = row.get(pcol, None)
	r = row.get(rcol, None)
	p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else ""
	r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else ""
	elif col.startswith("bertscore_"):
	metric_kind = "BERTSCORE"
	# try model-specific first
	base = col[:-3] if col.endswith("_f1") else col # strip trailing _f1
	pcol, rcol = f"{base}_p", f"{base}_r"
	if pcol not in result_df.columns and rcol not in result_df.columns:
	# fallback to "bertscore_global" naming
	pcol, rcol = "bertscore_global_p", "bertscore_global_r"
	p = row.get(pcol, None)
	r = row.get(rcol, None)
	p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else ""
	r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else ""

	if isinstance(val, (int, float)):
	bg = get_metric_color(float(val), metric_kind)
	val_text = f"{float(val):.4f}"
	else:
	bg = "transparent"
	val_text = "—"

	# Dark badges for P/R
	pills = []
	if p_text:
	pills.append("<span style='padding:1px 6px;border-radius:999px;background:rgba(0,0,0,.48);color:#fff;display:inline-block;'>"
	f"{p_text}</span>")
	if r_text:
	pills.append("<span style='padding:1px 6px;border-radius:999px;background:rgba(0,0,0,.48);color:#fff;display:inline-block;margin-left:6px;'>"
	f"{r_text}</span>")
	badges = ""
	if pills:
	badges = "<div style='font-size:12px;margin-top:4px;line-height:1.2;'>" + "".join(pills) + "</div>"

	html.append(
	f"<td style='padding:8px 12px;border:1px solid #2f3240;background:{bg};color:#fff;text-align:center;white-space:nowrap;'>"
	f"{val_text}{badges}</td>"
	)
	html.append("</tr>")
	html.append("</tbody></table></div>")
	return "".join(html)


	# ------------------- Tab builder -------------------
	def build_csv_tab():
	with gr.Blocks() as tab:
	state_df = gr.State() # original uploaded DataFrame
	state_pairs = gr.State() # standardized pairs: id + reference + generated
	state_result = gr.State() # metrics result DataFrame for export

	gr.Markdown("# RUN AN EXPERIMENT VIA CSV UPLOAD")
	gr.Markdown(
	"Upload a CSV of reference/generated text pairs, map the columns, pick metrics, and run a batch evaluation. \n "
	"F1 is highlighted in color; Precision/Recall appear as small dark badges."
	)

	gr.Markdown("## Experiment Configuration")

	# 1) Upload CSV (status collapsed into the label)
	gr.Markdown("### Upload CSV")
	gr.Markdown("Provide a CSV file containing your data. It should include columns for the reference text, the generated text, and an identifier (e.g., audio ID).")
	with gr.Row():
	file_input = gr.File(label="Upload CSV", file_types=[".csv"])

	# 2) Map Columns
	gr.Markdown("### Map Columns")
	gr.Markdown("Select which columns in your CSV correspond to the reference text, generated text, and audio/example ID.")
	with gr.Row(visible=False) as mapping:
	ref_col = gr.Dropdown(label="Reference Column", choices=[])
	gen_col = gr.Dropdown(label="Generated Column", choices=[])
	id_col = gr.Dropdown(label="Audio ID Column", choices=[])

	# 3) Select Metrics
	gr.Markdown("### Select Metrics")
	metric_selector = MetricCheckboxGroup()
	bert_model_selector = BertCheckboxGroup()

	# ---------- Divider before RESULTS ----------
	gr.HTML("""<div style="height:1px;margin:22px 0;background:
	linear-gradient(90deg, rgba(0,0,0,0) 0%, #4a5568 35%, #4a5568 65%, rgba(0,0,0,0) 100%);"></div>""")

	gr.Markdown("# RESULTS")

	# Emphasize the run button
	gr.HTML("""
	<style>
	#run-eval-btn button {
	background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
	color: #fff !important;
	border: none !important;
	box-shadow: 0 6px 16px rgba(0,0,0,.25);
	}
	#run-eval-btn button:hover { filter: brightness(1.08); transform: translateY(-1px); }
	</style>
	""")

	# 4) Run Evaluation (+ Export control)
	with gr.Row():
	run_btn = gr.Button("🚀 Run Evaluation", variant="primary", elem_id="run-eval-btn")
	download_btn = gr.DownloadButton(label="⬇️ Export full results (CSV)", visible=False)

	# This Text box will display both success and error messages
	output_status = gr.Text()
	summary_output = gr.HTML()
	table_output = gr.HTML()

	# 5) Inspect example
	gr.Markdown("### Inspect an Example")
	gr.Markdown("Pick an example by its ID to view the reference vs generated text with token-level differences highlighted.")
	with gr.Accordion("🔍 Show reference & generated text", open=False):
	pick_id = gr.Dropdown(label="Pick an Audio ID", choices=[])
	ref_disp = gr.Textbox(label="Reference Text", lines=6, interactive=False)
	gen_disp = gr.Textbox(label="Generated Text", lines=6, interactive=False)
	diff_disp= gr.HTML()

	# ---- Handlers ----
	def handle_upload(f):
	if not f:
	# reset label & hide mapping
	return (
	None,
	gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]),
	gr.update(visible=False),
	gr.update(label="Upload CSV")
	)
	df = smart_read_csv(f.name)
	cols = list(df.columns)
	return (
	df,
	gr.update(choices=cols, value=None),
	gr.update(choices=cols, value=None),
	gr.update(choices=cols, value=None),
	gr.update(visible=True),
	gr.update(label="Upload CSV — OK: selecione as colunas.")
	)

	def run_batch(df, r, g, i, mets, berts):
	# Pre-flight validation: CSV uploaded?
	if df is None:
	return (
	"Erro: por favor faça upload de um CSV e selecione as colunas.",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Columns chosen?
	if not r or not g or not i:
	return (
	"Erro: selecione as colunas de Reference, Generated e Audio ID.",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Columns exist?
	missing = [c for c in [i, r, g] if c not in df.columns]
	if missing:
	return (
	f"Erro: as colunas não existem no CSV: {missing}",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Metrics chosen?
	if not mets:
	return (
	"Erro: selecione pelo menos uma métrica.",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Rename into standard schema (this is what we'll use for "Inspect an Example")
	try:
	sub = df[[i, r, g]].rename(
	columns={i: "code_audio_transcription", r: "dsc_reference_free_text", g: "dsc_generated_clinical_report"}
	)
	except Exception as e:
	return (
	f"Erro ao preparar dados: {e}",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Compute metrics
	try:
	result = compute_all_metrics_batch(
	sub,
	mets,
	berts if "BERTSCORE" in (mets or []) else None
	)
	except Exception as e:
	return (
	f"Erro ao calcular métricas: {e}",
	"", "", gr.update(choices=[]), None, None, gr.update(visible=False)
	)

	# Normalize IDs for dropdown
	try:
	raw_ids = result["code_audio_transcription"].dropna().unique().tolist()
	ids = []
	for x in raw_ids:
	try:
	ids.append(int(x))
	except Exception:
	ids.append(x)
	ids = sorted(ids, key=lambda z: (not isinstance(z, int), z))
	except Exception:
	ids = []

	# Build HTML views
	try:
	summary = build_summary_html(result, mets, berts if "BERTSCORE" in (mets or []) else None)
	table = render_results_table_html(result)
	except Exception as e:
	return (
	f"Erro ao renderizar resultados: {e}",
	"", "", gr.update(choices=ids, value=None), None, None, gr.update(visible=False)
	)

	# Keep results for export & show download button
	# Also keep standardized pairs (sub) for the "Inspect an Example" view
	return (
	"Métricas calculadas com sucesso.",
	summary,
	table,
	gr.update(choices=ids, value=None),
	result,
	sub,
	gr.update(visible=True),
	)

	def show_example(pairs_df, audio_id):
	# Use the standardized pairs dataframe (id + reference + generated)
	if pairs_df is None or audio_id is None:
	return "", "", ""
	try:
	row = pairs_df[pairs_df["code_audio_transcription"] == audio_id]
	if row.empty:
	# Try float cast fallback for IDs that come as strings
	try:
	audio_id2 = float(audio_id)
	row = pairs_df[pairs_df["code_audio_transcription"] == audio_id2]
	except Exception:
	return "", "", ""
	if row.empty:
	return "", "", ""
	row = row.iloc[0]
	ref_txt = row["dsc_reference_free_text"]
	gen_txt = row["dsc_generated_clinical_report"]
	return ref_txt, gen_txt, generate_diff_html(ref_txt, gen_txt)
	except Exception:
	return "", "", ""

	def _export_results_csv(df: pd.DataFrame \| None) -> str:
	# Always export with comma separator; include ALL columns that were computed
	if df is None or df.empty:
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
	with open(tmp.name, "w", encoding="utf-8") as f:
	f.write("no_data\n")
	return tmp.name
	ts = time.strftime("%Y%m%d_%H%M%S")
	tmp_path = os.path.join(tempfile.gettempdir(), f"automatic_metrics_{ts}.csv")
	df.to_csv(tmp_path, sep=",", index=False)
	return tmp_path

	# ---- Wiring ----
	file_input.change(
	fn=handle_upload,
	inputs=[file_input],
	outputs=[state_df, ref_col, gen_col, id_col, mapping, file_input], # update label in place
	)
	metric_selector.change(
	fn=lambda ms: gr.update(visible="BERTSCORE" in ms),
	inputs=[metric_selector],
	outputs=[bert_model_selector],
	)
	run_btn.click(
	fn=run_batch,
	inputs=[state_df, ref_col, gen_col, id_col, metric_selector, bert_model_selector],
	outputs=[output_status, summary_output, table_output, pick_id, state_result, state_pairs, download_btn],
	)
	# Use standardized pairs DF for example view (fixes KeyError on original DF)
	pick_id.change(
	fn=show_example,
	inputs=[state_pairs, pick_id],
	outputs=[ref_disp, gen_disp, diff_disp],
	)
	download_btn.click(
	fn=_export_results_csv,
	inputs=[state_result],
	outputs=download_btn, # path returned; Gradio serves it
	)

	return tab