# ui/csv_tab.py """ Builds the CSV-upload tab (batch metrics). - Summary table: **only global scores** (no S/O/A/P). Labels are short (e.g., "BLEU", not "BLEU GLOBAL"). - Detailed table: shows only global F1 columns (colored) and, when available, dark badges for P/R. - CSV export includes whatever columns the backend produced; UI renders only the globals. - Upload "Status" is collapsed into the file input's label. - Errors (missing CSV, columns not chosen, etc.) are displayed in the status textbox under "Run Evaluation". """ import os import time import tempfile import gradio as gr import pandas as pd from metrics import compute_all_metrics_batch, BERT_FRIENDLY_TO_MODEL from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup from utils.file_utils import smart_read_csv from utils.colors_utils import get_metric_color from utils.tokenizer_refgen import generate_diff_html # ------------------- Summary HTML builder (GLOBAL ONLY) ------------------- def build_summary_html(result_df: pd.DataFrame, selected_metrics: list, bert_models: list | None = None) -> str: def safe_stats(col): if col not in result_df.columns: return None s = result_df[col].dropna() if s.empty: return None s = s.astype(float) avg, mn, mx = s.mean(), s.min(), s.max() def audio_id_for(v): subset = result_df[result_df[col].astype(float) == v] if not subset.empty and "code_audio_transcription" in subset.columns: aid = subset.iloc[0]["code_audio_transcription"] try: return int(aid) except Exception: return aid return "" return {"avg": avg, "min": mn, "min_id": audio_id_for(mn), "max": mx, "max_id": audio_id_for(mx)} rows = [] # NOTE: We used to show per-section rows (S/O/A/P) when a single metric was selected. # That logic has been **removed**; we now present **only global** rows for all metrics. if "BLEU" in selected_metrics: s = safe_stats("bleu_global") if s: rows.append(("bleu_global", s)) if "BLEURT" in selected_metrics: s = safe_stats("bleurt_global") if s: rows.append(("bleurt_global", s)) if "ROUGE" in selected_metrics: s = safe_stats("rougeL_global_f1") if s: rows.append(("rougeL_global_f1", s)) # BERTScore (global only) if "BERTSCORE" in selected_metrics and bert_models: # NOTE: Previously, if only BERTScore with one model was selected, we added per-section rows. # That behavior is **disabled**. We only show global columns: # - bertscore__f1 (multi-model) # - or bertscore_global_f1 (if that's what backend produced) for friendly in bert_models: mid = BERT_FRIENDLY_TO_MODEL.get(friendly) if not mid: continue short = mid.split("/")[-1].replace("-", "_") col = f"bertscore_{short}_f1" if f"bertscore_{short}_f1" in result_df.columns else "bertscore_global_f1" s = safe_stats(col) if s: rows.append((col, s)) if not rows: return "
No summary available.
" # Build HTML table html = """
Summary Statistics
""" for col, stat in rows: # Pretty names (drop "GLOBAL") if col == "bleu_global": name = "BLEU" elif col == "bleurt_global": name = "BLEURT" elif col == "rougeL_global_f1": name = "ROUGE-L" elif col.startswith("bertscore_"): if col == "bertscore_global_f1": name = "BERTSCORE" else: label = " ".join(col.split("_")[1:-1]).upper() name = f"BERTSCORE {label}" if label else "BERTSCORE" else: name = col.replace("_", " ").upper() avg = f"{stat['avg']:.4f}" mn = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] != "" else f"{stat['min']:.4f}" mx = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] != "" else f"{stat['max']:.4f}" # Color scale by metric family (F1) if col.startswith("bleu_"): ca, cm, cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU") elif col.startswith("bleurt_"): ca, cm, cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT") elif col.startswith("rougeL_"): ca, cm, cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE") else: ca, cm, cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE") html += f""" """ html += "
Metric Avg Min (ID) Max (ID)
{name} {avg} {mn} {mx}
" return html # ------------------- Detailed table (GLOBAL ONLY, F1 colored + dark P/R badges) ------------------- def render_results_table_html(result_df: pd.DataFrame) -> str: if result_df is None or result_df.empty: return "
No results.
" # Keep only *global* F1 columns (skip *_p/_r and any S/O/A/P) def is_global_f1(col: str) -> bool: if col == "code_audio_transcription": return False if col.endswith("_p") or col.endswith("_r"): return False if col.startswith("bleu_"): return col == "bleu_global" if col.startswith("bleurt_"): return col == "bleurt_global" if col.startswith("rougeL_"): return col == "rougeL_global_f1" if col.startswith("bertscore_"): parts = col.split("_") # Exclude per-section: bertscore_S_f1, etc. if len(parts) >= 2 and parts[1] in {"S", "O", "A", "P"}: return False # Allow model-specific or "bertscore_global_f1" return parts[-1] == "f1" or col == "bertscore_global_f1" return False f1_cols = [c for c in result_df.columns if is_global_f1(c)] # Sort for readability: BLEU, BLEURT, ROUGE-L, BERTSCORE (...) def _grp_key(col): if col.startswith("bleu_"): g = 0 elif col.startswith("bleurt_"): g = 1 elif col.startswith("rougeL_"): g = 2 elif col.startswith("bertscore_"): g = 3 else: g = 9 return (g, col) f1_cols = sorted(f1_cols, key=_grp_key) # HTML table html = [ "
", "
Individual Results
", "", "", "", ] def pretty_header(col: str) -> str: if col == "bleu_global": return "BLEU" if col == "bleurt_global": return "BLEURT" if col == "rougeL_global_f1": return "ROUGE-L" if col.startswith("bertscore_"): if col == "bertscore_global_f1": return "BERTSCORE" label = " ".join(col.split("_")[1:-1]).upper() return f"BERTSCORE {label}" if label else "BERTSCORE" return col.replace("_", " ").upper() for col in f1_cols: html.append( f"" ) html.append("") for _, row in result_df.iterrows(): rid = row.get("code_audio_transcription", "") try: rid = int(rid) except Exception: pass html.append("") html.append(f"") for col in f1_cols: val = row.get(col, None) # figure metric family & pick P/R columns accordingly metric_kind = "BERTSCORE" p_text = r_text = "" if col.startswith("bleu_"): metric_kind = "BLEU" # BLEU: no P/R elif col.startswith("bleurt_"): metric_kind = "BLEURT" elif col.startswith("rougeL_"): metric_kind = "ROUGE" base = "rougeL_global" # global root pcol, rcol = f"{base}_p", f"{base}_r" p = row.get(pcol, None) r = row.get(rcol, None) p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else "" r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else "" elif col.startswith("bertscore_"): metric_kind = "BERTSCORE" # try model-specific first base = col[:-3] if col.endswith("_f1") else col # strip trailing _f1 pcol, rcol = f"{base}_p", f"{base}_r" if pcol not in result_df.columns and rcol not in result_df.columns: # fallback to "bertscore_global" naming pcol, rcol = "bertscore_global_p", "bertscore_global_r" p = row.get(pcol, None) r = row.get(rcol, None) p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else "" r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else "" if isinstance(val, (int, float)): bg = get_metric_color(float(val), metric_kind) val_text = f"{float(val):.4f}" else: bg = "transparent" val_text = "—" # Dark badges for P/R pills = [] if p_text: pills.append("" f"{p_text}") if r_text: pills.append("" f"{r_text}") badges = "" if pills: badges = "
" + "".join(pills) + "
" html.append( f"" ) html.append("") html.append("
ID{pretty_header(col)}
{rid}" f"{val_text}{badges}
") return "".join(html) # ------------------- Tab builder ------------------- def build_csv_tab(): with gr.Blocks() as tab: state_df = gr.State() # original uploaded DataFrame state_pairs = gr.State() # standardized pairs: id + reference + generated state_result = gr.State() # metrics result DataFrame for export gr.Markdown("# RUN AN EXPERIMENT VIA CSV UPLOAD") gr.Markdown( "Upload a CSV of reference/generated text pairs, map the columns, pick metrics, and run a batch evaluation. \n " "F1 is highlighted in color; Precision/Recall appear as small dark badges." ) gr.Markdown("## Experiment Configuration") # 1) Upload CSV (status collapsed into the label) gr.Markdown("### Upload CSV") gr.Markdown("Provide a CSV file containing your data. It should include columns for the reference text, the generated text, and an identifier (e.g., audio ID).") with gr.Row(): file_input = gr.File(label="Upload CSV", file_types=[".csv"]) # 2) Map Columns gr.Markdown("### Map Columns") gr.Markdown("Select which columns in your CSV correspond to the reference text, generated text, and audio/example ID.") with gr.Row(visible=False) as mapping: ref_col = gr.Dropdown(label="Reference Column", choices=[]) gen_col = gr.Dropdown(label="Generated Column", choices=[]) id_col = gr.Dropdown(label="Audio ID Column", choices=[]) # 3) Select Metrics gr.Markdown("### Select Metrics") metric_selector = MetricCheckboxGroup() bert_model_selector = BertCheckboxGroup() # ---------- Divider before RESULTS ---------- gr.HTML("""
""") gr.Markdown("# RESULTS") # Emphasize the run button gr.HTML(""" """) # 4) Run Evaluation (+ Export control) with gr.Row(): run_btn = gr.Button("🚀 Run Evaluation", variant="primary", elem_id="run-eval-btn") download_btn = gr.DownloadButton(label="⬇️ Export full results (CSV)", visible=False) # This Text box will display both success and error messages output_status = gr.Text() summary_output = gr.HTML() table_output = gr.HTML() # 5) Inspect example gr.Markdown("### Inspect an Example") gr.Markdown("Pick an example by its ID to view the reference vs generated text with token-level differences highlighted.") with gr.Accordion("🔍 Show reference & generated text", open=False): pick_id = gr.Dropdown(label="Pick an Audio ID", choices=[]) ref_disp = gr.Textbox(label="Reference Text", lines=6, interactive=False) gen_disp = gr.Textbox(label="Generated Text", lines=6, interactive=False) diff_disp= gr.HTML() # ---- Handlers ---- def handle_upload(f): if not f: # reset label & hide mapping return ( None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(visible=False), gr.update(label="Upload CSV") ) df = smart_read_csv(f.name) cols = list(df.columns) return ( df, gr.update(choices=cols, value=None), gr.update(choices=cols, value=None), gr.update(choices=cols, value=None), gr.update(visible=True), gr.update(label="Upload CSV — OK: selecione as colunas.") ) def run_batch(df, r, g, i, mets, berts): # Pre-flight validation: CSV uploaded? if df is None: return ( "Erro: por favor faça upload de um CSV e selecione as colunas.", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Columns chosen? if not r or not g or not i: return ( "Erro: selecione as colunas de Reference, Generated e Audio ID.", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Columns exist? missing = [c for c in [i, r, g] if c not in df.columns] if missing: return ( f"Erro: as colunas não existem no CSV: {missing}", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Metrics chosen? if not mets: return ( "Erro: selecione pelo menos uma métrica.", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Rename into standard schema (this is what we'll use for "Inspect an Example") try: sub = df[[i, r, g]].rename( columns={i: "code_audio_transcription", r: "dsc_reference_free_text", g: "dsc_generated_clinical_report"} ) except Exception as e: return ( f"Erro ao preparar dados: {e}", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Compute metrics try: result = compute_all_metrics_batch( sub, mets, berts if "BERTSCORE" in (mets or []) else None ) except Exception as e: return ( f"Erro ao calcular métricas: {e}", "", "", gr.update(choices=[]), None, None, gr.update(visible=False) ) # Normalize IDs for dropdown try: raw_ids = result["code_audio_transcription"].dropna().unique().tolist() ids = [] for x in raw_ids: try: ids.append(int(x)) except Exception: ids.append(x) ids = sorted(ids, key=lambda z: (not isinstance(z, int), z)) except Exception: ids = [] # Build HTML views try: summary = build_summary_html(result, mets, berts if "BERTSCORE" in (mets or []) else None) table = render_results_table_html(result) except Exception as e: return ( f"Erro ao renderizar resultados: {e}", "", "", gr.update(choices=ids, value=None), None, None, gr.update(visible=False) ) # Keep results for export & show download button # Also keep standardized pairs (sub) for the "Inspect an Example" view return ( "Métricas calculadas com sucesso.", summary, table, gr.update(choices=ids, value=None), result, sub, gr.update(visible=True), ) def show_example(pairs_df, audio_id): # Use the standardized pairs dataframe (id + reference + generated) if pairs_df is None or audio_id is None: return "", "", "" try: row = pairs_df[pairs_df["code_audio_transcription"] == audio_id] if row.empty: # Try float cast fallback for IDs that come as strings try: audio_id2 = float(audio_id) row = pairs_df[pairs_df["code_audio_transcription"] == audio_id2] except Exception: return "", "", "" if row.empty: return "", "", "" row = row.iloc[0] ref_txt = row["dsc_reference_free_text"] gen_txt = row["dsc_generated_clinical_report"] return ref_txt, gen_txt, generate_diff_html(ref_txt, gen_txt) except Exception: return "", "", "" def _export_results_csv(df: pd.DataFrame | None) -> str: # Always export with comma separator; include ALL columns that were computed if df is None or df.empty: tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".csv") with open(tmp.name, "w", encoding="utf-8") as f: f.write("no_data\n") return tmp.name ts = time.strftime("%Y%m%d_%H%M%S") tmp_path = os.path.join(tempfile.gettempdir(), f"automatic_metrics_{ts}.csv") df.to_csv(tmp_path, sep=",", index=False) return tmp_path # ---- Wiring ---- file_input.change( fn=handle_upload, inputs=[file_input], outputs=[state_df, ref_col, gen_col, id_col, mapping, file_input], # update label in place ) metric_selector.change( fn=lambda ms: gr.update(visible="BERTSCORE" in ms), inputs=[metric_selector], outputs=[bert_model_selector], ) run_btn.click( fn=run_batch, inputs=[state_df, ref_col, gen_col, id_col, metric_selector, bert_model_selector], outputs=[output_status, summary_output, table_output, pick_id, state_result, state_pairs, download_btn], ) # Use standardized pairs DF for example view (fixes KeyError on original DF) pick_id.change( fn=show_example, inputs=[state_pairs, pick_id], outputs=[ref_disp, gen_disp, diff_disp], ) download_btn.click( fn=_export_results_csv, inputs=[state_result], outputs=download_btn, # path returned; Gradio serves it ) return tab