Summary Statistics

""" for col, stat in rows: # Pretty names (drop "GLOBAL") if col == "bleu_global": name = "BLEU" elif col == "bleurt_global": name = "BLEURT" elif col == "rougeL_global_f1": name = "ROUGE-L" elif col.startswith("bertscore_"): if col == "bertscore_global_f1": name = "BERTSCORE" else: label = " ".join(col.split("_")[1:-1]).upper() name = f"BERTSCORE {label}" if label else "BERTSCORE" else: name = col.replace("_", " ").upper() avg = f"{stat['avg']:.4f}" mn = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] != "" else f"{stat['min']:.4f}" mx = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] != "" else f"{stat['max']:.4f}" # Color scale by metric family (F1) if col.startswith("bleu_"): ca, cm, cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU") elif col.startswith("bleurt_"): ca, cm, cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT") elif col.startswith("rougeL_"): ca, cm, cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE") else: ca, cm, cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE") html += f""" """ html += "

Metric	Avg	Min (ID)	Max (ID)
{name}	{avg}	{mn}	{mx}

", "

Individual Results

", "", "", "", ] def pretty_header(col: str) -> str: if col == "bleu_global": return "BLEU" if col == "bleurt_global": return "BLEURT" if col == "rougeL_global_f1": return "ROUGE-L" if col.startswith("bertscore_"): if col == "bertscore_global_f1": return "BERTSCORE" label = " ".join(col.split("_")[1:-1]).upper() return f"BERTSCORE {label}" if label else "BERTSCORE" return col.replace("_", " ").upper() for col in f1_cols: html.append( f"" ) html.append("") for _, row in result_df.iterrows(): rid = row.get("code_audio_transcription", "") try: rid = int(rid) except Exception: pass html.append("") html.append(f"") for col in f1_cols: val = row.get(col, None) # figure metric family & pick P/R columns accordingly metric_kind = "BERTSCORE" p_text = r_text = "" if col.startswith("bleu_"): metric_kind = "BLEU" # BLEU: no P/R elif col.startswith("bleurt_"): metric_kind = "BLEURT" elif col.startswith("rougeL_"): metric_kind = "ROUGE" base = "rougeL_global" # global root pcol, rcol = f"{base}_p", f"{base}_r" p = row.get(pcol, None) r = row.get(rcol, None) p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else "" r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else "" elif col.startswith("bertscore_"): metric_kind = "BERTSCORE" # try model-specific first base = col[:-3] if col.endswith("_f1") else col # strip trailing _f1 pcol, rcol = f"{base}_p", f"{base}_r" if pcol not in result_df.columns and rcol not in result_df.columns: # fallback to "bertscore_global" naming pcol, rcol = "bertscore_global_p", "bertscore_global_r" p = row.get(pcol, None) r = row.get(rcol, None) p_text = f"P: {p:.4f}" if isinstance(p, (int, float)) else "" r_text = f"R: {r:.4f}" if isinstance(r, (int, float)) else "" if isinstance(val, (int, float)): bg = get_metric_color(float(val), metric_kind) val_text = f"{float(val):.4f}" else: bg = "transparent" val_text = "—" # Dark badges for P/R pills = [] if p_text: pills.append("" f"{p_text}") if r_text: pills.append("" f"{r_text}") badges = "" if pills: badges = "

" + "".join(pills) + "

" html.append( f"" ) html.append("") html.append("

ID	{pretty_header(col)}
{rid}	" f"{val_text}{badges}