Spaces:

gjoliveira
/

data-ai-llm-eval-app

Sleeping

App Files Files Community

Guilherme commited on Aug 19

Commit

4b112ae

1 Parent(s): ca4ba7f

Deploy to HF Space

Browse files

Files changed (25) hide show

app.py +39 -0
config.py +11 -0
metrics/__init__.py +21 -0
metrics/__pycache__/__init__.cpython-312.pyc +0 -0
metrics/__pycache__/bertscore.cpython-312.pyc +0 -0
metrics/__pycache__/bleu.cpython-312.pyc +0 -0
metrics/__pycache__/bleurt.cpython-312.pyc +0 -0
metrics/__pycache__/core.cpython-312.pyc +0 -0
metrics/__pycache__/rouge.cpython-312.pyc +0 -0
metrics/bertscore.py +187 -0
metrics/bleu.py +41 -0
metrics/bleurt.py +30 -0
metrics/core.py +122 -0
metrics/rouge.py +38 -0
requirements.txt +10 -0
ui/__init__.py +8 -0
ui/__pycache__/__init__.cpython-312.pyc +0 -0
ui/__pycache__/common.cpython-312.pyc +0 -0
ui/__pycache__/csv_tab.cpython-312.pyc +0 -0
ui/__pycache__/manual_tab.cpython-312.pyc +0 -0
ui/__pycache__/widgets.cpython-312.pyc +0 -0
ui/common.py +15 -0
ui/csv_tab.py +238 -0
ui/manual_tab.py +68 -0
ui/widgets.py +25 -0

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# app.py
+import os
+import threading
+import gradio as gr
+from metrics import get_hf_bleurt, get_hf_rouge
+from ui.manual_tab import build_manual_tab
+from ui.csv_tab import build_csv_tab
+# (Optional) force CPU so TensorFlow/BLEURT doesn't try CUDA on Spaces
+os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")
+def create_interface() -> gr.TabbedInterface:
+    return gr.TabbedInterface(
+        interface_list=[
+            build_manual_tab(),
+            build_csv_tab(),
+        ],
+        tab_names=["Manual Input", "CSV Upload"],
+    )
+if __name__ == "__main__":
+    # Preload heavy HF metrics so the UI stays responsive
+    threading.Thread(target=get_hf_bleurt, daemon=True).start()
+    threading.Thread(target=get_hf_rouge, daemon=True).start()
+    app = create_interface()
+    # Spaces (and most PaaS) provide PORT; default to 7860 for local runs
+    port = int(os.getenv("PORT", "7860"))
+    # Use queue for long-running tasks to avoid timeouts
+    app.queue(concurrency_count=1, max_size=20).launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        show_error=True,
+    )

config.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from metrics import BERT_FRIENDLY_TO_MODEL
+# Available evaluation metrics for selection
+METRIC_CHOICES = ["BLEU", "BLEURT", "ROUGE", "BERTSCORE"]
+# Default metrics pre-selected in the UI
+DEFAULT_METRICS = ["BLEU"]
+# Available BERT models for BERTScore
+BERT_CHOICES = list(BERT_FRIENDLY_TO_MODEL.keys())
+# Default BERT model pre-selected in the UI
+DEFAULT_BERTS = [BERT_CHOICES[0]]

metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# metrics/__init__.py
+"""
+Central metrics entrypoint: import and expose all metric functions.
+"""
+from .core import compute_all_metrics_batch
+from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
+from .bleurt import compute_bleurt_single, get_hf_bleurt
+from .rouge import compute_rouge_single, get_hf_rouge
+from .bertscore import compute_bertscore_single, BERT_FRIENDLY_TO_MODEL, BERT_MODEL_TO_FRIENDLY
+__all__ = [
+    "compute_all_metrics_batch",
+    "compute_bleu_single",
+    "compute_bleurt_single",
+    "compute_rouge_single",
+    "get_hf_bleurt",
+    "get_hf_rouge",
+    "compute_bertscore_single",
+    "BERT_FRIENDLY_TO_MODEL",
+    "BERT_MODEL_TO_FRIENDLY",
+]

metrics/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (776 Bytes). View file

metrics/__pycache__/bertscore.cpython-312.pyc ADDED Viewed

Binary file (8.73 kB). View file

metrics/__pycache__/bleu.cpython-312.pyc ADDED Viewed

Binary file (1.93 kB). View file

metrics/__pycache__/bleurt.cpython-312.pyc ADDED Viewed

Binary file (1.11 kB). View file

metrics/__pycache__/core.cpython-312.pyc ADDED Viewed

Binary file (5.76 kB). View file

metrics/__pycache__/rouge.cpython-312.pyc ADDED Viewed

Binary file (1.59 kB). View file

metrics/bertscore.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# metrics/bertscore.py
+"""
+BERTScore helpers: scorer init, single and batch computation.
+"""
+from bert_score import BERTScorer
+from functools import lru_cache
+from transformers import AutoTokenizer
+from utils.file_utils import extract_sections, has_sections
+import pandas as pd
+# manual layer mapping
+_MANUAL_BERT_LAYERS = {
+    "neuralmind/bert-base-portuguese-cased": 12,
+    "pucpr/biobertpt-clin": 12,
+    "xlm-roberta-large": 24,
+}
+# friendly label ↔ model id mapping
+BERT_FRIENDLY_TO_MODEL = {
+    "Portuguese (Br) Bert": "neuralmind/bert-base-portuguese-cased",
+    "Portuguese (Br) Clinical BioBert": "pucpr/biobertpt-clin",
+    "Multilingual Bert ( RoBerta)": "xlm-roberta-large",
+}
+BERT_MODEL_TO_FRIENDLY = {v: k for k, v in BERT_FRIENDLY_TO_MODEL.items()}
+_USE_RESCALE_BASELINE = False
+@lru_cache(maxsize=6)
+def get_bertscore_scorer(model_type: str):
+    lang = "pt" if any(model_type.startswith(p) for p in ("neuralmind","pucpr")) else ""
+    num_layers = _MANUAL_BERT_LAYERS.get(model_type)
+    kwargs = {"lang": lang, "rescale_with_baseline": _USE_RESCALE_BASELINE}
+    if num_layers is not None:
+        kwargs["num_layers"] = num_layers
+    return BERTScorer(model_type=model_type, **kwargs)
+def chunk_text_with_stride(text: str, tokenizer, max_len: int = 512, stride: int = 50):
+    ids = tokenizer.encode(text, add_special_tokens=True)
+    if len(ids) <= max_len:
+        return [tokenizer.decode(ids, skip_special_tokens=True)]
+    chunks, step = [], max_len - stride
+    for i in range(0, len(ids), step):
+        subset = ids[i:i+max_len]
+        if not subset:
+            break
+        chunks.append(tokenizer.decode(subset, skip_special_tokens=True))
+        if i+max_len >= len(ids):
+            break
+    return chunks
+def compute_bertscore_single(reference: str, prediction: str, model_type: str, per_section: bool = False):
+    """
+    If per_section=False: returns float global F1 (0..1) or None on error.
+    If per_section=True: returns dict with keys:
+        - bertscore_S_f1, bertscore_O_f1, bertscore_A_f1, bertscore_P_f1 (if sections exist)
+        - bertscore_global_f1
+      Missing/invalid sections are omitted or set to None.
+    """
+    if not reference or not prediction:
+        return None if not per_section else {}
+    try:
+        scorer = get_bertscore_scorer(model_type)
+        tokenizer = AutoTokenizer.from_pretrained(model_type, use_fast=True)
+        def score_pair(pred_text, ref_text):
+            if not pred_text or not ref_text:
+                return None
+            try:
+                _, _, F1 = scorer.score([pred_text], [ref_text])
+                return float(F1[0])
+            except Exception:
+                return None
+        # global
+        pred_chunks = chunk_text_with_stride(prediction, tokenizer)
+        ref_chunks = chunk_text_with_stride(reference, tokenizer)
+        paired = list(zip(pred_chunks, ref_chunks))
+        global_f1s = []
+        for pc, rc in paired:
+            f1 = score_pair(pc, rc)
+            if f1 is not None:
+                global_f1s.append(f1)
+        global_avg = sum(global_f1s) / len(global_f1s) if global_f1s else 0.0
+        if not per_section:
+            return global_avg
+        out = {}
+        out["bertscore_global_f1"] = global_avg
+        # per-section only if both texts have sections
+        ref_has = has_sections(reference)
+        pred_has = has_sections(prediction)
+        if ref_has and pred_has:
+            sections_ref = extract_sections(reference)
+            sections_pred = extract_sections(prediction)
+            for tag in ["S", "O", "A", "P"]:
+                pred_sec = sections_pred.get(tag, "")
+                ref_sec = sections_ref.get(tag, "")
+                if pred_sec and ref_sec:
+                    pred_chunks = chunk_text_with_stride(pred_sec, tokenizer)
+                    ref_chunks = chunk_text_with_stride(ref_sec, tokenizer)
+                    paired_sec = list(zip(pred_chunks, ref_chunks))
+                    f1s = []
+                    for pc, rc in paired_sec:
+                        f1 = score_pair(pc, rc)
+                        if f1 is not None:
+                            f1s.append(f1)
+                    avg_f1 = sum(f1s) / len(f1s) if f1s else 0.0
+                    out[f"bertscore_{tag}_f1"] = avg_f1
+                else:
+                    out[f"bertscore_{tag}_f1"] = None
+        else:
+            for tag in ["S", "O", "A", "P"]:
+                out[f"bertscore_{tag}_f1"] = None
+        return out
+    except Exception:
+        return None if not per_section else {}
+def compute_batch_bertscore(df: pd.DataFrame, bert_models: list, per_section: bool = False) -> pd.DataFrame:
+    """
+    If per_section=True and single model: returns per-section + global bertscore.
+    Otherwise: only global scores (one per model).
+    Expects df with columns:
+      - dsc_reference_free_text
+      - dsc_generated_clinical_report
+    bert_models is a list of friendly names (e.g., "Portuguese (Br) Bert").
+    """
+    if bert_models is None or not bert_models:
+        return pd.DataFrame(index=df.index)
+    preds = df["dsc_generated_clinical_report"].astype(str).tolist()
+    refs = df["dsc_reference_free_text"].astype(str).tolist()
+    add = {}
+    single_model = len(bert_models) == 1
+    for friendly in bert_models:
+        model_id = BERT_FRIENDLY_TO_MODEL.get(friendly, friendly)
+        short = model_id.split("/")[-1].replace("-", "_")
+        if per_section and single_model:
+            col_data = {
+                "bertscore_global_f1": [],
+                "bertscore_S_f1": [],
+                "bertscore_O_f1": [],
+                "bertscore_A_f1": [],
+                "bertscore_P_f1": [],
+            }
+            for pred, ref in zip(preds, refs):
+                scores = compute_bertscore_single(ref, pred, model_id, per_section=True)
+                if not scores:
+                    col_data["bertscore_global_f1"].append(None)
+                    for tag in ["S", "O", "A", "P"]:
+                        col_data[f"bertscore_{tag}_f1"].append(None)
+                else:
+                    col_data["bertscore_global_f1"].append(scores.get("bertscore_global_f1"))
+                    for tag in ["S", "O", "A", "P"]:
+                        col_data[f"bertscore_{tag}_f1"].append(scores.get(f"bertscore_{tag}_f1"))
+            add.update(col_data)
+        else:
+            col_name = f"bertscore_{short}_f1"
+            scorer = get_bertscore_scorer(model_id)
+            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+            f1_list = []
+            for pred, ref in zip(preds, refs):
+                try:
+                    pred_chunks = chunk_text_with_stride(pred, tokenizer)
+                    ref_chunks = chunk_text_with_stride(ref, tokenizer)
+                    paired = list(zip(pred_chunks, ref_chunks))
+                    if not paired:
+                        f1_list.append(None)
+                        continue
+                    per_pair_f1s = []
+                    for pc, rc in paired:
+                        _, _, F1 = scorer.score([pc], [rc])
+                        per_pair_f1s.append(float(F1[0]))
+                    avg_f1 = sum(per_pair_f1s) / len(per_pair_f1s) if per_pair_f1s else None
+                    f1_list.append(avg_f1)
+                except Exception:
+                    f1_list.append(None)
+            add[col_name] = f1_list
+    return pd.DataFrame(add, index=df.index)

metrics/bleu.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# metrics/bleu.py
+"""
+BLEU metric wrappers using sacreBLEU and file_utils.
+"""
+from utils.file_utils import  *
+from sacrebleu.metrics import BLEU
+# Instância global de BLEU com tokenização 'intl', lowercase e smoothing 'exp'
+_bleu_scorer = BLEU(tokenize='intl', lowercase=True, smooth_method='exp')
+def section_bleu(gen_txt: str, ref_txt: str) -> float:
+    """
+    Calcula BLEU para um par de strings (seção), retornando score de 0 a 100.
+    """
+    if not gen_txt.strip() and not ref_txt.strip():
+        return 100.0
+    if (not gen_txt.strip()) ^ (not ref_txt.strip()):
+        return 0.0
+    return _bleu_scorer.sentence_score(gen_txt, [ref_txt]).score
+def full_bleu(gen_raw: str, ref_raw: str) -> float:
+    """
+    Calcula BLEU global para strings completas, retornando score de 0 a 100.
+    """
+    gen = normalize_and_flatten(gen_raw)
+    ref = normalize_and_flatten(ref_raw)
+    if not gen and not ref:
+        return 100.0
+    if (not gen) ^ (not ref):
+        return 0.0
+    return _bleu_scorer.sentence_score(gen, [ref]).score
+def compute_bleu_single(reference: str, prediction: str) -> str:
+    """
+    Compute and format BLEU score for a single pair.
+    """
+    if not reference or not prediction:
+        return "Please provide both texts."
+    score = full_bleu(prediction, reference) / 100.0
+    return f"BLEU Score: {score:.4f}"

metrics/bleurt.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# metrics/bleurt.py
+import os
+# disable GPU (and XLA/PTX) so BLEURT runs on CPU and avoids the libdevice error
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+"""
+BLEURT metric wrappers using HuggingFace evaluate.
+"""
+from evaluate import load
+_hf_bleurt = None
+def get_hf_bleurt():
+    global _hf_bleurt
+    if _hf_bleurt is None:
+        _hf_bleurt = load("bleurt", module_type="metric", checkpoint="BLEURT-20")
+    return _hf_bleurt
+def compute_bleurt_single(reference: str, prediction: str) -> str:
+    """
+    Compute and format BLEURT score for a single pair.
+    """
+    if not reference or not prediction:
+        return "Please provide both texts."
+    bleurt = get_hf_bleurt()
+    result = bleurt.compute(predictions=[prediction], references=[reference])
+    return f"BLEURT Score: {result['scores'][0]:.4f}"

metrics/core.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# metrics/core.py
+"""
+Orchestrates batch computation of selected metrics FOR UPLOAD CSV TAB
+"""
+import pandas as pd
+from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
+from .bleurt import get_hf_bleurt, compute_bleurt_single
+from .rouge import get_hf_rouge, compute_rouge_single, rougeL_score
+from .bertscore import compute_batch_bertscore
+from utils.file_utils import extract_sections, has_sections
+def compute_all_metrics_batch(
+    df: pd.DataFrame,
+    selected_metrics: list = None,
+    bert_models: list | None = None
+) -> pd.DataFrame:
+    if selected_metrics is None:
+        selected_metrics = ["BLEU"]
+    df = df.dropna(
+        subset=["dsc_reference_free_text", "dsc_generated_clinical_report"]
+    ).copy()
+    if "code_audio_transcription" not in df.columns:
+        df["code_audio_transcription"] = list(range(len(df)))
+    df["has_sections"] = df.apply(
+        lambda r: has_sections(r["dsc_reference_free_text"])
+                  and has_sections(r["dsc_generated_clinical_report"]),
+        axis=1
+    )
+    only_one_metric = len(selected_metrics) == 1
+    core_metrics = [m for m in selected_metrics if m in {"BLEU", "BLEURT", "ROUGE"}]
+    only_bertscore_alone = only_one_metric and selected_metrics == ["BERTSCORE"]
+    out_cols = ["code_audio_transcription"]
+    tags = ["S", "O", "A", "P"]
+    # BLEU
+    if "BLEU" in selected_metrics:
+        if only_one_metric and "BLEU" in selected_metrics:
+            for tag in tags:
+                def _sec_bleu(row, tag=tag):
+                    gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
+                    ref = extract_sections(row["dsc_reference_free_text"])[tag]
+                    if row["has_sections"] and gen and ref:
+                        return section_bleu(gen, ref) / 100.0
+                    return None
+                df[f"bleu_{tag}"] = df.apply(_sec_bleu, axis=1)
+                out_cols.append(f"bleu_{tag}")
+        df["bleu_global"] = df.apply(
+            lambda r: full_bleu(
+                r["dsc_generated_clinical_report"],
+                r["dsc_reference_free_text"]
+            ) / 100.0,
+            axis=1
+        )
+        out_cols.append("bleu_global")
+    # BLEURT
+    if "BLEURT" in selected_metrics:
+        bleurt = get_hf_bleurt()
+        if only_one_metric and "BLEURT" in selected_metrics:
+            for tag in tags:
+                idxs, gens, refs = [], [], []
+                for i, row in df.iterrows():
+                    gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
+                    ref = extract_sections(row["dsc_reference_free_text"])[tag]
+                    if row["has_sections"] and gen and ref:
+                        idxs.append(i); gens.append(gen); refs.append(ref)
+                scores = (
+                    bleurt.compute(predictions=gens, references=refs)["scores"]
+                    if gens else []
+                )
+                col = [None] * len(df)
+                for i, sc in zip(idxs, scores):
+                    col[i] = sc
+                df[f"bleurt_{tag}"] = col
+                out_cols.append(f"bleurt_{tag}")
+        df["bleurt_global"] = bleurt.compute(
+            predictions=df["dsc_generated_clinical_report"].tolist(),
+            references=df["dsc_reference_free_text"].tolist()
+        )["scores"]
+        out_cols.append("bleurt_global")
+    # ROUGE-L
+    if "ROUGE" in selected_metrics:
+        if only_one_metric and "ROUGE" in selected_metrics:
+            for tag in tags:
+                df[f"rougeL_{tag}"] = df.apply(
+                    lambda row: rougeL_score(
+                        extract_sections(row["dsc_generated_clinical_report"])[tag],
+                        extract_sections(row["dsc_reference_free_text"])[tag]
+                    ) if row["has_sections"] else None,
+                    axis=1
+                )
+                out_cols.append(f"rougeL_{tag}")
+        df["rougeL_global"] = df.apply(
+            lambda row: rougeL_score(
+                row["dsc_generated_clinical_report"],
+                row["dsc_reference_free_text"]
+            ),
+            axis=1
+        )
+        out_cols.append("rougeL_global")
+    # BERTScore
+    if "BERTSCORE" in (selected_metrics or []) and bert_models:
+        per_section_bertscore = only_bertscore_alone and bert_models and len(bert_models) == 1
+        bert_df = compute_batch_bertscore(df, bert_models, per_section=per_section_bertscore)
+        for col in bert_df.columns:
+            df[col] = bert_df[col]
+            out_cols.append(col)
+    # clip BLEU
+    for c in df.columns:
+        if c.startswith("bleu_"):
+            df[c] = df[c].clip(0.0, 1.0)
+    return df[out_cols]

metrics/rouge.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# metrics/rouge.py
+"""
+ROUGE metric wrappers using HuggingFace evaluate.
+"""
+from evaluate import load
+_hf_rouge = None
+def get_hf_rouge():
+    global _hf_rouge
+    if _hf_rouge is None:
+        _hf_rouge = load("rouge")
+    return _hf_rouge
+def compute_rouge_single(reference: str, prediction: str) -> str:
+    """
+    Compute and format ROUGE-L score for a single pair.
+    """
+    if not reference or not prediction:
+        return "Please provide both texts."
+    rouge = get_hf_rouge()
+    res = rouge.compute(predictions=[prediction], references=[reference], rouge_types=["rougeL"])
+    score = res["rougeL"]
+    if isinstance(score, (list, tuple)):
+        score = score[0]
+    return f"ROUGE-L Score: {score:.4f}"
+def rougeL_score(pred: str, ref: str) -> float:
+    """
+    Raw ROUGE-L score (0..1) for text pairs.
+    """
+    rouge = get_hf_rouge()
+    res = rouge.compute(predictions=[pred], references=[ref], rouge_types=["rougeL"])
+    s = res["rougeL"]
+    return s[0] if isinstance(s, (list, tuple)) else s

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=5.40.0
+pandas>=2.0.0
+evaluate>=0.4.5
+transformers>=4.54.1
+tokenizers>=0.21.4
+bert-score>=0.3.13
+bleurt@git+https://github.com/google-research/bleurt.git@cebe7e6f996b40910cfaa520a63db47807e3bf5c
+sacrebleu>=2.5.1
+rouge_score>=0.1.2
+chardet

ui/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# ui/__init__.py
+"""
+UI package: exports tab builders.
+"""
+__all__ = ["build_manual_tab", "build_csv_tab"]
+from .manual_tab import build_manual_tab
+from .csv_tab import build_csv_tab

ui/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (363 Bytes). View file

ui/__pycache__/common.cpython-312.pyc ADDED Viewed

Binary file (841 Bytes). View file

ui/__pycache__/csv_tab.cpython-312.pyc ADDED Viewed

Binary file (13.7 kB). View file

ui/__pycache__/manual_tab.cpython-312.pyc ADDED Viewed

Binary file (3.74 kB). View file

ui/__pycache__/widgets.cpython-312.pyc ADDED Viewed

Binary file (1.1 kB). View file

ui/common.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import gradio as gr
+def toggle_manual_visibility(selected_metrics):
+    """
+    Returns visibility updates for manual-tab outputs:
+    (BLEU_out, BLEURT_out, ROUGE_out, BERT_out, BERT_model_selector)
+    """
+    return (
+        gr.update(visible="BLEU" in selected_metrics),
+        gr.update(visible="BLEURT" in selected_metrics),
+        gr.update(visible="ROUGE" in selected_metrics),
+        gr.update(visible="BERTSCORE" in selected_metrics),
+        gr.update(visible="BERTSCORE" in selected_metrics),
+    )

ui/csv_tab.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# ui/csv_tab.py
+"""
+Builds the CSV-upload tab (batch metrics).
+"""
+import gradio as gr
+import pandas as pd
+from metrics import compute_all_metrics_batch, BERT_FRIENDLY_TO_MODEL
+from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
+from utils.file_utils import smart_read_csv
+from utils.colors_utils import df_to_colored_html, get_metric_color
+from ui.common import toggle_manual_visibility as _toggle  # reuse for BERT selector
+from utils.tokenizer_refgen import generate_diff_html
+# ------------------- Summary HTML builder -------------------
+def build_summary_html(result_df: pd.DataFrame, selected_metrics: list, bert_models: list | None = None) -> str:
+    def safe_stats(col):
+        if col not in result_df.columns:
+            return None
+        series = result_df[col].dropna().astype(float)
+        if series.empty:
+            return None
+        avg, mn, mx = series.mean(), series.min(), series.max()
+        def audio_id_for(v):
+            subset = result_df[result_df[col].astype(float) == v]
+            if not subset.empty and "code_audio_transcription" in subset.columns:
+                aid = subset.iloc[0]["code_audio_transcription"]
+                try: return int(aid)
+                except: return aid
+            return ""
+        return {"avg": avg, "min": mn, "min_id": audio_id_for(mn), "max": mx, "max_id": audio_id_for(mx)}
+    rows = []
+    # Core metrics
+    core = [m for m in selected_metrics if m in {"BLEU","BLEURT","ROUGE"}]
+    if len(core) == 1:
+        m = core[0]
+        prefix = {"BLEU":"bleu_","BLEURT":"bleurt_","ROUGE":"rougeL_"}[m]
+        for tag in ('S','O','A','P'):
+            s = safe_stats(f"{prefix}{tag}")
+            if s: rows.append((f"{prefix}{tag}", s))
+        g = safe_stats(f"{prefix}global")
+        if g: rows.append((f"{prefix}global", g))
+    else:
+        for m,pref in [("BLEU","bleu_global"),("BLEURT","bleurt_global"),("ROUGE","rougeL_global")]:
+            if m in selected_metrics:
+                s = safe_stats(pref)
+                if s: rows.append((pref, s))
+    # BERTScore
+    if "BERTSCORE" in selected_metrics and bert_models:
+        only_bs = selected_metrics == ["BERTSCORE"]
+        single  = len(bert_models) == 1
+        per_sec = only_bs and single
+        for friendly in bert_models:
+            mid   = BERT_FRIENDLY_TO_MODEL[friendly]
+            short = mid.split("/")[-1].replace("-","_")
+            if per_sec:
+                for tag in ('S','O','A','P'):
+                    s = safe_stats(f"bertscore_{tag}_f1")
+                    if s: rows.append((f"bertscore_{tag}_f1", s))
+                sg = safe_stats("bertscore_global_f1")
+                if sg: rows.append(("bertscore_global_f1", sg))
+            else:
+                s = safe_stats(f"bertscore_{short}_f1")
+                if s: rows.append((f"bertscore_{short}_f1", s))
+    if not rows:
+        return "<div style='padding:8px;background:#1f1f1f;color:#eee;border-radius:6px;'>No summary available.</div>"
+    html = """
+    <div style="margin-bottom:12px;overflow-x:auto;">
+      <div style="font-weight:600;margin-bottom:4px;color:#f5f5f5;font-size:16px;">Summary Statistics</div>
+      <table style="border-collapse:collapse;width:100%;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;border-radius:8px;overflow:hidden;min-width:500px;">
+        <thead>
+          <tr>
+            <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:left;font-weight:600;">Metric</th>
+            <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Avg</th>
+            <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Min (ID)</th>
+            <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Max (ID)</th>
+          </tr>
+        </thead><tbody>
+    """
+    for col, stat in rows:
+        if col.startswith("bertscore_"):
+            name = "BERTScore Global" if col=="bertscore_global_f1" else f"BERTScore {col.split('_')[1].upper()}"
+        else:
+            name = col.replace("_"," ").upper()
+        avg = f"{stat['avg']:.4f}"
+        mn  = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] else f"{stat['min']:.4f}"
+        mx  = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] else f"{stat['max']:.4f}"
+        if col.startswith("bleu_"):
+            ca,cm,cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU")
+        elif col.startswith("bleurt_"):
+            ca,cm,cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT")
+        elif col.startswith("rougeL_"):
+            ca,cm,cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE")
+        else:
+            ca,cm,cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE")
+        html += f"""
+          <tr style="background:#0f1218;">
+            <td style="padding:8px 12px;border:1px solid #2f3240;color:#fff;white-space:nowrap;">{name}</td>
+            <td style="padding:8px 12px;border:1px solid #2f3240;background:{ca};color:#fff;text-align:center;white-space:nowrap;">{avg}</td>
+            <td style="padding:8px 12px;border:1px solid #2f3240;background:{cm};color:#fff;text-align:center;white-space:nowrap;">{mn}</td>
+            <td style="padding:8px 12px;border:1px solid #2f3240;background:{cx};color:#fff;text-align:center;white-space:nowrap;">{mx}</td>
+          </tr>
+        """
+    html += "</tbody></table></div>"
+    return html
+def build_csv_tab():
+    with gr.Blocks() as tab:
+        state_df = gr.State()
+        gr.Markdown("# Run an Experiment via CSV upload")
+        gr.Markdown(
+            "This section lets you upload a CSV of reference/generated text pairs, "
+            "select which columns correspond to reference, generated output, and a shared ID, "
+            "choose metrics (including BERTScore variants), and compute scores in bulk."
+        )
+        # 1. Upload CSV
+        gr.Markdown("## Upload CSV")
+        gr.Markdown(
+            "Provide a CSV file containing your data. It should include columns for the reference text, "
+            "the generated text, and an identifier (e.g., audio ID)."
+        )
+        with gr.Row():
+            file_input = gr.File(label="Upload CSV", file_types=[".csv"])
+            status     = gr.Text(label="Status")
+        # 2. Map Columns
+        gr.Markdown("## Map Columns")
+        gr.Markdown(
+            "Select which columns in your CSV correspond to the reference text, generated text, and audio/example ID."
+        )
+        with gr.Row(visible=False) as mapping:
+            ref_col = gr.Dropdown(label="Reference Column", choices=[])
+            gen_col = gr.Dropdown(label="Generated Column", choices=[])
+            id_col  = gr.Dropdown(label="Audio ID Column", choices=[])
+        # 3. Select Metrics
+        gr.Markdown("## Select Metrics")
+        metric_selector     = MetricCheckboxGroup()
+        bert_model_selector = BertCheckboxGroup()
+        # 4. Compute
+        gr.Markdown("## Run Evaluation")
+        run_btn        = gr.Button("Run an Evaluation")
+        output_status  = gr.Text()
+        summary_output = gr.HTML()
+        table_output   = gr.HTML()
+        # 5. Inspect an Example
+        gr.Markdown("## Inspect an Example")
+        gr.Markdown(
+            "Pick an example by its ID to view the reference vs generated text with token-level differences highlighted."
+        )
+        with gr.Accordion("🔍 Show reference & generated text", open=False):
+            pick_id   = gr.Dropdown(label="Pick an Audio ID", choices=[])
+            ref_disp  = gr.Textbox(label="Reference Text", lines=6, interactive=False)
+            gen_disp  = gr.Textbox(label="Generated Text", lines=6, interactive=False)
+            diff_disp = gr.HTML()
+        # --- Handlers ---
+        def handle_upload(f):
+            if not f:
+                return None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(visible=False), ""
+            df = smart_read_csv(f.name)
+            cols = df.columns.tolist()
+            return (
+                df,
+                gr.update(choices=cols, value=None),
+                gr.update(choices=cols, value=None),
+                gr.update(choices=cols, value=None),
+                gr.update(visible=True),
+                "Upload OK",
+            )
+        def run_batch(df, r, g, i, mets, berts):
+            if df is None:
+                return "No data.", "", "", gr.update(choices=[])
+            sub    = df[[i, r, g]].rename(
+                columns={i: "code_audio_transcription", r: "dsc_reference_free_text", g: "dsc_generated_clinical_report"}
+            )
+            result = compute_all_metrics_batch(sub, mets, berts if "BERTSCORE" in (mets or []) else None)
+            # Cast IDs to Python int to avoid np.int64 dropdown issues
+            raw_ids = result["code_audio_transcription"].dropna().unique()
+            ids     = sorted(int(x) for x in raw_ids)
+            summary = build_summary_html(result, mets, berts if "BERTSCORE" in (mets or []) else None)
+            table   = df_to_colored_html(result)
+            return "Métricas calculadas com sucesso.", summary, table, gr.update(choices=ids)
+        def show_example(df, audio_id):
+            if df is None or audio_id is None:
+                return "", "", ""
+            row = df[df["code_audio_transcription"] == audio_id]
+            if row.empty:
+                try:
+                    row = df[df["code_audio_transcription"] == float(audio_id)]
+                except:
+                    return "", "", ""
+            row = row.iloc[0]
+            return (
+                row["dsc_reference_free_text"],
+                row["dsc_generated_clinical_report"],
+                generate_diff_html(row["dsc_reference_free_text"], row["dsc_generated_clinical_report"])
+            )
+        # --- Wiring ---
+        file_input.change(
+            fn=handle_upload,
+            inputs=[file_input],
+            outputs=[state_df, ref_col, gen_col, id_col, mapping, status],
+        )
+        metric_selector.change(
+            lambda ms: gr.update(visible="BERTSCORE" in ms),
+            inputs=[metric_selector],
+            outputs=[bert_model_selector],
+        )
+        run_btn.click(
+            fn=run_batch,
+            inputs=[state_df, ref_col, gen_col, id_col, metric_selector, bert_model_selector],
+            outputs=[output_status, summary_output, table_output, pick_id],
+        )
+        pick_id.change(
+            fn=show_example,
+            inputs=[state_df, pick_id],
+            outputs=[ref_disp, gen_disp, diff_disp],
+        )
+    return tab

ui/manual_tab.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# ui/manual_tab.py
+"""
+Builds the manual-evaluation tab (single pair metrics).
+"""
+import gradio as gr
+from metrics import (
+    compute_bleu_single,
+    compute_bleurt_single,
+    compute_rouge_single,
+    compute_bertscore_single,
+    BERT_FRIENDLY_TO_MODEL,
+)
+from ui.common import toggle_manual_visibility
+from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
+def build_manual_tab():
+    with gr.Blocks() as tab:
+        gr.Markdown("## Manual Evaluation")
+        gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.")
+        with gr.Row():
+            reference_input = gr.Textbox(label="Reference Text", lines=3)
+            generated_input = gr.Textbox(label="Generated Text", lines=3)
+        metric_selector = MetricCheckboxGroup()
+        bert_model_selector = BertCheckboxGroup()
+        with gr.Row():
+            run_btn = gr.Button("Run an Evaluation")
+            clear_btn = gr.Button("Clear")
+        bleu_out = gr.Textbox(label="BLEU Score", interactive=False)
+        bleurt_out = gr.Textbox(label="BLEURT Score", interactive=False)
+        rouge_out = gr.Textbox(label="ROUGE Score", interactive=False)
+        bert_out = gr.Textbox(label="BERTScore Results", interactive=False)
+        def compute_manual(reference, generated, metrics, berts):
+            bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else ""
+            bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else ""
+            rouge = compute_rouge_single(reference, generated) if "ROUGE" in metrics else ""
+            bertscore = ""
+            if "BERTSCORE" in metrics and berts:
+                parts = []
+                for f in berts:
+                    mid = BERT_FRIENDLY_TO_MODEL[f]
+                    score = compute_bertscore_single(reference, generated, mid, per_section=False)
+                    parts.append(f"{f} Global F1: {score:.4f}" if score is not None else f"{f}: error")
+                bertscore = "\n".join(parts)
+            return bleu, bleurt, rouge, bertscore
+        run_btn.click(
+            fn=compute_manual,
+            inputs=[reference_input, generated_input, metric_selector, bert_model_selector],
+            outputs=[bleu_out, bleurt_out, rouge_out, bert_out],
+        )
+        metric_selector.change(
+            fn=toggle_manual_visibility,
+            inputs=[metric_selector],
+            outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector],
+        )
+        clear_btn.click(
+            fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]),
+            inputs=[],
+            outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector],
+        )
+    return tab

ui/widgets.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# ui/widgets.py
+"""
+Factory functions for common Gradio widgets.
+"""
+import gradio as gr
+from metrics import BERT_FRIENDLY_TO_MODEL
+from config import METRIC_CHOICES, DEFAULT_METRICS, DEFAULT_BERTS
+def MetricCheckboxGroup(label="Which metrics to compute", default=None, visible=True):
+    return gr.CheckboxGroup(
+        choices=METRIC_CHOICES,
+        label=label,
+        value=default or DEFAULT_METRICS,
+        visible=visible,
+    )
+def BertCheckboxGroup(label="Which BERT models (for BERTScore)", default=None, visible=False):
+    return gr.CheckboxGroup(
+        choices=list(BERT_FRIENDLY_TO_MODEL.keys()),
+        label=label,
+        value=default or DEFAULT_BERTS,
+        visible=visible,
+    )