Spaces:
Sleeping
Sleeping
Guilherme
commited on
Commit
·
4b112ae
1
Parent(s):
ca4ba7f
Deploy to HF Space
Browse files- app.py +39 -0
- config.py +11 -0
- metrics/__init__.py +21 -0
- metrics/__pycache__/__init__.cpython-312.pyc +0 -0
- metrics/__pycache__/bertscore.cpython-312.pyc +0 -0
- metrics/__pycache__/bleu.cpython-312.pyc +0 -0
- metrics/__pycache__/bleurt.cpython-312.pyc +0 -0
- metrics/__pycache__/core.cpython-312.pyc +0 -0
- metrics/__pycache__/rouge.cpython-312.pyc +0 -0
- metrics/bertscore.py +187 -0
- metrics/bleu.py +41 -0
- metrics/bleurt.py +30 -0
- metrics/core.py +122 -0
- metrics/rouge.py +38 -0
- requirements.txt +10 -0
- ui/__init__.py +8 -0
- ui/__pycache__/__init__.cpython-312.pyc +0 -0
- ui/__pycache__/common.cpython-312.pyc +0 -0
- ui/__pycache__/csv_tab.cpython-312.pyc +0 -0
- ui/__pycache__/manual_tab.cpython-312.pyc +0 -0
- ui/__pycache__/widgets.cpython-312.pyc +0 -0
- ui/common.py +15 -0
- ui/csv_tab.py +238 -0
- ui/manual_tab.py +68 -0
- ui/widgets.py +25 -0
app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import os
|
| 3 |
+
import threading
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from metrics import get_hf_bleurt, get_hf_rouge
|
| 7 |
+
from ui.manual_tab import build_manual_tab
|
| 8 |
+
from ui.csv_tab import build_csv_tab
|
| 9 |
+
|
| 10 |
+
# (Optional) force CPU so TensorFlow/BLEURT doesn't try CUDA on Spaces
|
| 11 |
+
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")
|
| 12 |
+
|
| 13 |
+
def create_interface() -> gr.TabbedInterface:
|
| 14 |
+
return gr.TabbedInterface(
|
| 15 |
+
interface_list=[
|
| 16 |
+
build_manual_tab(),
|
| 17 |
+
build_csv_tab(),
|
| 18 |
+
],
|
| 19 |
+
tab_names=["Manual Input", "CSV Upload"],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
# Preload heavy HF metrics so the UI stays responsive
|
| 24 |
+
threading.Thread(target=get_hf_bleurt, daemon=True).start()
|
| 25 |
+
threading.Thread(target=get_hf_rouge, daemon=True).start()
|
| 26 |
+
|
| 27 |
+
app = create_interface()
|
| 28 |
+
|
| 29 |
+
# Spaces (and most PaaS) provide PORT; default to 7860 for local runs
|
| 30 |
+
port = int(os.getenv("PORT", "7860"))
|
| 31 |
+
|
| 32 |
+
# Use queue for long-running tasks to avoid timeouts
|
| 33 |
+
app.queue(concurrency_count=1, max_size=20).launch(
|
| 34 |
+
server_name="0.0.0.0",
|
| 35 |
+
server_port=port,
|
| 36 |
+
show_error=True,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
config.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from metrics import BERT_FRIENDLY_TO_MODEL
|
| 2 |
+
|
| 3 |
+
# Available evaluation metrics for selection
|
| 4 |
+
METRIC_CHOICES = ["BLEU", "BLEURT", "ROUGE", "BERTSCORE"]
|
| 5 |
+
# Default metrics pre-selected in the UI
|
| 6 |
+
DEFAULT_METRICS = ["BLEU"]
|
| 7 |
+
|
| 8 |
+
# Available BERT models for BERTScore
|
| 9 |
+
BERT_CHOICES = list(BERT_FRIENDLY_TO_MODEL.keys())
|
| 10 |
+
# Default BERT model pre-selected in the UI
|
| 11 |
+
DEFAULT_BERTS = [BERT_CHOICES[0]]
|
metrics/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/__init__.py
|
| 2 |
+
"""
|
| 3 |
+
Central metrics entrypoint: import and expose all metric functions.
|
| 4 |
+
"""
|
| 5 |
+
from .core import compute_all_metrics_batch
|
| 6 |
+
from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
|
| 7 |
+
from .bleurt import compute_bleurt_single, get_hf_bleurt
|
| 8 |
+
from .rouge import compute_rouge_single, get_hf_rouge
|
| 9 |
+
from .bertscore import compute_bertscore_single, BERT_FRIENDLY_TO_MODEL, BERT_MODEL_TO_FRIENDLY
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"compute_all_metrics_batch",
|
| 13 |
+
"compute_bleu_single",
|
| 14 |
+
"compute_bleurt_single",
|
| 15 |
+
"compute_rouge_single",
|
| 16 |
+
"get_hf_bleurt",
|
| 17 |
+
"get_hf_rouge",
|
| 18 |
+
"compute_bertscore_single",
|
| 19 |
+
"BERT_FRIENDLY_TO_MODEL",
|
| 20 |
+
"BERT_MODEL_TO_FRIENDLY",
|
| 21 |
+
]
|
metrics/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (776 Bytes). View file
|
|
|
metrics/__pycache__/bertscore.cpython-312.pyc
ADDED
|
Binary file (8.73 kB). View file
|
|
|
metrics/__pycache__/bleu.cpython-312.pyc
ADDED
|
Binary file (1.93 kB). View file
|
|
|
metrics/__pycache__/bleurt.cpython-312.pyc
ADDED
|
Binary file (1.11 kB). View file
|
|
|
metrics/__pycache__/core.cpython-312.pyc
ADDED
|
Binary file (5.76 kB). View file
|
|
|
metrics/__pycache__/rouge.cpython-312.pyc
ADDED
|
Binary file (1.59 kB). View file
|
|
|
metrics/bertscore.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/bertscore.py
|
| 2 |
+
"""
|
| 3 |
+
BERTScore helpers: scorer init, single and batch computation.
|
| 4 |
+
"""
|
| 5 |
+
from bert_score import BERTScorer
|
| 6 |
+
from functools import lru_cache
|
| 7 |
+
from transformers import AutoTokenizer
|
| 8 |
+
from utils.file_utils import extract_sections, has_sections
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
# manual layer mapping
|
| 12 |
+
_MANUAL_BERT_LAYERS = {
|
| 13 |
+
"neuralmind/bert-base-portuguese-cased": 12,
|
| 14 |
+
"pucpr/biobertpt-clin": 12,
|
| 15 |
+
"xlm-roberta-large": 24,
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
# friendly label ↔ model id mapping
|
| 19 |
+
BERT_FRIENDLY_TO_MODEL = {
|
| 20 |
+
"Portuguese (Br) Bert": "neuralmind/bert-base-portuguese-cased",
|
| 21 |
+
"Portuguese (Br) Clinical BioBert": "pucpr/biobertpt-clin",
|
| 22 |
+
"Multilingual Bert ( RoBerta)": "xlm-roberta-large",
|
| 23 |
+
}
|
| 24 |
+
BERT_MODEL_TO_FRIENDLY = {v: k for k, v in BERT_FRIENDLY_TO_MODEL.items()}
|
| 25 |
+
|
| 26 |
+
_USE_RESCALE_BASELINE = False
|
| 27 |
+
|
| 28 |
+
@lru_cache(maxsize=6)
|
| 29 |
+
def get_bertscore_scorer(model_type: str):
|
| 30 |
+
lang = "pt" if any(model_type.startswith(p) for p in ("neuralmind","pucpr")) else ""
|
| 31 |
+
num_layers = _MANUAL_BERT_LAYERS.get(model_type)
|
| 32 |
+
kwargs = {"lang": lang, "rescale_with_baseline": _USE_RESCALE_BASELINE}
|
| 33 |
+
if num_layers is not None:
|
| 34 |
+
kwargs["num_layers"] = num_layers
|
| 35 |
+
return BERTScorer(model_type=model_type, **kwargs)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def chunk_text_with_stride(text: str, tokenizer, max_len: int = 512, stride: int = 50):
|
| 39 |
+
ids = tokenizer.encode(text, add_special_tokens=True)
|
| 40 |
+
if len(ids) <= max_len:
|
| 41 |
+
return [tokenizer.decode(ids, skip_special_tokens=True)]
|
| 42 |
+
chunks, step = [], max_len - stride
|
| 43 |
+
for i in range(0, len(ids), step):
|
| 44 |
+
subset = ids[i:i+max_len]
|
| 45 |
+
if not subset:
|
| 46 |
+
break
|
| 47 |
+
chunks.append(tokenizer.decode(subset, skip_special_tokens=True))
|
| 48 |
+
if i+max_len >= len(ids):
|
| 49 |
+
break
|
| 50 |
+
return chunks
|
| 51 |
+
|
| 52 |
+
def compute_bertscore_single(reference: str, prediction: str, model_type: str, per_section: bool = False):
|
| 53 |
+
"""
|
| 54 |
+
If per_section=False: returns float global F1 (0..1) or None on error.
|
| 55 |
+
If per_section=True: returns dict with keys:
|
| 56 |
+
- bertscore_S_f1, bertscore_O_f1, bertscore_A_f1, bertscore_P_f1 (if sections exist)
|
| 57 |
+
- bertscore_global_f1
|
| 58 |
+
Missing/invalid sections are omitted or set to None.
|
| 59 |
+
"""
|
| 60 |
+
if not reference or not prediction:
|
| 61 |
+
return None if not per_section else {}
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
scorer = get_bertscore_scorer(model_type)
|
| 65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_type, use_fast=True)
|
| 66 |
+
|
| 67 |
+
def score_pair(pred_text, ref_text):
|
| 68 |
+
if not pred_text or not ref_text:
|
| 69 |
+
return None
|
| 70 |
+
try:
|
| 71 |
+
_, _, F1 = scorer.score([pred_text], [ref_text])
|
| 72 |
+
return float(F1[0])
|
| 73 |
+
except Exception:
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
# global
|
| 77 |
+
pred_chunks = chunk_text_with_stride(prediction, tokenizer)
|
| 78 |
+
ref_chunks = chunk_text_with_stride(reference, tokenizer)
|
| 79 |
+
paired = list(zip(pred_chunks, ref_chunks))
|
| 80 |
+
global_f1s = []
|
| 81 |
+
for pc, rc in paired:
|
| 82 |
+
f1 = score_pair(pc, rc)
|
| 83 |
+
if f1 is not None:
|
| 84 |
+
global_f1s.append(f1)
|
| 85 |
+
global_avg = sum(global_f1s) / len(global_f1s) if global_f1s else 0.0
|
| 86 |
+
|
| 87 |
+
if not per_section:
|
| 88 |
+
return global_avg
|
| 89 |
+
|
| 90 |
+
out = {}
|
| 91 |
+
out["bertscore_global_f1"] = global_avg
|
| 92 |
+
|
| 93 |
+
# per-section only if both texts have sections
|
| 94 |
+
ref_has = has_sections(reference)
|
| 95 |
+
pred_has = has_sections(prediction)
|
| 96 |
+
if ref_has and pred_has:
|
| 97 |
+
sections_ref = extract_sections(reference)
|
| 98 |
+
sections_pred = extract_sections(prediction)
|
| 99 |
+
for tag in ["S", "O", "A", "P"]:
|
| 100 |
+
pred_sec = sections_pred.get(tag, "")
|
| 101 |
+
ref_sec = sections_ref.get(tag, "")
|
| 102 |
+
if pred_sec and ref_sec:
|
| 103 |
+
pred_chunks = chunk_text_with_stride(pred_sec, tokenizer)
|
| 104 |
+
ref_chunks = chunk_text_with_stride(ref_sec, tokenizer)
|
| 105 |
+
paired_sec = list(zip(pred_chunks, ref_chunks))
|
| 106 |
+
f1s = []
|
| 107 |
+
for pc, rc in paired_sec:
|
| 108 |
+
f1 = score_pair(pc, rc)
|
| 109 |
+
if f1 is not None:
|
| 110 |
+
f1s.append(f1)
|
| 111 |
+
avg_f1 = sum(f1s) / len(f1s) if f1s else 0.0
|
| 112 |
+
out[f"bertscore_{tag}_f1"] = avg_f1
|
| 113 |
+
else:
|
| 114 |
+
out[f"bertscore_{tag}_f1"] = None
|
| 115 |
+
else:
|
| 116 |
+
for tag in ["S", "O", "A", "P"]:
|
| 117 |
+
out[f"bertscore_{tag}_f1"] = None
|
| 118 |
+
|
| 119 |
+
return out
|
| 120 |
+
except Exception:
|
| 121 |
+
return None if not per_section else {}
|
| 122 |
+
|
| 123 |
+
def compute_batch_bertscore(df: pd.DataFrame, bert_models: list, per_section: bool = False) -> pd.DataFrame:
|
| 124 |
+
"""
|
| 125 |
+
If per_section=True and single model: returns per-section + global bertscore.
|
| 126 |
+
Otherwise: only global scores (one per model).
|
| 127 |
+
Expects df with columns:
|
| 128 |
+
- dsc_reference_free_text
|
| 129 |
+
- dsc_generated_clinical_report
|
| 130 |
+
bert_models is a list of friendly names (e.g., "Portuguese (Br) Bert").
|
| 131 |
+
"""
|
| 132 |
+
if bert_models is None or not bert_models:
|
| 133 |
+
return pd.DataFrame(index=df.index)
|
| 134 |
+
|
| 135 |
+
preds = df["dsc_generated_clinical_report"].astype(str).tolist()
|
| 136 |
+
refs = df["dsc_reference_free_text"].astype(str).tolist()
|
| 137 |
+
|
| 138 |
+
add = {}
|
| 139 |
+
single_model = len(bert_models) == 1
|
| 140 |
+
|
| 141 |
+
for friendly in bert_models:
|
| 142 |
+
model_id = BERT_FRIENDLY_TO_MODEL.get(friendly, friendly)
|
| 143 |
+
short = model_id.split("/")[-1].replace("-", "_")
|
| 144 |
+
|
| 145 |
+
if per_section and single_model:
|
| 146 |
+
col_data = {
|
| 147 |
+
"bertscore_global_f1": [],
|
| 148 |
+
"bertscore_S_f1": [],
|
| 149 |
+
"bertscore_O_f1": [],
|
| 150 |
+
"bertscore_A_f1": [],
|
| 151 |
+
"bertscore_P_f1": [],
|
| 152 |
+
}
|
| 153 |
+
for pred, ref in zip(preds, refs):
|
| 154 |
+
scores = compute_bertscore_single(ref, pred, model_id, per_section=True)
|
| 155 |
+
if not scores:
|
| 156 |
+
col_data["bertscore_global_f1"].append(None)
|
| 157 |
+
for tag in ["S", "O", "A", "P"]:
|
| 158 |
+
col_data[f"bertscore_{tag}_f1"].append(None)
|
| 159 |
+
else:
|
| 160 |
+
col_data["bertscore_global_f1"].append(scores.get("bertscore_global_f1"))
|
| 161 |
+
for tag in ["S", "O", "A", "P"]:
|
| 162 |
+
col_data[f"bertscore_{tag}_f1"].append(scores.get(f"bertscore_{tag}_f1"))
|
| 163 |
+
add.update(col_data)
|
| 164 |
+
else:
|
| 165 |
+
col_name = f"bertscore_{short}_f1"
|
| 166 |
+
scorer = get_bertscore_scorer(model_id)
|
| 167 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
| 168 |
+
f1_list = []
|
| 169 |
+
for pred, ref in zip(preds, refs):
|
| 170 |
+
try:
|
| 171 |
+
pred_chunks = chunk_text_with_stride(pred, tokenizer)
|
| 172 |
+
ref_chunks = chunk_text_with_stride(ref, tokenizer)
|
| 173 |
+
paired = list(zip(pred_chunks, ref_chunks))
|
| 174 |
+
if not paired:
|
| 175 |
+
f1_list.append(None)
|
| 176 |
+
continue
|
| 177 |
+
per_pair_f1s = []
|
| 178 |
+
for pc, rc in paired:
|
| 179 |
+
_, _, F1 = scorer.score([pc], [rc])
|
| 180 |
+
per_pair_f1s.append(float(F1[0]))
|
| 181 |
+
avg_f1 = sum(per_pair_f1s) / len(per_pair_f1s) if per_pair_f1s else None
|
| 182 |
+
f1_list.append(avg_f1)
|
| 183 |
+
except Exception:
|
| 184 |
+
f1_list.append(None)
|
| 185 |
+
add[col_name] = f1_list
|
| 186 |
+
|
| 187 |
+
return pd.DataFrame(add, index=df.index)
|
metrics/bleu.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/bleu.py
|
| 2 |
+
"""
|
| 3 |
+
BLEU metric wrappers using sacreBLEU and file_utils.
|
| 4 |
+
"""
|
| 5 |
+
from utils.file_utils import *
|
| 6 |
+
from sacrebleu.metrics import BLEU
|
| 7 |
+
|
| 8 |
+
# Instância global de BLEU com tokenização 'intl', lowercase e smoothing 'exp'
|
| 9 |
+
_bleu_scorer = BLEU(tokenize='intl', lowercase=True, smooth_method='exp')
|
| 10 |
+
|
| 11 |
+
def section_bleu(gen_txt: str, ref_txt: str) -> float:
|
| 12 |
+
"""
|
| 13 |
+
Calcula BLEU para um par de strings (seção), retornando score de 0 a 100.
|
| 14 |
+
"""
|
| 15 |
+
if not gen_txt.strip() and not ref_txt.strip():
|
| 16 |
+
return 100.0
|
| 17 |
+
if (not gen_txt.strip()) ^ (not ref_txt.strip()):
|
| 18 |
+
return 0.0
|
| 19 |
+
return _bleu_scorer.sentence_score(gen_txt, [ref_txt]).score
|
| 20 |
+
|
| 21 |
+
def full_bleu(gen_raw: str, ref_raw: str) -> float:
|
| 22 |
+
"""
|
| 23 |
+
Calcula BLEU global para strings completas, retornando score de 0 a 100.
|
| 24 |
+
"""
|
| 25 |
+
gen = normalize_and_flatten(gen_raw)
|
| 26 |
+
ref = normalize_and_flatten(ref_raw)
|
| 27 |
+
if not gen and not ref:
|
| 28 |
+
return 100.0
|
| 29 |
+
if (not gen) ^ (not ref):
|
| 30 |
+
return 0.0
|
| 31 |
+
return _bleu_scorer.sentence_score(gen, [ref]).score
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def compute_bleu_single(reference: str, prediction: str) -> str:
|
| 35 |
+
"""
|
| 36 |
+
Compute and format BLEU score for a single pair.
|
| 37 |
+
"""
|
| 38 |
+
if not reference or not prediction:
|
| 39 |
+
return "Please provide both texts."
|
| 40 |
+
score = full_bleu(prediction, reference) / 100.0
|
| 41 |
+
return f"BLEU Score: {score:.4f}"
|
metrics/bleurt.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/bleurt.py
|
| 2 |
+
import os
|
| 3 |
+
# disable GPU (and XLA/PTX) so BLEURT runs on CPU and avoids the libdevice error
|
| 4 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
BLEURT metric wrappers using HuggingFace evaluate.
|
| 9 |
+
"""
|
| 10 |
+
from evaluate import load
|
| 11 |
+
|
| 12 |
+
_hf_bleurt = None
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_hf_bleurt():
|
| 16 |
+
global _hf_bleurt
|
| 17 |
+
if _hf_bleurt is None:
|
| 18 |
+
_hf_bleurt = load("bleurt", module_type="metric", checkpoint="BLEURT-20")
|
| 19 |
+
return _hf_bleurt
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def compute_bleurt_single(reference: str, prediction: str) -> str:
|
| 23 |
+
"""
|
| 24 |
+
Compute and format BLEURT score for a single pair.
|
| 25 |
+
"""
|
| 26 |
+
if not reference or not prediction:
|
| 27 |
+
return "Please provide both texts."
|
| 28 |
+
bleurt = get_hf_bleurt()
|
| 29 |
+
result = bleurt.compute(predictions=[prediction], references=[reference])
|
| 30 |
+
return f"BLEURT Score: {result['scores'][0]:.4f}"
|
metrics/core.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/core.py
|
| 2 |
+
"""
|
| 3 |
+
Orchestrates batch computation of selected metrics FOR UPLOAD CSV TAB
|
| 4 |
+
"""
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
|
| 7 |
+
from .bleurt import get_hf_bleurt, compute_bleurt_single
|
| 8 |
+
from .rouge import get_hf_rouge, compute_rouge_single, rougeL_score
|
| 9 |
+
from .bertscore import compute_batch_bertscore
|
| 10 |
+
from utils.file_utils import extract_sections, has_sections
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def compute_all_metrics_batch(
|
| 14 |
+
df: pd.DataFrame,
|
| 15 |
+
selected_metrics: list = None,
|
| 16 |
+
bert_models: list | None = None
|
| 17 |
+
) -> pd.DataFrame:
|
| 18 |
+
if selected_metrics is None:
|
| 19 |
+
selected_metrics = ["BLEU"]
|
| 20 |
+
|
| 21 |
+
df = df.dropna(
|
| 22 |
+
subset=["dsc_reference_free_text", "dsc_generated_clinical_report"]
|
| 23 |
+
).copy()
|
| 24 |
+
|
| 25 |
+
if "code_audio_transcription" not in df.columns:
|
| 26 |
+
df["code_audio_transcription"] = list(range(len(df)))
|
| 27 |
+
|
| 28 |
+
df["has_sections"] = df.apply(
|
| 29 |
+
lambda r: has_sections(r["dsc_reference_free_text"])
|
| 30 |
+
and has_sections(r["dsc_generated_clinical_report"]),
|
| 31 |
+
axis=1
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
only_one_metric = len(selected_metrics) == 1
|
| 35 |
+
core_metrics = [m for m in selected_metrics if m in {"BLEU", "BLEURT", "ROUGE"}]
|
| 36 |
+
only_bertscore_alone = only_one_metric and selected_metrics == ["BERTSCORE"]
|
| 37 |
+
out_cols = ["code_audio_transcription"]
|
| 38 |
+
tags = ["S", "O", "A", "P"]
|
| 39 |
+
|
| 40 |
+
# BLEU
|
| 41 |
+
if "BLEU" in selected_metrics:
|
| 42 |
+
if only_one_metric and "BLEU" in selected_metrics:
|
| 43 |
+
for tag in tags:
|
| 44 |
+
def _sec_bleu(row, tag=tag):
|
| 45 |
+
gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
|
| 46 |
+
ref = extract_sections(row["dsc_reference_free_text"])[tag]
|
| 47 |
+
if row["has_sections"] and gen and ref:
|
| 48 |
+
return section_bleu(gen, ref) / 100.0
|
| 49 |
+
return None
|
| 50 |
+
df[f"bleu_{tag}"] = df.apply(_sec_bleu, axis=1)
|
| 51 |
+
out_cols.append(f"bleu_{tag}")
|
| 52 |
+
df["bleu_global"] = df.apply(
|
| 53 |
+
lambda r: full_bleu(
|
| 54 |
+
r["dsc_generated_clinical_report"],
|
| 55 |
+
r["dsc_reference_free_text"]
|
| 56 |
+
) / 100.0,
|
| 57 |
+
axis=1
|
| 58 |
+
)
|
| 59 |
+
out_cols.append("bleu_global")
|
| 60 |
+
|
| 61 |
+
# BLEURT
|
| 62 |
+
if "BLEURT" in selected_metrics:
|
| 63 |
+
bleurt = get_hf_bleurt()
|
| 64 |
+
if only_one_metric and "BLEURT" in selected_metrics:
|
| 65 |
+
for tag in tags:
|
| 66 |
+
idxs, gens, refs = [], [], []
|
| 67 |
+
for i, row in df.iterrows():
|
| 68 |
+
gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
|
| 69 |
+
ref = extract_sections(row["dsc_reference_free_text"])[tag]
|
| 70 |
+
if row["has_sections"] and gen and ref:
|
| 71 |
+
idxs.append(i); gens.append(gen); refs.append(ref)
|
| 72 |
+
scores = (
|
| 73 |
+
bleurt.compute(predictions=gens, references=refs)["scores"]
|
| 74 |
+
if gens else []
|
| 75 |
+
)
|
| 76 |
+
col = [None] * len(df)
|
| 77 |
+
for i, sc in zip(idxs, scores):
|
| 78 |
+
col[i] = sc
|
| 79 |
+
df[f"bleurt_{tag}"] = col
|
| 80 |
+
out_cols.append(f"bleurt_{tag}")
|
| 81 |
+
df["bleurt_global"] = bleurt.compute(
|
| 82 |
+
predictions=df["dsc_generated_clinical_report"].tolist(),
|
| 83 |
+
references=df["dsc_reference_free_text"].tolist()
|
| 84 |
+
)["scores"]
|
| 85 |
+
out_cols.append("bleurt_global")
|
| 86 |
+
|
| 87 |
+
# ROUGE-L
|
| 88 |
+
if "ROUGE" in selected_metrics:
|
| 89 |
+
if only_one_metric and "ROUGE" in selected_metrics:
|
| 90 |
+
for tag in tags:
|
| 91 |
+
df[f"rougeL_{tag}"] = df.apply(
|
| 92 |
+
lambda row: rougeL_score(
|
| 93 |
+
extract_sections(row["dsc_generated_clinical_report"])[tag],
|
| 94 |
+
extract_sections(row["dsc_reference_free_text"])[tag]
|
| 95 |
+
) if row["has_sections"] else None,
|
| 96 |
+
axis=1
|
| 97 |
+
)
|
| 98 |
+
out_cols.append(f"rougeL_{tag}")
|
| 99 |
+
df["rougeL_global"] = df.apply(
|
| 100 |
+
lambda row: rougeL_score(
|
| 101 |
+
row["dsc_generated_clinical_report"],
|
| 102 |
+
row["dsc_reference_free_text"]
|
| 103 |
+
),
|
| 104 |
+
axis=1
|
| 105 |
+
)
|
| 106 |
+
out_cols.append("rougeL_global")
|
| 107 |
+
|
| 108 |
+
# BERTScore
|
| 109 |
+
if "BERTSCORE" in (selected_metrics or []) and bert_models:
|
| 110 |
+
per_section_bertscore = only_bertscore_alone and bert_models and len(bert_models) == 1
|
| 111 |
+
bert_df = compute_batch_bertscore(df, bert_models, per_section=per_section_bertscore)
|
| 112 |
+
for col in bert_df.columns:
|
| 113 |
+
df[col] = bert_df[col]
|
| 114 |
+
out_cols.append(col)
|
| 115 |
+
|
| 116 |
+
# clip BLEU
|
| 117 |
+
for c in df.columns:
|
| 118 |
+
if c.startswith("bleu_"):
|
| 119 |
+
df[c] = df[c].clip(0.0, 1.0)
|
| 120 |
+
|
| 121 |
+
return df[out_cols]
|
| 122 |
+
|
metrics/rouge.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# metrics/rouge.py
|
| 2 |
+
"""
|
| 3 |
+
ROUGE metric wrappers using HuggingFace evaluate.
|
| 4 |
+
"""
|
| 5 |
+
from evaluate import load
|
| 6 |
+
|
| 7 |
+
_hf_rouge = None
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def get_hf_rouge():
|
| 11 |
+
global _hf_rouge
|
| 12 |
+
if _hf_rouge is None:
|
| 13 |
+
_hf_rouge = load("rouge")
|
| 14 |
+
return _hf_rouge
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def compute_rouge_single(reference: str, prediction: str) -> str:
|
| 18 |
+
"""
|
| 19 |
+
Compute and format ROUGE-L score for a single pair.
|
| 20 |
+
"""
|
| 21 |
+
if not reference or not prediction:
|
| 22 |
+
return "Please provide both texts."
|
| 23 |
+
rouge = get_hf_rouge()
|
| 24 |
+
res = rouge.compute(predictions=[prediction], references=[reference], rouge_types=["rougeL"])
|
| 25 |
+
score = res["rougeL"]
|
| 26 |
+
if isinstance(score, (list, tuple)):
|
| 27 |
+
score = score[0]
|
| 28 |
+
return f"ROUGE-L Score: {score:.4f}"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def rougeL_score(pred: str, ref: str) -> float:
|
| 32 |
+
"""
|
| 33 |
+
Raw ROUGE-L score (0..1) for text pairs.
|
| 34 |
+
"""
|
| 35 |
+
rouge = get_hf_rouge()
|
| 36 |
+
res = rouge.compute(predictions=[pred], references=[ref], rouge_types=["rougeL"])
|
| 37 |
+
s = res["rougeL"]
|
| 38 |
+
return s[0] if isinstance(s, (list, tuple)) else s
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.40.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
evaluate>=0.4.5
|
| 4 |
+
transformers>=4.54.1
|
| 5 |
+
tokenizers>=0.21.4
|
| 6 |
+
bert-score>=0.3.13
|
| 7 |
+
bleurt@git+https://github.com/google-research/bleurt.git@cebe7e6f996b40910cfaa520a63db47807e3bf5c
|
| 8 |
+
sacrebleu>=2.5.1
|
| 9 |
+
rouge_score>=0.1.2
|
| 10 |
+
chardet
|
ui/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/__init__.py
|
| 2 |
+
"""
|
| 3 |
+
UI package: exports tab builders.
|
| 4 |
+
"""
|
| 5 |
+
__all__ = ["build_manual_tab", "build_csv_tab"]
|
| 6 |
+
|
| 7 |
+
from .manual_tab import build_manual_tab
|
| 8 |
+
from .csv_tab import build_csv_tab
|
ui/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (363 Bytes). View file
|
|
|
ui/__pycache__/common.cpython-312.pyc
ADDED
|
Binary file (841 Bytes). View file
|
|
|
ui/__pycache__/csv_tab.cpython-312.pyc
ADDED
|
Binary file (13.7 kB). View file
|
|
|
ui/__pycache__/manual_tab.cpython-312.pyc
ADDED
|
Binary file (3.74 kB). View file
|
|
|
ui/__pycache__/widgets.cpython-312.pyc
ADDED
|
Binary file (1.1 kB). View file
|
|
|
ui/common.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def toggle_manual_visibility(selected_metrics):
|
| 5 |
+
"""
|
| 6 |
+
Returns visibility updates for manual-tab outputs:
|
| 7 |
+
(BLEU_out, BLEURT_out, ROUGE_out, BERT_out, BERT_model_selector)
|
| 8 |
+
"""
|
| 9 |
+
return (
|
| 10 |
+
gr.update(visible="BLEU" in selected_metrics),
|
| 11 |
+
gr.update(visible="BLEURT" in selected_metrics),
|
| 12 |
+
gr.update(visible="ROUGE" in selected_metrics),
|
| 13 |
+
gr.update(visible="BERTSCORE" in selected_metrics),
|
| 14 |
+
gr.update(visible="BERTSCORE" in selected_metrics),
|
| 15 |
+
)
|
ui/csv_tab.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/csv_tab.py
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
Builds the CSV-upload tab (batch metrics).
|
| 5 |
+
"""
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
from metrics import compute_all_metrics_batch, BERT_FRIENDLY_TO_MODEL
|
| 10 |
+
from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
|
| 11 |
+
from utils.file_utils import smart_read_csv
|
| 12 |
+
from utils.colors_utils import df_to_colored_html, get_metric_color
|
| 13 |
+
from ui.common import toggle_manual_visibility as _toggle # reuse for BERT selector
|
| 14 |
+
from utils.tokenizer_refgen import generate_diff_html
|
| 15 |
+
|
| 16 |
+
# ------------------- Summary HTML builder -------------------
|
| 17 |
+
def build_summary_html(result_df: pd.DataFrame, selected_metrics: list, bert_models: list | None = None) -> str:
|
| 18 |
+
def safe_stats(col):
|
| 19 |
+
if col not in result_df.columns:
|
| 20 |
+
return None
|
| 21 |
+
series = result_df[col].dropna().astype(float)
|
| 22 |
+
if series.empty:
|
| 23 |
+
return None
|
| 24 |
+
avg, mn, mx = series.mean(), series.min(), series.max()
|
| 25 |
+
def audio_id_for(v):
|
| 26 |
+
subset = result_df[result_df[col].astype(float) == v]
|
| 27 |
+
if not subset.empty and "code_audio_transcription" in subset.columns:
|
| 28 |
+
aid = subset.iloc[0]["code_audio_transcription"]
|
| 29 |
+
try: return int(aid)
|
| 30 |
+
except: return aid
|
| 31 |
+
return ""
|
| 32 |
+
return {"avg": avg, "min": mn, "min_id": audio_id_for(mn), "max": mx, "max_id": audio_id_for(mx)}
|
| 33 |
+
|
| 34 |
+
rows = []
|
| 35 |
+
# Core metrics
|
| 36 |
+
core = [m for m in selected_metrics if m in {"BLEU","BLEURT","ROUGE"}]
|
| 37 |
+
if len(core) == 1:
|
| 38 |
+
m = core[0]
|
| 39 |
+
prefix = {"BLEU":"bleu_","BLEURT":"bleurt_","ROUGE":"rougeL_"}[m]
|
| 40 |
+
for tag in ('S','O','A','P'):
|
| 41 |
+
s = safe_stats(f"{prefix}{tag}")
|
| 42 |
+
if s: rows.append((f"{prefix}{tag}", s))
|
| 43 |
+
g = safe_stats(f"{prefix}global")
|
| 44 |
+
if g: rows.append((f"{prefix}global", g))
|
| 45 |
+
else:
|
| 46 |
+
for m,pref in [("BLEU","bleu_global"),("BLEURT","bleurt_global"),("ROUGE","rougeL_global")]:
|
| 47 |
+
if m in selected_metrics:
|
| 48 |
+
s = safe_stats(pref)
|
| 49 |
+
if s: rows.append((pref, s))
|
| 50 |
+
|
| 51 |
+
# BERTScore
|
| 52 |
+
if "BERTSCORE" in selected_metrics and bert_models:
|
| 53 |
+
only_bs = selected_metrics == ["BERTSCORE"]
|
| 54 |
+
single = len(bert_models) == 1
|
| 55 |
+
per_sec = only_bs and single
|
| 56 |
+
for friendly in bert_models:
|
| 57 |
+
mid = BERT_FRIENDLY_TO_MODEL[friendly]
|
| 58 |
+
short = mid.split("/")[-1].replace("-","_")
|
| 59 |
+
if per_sec:
|
| 60 |
+
for tag in ('S','O','A','P'):
|
| 61 |
+
s = safe_stats(f"bertscore_{tag}_f1")
|
| 62 |
+
if s: rows.append((f"bertscore_{tag}_f1", s))
|
| 63 |
+
sg = safe_stats("bertscore_global_f1")
|
| 64 |
+
if sg: rows.append(("bertscore_global_f1", sg))
|
| 65 |
+
else:
|
| 66 |
+
s = safe_stats(f"bertscore_{short}_f1")
|
| 67 |
+
if s: rows.append((f"bertscore_{short}_f1", s))
|
| 68 |
+
|
| 69 |
+
if not rows:
|
| 70 |
+
return "<div style='padding:8px;background:#1f1f1f;color:#eee;border-radius:6px;'>No summary available.</div>"
|
| 71 |
+
|
| 72 |
+
html = """
|
| 73 |
+
<div style="margin-bottom:12px;overflow-x:auto;">
|
| 74 |
+
<div style="font-weight:600;margin-bottom:4px;color:#f5f5f5;font-size:16px;">Summary Statistics</div>
|
| 75 |
+
<table style="border-collapse:collapse;width:100%;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;border-radius:8px;overflow:hidden;min-width:500px;">
|
| 76 |
+
<thead>
|
| 77 |
+
<tr>
|
| 78 |
+
<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:left;font-weight:600;">Metric</th>
|
| 79 |
+
<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Avg</th>
|
| 80 |
+
<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Min (ID)</th>
|
| 81 |
+
<th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Max (ID)</th>
|
| 82 |
+
</tr>
|
| 83 |
+
</thead><tbody>
|
| 84 |
+
"""
|
| 85 |
+
for col, stat in rows:
|
| 86 |
+
if col.startswith("bertscore_"):
|
| 87 |
+
name = "BERTScore Global" if col=="bertscore_global_f1" else f"BERTScore {col.split('_')[1].upper()}"
|
| 88 |
+
else:
|
| 89 |
+
name = col.replace("_"," ").upper()
|
| 90 |
+
avg = f"{stat['avg']:.4f}"
|
| 91 |
+
mn = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] else f"{stat['min']:.4f}"
|
| 92 |
+
mx = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] else f"{stat['max']:.4f}"
|
| 93 |
+
if col.startswith("bleu_"):
|
| 94 |
+
ca,cm,cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU")
|
| 95 |
+
elif col.startswith("bleurt_"):
|
| 96 |
+
ca,cm,cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT")
|
| 97 |
+
elif col.startswith("rougeL_"):
|
| 98 |
+
ca,cm,cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE")
|
| 99 |
+
else:
|
| 100 |
+
ca,cm,cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE")
|
| 101 |
+
html += f"""
|
| 102 |
+
<tr style="background:#0f1218;">
|
| 103 |
+
<td style="padding:8px 12px;border:1px solid #2f3240;color:#fff;white-space:nowrap;">{name}</td>
|
| 104 |
+
<td style="padding:8px 12px;border:1px solid #2f3240;background:{ca};color:#fff;text-align:center;white-space:nowrap;">{avg}</td>
|
| 105 |
+
<td style="padding:8px 12px;border:1px solid #2f3240;background:{cm};color:#fff;text-align:center;white-space:nowrap;">{mn}</td>
|
| 106 |
+
<td style="padding:8px 12px;border:1px solid #2f3240;background:{cx};color:#fff;text-align:center;white-space:nowrap;">{mx}</td>
|
| 107 |
+
</tr>
|
| 108 |
+
"""
|
| 109 |
+
html += "</tbody></table></div>"
|
| 110 |
+
return html
|
| 111 |
+
|
| 112 |
+
def build_csv_tab():
|
| 113 |
+
with gr.Blocks() as tab:
|
| 114 |
+
state_df = gr.State()
|
| 115 |
+
|
| 116 |
+
gr.Markdown("# Run an Experiment via CSV upload")
|
| 117 |
+
gr.Markdown(
|
| 118 |
+
"This section lets you upload a CSV of reference/generated text pairs, "
|
| 119 |
+
"select which columns correspond to reference, generated output, and a shared ID, "
|
| 120 |
+
"choose metrics (including BERTScore variants), and compute scores in bulk."
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# 1. Upload CSV
|
| 124 |
+
gr.Markdown("## Upload CSV")
|
| 125 |
+
gr.Markdown(
|
| 126 |
+
"Provide a CSV file containing your data. It should include columns for the reference text, "
|
| 127 |
+
"the generated text, and an identifier (e.g., audio ID)."
|
| 128 |
+
)
|
| 129 |
+
with gr.Row():
|
| 130 |
+
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
|
| 131 |
+
status = gr.Text(label="Status")
|
| 132 |
+
|
| 133 |
+
# 2. Map Columns
|
| 134 |
+
gr.Markdown("## Map Columns")
|
| 135 |
+
gr.Markdown(
|
| 136 |
+
"Select which columns in your CSV correspond to the reference text, generated text, and audio/example ID."
|
| 137 |
+
)
|
| 138 |
+
with gr.Row(visible=False) as mapping:
|
| 139 |
+
ref_col = gr.Dropdown(label="Reference Column", choices=[])
|
| 140 |
+
gen_col = gr.Dropdown(label="Generated Column", choices=[])
|
| 141 |
+
id_col = gr.Dropdown(label="Audio ID Column", choices=[])
|
| 142 |
+
|
| 143 |
+
# 3. Select Metrics
|
| 144 |
+
gr.Markdown("## Select Metrics")
|
| 145 |
+
metric_selector = MetricCheckboxGroup()
|
| 146 |
+
bert_model_selector = BertCheckboxGroup()
|
| 147 |
+
|
| 148 |
+
# 4. Compute
|
| 149 |
+
gr.Markdown("## Run Evaluation")
|
| 150 |
+
run_btn = gr.Button("Run an Evaluation")
|
| 151 |
+
output_status = gr.Text()
|
| 152 |
+
summary_output = gr.HTML()
|
| 153 |
+
table_output = gr.HTML()
|
| 154 |
+
|
| 155 |
+
# 5. Inspect an Example
|
| 156 |
+
gr.Markdown("## Inspect an Example")
|
| 157 |
+
gr.Markdown(
|
| 158 |
+
"Pick an example by its ID to view the reference vs generated text with token-level differences highlighted."
|
| 159 |
+
)
|
| 160 |
+
with gr.Accordion("🔍 Show reference & generated text", open=False):
|
| 161 |
+
pick_id = gr.Dropdown(label="Pick an Audio ID", choices=[])
|
| 162 |
+
ref_disp = gr.Textbox(label="Reference Text", lines=6, interactive=False)
|
| 163 |
+
gen_disp = gr.Textbox(label="Generated Text", lines=6, interactive=False)
|
| 164 |
+
diff_disp = gr.HTML()
|
| 165 |
+
|
| 166 |
+
# --- Handlers ---
|
| 167 |
+
|
| 168 |
+
def handle_upload(f):
|
| 169 |
+
if not f:
|
| 170 |
+
return None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(visible=False), ""
|
| 171 |
+
df = smart_read_csv(f.name)
|
| 172 |
+
cols = df.columns.tolist()
|
| 173 |
+
return (
|
| 174 |
+
df,
|
| 175 |
+
gr.update(choices=cols, value=None),
|
| 176 |
+
gr.update(choices=cols, value=None),
|
| 177 |
+
gr.update(choices=cols, value=None),
|
| 178 |
+
gr.update(visible=True),
|
| 179 |
+
"Upload OK",
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
def run_batch(df, r, g, i, mets, berts):
|
| 183 |
+
if df is None:
|
| 184 |
+
return "No data.", "", "", gr.update(choices=[])
|
| 185 |
+
sub = df[[i, r, g]].rename(
|
| 186 |
+
columns={i: "code_audio_transcription", r: "dsc_reference_free_text", g: "dsc_generated_clinical_report"}
|
| 187 |
+
)
|
| 188 |
+
result = compute_all_metrics_batch(sub, mets, berts if "BERTSCORE" in (mets or []) else None)
|
| 189 |
+
|
| 190 |
+
# Cast IDs to Python int to avoid np.int64 dropdown issues
|
| 191 |
+
raw_ids = result["code_audio_transcription"].dropna().unique()
|
| 192 |
+
ids = sorted(int(x) for x in raw_ids)
|
| 193 |
+
|
| 194 |
+
summary = build_summary_html(result, mets, berts if "BERTSCORE" in (mets or []) else None)
|
| 195 |
+
table = df_to_colored_html(result)
|
| 196 |
+
return "Métricas calculadas com sucesso.", summary, table, gr.update(choices=ids)
|
| 197 |
+
|
| 198 |
+
def show_example(df, audio_id):
|
| 199 |
+
if df is None or audio_id is None:
|
| 200 |
+
return "", "", ""
|
| 201 |
+
row = df[df["code_audio_transcription"] == audio_id]
|
| 202 |
+
if row.empty:
|
| 203 |
+
try:
|
| 204 |
+
row = df[df["code_audio_transcription"] == float(audio_id)]
|
| 205 |
+
except:
|
| 206 |
+
return "", "", ""
|
| 207 |
+
row = row.iloc[0]
|
| 208 |
+
return (
|
| 209 |
+
row["dsc_reference_free_text"],
|
| 210 |
+
row["dsc_generated_clinical_report"],
|
| 211 |
+
generate_diff_html(row["dsc_reference_free_text"], row["dsc_generated_clinical_report"])
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
# --- Wiring ---
|
| 215 |
+
|
| 216 |
+
file_input.change(
|
| 217 |
+
fn=handle_upload,
|
| 218 |
+
inputs=[file_input],
|
| 219 |
+
outputs=[state_df, ref_col, gen_col, id_col, mapping, status],
|
| 220 |
+
)
|
| 221 |
+
metric_selector.change(
|
| 222 |
+
lambda ms: gr.update(visible="BERTSCORE" in ms),
|
| 223 |
+
inputs=[metric_selector],
|
| 224 |
+
outputs=[bert_model_selector],
|
| 225 |
+
)
|
| 226 |
+
run_btn.click(
|
| 227 |
+
fn=run_batch,
|
| 228 |
+
inputs=[state_df, ref_col, gen_col, id_col, metric_selector, bert_model_selector],
|
| 229 |
+
outputs=[output_status, summary_output, table_output, pick_id],
|
| 230 |
+
)
|
| 231 |
+
pick_id.change(
|
| 232 |
+
fn=show_example,
|
| 233 |
+
inputs=[state_df, pick_id],
|
| 234 |
+
outputs=[ref_disp, gen_disp, diff_disp],
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return tab
|
| 238 |
+
|
ui/manual_tab.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/manual_tab.py
|
| 2 |
+
"""
|
| 3 |
+
Builds the manual-evaluation tab (single pair metrics).
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from metrics import (
|
| 7 |
+
compute_bleu_single,
|
| 8 |
+
compute_bleurt_single,
|
| 9 |
+
compute_rouge_single,
|
| 10 |
+
compute_bertscore_single,
|
| 11 |
+
BERT_FRIENDLY_TO_MODEL,
|
| 12 |
+
)
|
| 13 |
+
from ui.common import toggle_manual_visibility
|
| 14 |
+
from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def build_manual_tab():
|
| 18 |
+
with gr.Blocks() as tab:
|
| 19 |
+
gr.Markdown("## Manual Evaluation")
|
| 20 |
+
gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.")
|
| 21 |
+
|
| 22 |
+
with gr.Row():
|
| 23 |
+
reference_input = gr.Textbox(label="Reference Text", lines=3)
|
| 24 |
+
generated_input = gr.Textbox(label="Generated Text", lines=3)
|
| 25 |
+
|
| 26 |
+
metric_selector = MetricCheckboxGroup()
|
| 27 |
+
bert_model_selector = BertCheckboxGroup()
|
| 28 |
+
|
| 29 |
+
with gr.Row():
|
| 30 |
+
run_btn = gr.Button("Run an Evaluation")
|
| 31 |
+
clear_btn = gr.Button("Clear")
|
| 32 |
+
|
| 33 |
+
bleu_out = gr.Textbox(label="BLEU Score", interactive=False)
|
| 34 |
+
bleurt_out = gr.Textbox(label="BLEURT Score", interactive=False)
|
| 35 |
+
rouge_out = gr.Textbox(label="ROUGE Score", interactive=False)
|
| 36 |
+
bert_out = gr.Textbox(label="BERTScore Results", interactive=False)
|
| 37 |
+
|
| 38 |
+
def compute_manual(reference, generated, metrics, berts):
|
| 39 |
+
bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else ""
|
| 40 |
+
bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else ""
|
| 41 |
+
rouge = compute_rouge_single(reference, generated) if "ROUGE" in metrics else ""
|
| 42 |
+
bertscore = ""
|
| 43 |
+
if "BERTSCORE" in metrics and berts:
|
| 44 |
+
parts = []
|
| 45 |
+
for f in berts:
|
| 46 |
+
mid = BERT_FRIENDLY_TO_MODEL[f]
|
| 47 |
+
score = compute_bertscore_single(reference, generated, mid, per_section=False)
|
| 48 |
+
parts.append(f"{f} Global F1: {score:.4f}" if score is not None else f"{f}: error")
|
| 49 |
+
bertscore = "\n".join(parts)
|
| 50 |
+
return bleu, bleurt, rouge, bertscore
|
| 51 |
+
|
| 52 |
+
run_btn.click(
|
| 53 |
+
fn=compute_manual,
|
| 54 |
+
inputs=[reference_input, generated_input, metric_selector, bert_model_selector],
|
| 55 |
+
outputs=[bleu_out, bleurt_out, rouge_out, bert_out],
|
| 56 |
+
)
|
| 57 |
+
metric_selector.change(
|
| 58 |
+
fn=toggle_manual_visibility,
|
| 59 |
+
inputs=[metric_selector],
|
| 60 |
+
outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector],
|
| 61 |
+
)
|
| 62 |
+
clear_btn.click(
|
| 63 |
+
fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]),
|
| 64 |
+
inputs=[],
|
| 65 |
+
outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector],
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
return tab
|
ui/widgets.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ui/widgets.py
|
| 2 |
+
"""
|
| 3 |
+
Factory functions for common Gradio widgets.
|
| 4 |
+
"""
|
| 5 |
+
import gradio as gr
|
| 6 |
+
from metrics import BERT_FRIENDLY_TO_MODEL
|
| 7 |
+
from config import METRIC_CHOICES, DEFAULT_METRICS, DEFAULT_BERTS
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def MetricCheckboxGroup(label="Which metrics to compute", default=None, visible=True):
|
| 11 |
+
return gr.CheckboxGroup(
|
| 12 |
+
choices=METRIC_CHOICES,
|
| 13 |
+
label=label,
|
| 14 |
+
value=default or DEFAULT_METRICS,
|
| 15 |
+
visible=visible,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def BertCheckboxGroup(label="Which BERT models (for BERTScore)", default=None, visible=False):
|
| 20 |
+
return gr.CheckboxGroup(
|
| 21 |
+
choices=list(BERT_FRIENDLY_TO_MODEL.keys()),
|
| 22 |
+
label=label,
|
| 23 |
+
value=default or DEFAULT_BERTS,
|
| 24 |
+
visible=visible,
|
| 25 |
+
)
|