Guilherme commited on
Commit
4b112ae
·
1 Parent(s): ca4ba7f

Deploy to HF Space

Browse files
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import threading
4
+ import gradio as gr
5
+
6
+ from metrics import get_hf_bleurt, get_hf_rouge
7
+ from ui.manual_tab import build_manual_tab
8
+ from ui.csv_tab import build_csv_tab
9
+
10
+ # (Optional) force CPU so TensorFlow/BLEURT doesn't try CUDA on Spaces
11
+ os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1")
12
+
13
+ def create_interface() -> gr.TabbedInterface:
14
+ return gr.TabbedInterface(
15
+ interface_list=[
16
+ build_manual_tab(),
17
+ build_csv_tab(),
18
+ ],
19
+ tab_names=["Manual Input", "CSV Upload"],
20
+ )
21
+
22
+ if __name__ == "__main__":
23
+ # Preload heavy HF metrics so the UI stays responsive
24
+ threading.Thread(target=get_hf_bleurt, daemon=True).start()
25
+ threading.Thread(target=get_hf_rouge, daemon=True).start()
26
+
27
+ app = create_interface()
28
+
29
+ # Spaces (and most PaaS) provide PORT; default to 7860 for local runs
30
+ port = int(os.getenv("PORT", "7860"))
31
+
32
+ # Use queue for long-running tasks to avoid timeouts
33
+ app.queue(concurrency_count=1, max_size=20).launch(
34
+ server_name="0.0.0.0",
35
+ server_port=port,
36
+ show_error=True,
37
+ )
38
+
39
+
config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from metrics import BERT_FRIENDLY_TO_MODEL
2
+
3
+ # Available evaluation metrics for selection
4
+ METRIC_CHOICES = ["BLEU", "BLEURT", "ROUGE", "BERTSCORE"]
5
+ # Default metrics pre-selected in the UI
6
+ DEFAULT_METRICS = ["BLEU"]
7
+
8
+ # Available BERT models for BERTScore
9
+ BERT_CHOICES = list(BERT_FRIENDLY_TO_MODEL.keys())
10
+ # Default BERT model pre-selected in the UI
11
+ DEFAULT_BERTS = [BERT_CHOICES[0]]
metrics/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/__init__.py
2
+ """
3
+ Central metrics entrypoint: import and expose all metric functions.
4
+ """
5
+ from .core import compute_all_metrics_batch
6
+ from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
7
+ from .bleurt import compute_bleurt_single, get_hf_bleurt
8
+ from .rouge import compute_rouge_single, get_hf_rouge
9
+ from .bertscore import compute_bertscore_single, BERT_FRIENDLY_TO_MODEL, BERT_MODEL_TO_FRIENDLY
10
+
11
+ __all__ = [
12
+ "compute_all_metrics_batch",
13
+ "compute_bleu_single",
14
+ "compute_bleurt_single",
15
+ "compute_rouge_single",
16
+ "get_hf_bleurt",
17
+ "get_hf_rouge",
18
+ "compute_bertscore_single",
19
+ "BERT_FRIENDLY_TO_MODEL",
20
+ "BERT_MODEL_TO_FRIENDLY",
21
+ ]
metrics/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (776 Bytes). View file
 
metrics/__pycache__/bertscore.cpython-312.pyc ADDED
Binary file (8.73 kB). View file
 
metrics/__pycache__/bleu.cpython-312.pyc ADDED
Binary file (1.93 kB). View file
 
metrics/__pycache__/bleurt.cpython-312.pyc ADDED
Binary file (1.11 kB). View file
 
metrics/__pycache__/core.cpython-312.pyc ADDED
Binary file (5.76 kB). View file
 
metrics/__pycache__/rouge.cpython-312.pyc ADDED
Binary file (1.59 kB). View file
 
metrics/bertscore.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/bertscore.py
2
+ """
3
+ BERTScore helpers: scorer init, single and batch computation.
4
+ """
5
+ from bert_score import BERTScorer
6
+ from functools import lru_cache
7
+ from transformers import AutoTokenizer
8
+ from utils.file_utils import extract_sections, has_sections
9
+ import pandas as pd
10
+
11
+ # manual layer mapping
12
+ _MANUAL_BERT_LAYERS = {
13
+ "neuralmind/bert-base-portuguese-cased": 12,
14
+ "pucpr/biobertpt-clin": 12,
15
+ "xlm-roberta-large": 24,
16
+ }
17
+
18
+ # friendly label ↔ model id mapping
19
+ BERT_FRIENDLY_TO_MODEL = {
20
+ "Portuguese (Br) Bert": "neuralmind/bert-base-portuguese-cased",
21
+ "Portuguese (Br) Clinical BioBert": "pucpr/biobertpt-clin",
22
+ "Multilingual Bert ( RoBerta)": "xlm-roberta-large",
23
+ }
24
+ BERT_MODEL_TO_FRIENDLY = {v: k for k, v in BERT_FRIENDLY_TO_MODEL.items()}
25
+
26
+ _USE_RESCALE_BASELINE = False
27
+
28
+ @lru_cache(maxsize=6)
29
+ def get_bertscore_scorer(model_type: str):
30
+ lang = "pt" if any(model_type.startswith(p) for p in ("neuralmind","pucpr")) else ""
31
+ num_layers = _MANUAL_BERT_LAYERS.get(model_type)
32
+ kwargs = {"lang": lang, "rescale_with_baseline": _USE_RESCALE_BASELINE}
33
+ if num_layers is not None:
34
+ kwargs["num_layers"] = num_layers
35
+ return BERTScorer(model_type=model_type, **kwargs)
36
+
37
+
38
+ def chunk_text_with_stride(text: str, tokenizer, max_len: int = 512, stride: int = 50):
39
+ ids = tokenizer.encode(text, add_special_tokens=True)
40
+ if len(ids) <= max_len:
41
+ return [tokenizer.decode(ids, skip_special_tokens=True)]
42
+ chunks, step = [], max_len - stride
43
+ for i in range(0, len(ids), step):
44
+ subset = ids[i:i+max_len]
45
+ if not subset:
46
+ break
47
+ chunks.append(tokenizer.decode(subset, skip_special_tokens=True))
48
+ if i+max_len >= len(ids):
49
+ break
50
+ return chunks
51
+
52
+ def compute_bertscore_single(reference: str, prediction: str, model_type: str, per_section: bool = False):
53
+ """
54
+ If per_section=False: returns float global F1 (0..1) or None on error.
55
+ If per_section=True: returns dict with keys:
56
+ - bertscore_S_f1, bertscore_O_f1, bertscore_A_f1, bertscore_P_f1 (if sections exist)
57
+ - bertscore_global_f1
58
+ Missing/invalid sections are omitted or set to None.
59
+ """
60
+ if not reference or not prediction:
61
+ return None if not per_section else {}
62
+
63
+ try:
64
+ scorer = get_bertscore_scorer(model_type)
65
+ tokenizer = AutoTokenizer.from_pretrained(model_type, use_fast=True)
66
+
67
+ def score_pair(pred_text, ref_text):
68
+ if not pred_text or not ref_text:
69
+ return None
70
+ try:
71
+ _, _, F1 = scorer.score([pred_text], [ref_text])
72
+ return float(F1[0])
73
+ except Exception:
74
+ return None
75
+
76
+ # global
77
+ pred_chunks = chunk_text_with_stride(prediction, tokenizer)
78
+ ref_chunks = chunk_text_with_stride(reference, tokenizer)
79
+ paired = list(zip(pred_chunks, ref_chunks))
80
+ global_f1s = []
81
+ for pc, rc in paired:
82
+ f1 = score_pair(pc, rc)
83
+ if f1 is not None:
84
+ global_f1s.append(f1)
85
+ global_avg = sum(global_f1s) / len(global_f1s) if global_f1s else 0.0
86
+
87
+ if not per_section:
88
+ return global_avg
89
+
90
+ out = {}
91
+ out["bertscore_global_f1"] = global_avg
92
+
93
+ # per-section only if both texts have sections
94
+ ref_has = has_sections(reference)
95
+ pred_has = has_sections(prediction)
96
+ if ref_has and pred_has:
97
+ sections_ref = extract_sections(reference)
98
+ sections_pred = extract_sections(prediction)
99
+ for tag in ["S", "O", "A", "P"]:
100
+ pred_sec = sections_pred.get(tag, "")
101
+ ref_sec = sections_ref.get(tag, "")
102
+ if pred_sec and ref_sec:
103
+ pred_chunks = chunk_text_with_stride(pred_sec, tokenizer)
104
+ ref_chunks = chunk_text_with_stride(ref_sec, tokenizer)
105
+ paired_sec = list(zip(pred_chunks, ref_chunks))
106
+ f1s = []
107
+ for pc, rc in paired_sec:
108
+ f1 = score_pair(pc, rc)
109
+ if f1 is not None:
110
+ f1s.append(f1)
111
+ avg_f1 = sum(f1s) / len(f1s) if f1s else 0.0
112
+ out[f"bertscore_{tag}_f1"] = avg_f1
113
+ else:
114
+ out[f"bertscore_{tag}_f1"] = None
115
+ else:
116
+ for tag in ["S", "O", "A", "P"]:
117
+ out[f"bertscore_{tag}_f1"] = None
118
+
119
+ return out
120
+ except Exception:
121
+ return None if not per_section else {}
122
+
123
+ def compute_batch_bertscore(df: pd.DataFrame, bert_models: list, per_section: bool = False) -> pd.DataFrame:
124
+ """
125
+ If per_section=True and single model: returns per-section + global bertscore.
126
+ Otherwise: only global scores (one per model).
127
+ Expects df with columns:
128
+ - dsc_reference_free_text
129
+ - dsc_generated_clinical_report
130
+ bert_models is a list of friendly names (e.g., "Portuguese (Br) Bert").
131
+ """
132
+ if bert_models is None or not bert_models:
133
+ return pd.DataFrame(index=df.index)
134
+
135
+ preds = df["dsc_generated_clinical_report"].astype(str).tolist()
136
+ refs = df["dsc_reference_free_text"].astype(str).tolist()
137
+
138
+ add = {}
139
+ single_model = len(bert_models) == 1
140
+
141
+ for friendly in bert_models:
142
+ model_id = BERT_FRIENDLY_TO_MODEL.get(friendly, friendly)
143
+ short = model_id.split("/")[-1].replace("-", "_")
144
+
145
+ if per_section and single_model:
146
+ col_data = {
147
+ "bertscore_global_f1": [],
148
+ "bertscore_S_f1": [],
149
+ "bertscore_O_f1": [],
150
+ "bertscore_A_f1": [],
151
+ "bertscore_P_f1": [],
152
+ }
153
+ for pred, ref in zip(preds, refs):
154
+ scores = compute_bertscore_single(ref, pred, model_id, per_section=True)
155
+ if not scores:
156
+ col_data["bertscore_global_f1"].append(None)
157
+ for tag in ["S", "O", "A", "P"]:
158
+ col_data[f"bertscore_{tag}_f1"].append(None)
159
+ else:
160
+ col_data["bertscore_global_f1"].append(scores.get("bertscore_global_f1"))
161
+ for tag in ["S", "O", "A", "P"]:
162
+ col_data[f"bertscore_{tag}_f1"].append(scores.get(f"bertscore_{tag}_f1"))
163
+ add.update(col_data)
164
+ else:
165
+ col_name = f"bertscore_{short}_f1"
166
+ scorer = get_bertscore_scorer(model_id)
167
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
168
+ f1_list = []
169
+ for pred, ref in zip(preds, refs):
170
+ try:
171
+ pred_chunks = chunk_text_with_stride(pred, tokenizer)
172
+ ref_chunks = chunk_text_with_stride(ref, tokenizer)
173
+ paired = list(zip(pred_chunks, ref_chunks))
174
+ if not paired:
175
+ f1_list.append(None)
176
+ continue
177
+ per_pair_f1s = []
178
+ for pc, rc in paired:
179
+ _, _, F1 = scorer.score([pc], [rc])
180
+ per_pair_f1s.append(float(F1[0]))
181
+ avg_f1 = sum(per_pair_f1s) / len(per_pair_f1s) if per_pair_f1s else None
182
+ f1_list.append(avg_f1)
183
+ except Exception:
184
+ f1_list.append(None)
185
+ add[col_name] = f1_list
186
+
187
+ return pd.DataFrame(add, index=df.index)
metrics/bleu.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/bleu.py
2
+ """
3
+ BLEU metric wrappers using sacreBLEU and file_utils.
4
+ """
5
+ from utils.file_utils import *
6
+ from sacrebleu.metrics import BLEU
7
+
8
+ # Instância global de BLEU com tokenização 'intl', lowercase e smoothing 'exp'
9
+ _bleu_scorer = BLEU(tokenize='intl', lowercase=True, smooth_method='exp')
10
+
11
+ def section_bleu(gen_txt: str, ref_txt: str) -> float:
12
+ """
13
+ Calcula BLEU para um par de strings (seção), retornando score de 0 a 100.
14
+ """
15
+ if not gen_txt.strip() and not ref_txt.strip():
16
+ return 100.0
17
+ if (not gen_txt.strip()) ^ (not ref_txt.strip()):
18
+ return 0.0
19
+ return _bleu_scorer.sentence_score(gen_txt, [ref_txt]).score
20
+
21
+ def full_bleu(gen_raw: str, ref_raw: str) -> float:
22
+ """
23
+ Calcula BLEU global para strings completas, retornando score de 0 a 100.
24
+ """
25
+ gen = normalize_and_flatten(gen_raw)
26
+ ref = normalize_and_flatten(ref_raw)
27
+ if not gen and not ref:
28
+ return 100.0
29
+ if (not gen) ^ (not ref):
30
+ return 0.0
31
+ return _bleu_scorer.sentence_score(gen, [ref]).score
32
+
33
+
34
+ def compute_bleu_single(reference: str, prediction: str) -> str:
35
+ """
36
+ Compute and format BLEU score for a single pair.
37
+ """
38
+ if not reference or not prediction:
39
+ return "Please provide both texts."
40
+ score = full_bleu(prediction, reference) / 100.0
41
+ return f"BLEU Score: {score:.4f}"
metrics/bleurt.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/bleurt.py
2
+ import os
3
+ # disable GPU (and XLA/PTX) so BLEURT runs on CPU and avoids the libdevice error
4
+ os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
5
+
6
+
7
+ """
8
+ BLEURT metric wrappers using HuggingFace evaluate.
9
+ """
10
+ from evaluate import load
11
+
12
+ _hf_bleurt = None
13
+
14
+
15
+ def get_hf_bleurt():
16
+ global _hf_bleurt
17
+ if _hf_bleurt is None:
18
+ _hf_bleurt = load("bleurt", module_type="metric", checkpoint="BLEURT-20")
19
+ return _hf_bleurt
20
+
21
+
22
+ def compute_bleurt_single(reference: str, prediction: str) -> str:
23
+ """
24
+ Compute and format BLEURT score for a single pair.
25
+ """
26
+ if not reference or not prediction:
27
+ return "Please provide both texts."
28
+ bleurt = get_hf_bleurt()
29
+ result = bleurt.compute(predictions=[prediction], references=[reference])
30
+ return f"BLEURT Score: {result['scores'][0]:.4f}"
metrics/core.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/core.py
2
+ """
3
+ Orchestrates batch computation of selected metrics FOR UPLOAD CSV TAB
4
+ """
5
+ import pandas as pd
6
+ from .bleu import compute_bleu_single, section_bleu, full_bleu, compute_bleu_single
7
+ from .bleurt import get_hf_bleurt, compute_bleurt_single
8
+ from .rouge import get_hf_rouge, compute_rouge_single, rougeL_score
9
+ from .bertscore import compute_batch_bertscore
10
+ from utils.file_utils import extract_sections, has_sections
11
+
12
+
13
+ def compute_all_metrics_batch(
14
+ df: pd.DataFrame,
15
+ selected_metrics: list = None,
16
+ bert_models: list | None = None
17
+ ) -> pd.DataFrame:
18
+ if selected_metrics is None:
19
+ selected_metrics = ["BLEU"]
20
+
21
+ df = df.dropna(
22
+ subset=["dsc_reference_free_text", "dsc_generated_clinical_report"]
23
+ ).copy()
24
+
25
+ if "code_audio_transcription" not in df.columns:
26
+ df["code_audio_transcription"] = list(range(len(df)))
27
+
28
+ df["has_sections"] = df.apply(
29
+ lambda r: has_sections(r["dsc_reference_free_text"])
30
+ and has_sections(r["dsc_generated_clinical_report"]),
31
+ axis=1
32
+ )
33
+
34
+ only_one_metric = len(selected_metrics) == 1
35
+ core_metrics = [m for m in selected_metrics if m in {"BLEU", "BLEURT", "ROUGE"}]
36
+ only_bertscore_alone = only_one_metric and selected_metrics == ["BERTSCORE"]
37
+ out_cols = ["code_audio_transcription"]
38
+ tags = ["S", "O", "A", "P"]
39
+
40
+ # BLEU
41
+ if "BLEU" in selected_metrics:
42
+ if only_one_metric and "BLEU" in selected_metrics:
43
+ for tag in tags:
44
+ def _sec_bleu(row, tag=tag):
45
+ gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
46
+ ref = extract_sections(row["dsc_reference_free_text"])[tag]
47
+ if row["has_sections"] and gen and ref:
48
+ return section_bleu(gen, ref) / 100.0
49
+ return None
50
+ df[f"bleu_{tag}"] = df.apply(_sec_bleu, axis=1)
51
+ out_cols.append(f"bleu_{tag}")
52
+ df["bleu_global"] = df.apply(
53
+ lambda r: full_bleu(
54
+ r["dsc_generated_clinical_report"],
55
+ r["dsc_reference_free_text"]
56
+ ) / 100.0,
57
+ axis=1
58
+ )
59
+ out_cols.append("bleu_global")
60
+
61
+ # BLEURT
62
+ if "BLEURT" in selected_metrics:
63
+ bleurt = get_hf_bleurt()
64
+ if only_one_metric and "BLEURT" in selected_metrics:
65
+ for tag in tags:
66
+ idxs, gens, refs = [], [], []
67
+ for i, row in df.iterrows():
68
+ gen = extract_sections(row["dsc_generated_clinical_report"])[tag]
69
+ ref = extract_sections(row["dsc_reference_free_text"])[tag]
70
+ if row["has_sections"] and gen and ref:
71
+ idxs.append(i); gens.append(gen); refs.append(ref)
72
+ scores = (
73
+ bleurt.compute(predictions=gens, references=refs)["scores"]
74
+ if gens else []
75
+ )
76
+ col = [None] * len(df)
77
+ for i, sc in zip(idxs, scores):
78
+ col[i] = sc
79
+ df[f"bleurt_{tag}"] = col
80
+ out_cols.append(f"bleurt_{tag}")
81
+ df["bleurt_global"] = bleurt.compute(
82
+ predictions=df["dsc_generated_clinical_report"].tolist(),
83
+ references=df["dsc_reference_free_text"].tolist()
84
+ )["scores"]
85
+ out_cols.append("bleurt_global")
86
+
87
+ # ROUGE-L
88
+ if "ROUGE" in selected_metrics:
89
+ if only_one_metric and "ROUGE" in selected_metrics:
90
+ for tag in tags:
91
+ df[f"rougeL_{tag}"] = df.apply(
92
+ lambda row: rougeL_score(
93
+ extract_sections(row["dsc_generated_clinical_report"])[tag],
94
+ extract_sections(row["dsc_reference_free_text"])[tag]
95
+ ) if row["has_sections"] else None,
96
+ axis=1
97
+ )
98
+ out_cols.append(f"rougeL_{tag}")
99
+ df["rougeL_global"] = df.apply(
100
+ lambda row: rougeL_score(
101
+ row["dsc_generated_clinical_report"],
102
+ row["dsc_reference_free_text"]
103
+ ),
104
+ axis=1
105
+ )
106
+ out_cols.append("rougeL_global")
107
+
108
+ # BERTScore
109
+ if "BERTSCORE" in (selected_metrics or []) and bert_models:
110
+ per_section_bertscore = only_bertscore_alone and bert_models and len(bert_models) == 1
111
+ bert_df = compute_batch_bertscore(df, bert_models, per_section=per_section_bertscore)
112
+ for col in bert_df.columns:
113
+ df[col] = bert_df[col]
114
+ out_cols.append(col)
115
+
116
+ # clip BLEU
117
+ for c in df.columns:
118
+ if c.startswith("bleu_"):
119
+ df[c] = df[c].clip(0.0, 1.0)
120
+
121
+ return df[out_cols]
122
+
metrics/rouge.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics/rouge.py
2
+ """
3
+ ROUGE metric wrappers using HuggingFace evaluate.
4
+ """
5
+ from evaluate import load
6
+
7
+ _hf_rouge = None
8
+
9
+
10
+ def get_hf_rouge():
11
+ global _hf_rouge
12
+ if _hf_rouge is None:
13
+ _hf_rouge = load("rouge")
14
+ return _hf_rouge
15
+
16
+
17
+ def compute_rouge_single(reference: str, prediction: str) -> str:
18
+ """
19
+ Compute and format ROUGE-L score for a single pair.
20
+ """
21
+ if not reference or not prediction:
22
+ return "Please provide both texts."
23
+ rouge = get_hf_rouge()
24
+ res = rouge.compute(predictions=[prediction], references=[reference], rouge_types=["rougeL"])
25
+ score = res["rougeL"]
26
+ if isinstance(score, (list, tuple)):
27
+ score = score[0]
28
+ return f"ROUGE-L Score: {score:.4f}"
29
+
30
+
31
+ def rougeL_score(pred: str, ref: str) -> float:
32
+ """
33
+ Raw ROUGE-L score (0..1) for text pairs.
34
+ """
35
+ rouge = get_hf_rouge()
36
+ res = rouge.compute(predictions=[pred], references=[ref], rouge_types=["rougeL"])
37
+ s = res["rougeL"]
38
+ return s[0] if isinstance(s, (list, tuple)) else s
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.40.0
2
+ pandas>=2.0.0
3
+ evaluate>=0.4.5
4
+ transformers>=4.54.1
5
+ tokenizers>=0.21.4
6
+ bert-score>=0.3.13
7
+ bleurt@git+https://github.com/google-research/bleurt.git@cebe7e6f996b40910cfaa520a63db47807e3bf5c
8
+ sacrebleu>=2.5.1
9
+ rouge_score>=0.1.2
10
+ chardet
ui/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # ui/__init__.py
2
+ """
3
+ UI package: exports tab builders.
4
+ """
5
+ __all__ = ["build_manual_tab", "build_csv_tab"]
6
+
7
+ from .manual_tab import build_manual_tab
8
+ from .csv_tab import build_csv_tab
ui/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (363 Bytes). View file
 
ui/__pycache__/common.cpython-312.pyc ADDED
Binary file (841 Bytes). View file
 
ui/__pycache__/csv_tab.cpython-312.pyc ADDED
Binary file (13.7 kB). View file
 
ui/__pycache__/manual_tab.cpython-312.pyc ADDED
Binary file (3.74 kB). View file
 
ui/__pycache__/widgets.cpython-312.pyc ADDED
Binary file (1.1 kB). View file
 
ui/common.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def toggle_manual_visibility(selected_metrics):
5
+ """
6
+ Returns visibility updates for manual-tab outputs:
7
+ (BLEU_out, BLEURT_out, ROUGE_out, BERT_out, BERT_model_selector)
8
+ """
9
+ return (
10
+ gr.update(visible="BLEU" in selected_metrics),
11
+ gr.update(visible="BLEURT" in selected_metrics),
12
+ gr.update(visible="ROUGE" in selected_metrics),
13
+ gr.update(visible="BERTSCORE" in selected_metrics),
14
+ gr.update(visible="BERTSCORE" in selected_metrics),
15
+ )
ui/csv_tab.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/csv_tab.py
2
+
3
+ """
4
+ Builds the CSV-upload tab (batch metrics).
5
+ """
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from metrics import compute_all_metrics_batch, BERT_FRIENDLY_TO_MODEL
10
+ from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
11
+ from utils.file_utils import smart_read_csv
12
+ from utils.colors_utils import df_to_colored_html, get_metric_color
13
+ from ui.common import toggle_manual_visibility as _toggle # reuse for BERT selector
14
+ from utils.tokenizer_refgen import generate_diff_html
15
+
16
+ # ------------------- Summary HTML builder -------------------
17
+ def build_summary_html(result_df: pd.DataFrame, selected_metrics: list, bert_models: list | None = None) -> str:
18
+ def safe_stats(col):
19
+ if col not in result_df.columns:
20
+ return None
21
+ series = result_df[col].dropna().astype(float)
22
+ if series.empty:
23
+ return None
24
+ avg, mn, mx = series.mean(), series.min(), series.max()
25
+ def audio_id_for(v):
26
+ subset = result_df[result_df[col].astype(float) == v]
27
+ if not subset.empty and "code_audio_transcription" in subset.columns:
28
+ aid = subset.iloc[0]["code_audio_transcription"]
29
+ try: return int(aid)
30
+ except: return aid
31
+ return ""
32
+ return {"avg": avg, "min": mn, "min_id": audio_id_for(mn), "max": mx, "max_id": audio_id_for(mx)}
33
+
34
+ rows = []
35
+ # Core metrics
36
+ core = [m for m in selected_metrics if m in {"BLEU","BLEURT","ROUGE"}]
37
+ if len(core) == 1:
38
+ m = core[0]
39
+ prefix = {"BLEU":"bleu_","BLEURT":"bleurt_","ROUGE":"rougeL_"}[m]
40
+ for tag in ('S','O','A','P'):
41
+ s = safe_stats(f"{prefix}{tag}")
42
+ if s: rows.append((f"{prefix}{tag}", s))
43
+ g = safe_stats(f"{prefix}global")
44
+ if g: rows.append((f"{prefix}global", g))
45
+ else:
46
+ for m,pref in [("BLEU","bleu_global"),("BLEURT","bleurt_global"),("ROUGE","rougeL_global")]:
47
+ if m in selected_metrics:
48
+ s = safe_stats(pref)
49
+ if s: rows.append((pref, s))
50
+
51
+ # BERTScore
52
+ if "BERTSCORE" in selected_metrics and bert_models:
53
+ only_bs = selected_metrics == ["BERTSCORE"]
54
+ single = len(bert_models) == 1
55
+ per_sec = only_bs and single
56
+ for friendly in bert_models:
57
+ mid = BERT_FRIENDLY_TO_MODEL[friendly]
58
+ short = mid.split("/")[-1].replace("-","_")
59
+ if per_sec:
60
+ for tag in ('S','O','A','P'):
61
+ s = safe_stats(f"bertscore_{tag}_f1")
62
+ if s: rows.append((f"bertscore_{tag}_f1", s))
63
+ sg = safe_stats("bertscore_global_f1")
64
+ if sg: rows.append(("bertscore_global_f1", sg))
65
+ else:
66
+ s = safe_stats(f"bertscore_{short}_f1")
67
+ if s: rows.append((f"bertscore_{short}_f1", s))
68
+
69
+ if not rows:
70
+ return "<div style='padding:8px;background:#1f1f1f;color:#eee;border-radius:6px;'>No summary available.</div>"
71
+
72
+ html = """
73
+ <div style="margin-bottom:12px;overflow-x:auto;">
74
+ <div style="font-weight:600;margin-bottom:4px;color:#f5f5f5;font-size:16px;">Summary Statistics</div>
75
+ <table style="border-collapse:collapse;width:100%;font-family:system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,sans-serif;border-radius:8px;overflow:hidden;min-width:500px;">
76
+ <thead>
77
+ <tr>
78
+ <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:left;font-weight:600;">Metric</th>
79
+ <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Avg</th>
80
+ <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Min (ID)</th>
81
+ <th style="padding:8px 12px;background:#2d3748;color:#fff;text-align:center;font-weight:600;">Max (ID)</th>
82
+ </tr>
83
+ </thead><tbody>
84
+ """
85
+ for col, stat in rows:
86
+ if col.startswith("bertscore_"):
87
+ name = "BERTScore Global" if col=="bertscore_global_f1" else f"BERTScore {col.split('_')[1].upper()}"
88
+ else:
89
+ name = col.replace("_"," ").upper()
90
+ avg = f"{stat['avg']:.4f}"
91
+ mn = f"{stat['min']:.4f} ({stat['min_id']})" if stat['min_id'] else f"{stat['min']:.4f}"
92
+ mx = f"{stat['max']:.4f} ({stat['max_id']})" if stat['max_id'] else f"{stat['max']:.4f}"
93
+ if col.startswith("bleu_"):
94
+ ca,cm,cx = get_metric_color(stat['avg'], "BLEU"), get_metric_color(stat['min'], "BLEU"), get_metric_color(stat['max'], "BLEU")
95
+ elif col.startswith("bleurt_"):
96
+ ca,cm,cx = get_metric_color(stat['avg'], "BLEURT"), get_metric_color(stat['min'], "BLEURT"), get_metric_color(stat['max'], "BLEURT")
97
+ elif col.startswith("rougeL_"):
98
+ ca,cm,cx = get_metric_color(stat['avg'], "ROUGE"), get_metric_color(stat['min'], "ROUGE"), get_metric_color(stat['max'], "ROUGE")
99
+ else:
100
+ ca,cm,cx = get_metric_color(stat['avg'], "BERTSCORE"), get_metric_color(stat['min'], "BERTSCORE"), get_metric_color(stat['max'], "BERTSCORE")
101
+ html += f"""
102
+ <tr style="background:#0f1218;">
103
+ <td style="padding:8px 12px;border:1px solid #2f3240;color:#fff;white-space:nowrap;">{name}</td>
104
+ <td style="padding:8px 12px;border:1px solid #2f3240;background:{ca};color:#fff;text-align:center;white-space:nowrap;">{avg}</td>
105
+ <td style="padding:8px 12px;border:1px solid #2f3240;background:{cm};color:#fff;text-align:center;white-space:nowrap;">{mn}</td>
106
+ <td style="padding:8px 12px;border:1px solid #2f3240;background:{cx};color:#fff;text-align:center;white-space:nowrap;">{mx}</td>
107
+ </tr>
108
+ """
109
+ html += "</tbody></table></div>"
110
+ return html
111
+
112
+ def build_csv_tab():
113
+ with gr.Blocks() as tab:
114
+ state_df = gr.State()
115
+
116
+ gr.Markdown("# Run an Experiment via CSV upload")
117
+ gr.Markdown(
118
+ "This section lets you upload a CSV of reference/generated text pairs, "
119
+ "select which columns correspond to reference, generated output, and a shared ID, "
120
+ "choose metrics (including BERTScore variants), and compute scores in bulk."
121
+ )
122
+
123
+ # 1. Upload CSV
124
+ gr.Markdown("## Upload CSV")
125
+ gr.Markdown(
126
+ "Provide a CSV file containing your data. It should include columns for the reference text, "
127
+ "the generated text, and an identifier (e.g., audio ID)."
128
+ )
129
+ with gr.Row():
130
+ file_input = gr.File(label="Upload CSV", file_types=[".csv"])
131
+ status = gr.Text(label="Status")
132
+
133
+ # 2. Map Columns
134
+ gr.Markdown("## Map Columns")
135
+ gr.Markdown(
136
+ "Select which columns in your CSV correspond to the reference text, generated text, and audio/example ID."
137
+ )
138
+ with gr.Row(visible=False) as mapping:
139
+ ref_col = gr.Dropdown(label="Reference Column", choices=[])
140
+ gen_col = gr.Dropdown(label="Generated Column", choices=[])
141
+ id_col = gr.Dropdown(label="Audio ID Column", choices=[])
142
+
143
+ # 3. Select Metrics
144
+ gr.Markdown("## Select Metrics")
145
+ metric_selector = MetricCheckboxGroup()
146
+ bert_model_selector = BertCheckboxGroup()
147
+
148
+ # 4. Compute
149
+ gr.Markdown("## Run Evaluation")
150
+ run_btn = gr.Button("Run an Evaluation")
151
+ output_status = gr.Text()
152
+ summary_output = gr.HTML()
153
+ table_output = gr.HTML()
154
+
155
+ # 5. Inspect an Example
156
+ gr.Markdown("## Inspect an Example")
157
+ gr.Markdown(
158
+ "Pick an example by its ID to view the reference vs generated text with token-level differences highlighted."
159
+ )
160
+ with gr.Accordion("🔍 Show reference & generated text", open=False):
161
+ pick_id = gr.Dropdown(label="Pick an Audio ID", choices=[])
162
+ ref_disp = gr.Textbox(label="Reference Text", lines=6, interactive=False)
163
+ gen_disp = gr.Textbox(label="Generated Text", lines=6, interactive=False)
164
+ diff_disp = gr.HTML()
165
+
166
+ # --- Handlers ---
167
+
168
+ def handle_upload(f):
169
+ if not f:
170
+ return None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), gr.update(visible=False), ""
171
+ df = smart_read_csv(f.name)
172
+ cols = df.columns.tolist()
173
+ return (
174
+ df,
175
+ gr.update(choices=cols, value=None),
176
+ gr.update(choices=cols, value=None),
177
+ gr.update(choices=cols, value=None),
178
+ gr.update(visible=True),
179
+ "Upload OK",
180
+ )
181
+
182
+ def run_batch(df, r, g, i, mets, berts):
183
+ if df is None:
184
+ return "No data.", "", "", gr.update(choices=[])
185
+ sub = df[[i, r, g]].rename(
186
+ columns={i: "code_audio_transcription", r: "dsc_reference_free_text", g: "dsc_generated_clinical_report"}
187
+ )
188
+ result = compute_all_metrics_batch(sub, mets, berts if "BERTSCORE" in (mets or []) else None)
189
+
190
+ # Cast IDs to Python int to avoid np.int64 dropdown issues
191
+ raw_ids = result["code_audio_transcription"].dropna().unique()
192
+ ids = sorted(int(x) for x in raw_ids)
193
+
194
+ summary = build_summary_html(result, mets, berts if "BERTSCORE" in (mets or []) else None)
195
+ table = df_to_colored_html(result)
196
+ return "Métricas calculadas com sucesso.", summary, table, gr.update(choices=ids)
197
+
198
+ def show_example(df, audio_id):
199
+ if df is None or audio_id is None:
200
+ return "", "", ""
201
+ row = df[df["code_audio_transcription"] == audio_id]
202
+ if row.empty:
203
+ try:
204
+ row = df[df["code_audio_transcription"] == float(audio_id)]
205
+ except:
206
+ return "", "", ""
207
+ row = row.iloc[0]
208
+ return (
209
+ row["dsc_reference_free_text"],
210
+ row["dsc_generated_clinical_report"],
211
+ generate_diff_html(row["dsc_reference_free_text"], row["dsc_generated_clinical_report"])
212
+ )
213
+
214
+ # --- Wiring ---
215
+
216
+ file_input.change(
217
+ fn=handle_upload,
218
+ inputs=[file_input],
219
+ outputs=[state_df, ref_col, gen_col, id_col, mapping, status],
220
+ )
221
+ metric_selector.change(
222
+ lambda ms: gr.update(visible="BERTSCORE" in ms),
223
+ inputs=[metric_selector],
224
+ outputs=[bert_model_selector],
225
+ )
226
+ run_btn.click(
227
+ fn=run_batch,
228
+ inputs=[state_df, ref_col, gen_col, id_col, metric_selector, bert_model_selector],
229
+ outputs=[output_status, summary_output, table_output, pick_id],
230
+ )
231
+ pick_id.change(
232
+ fn=show_example,
233
+ inputs=[state_df, pick_id],
234
+ outputs=[ref_disp, gen_disp, diff_disp],
235
+ )
236
+
237
+ return tab
238
+
ui/manual_tab.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/manual_tab.py
2
+ """
3
+ Builds the manual-evaluation tab (single pair metrics).
4
+ """
5
+ import gradio as gr
6
+ from metrics import (
7
+ compute_bleu_single,
8
+ compute_bleurt_single,
9
+ compute_rouge_single,
10
+ compute_bertscore_single,
11
+ BERT_FRIENDLY_TO_MODEL,
12
+ )
13
+ from ui.common import toggle_manual_visibility
14
+ from ui.widgets import MetricCheckboxGroup, BertCheckboxGroup
15
+
16
+
17
+ def build_manual_tab():
18
+ with gr.Blocks() as tab:
19
+ gr.Markdown("## Manual Evaluation")
20
+ gr.Markdown("Compute selected metrics (BLEU, BLEURT, ROUGE, BERTScore) for a single pair of texts.")
21
+
22
+ with gr.Row():
23
+ reference_input = gr.Textbox(label="Reference Text", lines=3)
24
+ generated_input = gr.Textbox(label="Generated Text", lines=3)
25
+
26
+ metric_selector = MetricCheckboxGroup()
27
+ bert_model_selector = BertCheckboxGroup()
28
+
29
+ with gr.Row():
30
+ run_btn = gr.Button("Run an Evaluation")
31
+ clear_btn = gr.Button("Clear")
32
+
33
+ bleu_out = gr.Textbox(label="BLEU Score", interactive=False)
34
+ bleurt_out = gr.Textbox(label="BLEURT Score", interactive=False)
35
+ rouge_out = gr.Textbox(label="ROUGE Score", interactive=False)
36
+ bert_out = gr.Textbox(label="BERTScore Results", interactive=False)
37
+
38
+ def compute_manual(reference, generated, metrics, berts):
39
+ bleu = compute_bleu_single(reference, generated) if "BLEU" in metrics else ""
40
+ bleurt = compute_bleurt_single(reference, generated) if "BLEURT" in metrics else ""
41
+ rouge = compute_rouge_single(reference, generated) if "ROUGE" in metrics else ""
42
+ bertscore = ""
43
+ if "BERTSCORE" in metrics and berts:
44
+ parts = []
45
+ for f in berts:
46
+ mid = BERT_FRIENDLY_TO_MODEL[f]
47
+ score = compute_bertscore_single(reference, generated, mid, per_section=False)
48
+ parts.append(f"{f} Global F1: {score:.4f}" if score is not None else f"{f}: error")
49
+ bertscore = "\n".join(parts)
50
+ return bleu, bleurt, rouge, bertscore
51
+
52
+ run_btn.click(
53
+ fn=compute_manual,
54
+ inputs=[reference_input, generated_input, metric_selector, bert_model_selector],
55
+ outputs=[bleu_out, bleurt_out, rouge_out, bert_out],
56
+ )
57
+ metric_selector.change(
58
+ fn=toggle_manual_visibility,
59
+ inputs=[metric_selector],
60
+ outputs=[bleu_out, bleurt_out, rouge_out, bert_out, bert_model_selector],
61
+ )
62
+ clear_btn.click(
63
+ fn=lambda: ("", "", "", "", ["BLEU"], [list(BERT_FRIENDLY_TO_MODEL.keys())[0]]),
64
+ inputs=[],
65
+ outputs=[reference_input, generated_input, bleu_out, bleurt_out, rouge_out, metric_selector, bert_model_selector],
66
+ )
67
+
68
+ return tab
ui/widgets.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ui/widgets.py
2
+ """
3
+ Factory functions for common Gradio widgets.
4
+ """
5
+ import gradio as gr
6
+ from metrics import BERT_FRIENDLY_TO_MODEL
7
+ from config import METRIC_CHOICES, DEFAULT_METRICS, DEFAULT_BERTS
8
+
9
+
10
+ def MetricCheckboxGroup(label="Which metrics to compute", default=None, visible=True):
11
+ return gr.CheckboxGroup(
12
+ choices=METRIC_CHOICES,
13
+ label=label,
14
+ value=default or DEFAULT_METRICS,
15
+ visible=visible,
16
+ )
17
+
18
+
19
+ def BertCheckboxGroup(label="Which BERT models (for BERTScore)", default=None, visible=False):
20
+ return gr.CheckboxGroup(
21
+ choices=list(BERT_FRIENDLY_TO_MODEL.keys()),
22
+ label=label,
23
+ value=default or DEFAULT_BERTS,
24
+ visible=visible,
25
+ )