import unicodedata import re import difflib __all__ = ["strip_accents", "tokenize_for_diff", "generate_diff_html"] # ------------------------------------------------------------------- # Diff helpers (can be moved to another file if desired) # ------------------------------------------------------------------- def strip_accents(text: str) -> str: """ Remove diacritics/accents for comparison while preserving original for display. """ nfkd = unicodedata.normalize("NFKD", text) return "".join(c for c in nfkd if not unicodedata.combining(c)) def tokenize_for_diff(text: str): """ Returns the original word tokens and normalized versions (lowercased, accents stripped) for matching. Section headers like '## S:' are removed before tokenizing. """ cleaned = re.sub(r"##\s*[SOAP]:", "", text, flags=re.IGNORECASE) words = re.findall(r"\b\w+\b", cleaned, flags=re.UNICODE) orig_tokens = words norm_tokens = [strip_accents(w).lower() for w in words] return orig_tokens, norm_tokens def generate_diff_html(ref_txt: str, gen_txt: str) -> str: """ Side-by-side token-level diff, case- and accent-insensitive. Reference deletions in dark red, inserts in gold. """ ref_orig, ref_norm = tokenize_for_diff(ref_txt) gen_orig, gen_norm = tokenize_for_diff(gen_txt) matcher = difflib.SequenceMatcher(a=ref_norm, b=gen_norm) ref_html, gen_html = [], [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == "equal": ref_html.extend(ref_orig[i1:i2]) gen_html.extend(gen_orig[j1:j2]) if tag in ("delete", "replace"): for tok in ref_orig[i1:i2]: ref_html.append( f'{tok}' ) if tag in ("insert", "replace"): for tok in gen_orig[j1:j2]: gen_html.append( f'{tok}' ) ref_col = " ".join(ref_html) gen_col = " ".join(gen_html) return ( '

' '

Reference

' f'

{ref_col}

' '

Generated

' f'

{gen_col}

' '

' )