Spaces:

gjoliveira
/

data-ai-llm-eval-app

Sleeping

File size: 3,740 Bytes
# file_utils.py
# file_utils.py

import pandas as pd
import chardet
import re
import os
from sacrebleu.metrics import BLEU

# Instância global de BLEU com tokenização 'intl', lowercase e smoothing 'exp'
_bleu_scorer = BLEU(tokenize='intl', lowercase=True, smooth_method='exp')

def smart_read_csv(file_obj):
    """

    Lê um CSV tentando detectar encoding e separador automaticamente.

    """
    if isinstance(file_obj, str) and os.path.exists(file_obj):
        f = open(file_obj, 'rb')
    elif hasattr(file_obj, 'name') and isinstance(file_obj.name, str):
        try:
            f = open(file_obj.name, 'rb')
        except Exception:
            f = file_obj
    else:
        f = file_obj

    raw = f.read()
    f.seek(0)
    enc = chardet.detect(raw).get('encoding', 'utf-8') or 'utf-8'

    for sep in [',', ';', '\t']:
        try:
            df = pd.read_csv(f, encoding=enc, sep=sep)
            if df.shape[1] >= 2:
                return df
        except Exception:
            pass
        f.seek(0)

    raise ValueError(f"Não foi possível ler o CSV com encoding {enc} e separadores comuns.")

def normalize_sections(txt: str) -> str:
    """

    Normaliza as tags de seção (## S:, ## O:, ## A:, ## P:) conforme seu notebook original.

    """
    txt = str(txt)
    # Sintomas
    txt = re.sub(r'(?m)^\s*S\s*C\s*telemedicina', '## S:', txt, flags=re.IGNORECASE)
    txt = re.sub(r'(?m)^(?:##\s*)?S\s*[:]?$', '## S:', txt, flags=re.IGNORECASE)
    # “O” e “A” colados
    txt = re.sub(r'(?m)^\s*O\s+A\s+', '## O:\n## A: ', txt, flags=re.IGNORECASE)
    # Objetivos, Avaliação, Plano
    for tag in ['O','A','P']:
        txt = re.sub(fr'(?m)^(?:##\s*)?{tag}\s*[:]?$', f'## {tag}:', txt, flags=re.IGNORECASE)
    # Uniformiza “##X:” → “## X:”
    for tag in ['S','O','A','P']:
        txt = re.sub(fr'##\s*{tag}\s*:', f'## {tag}:', txt, flags=re.IGNORECASE)
    return txt

def extract_sections(txt: str) -> dict:
    """

    Extrai o conteúdo de cada seção identificada por ## S:, ## O:, ## A:, ## P:.

    """
    txt = normalize_sections(txt).replace('\n', ' ')
    txt = re.sub(r'\s+', ' ', txt).strip()
    sections = {}
    for tag in ['S','O','A','P']:
        pat = fr'## {tag}:(.*?)(?=## [SOAP]:|$)'
        m = re.search(pat, txt, flags=re.IGNORECASE)
        sections[tag] = m.group(1).strip() if m else ''
    return sections

def normalize_and_flatten(txt: str) -> str:
    """

    Prepara texto completo para cálculo global (flatten + lowercase).

    """
    flat = normalize_sections(txt).replace('\n', ' ')
    flat = re.sub(r'\s+', ' ', flat).strip()
    return flat.lower()

def has_sections(txt: str) -> bool:
    """

    Retorna True se o texto contém pelo menos uma das tags ## S:, ## O:, ## A: ou ## P:

    """
    txt = normalize_sections(txt)
    return any(f"## {tag}:" in txt for tag in ['S', 'O', 'A', 'P'])

def section_bleu(gen_txt: str, ref_txt: str) -> float:
    """

    Calcula BLEU para um par de strings (seção), retornando score de 0 a 100.

    """
    if not gen_txt.strip() and not ref_txt.strip():
        return 100.0
    if (not gen_txt.strip()) ^ (not ref_txt.strip()):
        return 0.0
    return _bleu_scorer.sentence_score(gen_txt, [ref_txt]).score

def full_bleu(gen_raw: str, ref_raw: str) -> float:
    """

    Calcula BLEU global para strings completas, retornando score de 0 a 100.

    """
    gen = normalize_and_flatten(gen_raw)
    ref = normalize_and_flatten(ref_raw)
    if not gen and not ref:
        return 100.0
    if (not gen) ^ (not ref):
        return 0.0
    return _bleu_scorer.sentence_score(gen, [ref]).score