Spaces:

gjoliveira
/

data-ai-llm-eval-app

Sleeping

data-ai-llm-eval-app / utils /file_utils.py

Guilherme

Add utils modules to package for Space

6352550 about 1 month ago

3.74 kB

	# file_utils.py
	# file_utils.py

	import pandas as pd
	import chardet
	import re
	import os
	from sacrebleu.metrics import BLEU

	# Instância global de BLEU com tokenização 'intl', lowercase e smoothing 'exp'
	_bleu_scorer = BLEU(tokenize='intl', lowercase=True, smooth_method='exp')

	def smart_read_csv(file_obj):
	"""
	Lê um CSV tentando detectar encoding e separador automaticamente.
	"""
	if isinstance(file_obj, str) and os.path.exists(file_obj):
	f = open(file_obj, 'rb')
	elif hasattr(file_obj, 'name') and isinstance(file_obj.name, str):
	try:
	f = open(file_obj.name, 'rb')
	except Exception:
	f = file_obj
	else:
	f = file_obj

	raw = f.read()
	f.seek(0)
	enc = chardet.detect(raw).get('encoding', 'utf-8') or 'utf-8'

	for sep in [',', ';', '\t']:
	try:
	df = pd.read_csv(f, encoding=enc, sep=sep)
	if df.shape[1] >= 2:
	return df
	except Exception:
	pass
	f.seek(0)

	raise ValueError(f"Não foi possível ler o CSV com encoding {enc} e separadores comuns.")

	def normalize_sections(txt: str) -> str:
	"""
	Normaliza as tags de seção (## S:, ## O:, ## A:, ## P:) conforme seu notebook original.
	"""
	txt = str(txt)
	# Sintomas
	txt = re.sub(r'(?m)^\sS\sC\s*telemedicina', '## S:', txt, flags=re.IGNORECASE)
	txt = re.sub(r'(?m)^(?:##\s)?S\s[:]?$', '## S:', txt, flags=re.IGNORECASE)
	# “O” e “A” colados
	txt = re.sub(r'(?m)^\s*O\s+A\s+', '## O:\n## A: ', txt, flags=re.IGNORECASE)
	# Objetivos, Avaliação, Plano
	for tag in ['O','A','P']:
	txt = re.sub(fr'(?m)^(?:##\s)?{tag}\s[:]?$', f'## {tag}:', txt, flags=re.IGNORECASE)
	# Uniformiza “##X:” → “## X:”
	for tag in ['S','O','A','P']:
	txt = re.sub(fr'##\s{tag}\s:', f'## {tag}:', txt, flags=re.IGNORECASE)
	return txt

	def extract_sections(txt: str) -> dict:
	"""
	Extrai o conteúdo de cada seção identificada por ## S:, ## O:, ## A:, ## P:.
	"""
	txt = normalize_sections(txt).replace('\n', ' ')
	txt = re.sub(r'\s+', ' ', txt).strip()
	sections = {}
	for tag in ['S','O','A','P']:
	pat = fr'## {tag}:(.*?)(?=## [SOAP]:\|$)'
	m = re.search(pat, txt, flags=re.IGNORECASE)
	sections[tag] = m.group(1).strip() if m else ''
	return sections

	def normalize_and_flatten(txt: str) -> str:
	"""
	Prepara texto completo para cálculo global (flatten + lowercase).
	"""
	flat = normalize_sections(txt).replace('\n', ' ')
	flat = re.sub(r'\s+', ' ', flat).strip()
	return flat.lower()

	def has_sections(txt: str) -> bool:
	"""
	Retorna True se o texto contém pelo menos uma das tags ## S:, ## O:, ## A: ou ## P:
	"""
	txt = normalize_sections(txt)
	return any(f"## {tag}:" in txt for tag in ['S', 'O', 'A', 'P'])

	def section_bleu(gen_txt: str, ref_txt: str) -> float:
	"""
	Calcula BLEU para um par de strings (seção), retornando score de 0 a 100.
	"""
	if not gen_txt.strip() and not ref_txt.strip():
	return 100.0
	if (not gen_txt.strip()) ^ (not ref_txt.strip()):
	return 0.0
	return _bleu_scorer.sentence_score(gen_txt, [ref_txt]).score

	def full_bleu(gen_raw: str, ref_raw: str) -> float:
	"""
	Calcula BLEU global para strings completas, retornando score de 0 a 100.
	"""
	gen = normalize_and_flatten(gen_raw)
	ref = normalize_and_flatten(ref_raw)
	if not gen and not ref:
	return 100.0
	if (not gen) ^ (not ref):
	return 0.0
	return _bleu_scorer.sentence_score(gen, [ref]).score