MarCognity-AI / src /utils /support_functions.py

Create support_functions.py

ffcb401 verified 3 months ago

4.97 kB

	# © 2025 Elena Marziali — Code released under Apache 2.0 license.
	# See LICENSE in the repository for details.
	# Removal of this copyright is prohibited.

	# Evaluate the structure of the AI response from the LLM
	def validate_ai_structure(response, expected_fields=("title", "abstract", "url")):
	if not isinstance(response, list):
	return []
	valid_items = []
	for item in response:
	if isinstance(item, dict) and all(k in item for k in expected_fields):
	valid_items.append(item)
	return valid_items

	import math

	# Compute semantic score of the response
	def sigmoid(x):
	return 1 / (1 + math.exp(-x))

	def evaluate_score(model_output):
	try:
	score = float(model_output[0])
	return round(sigmoid(score), 3)
	except:
	return 0.0

	# Extract text from selected file
	def extract_text(file_name, max_chars=5000):
	"""
	Extracts text from supported formats (.pdf, .docx, .tsv, .csv).
	Returns only the first max_chars characters.
	"""
	extension = file_name.lower().split(".")[-1]

	try:
	if extension == "pdf":
	with pdfplumber.open(file_name) as pdf:
	text = "\n".join([p.extract_text() or "" for p in pdf.pages]).strip()

	elif extension == "docx":
	doc = Document(file_name)
	text = "\n".join([p.text for p in doc.paragraphs]).strip()

	elif extension in ["csv", "tsv"]:
	sep = "," if extension == "csv" else "\t"
	df = pd.read_csv(file_name, sep=sep)
	text = df.to_string(index=False)

	else:
	raise ValueError(f"Unsupported format: .{extension}")

	return text[:max_chars] if text else "No text extracted."

	except Exception as e:
	return f"Error during text extraction: {e}"

	# Safely extract textual content from an AIMessage
	def extract_text_from_ai(obj):
	""" Safely extracts textual content from an AIMessage object. """
	return getattr(obj, "content", str(obj)).strip()

	# Extract figure captions from text
	def extract_captions_from_text(text):
	pattern = r"(Figure\|Fig\.?)\s\d+[:\.\-–]?\s[^\n]+"
	return re.findall(pattern, text, re.IGNORECASE)

	# Extract images and captions from a file
	def extract_images_with_captions(file_path, output_folder="extracted_figures"):
	os.makedirs(output_folder, exist_ok=True)
	extension = file_path.lower().split(".")[-1]
	images = []
	captions = []

	try:
	if extension == "pdf":
	doc = fitz.open(file_path)
	full_text = "\n".join([p.get_text("text") for p in doc])
	extracted_captions = extract_captions_from_text(full_text)
	count = 0

	for i, page in enumerate(doc):
	for j, img in enumerate(page.get_images(full=True)):
	base = doc.extract_image(img[0])
	ext = base["ext"]
	path = f"{output_folder}/page{i+1}_img{j+1}.{ext}"
	with open(path, "wb") as f:
	f.write(base["image"])
	images.append(path)
	captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}.{j+1}")
	count += 1

	elif extension == "docx":
	doc = Document(file_path)
	text = "\n".join([p.text for p in doc.paragraphs])
	extracted_captions = extract_captions_from_text(text)
	count = 0

	for i, rel in enumerate(doc.part._rels):
	relation = doc.part._rels[rel]
	if "image" in relation.target_ref:
	img_data = relation.target_part.blob
	name = f"{output_folder}/docx_image_{i+1}.png"
	with open(name, "wb") as f:
	f.write(img_data)
	images.append(name)
	captions.append(extracted_captions[count] if count < len(extracted_captions) else f"Figure {i+1}")
	count += 1

	else:
	print(f"Unsupported extension: .{extension}")

	print(f"{len(images)} image(s) extracted.")
	return images, captions

	except Exception as e:
	print(f"Error extracting images: {e}")
	return [], []

	# Generate semantic coherence note based on score
	def generate_note(score):
	if score > 0.85:
	return "High semantic coherence. The response is likely solid and relevant."
	elif score > 0.6:
	return "Moderate coherence. The response is understandable but may contain approximations."
	else:
	return "Low coherence. It may be helpful to rephrase the question or provide more context."

	# Simulate LLM response generation
	def generate_response(question, temperature=0.7):
	if "Rephrase" in question:
	return "How does enthalpy change during a phase transition?"
	return f"[Simulated response at temperature {temperature} for: {question}]"