Spaces:

tamilprabaharan
/

AI_Doctor

Build error

App Files Files Community

AI_Doctor / pdfhandle.py

tamilprabaharan

Initial commit of AI Doctor App

c7077c5 5 months ago

raw

history blame contribute delete

5.8 kB

	# pdfhandle.py (Enhanced with AI fallback)
	import pdfplumber
	import re
	import logging
	import os
	from langchain_community.chat_models import AzureChatOpenAI
	#from langchain.chat_models import AzureChatOpenAI
	from langchain.schema import HumanMessage
	from langchain.output_parsers import PydanticOutputParser
	from pydantic import BaseModel, Field
	from typing import List

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class MedicalParameter(BaseModel):
	test: str = Field(description="Name of the medical test")
	value: str = Field(description="Observed value of the test")
	reference: str = Field(description="Reference range with units if available")

	class MedicalReport(BaseModel):
	parameters: List[MedicalParameter] = Field(description="List of medical parameters from the report")

	def parse_medical_pdf(pdf_file):
	"""Enhanced PDF parser with AI fallback for medical reports"""
	# First attempt with regex-based parsing
	results = standard_parse(pdf_file)

	# If standard parsing yields no results, try AI-based parsing
	if not results:
	logger.info("Standard parsing yielded no results. Trying AI-based parsing...")
	results = ai_based_parse(pdf_file)

	return results

	def standard_parse(pdf_file):
	"""Standard regex-based parsing method"""
	results = []
	header_found = False
	header_pattern = re.compile(
	r'TEST\s+NAME\s+OBSERVED\s+VALUE\s+UNITS\s+BIO\.?\s+REF\.?\s*INTERVAL',
	re.IGNORECASE
	)

	# Extended pattern to handle common variations in medical reports
	data_pattern = re.compile(
	r'^(?P<test>.+?)\s+' # Test name (non-greedy match)
	r'(?P<value>\d+\.?\d*)\s+' # Numeric value
	r'(?P<units>[^\s]+)\s+' # Units (no spaces)
	r'(?P<ref>.+)$' # Reference range
	)

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	lines = [line.strip() for line in text.split('\n') if line.strip()]

	for line in lines:
	# Skip disclaimers and empty lines
	if not line or line.startswith('Disclaimer'):
	continue

	# Detect header row
	if header_pattern.search(line):
	header_found = True
	logger.info(f"Header found: {line}")
	continue

	if header_found:
	# Skip section headers (all caps without numbers)
	if re.match(r'^[A-Z\s/]+$', line) and not re.search(r'\d', line):
	logger.debug(f"Skipping section: {line}")
	continue

	# Extract data using regex
	if match := data_pattern.match(line):
	data = match.groupdict()
	results.append({
	"test": data['test'].strip(),
	"value": data['value'],
	"reference": f"{data['ref']} {data['units']}".strip()
	})
	logger.info(f"Valid row: {data}")
	else:
	logger.debug(f"Skipped line: {line}")

	return results

	def ai_based_parse(pdf_file):
	"""AI-based parsing using LangChain and Azure OpenAI"""
	try:
	# Configure Azure OpenAI client
	llm = AzureChatOpenAI(
	openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
	azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
	openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
	azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
	)

	# Extract text from PDF
	full_text = ""
	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	full_text += page.extract_text() + "\n"

	# Define the output parser
	parser = PydanticOutputParser(pydantic_object=MedicalReport)

	# Create the prompt
	prompt = f"""
	You are a medical data extraction expert. Extract all medical test parameters from this report.

	Medical Report Text:
	{full_text}

	Extract each test with its observed value and reference range. Format your response exactly as in this example:
	{{
	"parameters": [
	{{
	"test": "Hemoglobin",
	"value": "14.5",
	"reference": "13.0 - 17.0 g/dL"
	}},
	{{
	"test": "Total Cholesterol",
	"value": "198",
	"reference": "<200 mg/dL"
	}}
	]
	}}

	Extract only actual test parameters. Include units in the reference field.
	{parser.get_format_instructions()}
	"""

	# Get response from the LLM
	messages = [HumanMessage(content=prompt)]
	response = llm.predict_messages(messages)

	# Parse the response
	report = parser.parse(response.content)

	# Convert to the expected format
	results = []
	for param in report.parameters:
	results.append({
	"test": param.test,
	"value": param.value,
	"reference": param.reference
	})

	logger.info(f"AI parsing successful. Extracted {len(results)} parameters.")
	return results

	except Exception as e:
	logger.error(f"AI-based parsing failed: {str(e)}")
	return []