AI_Doctor / pdfhandle.py
tamilprabaharan's picture
Initial commit of AI Doctor App
c7077c5
# pdfhandle.py (Enhanced with AI fallback)
import pdfplumber
import re
import logging
import os
from langchain_community.chat_models import AzureChatOpenAI
#from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MedicalParameter(BaseModel):
test: str = Field(description="Name of the medical test")
value: str = Field(description="Observed value of the test")
reference: str = Field(description="Reference range with units if available")
class MedicalReport(BaseModel):
parameters: List[MedicalParameter] = Field(description="List of medical parameters from the report")
def parse_medical_pdf(pdf_file):
"""Enhanced PDF parser with AI fallback for medical reports"""
# First attempt with regex-based parsing
results = standard_parse(pdf_file)
# If standard parsing yields no results, try AI-based parsing
if not results:
logger.info("Standard parsing yielded no results. Trying AI-based parsing...")
results = ai_based_parse(pdf_file)
return results
def standard_parse(pdf_file):
"""Standard regex-based parsing method"""
results = []
header_found = False
header_pattern = re.compile(
r'TEST\s+NAME\s+OBSERVED\s+VALUE\s+UNITS\s+BIO\.?\s+REF\.?\s*INTERVAL',
re.IGNORECASE
)
# Extended pattern to handle common variations in medical reports
data_pattern = re.compile(
r'^(?P<test>.+?)\s+' # Test name (non-greedy match)
r'(?P<value>\d+\.?\d*)\s+' # Numeric value
r'(?P<units>[^\s]+)\s+' # Units (no spaces)
r'(?P<ref>.+)$' # Reference range
)
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
lines = [line.strip() for line in text.split('\n') if line.strip()]
for line in lines:
# Skip disclaimers and empty lines
if not line or line.startswith('Disclaimer'):
continue
# Detect header row
if header_pattern.search(line):
header_found = True
logger.info(f"Header found: {line}")
continue
if header_found:
# Skip section headers (all caps without numbers)
if re.match(r'^[A-Z\s/]+$', line) and not re.search(r'\d', line):
logger.debug(f"Skipping section: {line}")
continue
# Extract data using regex
if match := data_pattern.match(line):
data = match.groupdict()
results.append({
"test": data['test'].strip(),
"value": data['value'],
"reference": f"{data['ref']} {data['units']}".strip()
})
logger.info(f"Valid row: {data}")
else:
logger.debug(f"Skipped line: {line}")
return results
def ai_based_parse(pdf_file):
"""AI-based parsing using LangChain and Azure OpenAI"""
try:
# Configure Azure OpenAI client
llm = AzureChatOpenAI(
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
# Extract text from PDF
full_text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define the output parser
parser = PydanticOutputParser(pydantic_object=MedicalReport)
# Create the prompt
prompt = f"""
You are a medical data extraction expert. Extract all medical test parameters from this report.
Medical Report Text:
{full_text}
Extract each test with its observed value and reference range. Format your response exactly as in this example:
{{
"parameters": [
{{
"test": "Hemoglobin",
"value": "14.5",
"reference": "13.0 - 17.0 g/dL"
}},
{{
"test": "Total Cholesterol",
"value": "198",
"reference": "<200 mg/dL"
}}
]
}}
Extract only actual test parameters. Include units in the reference field.
{parser.get_format_instructions()}
"""
# Get response from the LLM
messages = [HumanMessage(content=prompt)]
response = llm.predict_messages(messages)
# Parse the response
report = parser.parse(response.content)
# Convert to the expected format
results = []
for param in report.parameters:
results.append({
"test": param.test,
"value": param.value,
"reference": param.reference
})
logger.info(f"AI parsing successful. Extracted {len(results)} parameters.")
return results
except Exception as e:
logger.error(f"AI-based parsing failed: {str(e)}")
return []