# pdfhandle.py (Enhanced with AI fallback) import pdfplumber import re import logging import os from langchain_community.chat_models import AzureChatOpenAI #from langchain.chat_models import AzureChatOpenAI from langchain.schema import HumanMessage from langchain.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field from typing import List logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class MedicalParameter(BaseModel): test: str = Field(description="Name of the medical test") value: str = Field(description="Observed value of the test") reference: str = Field(description="Reference range with units if available") class MedicalReport(BaseModel): parameters: List[MedicalParameter] = Field(description="List of medical parameters from the report") def parse_medical_pdf(pdf_file): """Enhanced PDF parser with AI fallback for medical reports""" # First attempt with regex-based parsing results = standard_parse(pdf_file) # If standard parsing yields no results, try AI-based parsing if not results: logger.info("Standard parsing yielded no results. Trying AI-based parsing...") results = ai_based_parse(pdf_file) return results def standard_parse(pdf_file): """Standard regex-based parsing method""" results = [] header_found = False header_pattern = re.compile( r'TEST\s+NAME\s+OBSERVED\s+VALUE\s+UNITS\s+BIO\.?\s+REF\.?\s*INTERVAL', re.IGNORECASE ) # Extended pattern to handle common variations in medical reports data_pattern = re.compile( r'^(?P.+?)\s+' # Test name (non-greedy match) r'(?P\d+\.?\d*)\s+' # Numeric value r'(?P[^\s]+)\s+' # Units (no spaces) r'(?P.+)$' # Reference range ) with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: text = page.extract_text() lines = [line.strip() for line in text.split('\n') if line.strip()] for line in lines: # Skip disclaimers and empty lines if not line or line.startswith('Disclaimer'): continue # Detect header row if header_pattern.search(line): header_found = True logger.info(f"Header found: {line}") continue if header_found: # Skip section headers (all caps without numbers) if re.match(r'^[A-Z\s/]+$', line) and not re.search(r'\d', line): logger.debug(f"Skipping section: {line}") continue # Extract data using regex if match := data_pattern.match(line): data = match.groupdict() results.append({ "test": data['test'].strip(), "value": data['value'], "reference": f"{data['ref']} {data['units']}".strip() }) logger.info(f"Valid row: {data}") else: logger.debug(f"Skipped line: {line}") return results def ai_based_parse(pdf_file): """AI-based parsing using LangChain and Azure OpenAI""" try: # Configure Azure OpenAI client llm = AzureChatOpenAI( openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"), azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"), openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"), azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT") ) # Extract text from PDF full_text = "" with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: full_text += page.extract_text() + "\n" # Define the output parser parser = PydanticOutputParser(pydantic_object=MedicalReport) # Create the prompt prompt = f""" You are a medical data extraction expert. Extract all medical test parameters from this report. Medical Report Text: {full_text} Extract each test with its observed value and reference range. Format your response exactly as in this example: {{ "parameters": [ {{ "test": "Hemoglobin", "value": "14.5", "reference": "13.0 - 17.0 g/dL" }}, {{ "test": "Total Cholesterol", "value": "198", "reference": "<200 mg/dL" }} ] }} Extract only actual test parameters. Include units in the reference field. {parser.get_format_instructions()} """ # Get response from the LLM messages = [HumanMessage(content=prompt)] response = llm.predict_messages(messages) # Parse the response report = parser.parse(response.content) # Convert to the expected format results = [] for param in report.parameters: results.append({ "test": param.test, "value": param.value, "reference": param.reference }) logger.info(f"AI parsing successful. Extracted {len(results)} parameters.") return results except Exception as e: logger.error(f"AI-based parsing failed: {str(e)}") return []