File size: 5,803 Bytes
c7077c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# pdfhandle.py (Enhanced with AI fallback)
import pdfplumber
import re
import logging
import os
from langchain_community.chat_models import AzureChatOpenAI
#from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MedicalParameter(BaseModel):
    test: str = Field(description="Name of the medical test")
    value: str = Field(description="Observed value of the test")
    reference: str = Field(description="Reference range with units if available")

class MedicalReport(BaseModel):
    parameters: List[MedicalParameter] = Field(description="List of medical parameters from the report")

def parse_medical_pdf(pdf_file):
    """Enhanced PDF parser with AI fallback for medical reports"""
    # First attempt with regex-based parsing
    results = standard_parse(pdf_file)
    
    # If standard parsing yields no results, try AI-based parsing
    if not results:
        logger.info("Standard parsing yielded no results. Trying AI-based parsing...")
        results = ai_based_parse(pdf_file)
    
    return results

def standard_parse(pdf_file):
    """Standard regex-based parsing method"""
    results = []
    header_found = False
    header_pattern = re.compile(
        r'TEST\s+NAME\s+OBSERVED\s+VALUE\s+UNITS\s+BIO\.?\s+REF\.?\s*INTERVAL',
        re.IGNORECASE
    )
    
    # Extended pattern to handle common variations in medical reports
    data_pattern = re.compile(
        r'^(?P<test>.+?)\s+'          # Test name (non-greedy match)
        r'(?P<value>\d+\.?\d*)\s+'    # Numeric value
        r'(?P<units>[^\s]+)\s+'       # Units (no spaces)
        r'(?P<ref>.+)$'               # Reference range
    )

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            
            for line in lines:
                # Skip disclaimers and empty lines
                if not line or line.startswith('Disclaimer'):
                    continue
                
                # Detect header row
                if header_pattern.search(line):
                    header_found = True
                    logger.info(f"Header found: {line}")
                    continue
                
                if header_found:
                    # Skip section headers (all caps without numbers)
                    if re.match(r'^[A-Z\s/]+$', line) and not re.search(r'\d', line):
                        logger.debug(f"Skipping section: {line}")
                        continue
                    
                    # Extract data using regex
                    if match := data_pattern.match(line):
                        data = match.groupdict()
                        results.append({
                            "test": data['test'].strip(),
                            "value": data['value'],
                            "reference": f"{data['ref']} {data['units']}".strip()
                        })
                        logger.info(f"Valid row: {data}")
                    else:
                        logger.debug(f"Skipped line: {line}")
    
    return results

def ai_based_parse(pdf_file):
    """AI-based parsing using LangChain and Azure OpenAI"""
    try:
        # Configure Azure OpenAI client
        llm = AzureChatOpenAI(
            openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
            azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
            openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
        )
        
        # Extract text from PDF
        full_text = ""
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                full_text += page.extract_text() + "\n"
        
        # Define the output parser
        parser = PydanticOutputParser(pydantic_object=MedicalReport)
        
        # Create the prompt
        prompt = f"""
        You are a medical data extraction expert. Extract all medical test parameters from this report.
        
        Medical Report Text:
        {full_text}
        
        Extract each test with its observed value and reference range. Format your response exactly as in this example:
        {{
            "parameters": [
                {{
                    "test": "Hemoglobin",
                    "value": "14.5",
                    "reference": "13.0 - 17.0 g/dL"
                }},
                {{
                    "test": "Total Cholesterol",
                    "value": "198",
                    "reference": "<200 mg/dL"
                }}
            ]
        }}
        
        Extract only actual test parameters. Include units in the reference field.
        {parser.get_format_instructions()}
        """
        
        # Get response from the LLM
        messages = [HumanMessage(content=prompt)]
        response = llm.predict_messages(messages)
        
        # Parse the response
        report = parser.parse(response.content)
        
        # Convert to the expected format
        results = []
        for param in report.parameters:
            results.append({
                "test": param.test,
                "value": param.value,
                "reference": param.reference
            })
        
        logger.info(f"AI parsing successful. Extracted {len(results)} parameters.")
        return results
        
    except Exception as e:
        logger.error(f"AI-based parsing failed: {str(e)}")
        return []