Spaces:
Build error
Build error
File size: 5,803 Bytes
c7077c5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# pdfhandle.py (Enhanced with AI fallback)
import pdfplumber
import re
import logging
import os
from langchain_community.chat_models import AzureChatOpenAI
#from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MedicalParameter(BaseModel):
test: str = Field(description="Name of the medical test")
value: str = Field(description="Observed value of the test")
reference: str = Field(description="Reference range with units if available")
class MedicalReport(BaseModel):
parameters: List[MedicalParameter] = Field(description="List of medical parameters from the report")
def parse_medical_pdf(pdf_file):
"""Enhanced PDF parser with AI fallback for medical reports"""
# First attempt with regex-based parsing
results = standard_parse(pdf_file)
# If standard parsing yields no results, try AI-based parsing
if not results:
logger.info("Standard parsing yielded no results. Trying AI-based parsing...")
results = ai_based_parse(pdf_file)
return results
def standard_parse(pdf_file):
"""Standard regex-based parsing method"""
results = []
header_found = False
header_pattern = re.compile(
r'TEST\s+NAME\s+OBSERVED\s+VALUE\s+UNITS\s+BIO\.?\s+REF\.?\s*INTERVAL',
re.IGNORECASE
)
# Extended pattern to handle common variations in medical reports
data_pattern = re.compile(
r'^(?P<test>.+?)\s+' # Test name (non-greedy match)
r'(?P<value>\d+\.?\d*)\s+' # Numeric value
r'(?P<units>[^\s]+)\s+' # Units (no spaces)
r'(?P<ref>.+)$' # Reference range
)
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
lines = [line.strip() for line in text.split('\n') if line.strip()]
for line in lines:
# Skip disclaimers and empty lines
if not line or line.startswith('Disclaimer'):
continue
# Detect header row
if header_pattern.search(line):
header_found = True
logger.info(f"Header found: {line}")
continue
if header_found:
# Skip section headers (all caps without numbers)
if re.match(r'^[A-Z\s/]+$', line) and not re.search(r'\d', line):
logger.debug(f"Skipping section: {line}")
continue
# Extract data using regex
if match := data_pattern.match(line):
data = match.groupdict()
results.append({
"test": data['test'].strip(),
"value": data['value'],
"reference": f"{data['ref']} {data['units']}".strip()
})
logger.info(f"Valid row: {data}")
else:
logger.debug(f"Skipped line: {line}")
return results
def ai_based_parse(pdf_file):
"""AI-based parsing using LangChain and Azure OpenAI"""
try:
# Configure Azure OpenAI client
llm = AzureChatOpenAI(
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION", "2024-02-15-preview"),
azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)
# Extract text from PDF
full_text = ""
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
full_text += page.extract_text() + "\n"
# Define the output parser
parser = PydanticOutputParser(pydantic_object=MedicalReport)
# Create the prompt
prompt = f"""
You are a medical data extraction expert. Extract all medical test parameters from this report.
Medical Report Text:
{full_text}
Extract each test with its observed value and reference range. Format your response exactly as in this example:
{{
"parameters": [
{{
"test": "Hemoglobin",
"value": "14.5",
"reference": "13.0 - 17.0 g/dL"
}},
{{
"test": "Total Cholesterol",
"value": "198",
"reference": "<200 mg/dL"
}}
]
}}
Extract only actual test parameters. Include units in the reference field.
{parser.get_format_instructions()}
"""
# Get response from the LLM
messages = [HumanMessage(content=prompt)]
response = llm.predict_messages(messages)
# Parse the response
report = parser.parse(response.content)
# Convert to the expected format
results = []
for param in report.parameters:
results.append({
"test": param.test,
"value": param.value,
"reference": param.reference
})
logger.info(f"AI parsing successful. Extracted {len(results)} parameters.")
return results
except Exception as e:
logger.error(f"AI-based parsing failed: {str(e)}")
return [] |