exactEMR / app.py
leonsimon23's picture
Update app.py
c7a353d verified
import gradio as gr
import langextract as lx
import json
import os
import tempfile
import textwrap
from datetime import datetime
# --- 预设模板库 ---
REPORT_TEMPLATES = {
"影像报告": {
"prompt": textwrap.dedent("""\
请从影像检查报告中提取以下关键信息:
- 检查部位和器官
- 病灶的位置、大小、密度/信号特征
- 增强扫描表现
- 阴性发现(未见异常的部位)
提取规则:
1. 必须使用报告中的原文,不要改写
2. 保留数值的精确度
3. 完整提取解剖学描述"""),
"example": json.dumps([{
"text": "胸部CT平扫显示:右肺上叶尖段见结节影,大小约1.2 x 0.9 cm,边缘毛糙,密度不均。纵隔淋巴结未见明显肿大。",
"extractions": [
{"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}},
{"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}},
{"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}},
{"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}},
{"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}},
{"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}}
]
}], ensure_ascii=False, indent=2)
},
"病理报告": {
"prompt": textwrap.dedent("""\
请从病理报告中提取以下关键信息:
- 标本来源和类型
- 组织学分型
- 肿瘤分级
- 免疫组化结果
- 切缘情况
提取规则:
1. 完整保留病理诊断术语
2. 提取所有阳性和阴性的免疫组化标记
3. 保留分级和分期信息"""),
"example": json.dumps([{
"text": "(右乳腺)浸润性导管癌,组织学II级,肿瘤大小2.5cm。免疫组化:ER(+,90%),PR(+,70%),HER-2(-),Ki-67(30%)。切缘未见癌累及。",
"extractions": [
{"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}},
{"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}},
{"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}},
{"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}},
{"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}},
{"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}},
{"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}},
{"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}},
{"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}}
]
}], ensure_ascii=False, indent=2)
},
"病历摘要": {
"prompt": textwrap.dedent("""\
请从病历中提取以下关键信息:
- 主诉和现病史要点
- 既往史(重要疾病和手术史)
- 体格检查阳性体征
- 辅助检查异常结果
- 诊断和治疗方案
提取规则:
1. 提取关键时间节点
2. 保留症状的完整描述
3. 提取数值型指标"""),
"example": json.dumps([{
"text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体:BP 150/95mmHg,心率92次/分。",
"extractions": [
{"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}},
{"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}},
{"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}},
{"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}}
]
}], ensure_ascii=False, indent=2)
}
}
# --- 示例文本库 ---
SAMPLE_TEXTS = {
"影像报告示例": "头颅MRI平扫增强:左侧基底节区见片状异常信号影,T1WI呈低信号,T2WI及FLAIR呈高信号,大小约4.5 x 3.2 x 3.8 cm,周围见大片水肿带。增强扫描后病灶呈不均匀明显强化,周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称,未见明显扩大。",
"病理报告示例": "(胃窦)腺癌,中分化,浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化:CK(+),CK7(-),CK20(+),CDX-2(+),Her-2(1+),Ki-67阳性指数约40%。送检淋巴结12枚,见癌转移3枚(3/12)。",
"病历示例": "患者女性58岁,因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物,约2cm大小,无疼痛,无乳头溢液。既往体健。查体:左乳外上象限可触及约2.5 x 2.0cm肿物,质硬,边界欠清,活动度差,无压痛。左侧腋窝可触及1枚肿大淋巴结,约1.5cm。辅助检查:乳腺超声示左乳外上象限低回声结节,BI-RADS 4C类。"
}
# --- 历史记录管理 ---
extraction_history = []
def save_to_history(input_text, result, template_name):
"""保存提取历史"""
history_entry = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"template": template_name,
"input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text,
"extraction_count": len(result.get("extractions", [])),
"full_result": result
}
extraction_history.insert(0, history_entry)
if len(extraction_history) > 10:
extraction_history.pop()
return format_history_display()
def format_history_display():
"""格式化历史记录显示"""
if not extraction_history:
return "暂无提取历史"
history_text = ""
for i, entry in enumerate(extraction_history, 1):
history_text += f"### 记录 {i} - {entry['timestamp']}\n"
history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n"
history_text += f"**文本预览**: {entry['input_preview']}\n\n"
return history_text
# --- 统计信息生成 ---
def generate_statistics(result):
"""生成提取统计信息"""
extractions = result.get("extractions", [])
if not extractions:
return "暂无统计信息"
class_counts = {}
for ext in extractions:
cls = ext.get("extraction_class", "未分类")
class_counts[cls] = class_counts.get(cls, 0) + 1
stats_text = f"### 📊 提取统计\n"
stats_text += f"**总提取项数**: {len(extractions)}\n\n"
stats_text += "**分类统计**:\n"
for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
stats_text += f"- {cls}: {count}项\n"
return stats_text
# --- 后端处理函数 ---
def extract_information(api_key, prompt, examples_json, input_text, template_name):
"""执行信息提取"""
if not api_key:
raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥")
if not input_text.strip():
raise gr.Error("⚠️ 请输入待提取的文本内容")
try:
examples_data = json.loads(examples_json)
examples = [
lx.data.ExampleData(
text=ex['text'],
extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']]
) for ex in examples_data
]
except (json.JSONDecodeError, KeyError) as e:
raise gr.Error(f"❌ 示例JSON格式错误: {e}")
# ... 其他代码保持不变 ...
try:
os.environ['LANGEXTRACT_API_KEY'] = api_key
results = lx.extract(
text_or_documents=input_text,
prompt_description=prompt,
examples=examples,
model_id="gemini-2.5-flash", # 建议使用最新的模型
)
if isinstance(results, list) and len(results) > 0:
result = results[0]
else:
result = results
extractions_list = []
if hasattr(result, 'extractions'):
for ext in result.extractions:
ext_dict = {
"extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "",
"extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "",
"char_span": getattr(ext, 'char_span', None),
"attributes": ext.attributes if hasattr(ext, 'attributes') else {}
}
extractions_list.append(ext_dict)
output_dict = {
"source_text": result.text if hasattr(result, 'text') else input_text,
"extractions": extractions_list
}
# --- START: 修改后的文件保存逻辑 ---
# 1. 创建一个临时的、持久化的文件,用于最终的下载
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file:
download_path = final_output_file.name
# 2. 创建一个临时目录,供 langextract 库使用
with tempfile.TemporaryDirectory() as tmp_dir:
# 准备好要保存的文档列表
documents_to_save = results if isinstance(results, list) else [results]
# 3. 让库将结果保存到这个临时目录中
lx.io.save_annotated_documents(documents_to_save, tmp_dir)
# 4. 库通常会生成一个名为 'output.jsonl' 的文件,找到它
source_file_path = os.path.join(tmp_dir, 'output.jsonl')
# 5. 将生成的文件内容,复制到我们为 Gradio 准备的最终文件中
if os.path.exists(source_file_path):
with open(source_file_path, 'r', encoding='utf-8') as src_file:
with open(download_path, 'w', encoding='utf-8') as dest_file:
dest_file.write(src_file.read())
else:
# 如果没有生成文件,就直接保存我们构造的JSON字典
with open(download_path, 'w', encoding='utf-8') as dest_file:
json.dump(output_dict, dest_file, ensure_ascii=False, indent=2)
# --- END: 修改后的文件保存逻辑 ---
stats = generate_statistics(output_dict)
history = save_to_history(input_text, output_dict, template_name)
return output_dict, download_path, stats, history, gr.update(visible=True)
except Exception as e:
import traceback
error_detail = traceback.format_exc()
print(f"详细错误信息:\n{error_detail}")
raise gr.Error(f"❌ 提取失败: {str(e)}")
def load_template(template_name):
"""加载预设模板"""
if template_name in REPORT_TEMPLATES:
template = REPORT_TEMPLATES[template_name]
return template["prompt"], template["example"]
return "", ""
def load_sample_text(sample_name):
"""加载示例文本"""
return SAMPLE_TEXTS.get(sample_name, "")
# --- Gradio 界面 ---
custom_css = """
#header {
text-align: center;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 2rem;
border-radius: 10px;
color: white;
margin-bottom: 2rem;
}
#header h1 {
margin: 0;
font-size: 2.5rem;
font-weight: 700;
}
#header p {
margin: 0.5rem 0 0 0;
font-size: 1.1rem;
opacity: 0.95;
}
.template-btn {
margin: 0.25rem !important;
}
#stats-box {
background: #f8f9fa;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #667eea;
}
"""
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo:
with gr.Row(elem_id="header"):
gr.Markdown("""
# 🏥 医学信息智能提取系统
### 基于 AI 的结构化医学文本分析工具
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🔐 API 配置")
api_key_input = gr.Textbox(
label="Google AI Studio API Key",
type="password",
placeholder="请输入您的 API 密钥...",
info="获取密钥: https://aistudio.google.com/app/apikey"
)
gr.Markdown("### 📋 选择报告类型")
template_selector = gr.Radio(
choices=list(REPORT_TEMPLATES.keys()),
value="影像报告",
label="预设模板",
info="选择适合您文本类型的模板"
)
load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm")
gr.Markdown("### ✏️ 自定义提取规则")
prompt_input = gr.Textbox(
label="提取指令",
value=REPORT_TEMPLATES["影像报告"]["prompt"],
lines=8,
placeholder="描述您想提取的信息类型和规则..."
)
with gr.Accordion("🎯 提取示例 (JSON格式)", open=False):
examples_input = gr.Code(
value=REPORT_TEMPLATES["影像报告"]["example"],
language="json",
lines=15,
label="示例数据"
)
with gr.Column(scale=1):
gr.Markdown("### 📄 输入医学文本")
sample_selector = gr.Dropdown(
choices=list(SAMPLE_TEXTS.keys()),
label="快速加载示例",
value=None
)
text_input = gr.Textbox(
label="待提取文本",
lines=18,
placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告",
max_lines=25
)
with gr.Row():
clear_btn = gr.Button("🗑️ 清空", size="sm")
submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2)
with gr.Column(scale=1):
gr.Markdown("### ✨ 提取结果")
result_tabs = gr.Tabs()
with result_tabs:
with gr.Tab("📊 结构化数据"):
json_output = gr.JSON(label="提取结果", show_label=False)
with gr.Tab("📈 统计分析"):
stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box")
with gr.Tab("📜 历史记录"):
history_output = gr.Markdown("暂无提取历史")
file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False)
with gr.Accordion("ℹ️ 使用说明", open=False):
gr.Markdown("""
### 使用步骤
1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥
2. **选择模板**: 根据文本类型选择预设模板,或自定义提取规则
3. **输入文本**: 粘贴您的医学报告或病历文本
4. **开始提取**: 点击提取按钮,AI 将自动识别并结构化关键信息
5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录
### 支持的报告类型
- **影像报告**: CT、MRI、X线、超声等各类影像学检查
- **病理报告**: 组织病理、细胞病理、免疫组化等
- **病历记录**: 入院记录、病程记录、出院小结等
### 提示
- 提供高质量的示例可显著提升提取准确度
- 可同时处理多份报告(用空行分隔)
- 结果可导出为 JSONL 格式供后续分析使用
""")
load_template_btn.click(
fn=load_template,
inputs=[template_selector],
outputs=[prompt_input, examples_input]
)
sample_selector.change(
fn=load_sample_text,
inputs=[sample_selector],
outputs=[text_input]
)
clear_btn.click(
fn=lambda: "",
outputs=[text_input]
)
submit_btn.click(
fn=extract_information,
inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector],
outputs=[json_output, file_output, stats_output, history_output, file_output]
)
if __name__ == "__main__":
demo.launch()