Spaces:

leonsimon23
/

exactEMR

Sleeping

App Files Files Community

leonsimon23 commited on Nov 3, 2025

Commit

933b2be

verified ·

1 Parent(s): e50f65d

Create app.py

Browse files

Files changed (1) hide show

app.py +369 -0

app.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import gradio as gr
+import langextract as lx
+import json
+import os
+import tempfile
+import textwrap
+from datetime import datetime
+# --- 预设模板库 ---
+REPORT_TEMPLATES = {
+    "影像报告": {
+        "prompt": textwrap.dedent("""\
+            请从影像检查报告中提取以下关键信息：
+            - 检查部位和器官
+            - 病灶的位置、大小、密度/信号特征
+            - 增强扫描表现
+            - 阴性发现（未见异常的部位）
+            提取规则：
+            1. 必须使用报告中的原文，不要改写
+            2. 保留数值的精确度
+            3. 完整提取解剖学描述"""),
+        "example": json.dumps([{
+            "text": "胸部CT平扫显示：右肺上叶尖段见结节影，大小约1.2 x 0.9 cm，边缘毛糙，密度不均。纵隔淋巴结未见明显肿大。",
+            "extractions": [
+                {"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}},
+                {"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}},
+                {"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}},
+                {"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}},
+                {"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}},
+                {"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}}
+            ]
+        }], ensure_ascii=False, indent=2)
+    },
+    "病理报告": {
+        "prompt": textwrap.dedent("""\
+            请从病理报告中提取以下关键信息：
+            - 标本来源和类型
+            - 组织学分型
+            - 肿瘤分级
+            - 免疫组化结果
+            - 切缘情况
+            提取规则：
+            1. 完整保留病理诊断术语
+            2. 提取所有阳性和阴性的免疫组化标记
+            3. 保留分级和分期信息"""),
+        "example": json.dumps([{
+            "text": "（右乳腺）浸润性导管癌，组织学II级，肿瘤大小2.5cm。免疫组化：ER(+,90%)，PR(+,70%)，HER-2(-)，Ki-67(30%)。切缘未见癌累及。",
+            "extractions": [
+                {"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}},
+                {"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}},
+                {"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}},
+                {"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}},
+                {"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}},
+                {"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}},
+                {"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}},
+                {"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}},
+                {"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}}
+            ]
+        }], ensure_ascii=False, indent=2)
+    },
+    "病历摘要": {
+        "prompt": textwrap.dedent("""\
+            请从病历中提取以下关键信息：
+            - 主诉和现病史要点
+            - 既往史（重要疾病和手术史）
+            - 体格检查阳性体征
+            - 辅助检查异常结果
+            - 诊断和治疗方案
+            提取规则：
+            1. 提取关键时间节点
+            2. 保留症状的完整描述
+            3. 提取数值型指标"""),
+        "example": json.dumps([{
+            "text": "患者主因"反复胸痛3月，加重1周"入院。既往高血压病史10年，2型糖尿病5年。入院查体：BP 150/95mmHg，心率92次/分。心电图示：II、III、aVF导联ST段压低0.1mV。",
+            "extractions": [
+                {"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月，加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}},
+                {"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}},
+                {"extraction_class": "past_history", "extraction_text": "2型糖尿病5年", "attributes": {"disease": "2型糖尿病", "duration": "5年"}},
+                {"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}},
+                {"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}},
+                {"extraction_class": "exam_finding", "extraction_text": "II、III、aVF导联ST段压低0.1mV", "attributes": {"exam_type": "心电图", "leads": ["II", "III", "aVF"], "finding": "ST段压低", "value": "0.1", "unit": "mV"}}
+            ]
+        }], ensure_ascii=False, indent=2)
+    }
+}
+# --- 示例文本库 ---
+SAMPLE_TEXTS = {
+    "影像报告示例": "头颅MRI平扫增强：左侧基底节区见片状异常信号影，T1WI呈低信号，T2WI及FLAIR呈高信号，大小约4.5 x 3.2 x 3.8 cm，周围见大片水肿带。增强扫描后病灶呈不均匀明显强化，周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称，未见明显扩大。",
+    "病理报告示例": "（胃窦）腺癌，中分化，浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化：CK(+)，CK7(-)，CK20(+)，CDX-2(+)，Her-2(1+)，Ki-67阳性指数约40%。送检淋巴结12枚，见癌转移3枚（3/12）。",
+    "病历示例": "患者，女性，58岁，因"发现左乳腺肿物2月"入院。患者2月前无意中发现左乳腺外上象限肿物，约2cm大小，无疼痛，无乳头溢液。既往体健。查体：左乳外上象限可触及约2.5 x 2.0cm肿物，质硬，边界欠清，活动度差，无压痛。左侧腋窝可触及1枚肿大淋巴结，约1.5cm。辅助检查：乳腺超声示左乳外上象限低回声结节，BI-RADS 4C类。"
+}
+# --- 历史记录管理 ---
+extraction_history = []
+def save_to_history(input_text, result, template_name):
+    """保存提取历史"""
+    history_entry = {
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "template": template_name,
+        "input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text,
+        "extraction_count": len(result.get("extractions", [])),
+        "full_result": result
+    }
+    extraction_history.insert(0, history_entry)  # 最新的在前
+    if len(extraction_history) > 10:  # 只保留最近10条
+        extraction_history.pop()
+    return format_history_display()
+def format_history_display():
+    """格式化历史记录显示"""
+    if not extraction_history:
+        return "暂无提取历史"
+    history_text = ""
+    for i, entry in enumerate(extraction_history, 1):
+        history_text += f"### 记录 {i} - {entry['timestamp']}\n"
+        history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n"
+        history_text += f"**文本预览**: {entry['input_preview']}\n\n"
+    return history_text
+# --- 统计信息生成 ---
+def generate_statistics(result):
+    """生成提取统计信息"""
+    extractions = result.get("extractions", [])
+    if not extractions:
+        return "暂无统计信息"
+    # 按类别统计
+    class_counts = {}
+    for ext in extractions:
+        cls = ext.get("extraction_class", "未分类")
+        class_counts[cls] = class_counts.get(cls, 0) + 1
+    stats_text = f"### 📊 提取统计\n"
+    stats_text += f"**总提取项数**: {len(extractions)}\n\n"
+    stats_text += "**分类统计**:\n"
+    for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
+        stats_text += f"- {cls}: {count}项\n"
+    return stats_text
+# --- 后端处理函数 ---
+def extract_information(api_key, prompt, examples_json, input_text, template_name):
+    """执行信息提取"""
+    if not api_key:
+        raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥")
+    if not input_text.strip():
+        raise gr.Error("⚠️ 请输入待提取的文本内容")
+    try:
+        examples_data = json.loads(examples_json)
+        examples = [
+            lx.data.ExampleData(
+                text=ex['text'],
+                extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']]
+            ) for ex in examples_data
+        ]
+    except (json.JSONDecodeError, KeyError) as e:
+        raise gr.Error(f"❌ 示例JSON格式错误: {e}")
+    try:
+        os.environ['LANGEXTRACT_API_KEY'] = api_key
+        result = lx.extract(
+            text_or_documents=input_text,
+            prompt_description=prompt,
+            examples=examples,
+            model_id="gemini-1.5-flash",
+        )
+        output_dict = {
+            "source_text": result.source_text,
+            "extractions": [ext.to_dict() for ext in result.extractions]
+        }
+        # 创建下载文件
+        with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as tmp_file:
+            lx.io.save_annotated_documents([result], file_path=tmp_file.name)
+            download_path = tmp_file.name
+        # 生成统计信息
+        stats = generate_statistics(output_dict)
+        # 保存到历史
+        history = save_to_history(input_text, output_dict, template_name)
+        return output_dict, download_path, stats, history, gr.update(visible=True)
+    except Exception as e:
+        raise gr.Error(f"❌ 提取失败: {str(e)}")
+def load_template(template_name):
+    """加载预设模板"""
+    if template_name in REPORT_TEMPLATES:
+        template = REPORT_TEMPLATES[template_name]
+        return template["prompt"], template["example"]
+    return "", ""
+def load_sample_text(sample_name):
+    """加载示例文本"""
+    return SAMPLE_TEXTS.get(sample_name, "")
+# --- Gradio 界面 ---
+custom_css = """
+#header {
+    text-align: center;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 2rem;
+    border-radius: 10px;
+    color: white;
+    margin-bottom: 2rem;
+}
+#header h1 {
+    margin: 0;
+    font-size: 2.5rem;
+    font-weight: 700;
+}
+#header p {
+    margin: 0.5rem 0 0 0;
+    font-size: 1.1rem;
+    opacity: 0.95;
+}
+.template-btn {
+    margin: 0.25rem !important;
+}
+#stats-box {
+    background: #f8f9fa;
+    padding: 1rem;
+    border-radius: 8px;
+    border-left: 4px solid #667eea;
+}
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo:
+    # 顶部标题
+    with gr.Row(elem_id="header"):
+        gr.Markdown("""
+        # 🏥 医学信息智能提取系统
+        ### 基于 AI 的结构化医学文本分析工具
+        """)
+    with gr.Row():
+        # 左侧配置区
+        with gr.Column(scale=1):
+            gr.Markdown("### 🔐 API 配置")
+            api_key_input = gr.Textbox(
+                label="Google AI Studio API Key",
+                type="password",
+                placeholder="请输入您的 API 密钥...",
+                info="获取密钥: https://aistudio.google.com/app/apikey"
+            )
+            gr.Markdown("### 📋 选择报告类型")
+            template_selector = gr.Radio(
+                choices=list(REPORT_TEMPLATES.keys()),
+                value="影像报告",
+                label="预设模板",
+                info="选择适合您文本类型的模板"
+            )
+            load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm")
+            gr.Markdown("### ✏️ 自定义提取规则")
+            prompt_input = gr.Textbox(
+                label="提取指令",
+                value=REPORT_TEMPLATES["影像报告"]["prompt"],
+                lines=8,
+                placeholder="描述您想提取的信息类型和规则..."
+            )
+            with gr.Accordion("🎯 提取示例 (JSON格式)", open=False):
+                examples_input = gr.Code(
+                    value=REPORT_TEMPLATES["影像报告"]["example"],
+                    language="json",
+                    lines=15,
+                    label="示例数据"
+                )
+        # 中间输入区
+        with gr.Column(scale=1):
+            gr.Markdown("### 📄 输入医学文本")
+            sample_selector = gr.Dropdown(
+                choices=list(SAMPLE_TEXTS.keys()),
+                label="快速加载示例",
+                value=None
+            )
+            text_input = gr.Textbox(
+                label="待提取文本",
+                lines=18,
+                placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告",
+                max_lines=25
+            )
+            with gr.Row():
+                clear_btn = gr.Button("🗑️ 清空", size="sm")
+                submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2)
+        # 右侧结果区
+        with gr.Column(scale=1):
+            gr.Markdown("### ✨ 提取结果")
+            result_tabs = gr.Tabs()
+            with result_tabs:
+                with gr.Tab("📊 结构化数据"):
+                    json_output = gr.JSON(label="提取结果", show_label=False)
+                with gr.Tab("📈 统计分析"):
+                    stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box")
+                with gr.Tab("📜 历史记录"):
+                    history_output = gr.Markdown("暂无提取历史")
+            file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False)
+    # 底部说明
+    with gr.Accordion("ℹ️ 使用说明", open=False):
+        gr.Markdown("""
+        ### 使用步骤
+        1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥
+        2. **选择模板**: 根据文本类型选择预设模板，或自定义提取规则
+        3. **输入文本**: 粘贴您的医学报告或病历文本
+        4. **开始提取**: 点击提取按钮，AI 将自动识别并结构化关键信息
+        5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录
+        ### 支持的报告类型
+        - **影像报告**: CT、MRI、X线、超声等各类影像学检查
+        - **病理报告**: 组织病理、细胞病理、免疫组化等
+        - **病历记录**: 入院记录、病程记录、出院小结等
+        ### 提示
+        - 提供高质量的示例可显著提升提取准确度
+        - 可同时处理多份报告（用空行分隔）
+        - 结果可导出为 JSONL 格式供后续分析使用
+        """)
+    # 事件绑定
+    load_template_btn.click(
+        fn=load_template,
+        inputs=[template_selector],
+        outputs=[prompt_input, examples_input]
+    )
+    sample_selector.change(
+        fn=load_sample_text,
+        inputs=[sample_selector],
+        outputs=[text_input]
+    )
+    clear_btn.click(
+        fn=lambda: "",
+        outputs=[text_input]
+    )
+    submit_btn.click(
+        fn=extract_information,
+        inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector],
+        outputs=[json_output, file_output, stats_output, history_output, file_output]
+    )
+if __name__ == "__main__":
+    demo.launch()