Spaces:

leonsimon23
/

exactEMR

Sleeping

File size: 18,160 Bytes

import gradio as gr
import langextract as lx
import json
import os
import tempfile
import textwrap
from datetime import datetime

# --- 预设模板库 ---
REPORT_TEMPLATES = {
    "影像报告": {
        "prompt": textwrap.dedent("""\
            请从影像检查报告中提取以下关键信息：
            - 检查部位和器官
            - 病灶的位置、大小、密度/信号特征
            - 增强扫描表现
            - 阴性发现（未见异常的部位）
            
            提取规则：
            1. 必须使用报告中的原文，不要改写
            2. 保留数值的精确度
            3. 完整提取解剖学描述"""),
        "example": json.dumps([{
            "text": "胸部CT平扫显示：右肺上叶尖段见结节影，大小约1.2 x 0.9 cm，边缘毛糙，密度不均。纵隔淋巴结未见明显肿大。",
            "extractions": [
                {"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}},
                {"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}},
                {"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}},
                {"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}},
                {"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}},
                {"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}}
            ]
        }], ensure_ascii=False, indent=2)
    },
    "病理报告": {
        "prompt": textwrap.dedent("""\
            请从病理报告中提取以下关键信息：
            - 标本来源和类型
            - 组织学分型
            - 肿瘤分级
            - 免疫组化结果
            - 切缘情况
            
            提取规则：
            1. 完整保留病理诊断术语
            2. 提取所有阳性和阴性的免疫组化标记
            3. 保留分级和分期信息"""),
        "example": json.dumps([{
            "text": "（右乳腺）浸润性导管癌，组织学II级，肿瘤大小2.5cm。免疫组化：ER(+,90%)，PR(+,70%)，HER-2(-)，Ki-67(30%)。切缘未见癌累及。",
            "extractions": [
                {"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}},
                {"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}},
                {"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}},
                {"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}},
                {"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}},
                {"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}},
                {"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}},
                {"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}},
                {"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}}
            ]
        }], ensure_ascii=False, indent=2)
    },
    "病历摘要": {
        "prompt": textwrap.dedent("""\
            请从病历中提取以下关键信息：
            - 主诉和现病史要点
            - 既往史（重要疾病和手术史）
            - 体格检查阳性体征
            - 辅助检查异常结果
            - 诊断和治疗方案
            
            提取规则：
            1. 提取关键时间节点
            2. 保留症状的完整描述
            3. 提取数值型指标"""),
        "example": json.dumps([{
            "text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体：BP 150/95mmHg，心率92次/分。",
            "extractions": [
                {"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}},
                {"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}},
                {"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}},
                {"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}}
            ]
        }], ensure_ascii=False, indent=2)
    }
}

# --- 示例文本库 ---
SAMPLE_TEXTS = {
    "影像报告示例": "头颅MRI平扫增强：左侧基底节区见片状异常信号影，T1WI呈低信号，T2WI及FLAIR呈高信号，大小约4.5 x 3.2 x 3.8 cm，周围见大片水肿带。增强扫描后病灶呈不均匀明显强化，周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称，未见明显扩大。",
    "病理报告示例": "（胃窦）腺癌，中分化，浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化：CK(+)，CK7(-)，CK20(+)，CDX-2(+)，Her-2(1+)，Ki-67阳性指数约40%。送检淋巴结12枚，见癌转移3枚（3/12）。",
    "病历示例": "患者女性58岁，因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物，约2cm大小，无疼痛，无乳头溢液。既往体健。查体：左乳外上象限可触及约2.5 x 2.0cm肿物，质硬，边界欠清，活动度差，无压痛。左侧腋窝可触及1枚肿大淋巴结，约1.5cm。辅助检查：乳腺超声示左乳外上象限低回声结节，BI-RADS 4C类。"
}

# --- 历史记录管理 ---
extraction_history = []

def save_to_history(input_text, result, template_name):
    """保存提取历史"""
    history_entry = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "template": template_name,
        "input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text,
        "extraction_count": len(result.get("extractions", [])),
        "full_result": result
    }
    extraction_history.insert(0, history_entry)
    if len(extraction_history) > 10:
        extraction_history.pop()
    return format_history_display()

def format_history_display():
    """格式化历史记录显示"""
    if not extraction_history:
        return "暂无提取历史"
    
    history_text = ""
    for i, entry in enumerate(extraction_history, 1):
        history_text += f"### 记录 {i} - {entry['timestamp']}\n"
        history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n"
        history_text += f"**文本预览**: {entry['input_preview']}\n\n"
    return history_text

# --- 统计信息生成 ---
def generate_statistics(result):
    """生成提取统计信息"""
    extractions = result.get("extractions", [])
    if not extractions:
        return "暂无统计信息"
    
    class_counts = {}
    for ext in extractions:
        cls = ext.get("extraction_class", "未分类")
        class_counts[cls] = class_counts.get(cls, 0) + 1
    
    stats_text = f"### 📊 提取统计\n"
    stats_text += f"**总提取项数**: {len(extractions)}\n\n"
    stats_text += "**分类统计**:\n"
    for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
        stats_text += f"- {cls}: {count}项\n"
    
    return stats_text

# --- 后端处理函数 ---
def extract_information(api_key, prompt, examples_json, input_text, template_name):
    """执行信息提取"""
    if not api_key:
        raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥")
    if not input_text.strip():
        raise gr.Error("⚠️ 请输入待提取的文本内容")

        

    try:
        examples_data = json.loads(examples_json)
        examples = [
            lx.data.ExampleData(
                text=ex['text'],
                extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']]
            ) for ex in examples_data
        ]
    except (json.JSONDecodeError, KeyError) as e:
        raise gr.Error(f"❌ 示例JSON格式错误: {e}")
        

    # ... 其他代码保持不变 ...
    try:
        os.environ['LANGEXTRACT_API_KEY'] = api_key
        
        results = lx.extract(
            text_or_documents=input_text,
            prompt_description=prompt,
            examples=examples,
            model_id="gemini-2.5-flash", # 建议使用最新的模型
        )
        
        if isinstance(results, list) and len(results) > 0:
            result = results[0]
        else:
            result = results
        
        extractions_list = []
        if hasattr(result, 'extractions'):
            for ext in result.extractions:
                ext_dict = {
                    "extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "",
                    "extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "",
                    "char_span": getattr(ext, 'char_span', None),
                    "attributes": ext.attributes if hasattr(ext, 'attributes') else {}
                }
                extractions_list.append(ext_dict)
        
        output_dict = {
            "source_text": result.text if hasattr(result, 'text') else input_text,
            "extractions": extractions_list
        }

        # --- START: 修改后的文件保存逻辑 ---
        
        # 1. 创建一个临时的、持久化的文件，用于最终的下载
        with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file:
            download_path = final_output_file.name

        # 2. 创建一个临时目录，供 langextract 库使用
        with tempfile.TemporaryDirectory() as tmp_dir:
            # 准备好要保存的文档列表
            documents_to_save = results if isinstance(results, list) else [results]
            
            # 3. 让库将结果保存到这个临时目录中
            lx.io.save_annotated_documents(documents_to_save, tmp_dir)
            
            # 4. 库通常会生成一个名为 'output.jsonl' 的文件，找到它
            source_file_path = os.path.join(tmp_dir, 'output.jsonl')
            
            # 5. 将生成的文件内容，复制到我们为 Gradio 准备的最终文件中
            if os.path.exists(source_file_path):
                with open(source_file_path, 'r', encoding='utf-8') as src_file:
                    with open(download_path, 'w', encoding='utf-8') as dest_file:
                        dest_file.write(src_file.read())
            else:
                # 如果没有生成文件，就直接保存我们构造的JSON字典
                with open(download_path, 'w', encoding='utf-8') as dest_file:
                    json.dump(output_dict, dest_file, ensure_ascii=False, indent=2)

        # --- END: 修改后的文件保存逻辑 ---
        
        stats = generate_statistics(output_dict)
        history = save_to_history(input_text, output_dict, template_name)
        
        return output_dict, download_path, stats, history, gr.update(visible=True)

    except Exception as e:
        import traceback
        error_detail = traceback.format_exc()
        print(f"详细错误信息:\n{error_detail}")
        raise gr.Error(f"❌ 提取失败: {str(e)}")
        

def load_template(template_name):
    """加载预设模板"""
    if template_name in REPORT_TEMPLATES:
        template = REPORT_TEMPLATES[template_name]
        return template["prompt"], template["example"]
    return "", ""

def load_sample_text(sample_name):
    """加载示例文本"""
    return SAMPLE_TEXTS.get(sample_name, "")

# --- Gradio 界面 ---
custom_css = """
#header {
    text-align: center;
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    padding: 2rem;
    border-radius: 10px;
    color: white;
    margin-bottom: 2rem;
}
#header h1 {
    margin: 0;
    font-size: 2.5rem;
    font-weight: 700;
}
#header p {
    margin: 0.5rem 0 0 0;
    font-size: 1.1rem;
    opacity: 0.95;
}
.template-btn {
    margin: 0.25rem !important;
}
#stats-box {
    background: #f8f9fa;
    padding: 1rem;
    border-radius: 8px;
    border-left: 4px solid #667eea;
}
"""

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo:
    
    with gr.Row(elem_id="header"):
        gr.Markdown("""
        # 🏥 医学信息智能提取系统
        ### 基于 AI 的结构化医学文本分析工具
        """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔐 API 配置")
            api_key_input = gr.Textbox(
                label="Google AI Studio API Key",
                type="password",
                placeholder="请输入您的 API 密钥...",
                info="获取密钥: https://aistudio.google.com/app/apikey"
            )
            
            gr.Markdown("### 📋 选择报告类型")
            template_selector = gr.Radio(
                choices=list(REPORT_TEMPLATES.keys()),
                value="影像报告",
                label="预设模板",
                info="选择适合您文本类型的模板"
            )
            
            load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm")
            
            gr.Markdown("### ✏️ 自定义提取规则")
            prompt_input = gr.Textbox(
                label="提取指令",
                value=REPORT_TEMPLATES["影像报告"]["prompt"],
                lines=8,
                placeholder="描述您想提取的信息类型和规则..."
            )
            
            with gr.Accordion("🎯 提取示例 (JSON格式)", open=False):
                examples_input = gr.Code(
                    value=REPORT_TEMPLATES["影像报告"]["example"],
                    language="json",
                    lines=15,
                    label="示例数据"
                )
        
        with gr.Column(scale=1):
            gr.Markdown("### 📄 输入医学文本")
            
            sample_selector = gr.Dropdown(
                choices=list(SAMPLE_TEXTS.keys()),
                label="快速加载示例",
                value=None
            )
            
            text_input = gr.Textbox(
                label="待提取文本",
                lines=18,
                placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告",
                max_lines=25
            )
            
            with gr.Row():
                clear_btn = gr.Button("🗑️ 清空", size="sm")
                submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2)
        
        with gr.Column(scale=1):
            gr.Markdown("### ✨ 提取结果")
            
            result_tabs = gr.Tabs()
            with result_tabs:
                with gr.Tab("📊 结构化数据"):
                    json_output = gr.JSON(label="提取结果", show_label=False)
                
                with gr.Tab("📈 统计分析"):
                    stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box")
                
                with gr.Tab("📜 历史记录"):
                    history_output = gr.Markdown("暂无提取历史")
            
            file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False)
    
    with gr.Accordion("ℹ️ 使用说明", open=False):
        gr.Markdown("""
        ### 使用步骤
        1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥
        2. **选择模板**: 根据文本类型选择预设模板，或自定义提取规则
        3. **输入文本**: 粘贴您的医学报告或病历文本
        4. **开始提取**: 点击提取按钮，AI 将自动识别并结构化关键信息
        5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录
        
        ### 支持的报告类型
        - **影像报告**: CT、MRI、X线、超声等各类影像学检查
        - **病理报告**: 组织病理、细胞病理、免疫组化等
        - **病历记录**: 入院记录、病程记录、出院小结等
        
        ### 提示
        - 提供高质量的示例可显著提升提取准确度
        - 可同时处理多份报告（用空行分隔）
        - 结果可导出为 JSONL 格式供后续分析使用
        """)
    
    load_template_btn.click(
        fn=load_template,
        inputs=[template_selector],
        outputs=[prompt_input, examples_input]
    )
    
    sample_selector.change(
        fn=load_sample_text,
        inputs=[sample_selector],
        outputs=[text_input]
    )
    
    clear_btn.click(
        fn=lambda: "",
        outputs=[text_input]
    )
    
    submit_btn.click(
        fn=extract_information,
        inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector],
        outputs=[json_output, file_output, stats_output, history_output, file_output]
    )

if __name__ == "__main__":
    demo.launch()