import gradio as gr import langextract as lx import json import os import tempfile import textwrap from datetime import datetime # --- 预设模板库 --- REPORT_TEMPLATES = { "影像报告": { "prompt": textwrap.dedent("""\ 请从影像检查报告中提取以下关键信息: - 检查部位和器官 - 病灶的位置、大小、密度/信号特征 - 增强扫描表现 - 阴性发现(未见异常的部位) 提取规则: 1. 必须使用报告中的原文,不要改写 2. 保留数值的精确度 3. 完整提取解剖学描述"""), "example": json.dumps([{ "text": "胸部CT平扫显示:右肺上叶尖段见结节影,大小约1.2 x 0.9 cm,边缘毛糙,密度不均。纵隔淋巴结未见明显肿大。", "extractions": [ {"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}}, {"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}}, {"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}}, {"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}}, {"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}}, {"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}} ] }], ensure_ascii=False, indent=2) }, "病理报告": { "prompt": textwrap.dedent("""\ 请从病理报告中提取以下关键信息: - 标本来源和类型 - 组织学分型 - 肿瘤分级 - 免疫组化结果 - 切缘情况 提取规则: 1. 完整保留病理诊断术语 2. 提取所有阳性和阴性的免疫组化标记 3. 保留分级和分期信息"""), "example": json.dumps([{ "text": "(右乳腺)浸润性导管癌,组织学II级,肿瘤大小2.5cm。免疫组化:ER(+,90%),PR(+,70%),HER-2(-),Ki-67(30%)。切缘未见癌累及。", "extractions": [ {"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}}, {"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}}, {"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}}, {"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}}, {"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}}, {"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}}, {"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}}, {"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}}, {"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}} ] }], ensure_ascii=False, indent=2) }, "病历摘要": { "prompt": textwrap.dedent("""\ 请从病历中提取以下关键信息: - 主诉和现病史要点 - 既往史(重要疾病和手术史) - 体格检查阳性体征 - 辅助检查异常结果 - 诊断和治疗方案 提取规则: 1. 提取关键时间节点 2. 保留症状的完整描述 3. 提取数值型指标"""), "example": json.dumps([{ "text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体:BP 150/95mmHg,心率92次/分。", "extractions": [ {"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}}, {"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}}, {"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}}, {"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}} ] }], ensure_ascii=False, indent=2) } } # --- 示例文本库 --- SAMPLE_TEXTS = { "影像报告示例": "头颅MRI平扫增强:左侧基底节区见片状异常信号影,T1WI呈低信号,T2WI及FLAIR呈高信号,大小约4.5 x 3.2 x 3.8 cm,周围见大片水肿带。增强扫描后病灶呈不均匀明显强化,周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称,未见明显扩大。", "病理报告示例": "(胃窦)腺癌,中分化,浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化:CK(+),CK7(-),CK20(+),CDX-2(+),Her-2(1+),Ki-67阳性指数约40%。送检淋巴结12枚,见癌转移3枚(3/12)。", "病历示例": "患者女性58岁,因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物,约2cm大小,无疼痛,无乳头溢液。既往体健。查体:左乳外上象限可触及约2.5 x 2.0cm肿物,质硬,边界欠清,活动度差,无压痛。左侧腋窝可触及1枚肿大淋巴结,约1.5cm。辅助检查:乳腺超声示左乳外上象限低回声结节,BI-RADS 4C类。" } # --- 历史记录管理 --- extraction_history = [] def save_to_history(input_text, result, template_name): """保存提取历史""" history_entry = { "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "template": template_name, "input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text, "extraction_count": len(result.get("extractions", [])), "full_result": result } extraction_history.insert(0, history_entry) if len(extraction_history) > 10: extraction_history.pop() return format_history_display() def format_history_display(): """格式化历史记录显示""" if not extraction_history: return "暂无提取历史" history_text = "" for i, entry in enumerate(extraction_history, 1): history_text += f"### 记录 {i} - {entry['timestamp']}\n" history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n" history_text += f"**文本预览**: {entry['input_preview']}\n\n" return history_text # --- 统计信息生成 --- def generate_statistics(result): """生成提取统计信息""" extractions = result.get("extractions", []) if not extractions: return "暂无统计信息" class_counts = {} for ext in extractions: cls = ext.get("extraction_class", "未分类") class_counts[cls] = class_counts.get(cls, 0) + 1 stats_text = f"### 📊 提取统计\n" stats_text += f"**总提取项数**: {len(extractions)}\n\n" stats_text += "**分类统计**:\n" for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True): stats_text += f"- {cls}: {count}项\n" return stats_text # --- 后端处理函数 --- def extract_information(api_key, prompt, examples_json, input_text, template_name): """执行信息提取""" if not api_key: raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥") if not input_text.strip(): raise gr.Error("⚠️ 请输入待提取的文本内容") try: examples_data = json.loads(examples_json) examples = [ lx.data.ExampleData( text=ex['text'], extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']] ) for ex in examples_data ] except (json.JSONDecodeError, KeyError) as e: raise gr.Error(f"❌ 示例JSON格式错误: {e}") # ... 其他代码保持不变 ... try: os.environ['LANGEXTRACT_API_KEY'] = api_key results = lx.extract( text_or_documents=input_text, prompt_description=prompt, examples=examples, model_id="gemini-2.5-flash", # 建议使用最新的模型 ) if isinstance(results, list) and len(results) > 0: result = results[0] else: result = results extractions_list = [] if hasattr(result, 'extractions'): for ext in result.extractions: ext_dict = { "extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "", "extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "", "char_span": getattr(ext, 'char_span', None), "attributes": ext.attributes if hasattr(ext, 'attributes') else {} } extractions_list.append(ext_dict) output_dict = { "source_text": result.text if hasattr(result, 'text') else input_text, "extractions": extractions_list } # --- START: 修改后的文件保存逻辑 --- # 1. 创建一个临时的、持久化的文件,用于最终的下载 with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file: download_path = final_output_file.name # 2. 创建一个临时目录,供 langextract 库使用 with tempfile.TemporaryDirectory() as tmp_dir: # 准备好要保存的文档列表 documents_to_save = results if isinstance(results, list) else [results] # 3. 让库将结果保存到这个临时目录中 lx.io.save_annotated_documents(documents_to_save, tmp_dir) # 4. 库通常会生成一个名为 'output.jsonl' 的文件,找到它 source_file_path = os.path.join(tmp_dir, 'output.jsonl') # 5. 将生成的文件内容,复制到我们为 Gradio 准备的最终文件中 if os.path.exists(source_file_path): with open(source_file_path, 'r', encoding='utf-8') as src_file: with open(download_path, 'w', encoding='utf-8') as dest_file: dest_file.write(src_file.read()) else: # 如果没有生成文件,就直接保存我们构造的JSON字典 with open(download_path, 'w', encoding='utf-8') as dest_file: json.dump(output_dict, dest_file, ensure_ascii=False, indent=2) # --- END: 修改后的文件保存逻辑 --- stats = generate_statistics(output_dict) history = save_to_history(input_text, output_dict, template_name) return output_dict, download_path, stats, history, gr.update(visible=True) except Exception as e: import traceback error_detail = traceback.format_exc() print(f"详细错误信息:\n{error_detail}") raise gr.Error(f"❌ 提取失败: {str(e)}") def load_template(template_name): """加载预设模板""" if template_name in REPORT_TEMPLATES: template = REPORT_TEMPLATES[template_name] return template["prompt"], template["example"] return "", "" def load_sample_text(sample_name): """加载示例文本""" return SAMPLE_TEXTS.get(sample_name, "") # --- Gradio 界面 --- custom_css = """ #header { text-align: center; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 10px; color: white; margin-bottom: 2rem; } #header h1 { margin: 0; font-size: 2.5rem; font-weight: 700; } #header p { margin: 0.5rem 0 0 0; font-size: 1.1rem; opacity: 0.95; } .template-btn { margin: 0.25rem !important; } #stats-box { background: #f8f9fa; padding: 1rem; border-radius: 8px; border-left: 4px solid #667eea; } """ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo: with gr.Row(elem_id="header"): gr.Markdown(""" # 🏥 医学信息智能提取系统 ### 基于 AI 的结构化医学文本分析工具 """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🔐 API 配置") api_key_input = gr.Textbox( label="Google AI Studio API Key", type="password", placeholder="请输入您的 API 密钥...", info="获取密钥: https://aistudio.google.com/app/apikey" ) gr.Markdown("### 📋 选择报告类型") template_selector = gr.Radio( choices=list(REPORT_TEMPLATES.keys()), value="影像报告", label="预设模板", info="选择适合您文本类型的模板" ) load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm") gr.Markdown("### ✏️ 自定义提取规则") prompt_input = gr.Textbox( label="提取指令", value=REPORT_TEMPLATES["影像报告"]["prompt"], lines=8, placeholder="描述您想提取的信息类型和规则..." ) with gr.Accordion("🎯 提取示例 (JSON格式)", open=False): examples_input = gr.Code( value=REPORT_TEMPLATES["影像报告"]["example"], language="json", lines=15, label="示例数据" ) with gr.Column(scale=1): gr.Markdown("### 📄 输入医学文本") sample_selector = gr.Dropdown( choices=list(SAMPLE_TEXTS.keys()), label="快速加载示例", value=None ) text_input = gr.Textbox( label="待提取文本", lines=18, placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告", max_lines=25 ) with gr.Row(): clear_btn = gr.Button("🗑️ 清空", size="sm") submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2) with gr.Column(scale=1): gr.Markdown("### ✨ 提取结果") result_tabs = gr.Tabs() with result_tabs: with gr.Tab("📊 结构化数据"): json_output = gr.JSON(label="提取结果", show_label=False) with gr.Tab("📈 统计分析"): stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box") with gr.Tab("📜 历史记录"): history_output = gr.Markdown("暂无提取历史") file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False) with gr.Accordion("ℹ️ 使用说明", open=False): gr.Markdown(""" ### 使用步骤 1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥 2. **选择模板**: 根据文本类型选择预设模板,或自定义提取规则 3. **输入文本**: 粘贴您的医学报告或病历文本 4. **开始提取**: 点击提取按钮,AI 将自动识别并结构化关键信息 5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录 ### 支持的报告类型 - **影像报告**: CT、MRI、X线、超声等各类影像学检查 - **病理报告**: 组织病理、细胞病理、免疫组化等 - **病历记录**: 入院记录、病程记录、出院小结等 ### 提示 - 提供高质量的示例可显著提升提取准确度 - 可同时处理多份报告(用空行分隔) - 结果可导出为 JSONL 格式供后续分析使用 """) load_template_btn.click( fn=load_template, inputs=[template_selector], outputs=[prompt_input, examples_input] ) sample_selector.change( fn=load_sample_text, inputs=[sample_selector], outputs=[text_input] ) clear_btn.click( fn=lambda: "", outputs=[text_input] ) submit_btn.click( fn=extract_information, inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector], outputs=[json_output, file_output, stats_output, history_output, file_output] ) if __name__ == "__main__": demo.launch()