Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import langextract as lx | |
| import json | |
| import os | |
| import tempfile | |
| import textwrap | |
| from datetime import datetime | |
| # --- 预设模板库 --- | |
| REPORT_TEMPLATES = { | |
| "影像报告": { | |
| "prompt": textwrap.dedent("""\ | |
| 请从影像检查报告中提取以下关键信息: | |
| - 检查部位和器官 | |
| - 病灶的位置、大小、密度/信号特征 | |
| - 增强扫描表现 | |
| - 阴性发现(未见异常的部位) | |
| 提取规则: | |
| 1. 必须使用报告中的原文,不要改写 | |
| 2. 保留数值的精确度 | |
| 3. 完整提取解剖学描述"""), | |
| "example": json.dumps([{ | |
| "text": "胸部CT平扫显示:右肺上叶尖段见结节影,大小约1.2 x 0.9 cm,边缘毛糙,密度不均。纵隔淋巴结未见明显肿大。", | |
| "extractions": [ | |
| {"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}}, | |
| {"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}}, | |
| {"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}}, | |
| {"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}}, | |
| {"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}}, | |
| {"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}} | |
| ] | |
| }], ensure_ascii=False, indent=2) | |
| }, | |
| "病理报告": { | |
| "prompt": textwrap.dedent("""\ | |
| 请从病理报告中提取以下关键信息: | |
| - 标本来源和类型 | |
| - 组织学分型 | |
| - 肿瘤分级 | |
| - 免疫组化结果 | |
| - 切缘情况 | |
| 提取规则: | |
| 1. 完整保留病理诊断术语 | |
| 2. 提取所有阳性和阴性的免疫组化标记 | |
| 3. 保留分级和分期信息"""), | |
| "example": json.dumps([{ | |
| "text": "(右乳腺)浸润性导管癌,组织学II级,肿瘤大小2.5cm。免疫组化:ER(+,90%),PR(+,70%),HER-2(-),Ki-67(30%)。切缘未见癌累及。", | |
| "extractions": [ | |
| {"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}}, | |
| {"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}}, | |
| {"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}}, | |
| {"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}}, | |
| {"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}}, | |
| {"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}}, | |
| {"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}}, | |
| {"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}}, | |
| {"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}} | |
| ] | |
| }], ensure_ascii=False, indent=2) | |
| }, | |
| "病历摘要": { | |
| "prompt": textwrap.dedent("""\ | |
| 请从病历中提取以下关键信息: | |
| - 主诉和现病史要点 | |
| - 既往史(重要疾病和手术史) | |
| - 体格检查阳性体征 | |
| - 辅助检查异常结果 | |
| - 诊断和治疗方案 | |
| 提取规则: | |
| 1. 提取关键时间节点 | |
| 2. 保留症状的完整描述 | |
| 3. 提取数值型指标"""), | |
| "example": json.dumps([{ | |
| "text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体:BP 150/95mmHg,心率92次/分。", | |
| "extractions": [ | |
| {"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}}, | |
| {"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}}, | |
| {"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}}, | |
| {"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}} | |
| ] | |
| }], ensure_ascii=False, indent=2) | |
| } | |
| } | |
| # --- 示例文本库 --- | |
| SAMPLE_TEXTS = { | |
| "影像报告示例": "头颅MRI平扫增强:左侧基底节区见片状异常信号影,T1WI呈低信号,T2WI及FLAIR呈高信号,大小约4.5 x 3.2 x 3.8 cm,周围见大片水肿带。增强扫描后病灶呈不均匀明显强化,周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称,未见明显扩大。", | |
| "病理报告示例": "(胃窦)腺癌,中分化,浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化:CK(+),CK7(-),CK20(+),CDX-2(+),Her-2(1+),Ki-67阳性指数约40%。送检淋巴结12枚,见癌转移3枚(3/12)。", | |
| "病历示例": "患者女性58岁,因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物,约2cm大小,无疼痛,无乳头溢液。既往体健。查体:左乳外上象限可触及约2.5 x 2.0cm肿物,质硬,边界欠清,活动度差,无压痛。左侧腋窝可触及1枚肿大淋巴结,约1.5cm。辅助检查:乳腺超声示左乳外上象限低回声结节,BI-RADS 4C类。" | |
| } | |
| # --- 历史记录管理 --- | |
| extraction_history = [] | |
| def save_to_history(input_text, result, template_name): | |
| """保存提取历史""" | |
| history_entry = { | |
| "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| "template": template_name, | |
| "input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text, | |
| "extraction_count": len(result.get("extractions", [])), | |
| "full_result": result | |
| } | |
| extraction_history.insert(0, history_entry) | |
| if len(extraction_history) > 10: | |
| extraction_history.pop() | |
| return format_history_display() | |
| def format_history_display(): | |
| """格式化历史记录显示""" | |
| if not extraction_history: | |
| return "暂无提取历史" | |
| history_text = "" | |
| for i, entry in enumerate(extraction_history, 1): | |
| history_text += f"### 记录 {i} - {entry['timestamp']}\n" | |
| history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n" | |
| history_text += f"**文本预览**: {entry['input_preview']}\n\n" | |
| return history_text | |
| # --- 统计信息生成 --- | |
| def generate_statistics(result): | |
| """生成提取统计信息""" | |
| extractions = result.get("extractions", []) | |
| if not extractions: | |
| return "暂无统计信息" | |
| class_counts = {} | |
| for ext in extractions: | |
| cls = ext.get("extraction_class", "未分类") | |
| class_counts[cls] = class_counts.get(cls, 0) + 1 | |
| stats_text = f"### 📊 提取统计\n" | |
| stats_text += f"**总提取项数**: {len(extractions)}\n\n" | |
| stats_text += "**分类统计**:\n" | |
| for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True): | |
| stats_text += f"- {cls}: {count}项\n" | |
| return stats_text | |
| # --- 后端处理函数 --- | |
| def extract_information(api_key, prompt, examples_json, input_text, template_name): | |
| """执行信息提取""" | |
| if not api_key: | |
| raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥") | |
| if not input_text.strip(): | |
| raise gr.Error("⚠️ 请输入待提取的文本内容") | |
| try: | |
| examples_data = json.loads(examples_json) | |
| examples = [ | |
| lx.data.ExampleData( | |
| text=ex['text'], | |
| extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']] | |
| ) for ex in examples_data | |
| ] | |
| except (json.JSONDecodeError, KeyError) as e: | |
| raise gr.Error(f"❌ 示例JSON格式错误: {e}") | |
| # ... 其他代码保持不变 ... | |
| try: | |
| os.environ['LANGEXTRACT_API_KEY'] = api_key | |
| results = lx.extract( | |
| text_or_documents=input_text, | |
| prompt_description=prompt, | |
| examples=examples, | |
| model_id="gemini-2.5-flash", # 建议使用最新的模型 | |
| ) | |
| if isinstance(results, list) and len(results) > 0: | |
| result = results[0] | |
| else: | |
| result = results | |
| extractions_list = [] | |
| if hasattr(result, 'extractions'): | |
| for ext in result.extractions: | |
| ext_dict = { | |
| "extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "", | |
| "extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "", | |
| "char_span": getattr(ext, 'char_span', None), | |
| "attributes": ext.attributes if hasattr(ext, 'attributes') else {} | |
| } | |
| extractions_list.append(ext_dict) | |
| output_dict = { | |
| "source_text": result.text if hasattr(result, 'text') else input_text, | |
| "extractions": extractions_list | |
| } | |
| # --- START: 修改后的文件保存逻辑 --- | |
| # 1. 创建一个临时的、持久化的文件,用于最终的下载 | |
| with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file: | |
| download_path = final_output_file.name | |
| # 2. 创建一个临时目录,供 langextract 库使用 | |
| with tempfile.TemporaryDirectory() as tmp_dir: | |
| # 准备好要保存的文档列表 | |
| documents_to_save = results if isinstance(results, list) else [results] | |
| # 3. 让库将结果保存到这个临时目录中 | |
| lx.io.save_annotated_documents(documents_to_save, tmp_dir) | |
| # 4. 库通常会生成一个名为 'output.jsonl' 的文件,找到它 | |
| source_file_path = os.path.join(tmp_dir, 'output.jsonl') | |
| # 5. 将生成的文件内容,复制到我们为 Gradio 准备的最终文件中 | |
| if os.path.exists(source_file_path): | |
| with open(source_file_path, 'r', encoding='utf-8') as src_file: | |
| with open(download_path, 'w', encoding='utf-8') as dest_file: | |
| dest_file.write(src_file.read()) | |
| else: | |
| # 如果没有生成文件,就直接保存我们构造的JSON字典 | |
| with open(download_path, 'w', encoding='utf-8') as dest_file: | |
| json.dump(output_dict, dest_file, ensure_ascii=False, indent=2) | |
| # --- END: 修改后的文件保存逻辑 --- | |
| stats = generate_statistics(output_dict) | |
| history = save_to_history(input_text, output_dict, template_name) | |
| return output_dict, download_path, stats, history, gr.update(visible=True) | |
| except Exception as e: | |
| import traceback | |
| error_detail = traceback.format_exc() | |
| print(f"详细错误信息:\n{error_detail}") | |
| raise gr.Error(f"❌ 提取失败: {str(e)}") | |
| def load_template(template_name): | |
| """加载预设模板""" | |
| if template_name in REPORT_TEMPLATES: | |
| template = REPORT_TEMPLATES[template_name] | |
| return template["prompt"], template["example"] | |
| return "", "" | |
| def load_sample_text(sample_name): | |
| """加载示例文本""" | |
| return SAMPLE_TEXTS.get(sample_name, "") | |
| # --- Gradio 界面 --- | |
| custom_css = """ | |
| #header { | |
| text-align: center; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 2rem; | |
| border-radius: 10px; | |
| color: white; | |
| margin-bottom: 2rem; | |
| } | |
| #header h1 { | |
| margin: 0; | |
| font-size: 2.5rem; | |
| font-weight: 700; | |
| } | |
| #header p { | |
| margin: 0.5rem 0 0 0; | |
| font-size: 1.1rem; | |
| opacity: 0.95; | |
| } | |
| .template-btn { | |
| margin: 0.25rem !important; | |
| } | |
| #stats-box { | |
| background: #f8f9fa; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #667eea; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo: | |
| with gr.Row(elem_id="header"): | |
| gr.Markdown(""" | |
| # 🏥 医学信息智能提取系统 | |
| ### 基于 AI 的结构化医学文本分析工具 | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🔐 API 配置") | |
| api_key_input = gr.Textbox( | |
| label="Google AI Studio API Key", | |
| type="password", | |
| placeholder="请输入您的 API 密钥...", | |
| info="获取密钥: https://aistudio.google.com/app/apikey" | |
| ) | |
| gr.Markdown("### 📋 选择报告类型") | |
| template_selector = gr.Radio( | |
| choices=list(REPORT_TEMPLATES.keys()), | |
| value="影像报告", | |
| label="预设模板", | |
| info="选择适合您文本类型的模板" | |
| ) | |
| load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm") | |
| gr.Markdown("### ✏️ 自定义提取规则") | |
| prompt_input = gr.Textbox( | |
| label="提取指令", | |
| value=REPORT_TEMPLATES["影像报告"]["prompt"], | |
| lines=8, | |
| placeholder="描述您想提取的信息类型和规则..." | |
| ) | |
| with gr.Accordion("🎯 提取示例 (JSON格式)", open=False): | |
| examples_input = gr.Code( | |
| value=REPORT_TEMPLATES["影像报告"]["example"], | |
| language="json", | |
| lines=15, | |
| label="示例数据" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 📄 输入医学文本") | |
| sample_selector = gr.Dropdown( | |
| choices=list(SAMPLE_TEXTS.keys()), | |
| label="快速加载示例", | |
| value=None | |
| ) | |
| text_input = gr.Textbox( | |
| label="待提取文本", | |
| lines=18, | |
| placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告", | |
| max_lines=25 | |
| ) | |
| with gr.Row(): | |
| clear_btn = gr.Button("🗑️ 清空", size="sm") | |
| submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### ✨ 提取结果") | |
| result_tabs = gr.Tabs() | |
| with result_tabs: | |
| with gr.Tab("📊 结构化数据"): | |
| json_output = gr.JSON(label="提取结果", show_label=False) | |
| with gr.Tab("📈 统计分析"): | |
| stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box") | |
| with gr.Tab("📜 历史记录"): | |
| history_output = gr.Markdown("暂无提取历史") | |
| file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False) | |
| with gr.Accordion("ℹ️ 使用说明", open=False): | |
| gr.Markdown(""" | |
| ### 使用步骤 | |
| 1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥 | |
| 2. **选择模板**: 根据文本类型选择预设模板,或自定义提取规则 | |
| 3. **输入文本**: 粘贴您的医学报告或病历文本 | |
| 4. **开始提取**: 点击提取按钮,AI 将自动识别并结构化关键信息 | |
| 5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录 | |
| ### 支持的报告类型 | |
| - **影像报告**: CT、MRI、X线、超声等各类影像学检查 | |
| - **病理报告**: 组织病理、细胞病理、免疫组化等 | |
| - **病历记录**: 入院记录、病程记录、出院小结等 | |
| ### 提示 | |
| - 提供高质量的示例可显著提升提取准确度 | |
| - 可同时处理多份报告(用空行分隔) | |
| - 结果可导出为 JSONL 格式供后续分析使用 | |
| """) | |
| load_template_btn.click( | |
| fn=load_template, | |
| inputs=[template_selector], | |
| outputs=[prompt_input, examples_input] | |
| ) | |
| sample_selector.change( | |
| fn=load_sample_text, | |
| inputs=[sample_selector], | |
| outputs=[text_input] | |
| ) | |
| clear_btn.click( | |
| fn=lambda: "", | |
| outputs=[text_input] | |
| ) | |
| submit_btn.click( | |
| fn=extract_information, | |
| inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector], | |
| outputs=[json_output, file_output, stats_output, history_output, file_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |