Spaces:

leonsimon23
/

exactEMR

Sleeping

App Files Files Community

exactEMR / app.py

leonsimon23

Update app.py

c7a353d verified about 2 months ago

raw

history blame contribute delete

18.2 kB

	import gradio as gr
	import langextract as lx
	import json
	import os
	import tempfile
	import textwrap
	from datetime import datetime

	# --- 预设模板库 ---
	REPORT_TEMPLATES = {
	"影像报告": {
	"prompt": textwrap.dedent("""\
	请从影像检查报告中提取以下关键信息：
	- 检查部位和器官
	- 病灶的位置、大小、密度/信号特征
	- 增强扫描表现
	- 阴性发现（未见异常的部位）

	提取规则：
	1. 必须使用报告中的原文，不要改写
	2. 保留数值的精确度
	3. 完整提取解剖学描述"""),
	"example": json.dumps([{
	"text": "胸部CT平扫显示：右肺上叶尖段见结节影，大小约1.2 x 0.9 cm，边缘毛糙，密度不均。纵隔淋巴结未见明显肿大。",
	"extractions": [
	{"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}},
	{"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}},
	{"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}},
	{"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}},
	{"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}},
	{"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}}
	]
	}], ensure_ascii=False, indent=2)
	},
	"病理报告": {
	"prompt": textwrap.dedent("""\
	请从病理报告中提取以下关键信息：
	- 标本来源和类型
	- 组织学分型
	- 肿瘤分级
	- 免疫组化结果
	- 切缘情况

	提取规则：
	1. 完整保留病理诊断术语
	2. 提取所有阳性和阴性的免疫组化标记
	3. 保留分级和分期信息"""),
	"example": json.dumps([{
	"text": "（右乳腺）浸润性导管癌，组织学II级，肿瘤大小2.5cm。免疫组化：ER(+,90%)，PR(+,70%)，HER-2(-)，Ki-67(30%)。切缘未见癌累及。",
	"extractions": [
	{"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}},
	{"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}},
	{"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}},
	{"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}},
	{"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}},
	{"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}},
	{"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}},
	{"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}},
	{"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}}
	]
	}], ensure_ascii=False, indent=2)
	},
	"病历摘要": {
	"prompt": textwrap.dedent("""\
	请从病历中提取以下关键信息：
	- 主诉和现病史要点
	- 既往史（重要疾病和手术史）
	- 体格检查阳性体征
	- 辅助检查异常结果
	- 诊断和治疗方案

	提取规则：
	1. 提取关键时间节点
	2. 保留症状的完整描述
	3. 提取数值型指标"""),
	"example": json.dumps([{
	"text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体：BP 150/95mmHg，心率92次/分。",
	"extractions": [
	{"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}},
	{"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}},
	{"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}},
	{"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}}
	]
	}], ensure_ascii=False, indent=2)
	}
	}

	# --- 示例文本库 ---
	SAMPLE_TEXTS = {
	"影像报告示例": "头颅MRI平扫增强：左侧基底节区见片状异常信号影，T1WI呈低信号，T2WI及FLAIR呈高信号，大小约4.5 x 3.2 x 3.8 cm，周围见大片水肿带。增强扫描后病灶呈不均匀明显强化，周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称，未见明显扩大。",
	"病理报告示例": "（胃窦）腺癌，中分化，浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化：CK(+)，CK7(-)，CK20(+)，CDX-2(+)，Her-2(1+)，Ki-67阳性指数约40%。送检淋巴结12枚，见癌转移3枚（3/12）。",
	"病历示例": "患者女性58岁，因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物，约2cm大小，无疼痛，无乳头溢液。既往体健。查体：左乳外上象限可触及约2.5 x 2.0cm肿物，质硬，边界欠清，活动度差，无压痛。左侧腋窝可触及1枚肿大淋巴结，约1.5cm。辅助检查：乳腺超声示左乳外上象限低回声结节，BI-RADS 4C类。"
	}

	# --- 历史记录管理 ---
	extraction_history = []

	def save_to_history(input_text, result, template_name):
	"""保存提取历史"""
	history_entry = {
	"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
	"template": template_name,
	"input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text,
	"extraction_count": len(result.get("extractions", [])),
	"full_result": result
	}
	extraction_history.insert(0, history_entry)
	if len(extraction_history) > 10:
	extraction_history.pop()
	return format_history_display()

	def format_history_display():
	"""格式化历史记录显示"""
	if not extraction_history:
	return "暂无提取历史"

	history_text = ""
	for i, entry in enumerate(extraction_history, 1):
	history_text += f"### 记录 {i} - {entry['timestamp']}\n"
	history_text += f"模板: {entry['template']} \| 提取项: {entry['extraction_count']}项\n"
	history_text += f"文本预览: {entry['input_preview']}\n\n"
	return history_text

	# --- 统计信息生成 ---
	def generate_statistics(result):
	"""生成提取统计信息"""
	extractions = result.get("extractions", [])
	if not extractions:
	return "暂无统计信息"

	class_counts = {}
	for ext in extractions:
	cls = ext.get("extraction_class", "未分类")
	class_counts[cls] = class_counts.get(cls, 0) + 1

	stats_text = f"### 📊 提取统计\n"
	stats_text += f"总提取项数: {len(extractions)}\n\n"
	stats_text += "分类统计:\n"
	for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
	stats_text += f"- {cls}: {count}项\n"

	return stats_text

	# --- 后端处理函数 ---
	def extract_information(api_key, prompt, examples_json, input_text, template_name):
	"""执行信息提取"""
	if not api_key:
	raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥")
	if not input_text.strip():
	raise gr.Error("⚠️ 请输入待提取的文本内容")



	try:
	examples_data = json.loads(examples_json)
	examples = [
	lx.data.ExampleData(
	text=ex['text'],
	extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']]
	) for ex in examples_data
	]
	except (json.JSONDecodeError, KeyError) as e:
	raise gr.Error(f"❌ 示例JSON格式错误: {e}")


	# ... 其他代码保持不变 ...
	try:
	os.environ['LANGEXTRACT_API_KEY'] = api_key

	results = lx.extract(
	text_or_documents=input_text,
	prompt_description=prompt,
	examples=examples,
	model_id="gemini-2.5-flash", # 建议使用最新的模型
	)

	if isinstance(results, list) and len(results) > 0:
	result = results[0]
	else:
	result = results

	extractions_list = []
	if hasattr(result, 'extractions'):
	for ext in result.extractions:
	ext_dict = {
	"extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "",
	"extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "",
	"char_span": getattr(ext, 'char_span', None),
	"attributes": ext.attributes if hasattr(ext, 'attributes') else {}
	}
	extractions_list.append(ext_dict)

	output_dict = {
	"source_text": result.text if hasattr(result, 'text') else input_text,
	"extractions": extractions_list
	}

	# --- START: 修改后的文件保存逻辑 ---

	# 1. 创建一个临时的、持久化的文件，用于最终的下载
	with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file:
	download_path = final_output_file.name

	# 2. 创建一个临时目录，供 langextract 库使用
	with tempfile.TemporaryDirectory() as tmp_dir:
	# 准备好要保存的文档列表
	documents_to_save = results if isinstance(results, list) else [results]

	# 3. 让库将结果保存到这个临时目录中
	lx.io.save_annotated_documents(documents_to_save, tmp_dir)

	# 4. 库通常会生成一个名为 'output.jsonl' 的文件，找到它
	source_file_path = os.path.join(tmp_dir, 'output.jsonl')

	# 5. 将生成的文件内容，复制到我们为 Gradio 准备的最终文件中
	if os.path.exists(source_file_path):
	with open(source_file_path, 'r', encoding='utf-8') as src_file:
	with open(download_path, 'w', encoding='utf-8') as dest_file:
	dest_file.write(src_file.read())
	else:
	# 如果没有生成文件，就直接保存我们构造的JSON字典
	with open(download_path, 'w', encoding='utf-8') as dest_file:
	json.dump(output_dict, dest_file, ensure_ascii=False, indent=2)

	# --- END: 修改后的文件保存逻辑 ---

	stats = generate_statistics(output_dict)
	history = save_to_history(input_text, output_dict, template_name)

	return output_dict, download_path, stats, history, gr.update(visible=True)

	except Exception as e:
	import traceback
	error_detail = traceback.format_exc()
	print(f"详细错误信息:\n{error_detail}")
	raise gr.Error(f"❌ 提取失败: {str(e)}")


	def load_template(template_name):
	"""加载预设模板"""
	if template_name in REPORT_TEMPLATES:
	template = REPORT_TEMPLATES[template_name]
	return template["prompt"], template["example"]
	return "", ""

	def load_sample_text(sample_name):
	"""加载示例文本"""
	return SAMPLE_TEXTS.get(sample_name, "")

	# --- Gradio 界面 ---
	custom_css = """
	#header {
	text-align: center;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 2rem;
	border-radius: 10px;
	color: white;
	margin-bottom: 2rem;
	}
	#header h1 {
	margin: 0;
	font-size: 2.5rem;
	font-weight: 700;
	}
	#header p {
	margin: 0.5rem 0 0 0;
	font-size: 1.1rem;
	opacity: 0.95;
	}
	.template-btn {
	margin: 0.25rem !important;
	}
	#stats-box {
	background: #f8f9fa;
	padding: 1rem;
	border-radius: 8px;
	border-left: 4px solid #667eea;
	}
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo:

	with gr.Row(elem_id="header"):
	gr.Markdown("""
	# 🏥 医学信息智能提取系统
	### 基于 AI 的结构化医学文本分析工具
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🔐 API 配置")
	api_key_input = gr.Textbox(
	label="Google AI Studio API Key",
	type="password",
	placeholder="请输入您的 API 密钥...",
	info="获取密钥: https://aistudio.google.com/app/apikey"
	)

	gr.Markdown("### 📋 选择报告类型")
	template_selector = gr.Radio(
	choices=list(REPORT_TEMPLATES.keys()),
	value="影像报告",
	label="预设模板",
	info="选择适合您文本类型的模板"
	)

	load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm")

	gr.Markdown("### ✏️ 自定义提取规则")
	prompt_input = gr.Textbox(
	label="提取指令",
	value=REPORT_TEMPLATES["影像报告"]["prompt"],
	lines=8,
	placeholder="描述您想提取的信息类型和规则..."
	)

	with gr.Accordion("🎯 提取示例 (JSON格式)", open=False):
	examples_input = gr.Code(
	value=REPORT_TEMPLATES["影像报告"]["example"],
	language="json",
	lines=15,
	label="示例数据"
	)

	with gr.Column(scale=1):
	gr.Markdown("### 📄 输入医学文本")

	sample_selector = gr.Dropdown(
	choices=list(SAMPLE_TEXTS.keys()),
	label="快速加载示例",
	value=None
	)

	text_input = gr.Textbox(
	label="待提取文本",
	lines=18,
	placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告",
	max_lines=25
	)

	with gr.Row():
	clear_btn = gr.Button("🗑️ 清空", size="sm")
	submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2)

	with gr.Column(scale=1):
	gr.Markdown("### ✨ 提取结果")

	result_tabs = gr.Tabs()
	with result_tabs:
	with gr.Tab("📊 结构化数据"):
	json_output = gr.JSON(label="提取结果", show_label=False)

	with gr.Tab("📈 统计分析"):
	stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box")

	with gr.Tab("📜 历史记录"):
	history_output = gr.Markdown("暂无提取历史")

	file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False)

	with gr.Accordion("ℹ️ 使用说明", open=False):
	gr.Markdown("""
	### 使用步骤
	1. 输入 API 密钥: 从 Google AI Studio 获取免费 API 密钥
	2. 选择模板: 根据文本类型选择预设模板，或自定义提取规则
	3. 输入文本: 粘贴您的医学报告或病历文本
	4. 开始提取: 点击提取按钮，AI 将自动识别并结构化关键信息
	5. 查看结果: 在右侧查看结构化数据、统计分析和历史记录

	### 支持的报告类型
	- 影像报告: CT、MRI、X线、超声等各类影像学检查
	- 病理报告: 组织病理、细胞病理、免疫组化等
	- 病历记录: 入院记录、病程记录、出院小结等

	### 提示
	- 提供高质量的示例可显著提升提取准确度
	- 可同时处理多份报告（用空行分隔）
	- 结果可导出为 JSONL 格式供后续分析使用
	""")

	load_template_btn.click(
	fn=load_template,
	inputs=[template_selector],
	outputs=[prompt_input, examples_input]
	)

	sample_selector.change(
	fn=load_sample_text,
	inputs=[sample_selector],
	outputs=[text_input]
	)

	clear_btn.click(
	fn=lambda: "",
	outputs=[text_input]
	)

	submit_btn.click(
	fn=extract_information,
	inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector],
	outputs=[json_output, file_output, stats_output, history_output, file_output]
	)

	if __name__ == "__main__":
	demo.launch()