Spaces:
Sleeping
Sleeping
File size: 18,160 Bytes
933b2be 094b119 933b2be 094b119 933b2be 094b119 933b2be 49b1cff 933b2be 49b1cff 933b2be 251413d 933b2be 251413d 933b2be 251413d 933b2be fee8813 933b2be c7a353d 933b2be fee8813 094b119 933b2be fee8813 094b119 933b2be 251413d 052cb21 251413d fee8813 251413d 052cb21 251413d 933b2be fee8813 933b2be 251413d 933b2be |
|
import gradio as gr
import langextract as lx
import json
import os
import tempfile
import textwrap
from datetime import datetime
# --- 预设模板库 ---
REPORT_TEMPLATES = {
"影像报告": {
"prompt": textwrap.dedent("""\
请从影像检查报告中提取以下关键信息:
- 检查部位和器官
- 病灶的位置、大小、密度/信号特征
- 增强扫描表现
- 阴性发现(未见异常的部位)
提取规则:
1. 必须使用报告中的原文,不要改写
2. 保留数值的精确度
3. 完整提取解剖学描述"""),
"example": json.dumps([{
"text": "胸部CT平扫显示:右肺上叶尖段见结节影,大小约1.2 x 0.9 cm,边缘毛糙,密度不均。纵隔淋巴结未见明显肿大。",
"extractions": [
{"extraction_class": "anatomy", "extraction_text": "右肺上叶尖段", "attributes": {"organ": "肺", "side": "右", "lobe": "上叶", "segment": "尖段"}},
{"extraction_class": "finding", "extraction_text": "结节影", "attributes": {"type": "结节"}},
{"extraction_class": "size", "extraction_text": "1.2 x 0.9 cm", "attributes": {"length": "1.2", "width": "0.9", "unit": "cm"}},
{"extraction_class": "feature", "extraction_text": "边缘毛糙", "attributes": {"feature_type": "边缘", "description": "毛糙"}},
{"extraction_class": "feature", "extraction_text": "密度不均", "attributes": {"feature_type": "密度", "description": "不均"}},
{"extraction_class": "negative_finding", "extraction_text": "纵隔淋巴结未见明显肿大", "attributes": {"location": "纵隔", "structure": "淋巴结"}}
]
}], ensure_ascii=False, indent=2)
},
"病理报告": {
"prompt": textwrap.dedent("""\
请从病理报告中提取以下关键信息:
- 标本来源和类型
- 组织学分型
- 肿瘤分级
- 免疫组化结果
- 切缘情况
提取规则:
1. 完整保留病理诊断术语
2. 提取所有阳性和阴性的免疫组化标记
3. 保留分级和分期信息"""),
"example": json.dumps([{
"text": "(右乳腺)浸润性导管癌,组织学II级,肿瘤大小2.5cm。免疫组化:ER(+,90%),PR(+,70%),HER-2(-),Ki-67(30%)。切缘未见癌累及。",
"extractions": [
{"extraction_class": "specimen", "extraction_text": "右乳腺", "attributes": {"location": "右乳腺"}},
{"extraction_class": "diagnosis", "extraction_text": "浸润性导管癌", "attributes": {"type": "癌", "subtype": "浸润性导管癌"}},
{"extraction_class": "grade", "extraction_text": "组织学II级", "attributes": {"grading_system": "组织学", "grade": "II"}},
{"extraction_class": "size", "extraction_text": "2.5cm", "attributes": {"value": "2.5", "unit": "cm"}},
{"extraction_class": "ihc_marker", "extraction_text": "ER(+,90%)", "attributes": {"marker": "ER", "result": "阳性", "percentage": "90"}},
{"extraction_class": "ihc_marker", "extraction_text": "PR(+,70%)", "attributes": {"marker": "PR", "result": "阳性", "percentage": "70"}},
{"extraction_class": "ihc_marker", "extraction_text": "HER-2(-)", "attributes": {"marker": "HER-2", "result": "阴性"}},
{"extraction_class": "ihc_marker", "extraction_text": "Ki-67(30%)", "attributes": {"marker": "Ki-67", "percentage": "30"}},
{"extraction_class": "margin", "extraction_text": "切缘未见癌累及", "attributes": {"status": "阴性"}}
]
}], ensure_ascii=False, indent=2)
},
"病历摘要": {
"prompt": textwrap.dedent("""\
请从病历中提取以下关键信息:
- 主诉和现病史要点
- 既往史(重要疾病和手术史)
- 体格检查阳性体征
- 辅助检查异常结果
- 诊断和治疗方案
提取规则:
1. 提取关键时间节点
2. 保留症状的完整描述
3. 提取数值型指标"""),
"example": json.dumps([{
"text": "患者因反复胸痛3月加重1周入院。既往有高血压病史10年。查体:BP 150/95mmHg,心率92次/分。",
"extractions": [
{"extraction_class": "chief_complaint", "extraction_text": "反复胸痛3月加重1周", "attributes": {"symptom": "胸痛", "duration": "3月", "change": "加重1周"}},
{"extraction_class": "past_history", "extraction_text": "高血压病史10年", "attributes": {"disease": "高血压", "duration": "10年"}},
{"extraction_class": "vital_sign", "extraction_text": "BP 150/95mmHg", "attributes": {"type": "血压", "systolic": "150", "diastolic": "95", "unit": "mmHg"}},
{"extraction_class": "vital_sign", "extraction_text": "心率92次/分", "attributes": {"type": "心率", "value": "92", "unit": "次/分"}}
]
}], ensure_ascii=False, indent=2)
}
}
# --- 示例文本库 ---
SAMPLE_TEXTS = {
"影像报告示例": "头颅MRI平扫增强:左侧基底节区见片状异常信号影,T1WI呈低信号,T2WI及FLAIR呈高信号,大小约4.5 x 3.2 x 3.8 cm,周围见大片水肿带。增强扫描后病灶呈不均匀明显强化,周围水肿带未见强化。中线结构轻度右移约0.5cm。双侧侧脑室对称,未见明显扩大。",
"病理报告示例": "(胃窦)腺癌,中分化,浸润至肌层。肿瘤大小3.0 x 2.5 cm。免疫组化:CK(+),CK7(-),CK20(+),CDX-2(+),Her-2(1+),Ki-67阳性指数约40%。送检淋巴结12枚,见癌转移3枚(3/12)。",
"病历示例": "患者女性58岁,因发现左乳腺肿物2月入院。患者2月前无意中发现左乳腺外上象限肿物,约2cm大小,无疼痛,无乳头溢液。既往体健。查体:左乳外上象限可触及约2.5 x 2.0cm肿物,质硬,边界欠清,活动度差,无压痛。左侧腋窝可触及1枚肿大淋巴结,约1.5cm。辅助检查:乳腺超声示左乳外上象限低回声结节,BI-RADS 4C类。"
}
# --- 历史记录管理 ---
extraction_history = []
def save_to_history(input_text, result, template_name):
"""保存提取历史"""
history_entry = {
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
"template": template_name,
"input_preview": input_text[:100] + "..." if len(input_text) > 100 else input_text,
"extraction_count": len(result.get("extractions", [])),
"full_result": result
}
extraction_history.insert(0, history_entry)
if len(extraction_history) > 10:
extraction_history.pop()
return format_history_display()
def format_history_display():
"""格式化历史记录显示"""
if not extraction_history:
return "暂无提取历史"
history_text = ""
for i, entry in enumerate(extraction_history, 1):
history_text += f"### 记录 {i} - {entry['timestamp']}\n"
history_text += f"**模板**: {entry['template']} | **提取项**: {entry['extraction_count']}项\n"
history_text += f"**文本预览**: {entry['input_preview']}\n\n"
return history_text
# --- 统计信息生成 ---
def generate_statistics(result):
"""生成提取统计信息"""
extractions = result.get("extractions", [])
if not extractions:
return "暂无统计信息"
class_counts = {}
for ext in extractions:
cls = ext.get("extraction_class", "未分类")
class_counts[cls] = class_counts.get(cls, 0) + 1
stats_text = f"### 📊 提取统计\n"
stats_text += f"**总提取项数**: {len(extractions)}\n\n"
stats_text += "**分类统计**:\n"
for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
stats_text += f"- {cls}: {count}项\n"
return stats_text
# --- 后端处理函数 ---
def extract_information(api_key, prompt, examples_json, input_text, template_name):
"""执行信息提取"""
if not api_key:
raise gr.Error("⚠️ 请输入您的 Google AI Studio API 密钥")
if not input_text.strip():
raise gr.Error("⚠️ 请输入待提取的文本内容")
try:
examples_data = json.loads(examples_json)
examples = [
lx.data.ExampleData(
text=ex['text'],
extractions=[lx.data.Extraction(**extr) for extr in ex['extractions']]
) for ex in examples_data
]
except (json.JSONDecodeError, KeyError) as e:
raise gr.Error(f"❌ 示例JSON格式错误: {e}")
# ... 其他代码保持不变 ...
try:
os.environ['LANGEXTRACT_API_KEY'] = api_key
results = lx.extract(
text_or_documents=input_text,
prompt_description=prompt,
examples=examples,
model_id="gemini-2.5-flash", # 建议使用最新的模型
)
if isinstance(results, list) and len(results) > 0:
result = results[0]
else:
result = results
extractions_list = []
if hasattr(result, 'extractions'):
for ext in result.extractions:
ext_dict = {
"extraction_class": ext.extraction_class if hasattr(ext, 'extraction_class') else "",
"extraction_text": ext.extraction_text if hasattr(ext, 'extraction_text') else "",
"char_span": getattr(ext, 'char_span', None),
"attributes": ext.attributes if hasattr(ext, 'attributes') else {}
}
extractions_list.append(ext_dict)
output_dict = {
"source_text": result.text if hasattr(result, 'text') else input_text,
"extractions": extractions_list
}
# --- START: 修改后的文件保存逻辑 ---
# 1. 创建一个临时的、持久化的文件,用于最终的下载
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.jsonl', encoding='utf-8') as final_output_file:
download_path = final_output_file.name
# 2. 创建一个临时目录,供 langextract 库使用
with tempfile.TemporaryDirectory() as tmp_dir:
# 准备好要保存的文档列表
documents_to_save = results if isinstance(results, list) else [results]
# 3. 让库将结果保存到这个临时目录中
lx.io.save_annotated_documents(documents_to_save, tmp_dir)
# 4. 库通常会生成一个名为 'output.jsonl' 的文件,找到它
source_file_path = os.path.join(tmp_dir, 'output.jsonl')
# 5. 将生成的文件内容,复制到我们为 Gradio 准备的最终文件中
if os.path.exists(source_file_path):
with open(source_file_path, 'r', encoding='utf-8') as src_file:
with open(download_path, 'w', encoding='utf-8') as dest_file:
dest_file.write(src_file.read())
else:
# 如果没有生成文件,就直接保存我们构造的JSON字典
with open(download_path, 'w', encoding='utf-8') as dest_file:
json.dump(output_dict, dest_file, ensure_ascii=False, indent=2)
# --- END: 修改后的文件保存逻辑 ---
stats = generate_statistics(output_dict)
history = save_to_history(input_text, output_dict, template_name)
return output_dict, download_path, stats, history, gr.update(visible=True)
except Exception as e:
import traceback
error_detail = traceback.format_exc()
print(f"详细错误信息:\n{error_detail}")
raise gr.Error(f"❌ 提取失败: {str(e)}")
def load_template(template_name):
"""加载预设模板"""
if template_name in REPORT_TEMPLATES:
template = REPORT_TEMPLATES[template_name]
return template["prompt"], template["example"]
return "", ""
def load_sample_text(sample_name):
"""加载示例文本"""
return SAMPLE_TEXTS.get(sample_name, "")
# --- Gradio 界面 ---
custom_css = """
#header {
text-align: center;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 2rem;
border-radius: 10px;
color: white;
margin-bottom: 2rem;
}
#header h1 {
margin: 0;
font-size: 2.5rem;
font-weight: 700;
}
#header p {
margin: 0.5rem 0 0 0;
font-size: 1.1rem;
opacity: 0.95;
}
.template-btn {
margin: 0.25rem !important;
}
#stats-box {
background: #f8f9fa;
padding: 1rem;
border-radius: 8px;
border-left: 4px solid #667eea;
}
"""
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="医学信息智能提取系统") as demo:
with gr.Row(elem_id="header"):
gr.Markdown("""
# 🏥 医学信息智能提取系统
### 基于 AI 的结构化医学文本分析工具
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 🔐 API 配置")
api_key_input = gr.Textbox(
label="Google AI Studio API Key",
type="password",
placeholder="请输入您的 API 密钥...",
info="获取密钥: https://aistudio.google.com/app/apikey"
)
gr.Markdown("### 📋 选择报告类型")
template_selector = gr.Radio(
choices=list(REPORT_TEMPLATES.keys()),
value="影像报告",
label="预设模板",
info="选择适合您文本类型的模板"
)
load_template_btn = gr.Button("📥 加载模板", variant="secondary", size="sm")
gr.Markdown("### ✏️ 自定义提取规则")
prompt_input = gr.Textbox(
label="提取指令",
value=REPORT_TEMPLATES["影像报告"]["prompt"],
lines=8,
placeholder="描述您想提取的信息类型和规则..."
)
with gr.Accordion("🎯 提取示例 (JSON格式)", open=False):
examples_input = gr.Code(
value=REPORT_TEMPLATES["影像报告"]["example"],
language="json",
lines=15,
label="示例数据"
)
with gr.Column(scale=1):
gr.Markdown("### 📄 输入医学文本")
sample_selector = gr.Dropdown(
choices=list(SAMPLE_TEXTS.keys()),
label="快速加载示例",
value=None
)
text_input = gr.Textbox(
label="待提取文本",
lines=18,
placeholder="请粘贴或输入医学报告、病历等文本...\n\n支持:\n• 影像报告 (CT/MRI/X线等)\n• 病理报告\n• 病历记录\n• 检验报告",
max_lines=25
)
with gr.Row():
clear_btn = gr.Button("🗑️ 清空", size="sm")
submit_btn = gr.Button("🚀 开始提取", variant="primary", size="lg", scale=2)
with gr.Column(scale=1):
gr.Markdown("### ✨ 提取结果")
result_tabs = gr.Tabs()
with result_tabs:
with gr.Tab("📊 结构化数据"):
json_output = gr.JSON(label="提取结果", show_label=False)
with gr.Tab("📈 统计分析"):
stats_output = gr.Markdown("点击'开始提取'后显示统计信息", elem_id="stats-box")
with gr.Tab("📜 历史记录"):
history_output = gr.Markdown("暂无提取历史")
file_output = gr.File(label="💾 下载结果文件 (.jsonl)", visible=False)
with gr.Accordion("ℹ️ 使用说明", open=False):
gr.Markdown("""
### 使用步骤
1. **输入 API 密钥**: 从 Google AI Studio 获取免费 API 密钥
2. **选择模板**: 根据文本类型选择预设模板,或自定义提取规则
3. **输入文本**: 粘贴您的医学报告或病历文本
4. **开始提取**: 点击提取按钮,AI 将自动识别并结构化关键信息
5. **查看结果**: 在右侧查看结构化数据、统计分析和历史记录
### 支持的报告类型
- **影像报告**: CT、MRI、X线、超声等各类影像学检查
- **病理报告**: 组织病理、细胞病理、免疫组化等
- **病历记录**: 入院记录、病程记录、出院小结等
### 提示
- 提供高质量的示例可显著提升提取准确度
- 可同时处理多份报告(用空行分隔)
- 结果可导出为 JSONL 格式供后续分析使用
""")
load_template_btn.click(
fn=load_template,
inputs=[template_selector],
outputs=[prompt_input, examples_input]
)
sample_selector.change(
fn=load_sample_text,
inputs=[sample_selector],
outputs=[text_input]
)
clear_btn.click(
fn=lambda: "",
outputs=[text_input]
)
submit_btn.click(
fn=extract_information,
inputs=[api_key_input, prompt_input, examples_input, text_input, template_selector],
outputs=[json_output, file_output, stats_output, history_output, file_output]
)
if __name__ == "__main__":
demo.launch() |