Spaces:
Running
Running
import os | |
import re | |
import sys | |
import json | |
import time | |
import copy | |
import base64 | |
import asyncio | |
import tempfile | |
import subprocess | |
from pathlib import Path | |
from datetime import datetime | |
import zipfile | |
import httpx, aiofiles, os, asyncio | |
import numpy as np | |
import gradio as gr | |
from PIL import Image | |
from pdf2image import convert_from_path | |
from loguru import logger | |
from openai import OpenAI, AsyncOpenAI | |
from gradio_pdf import PDF | |
import aiohttp | |
import uuid | |
import tqdm | |
import requests | |
def setup_poppler_linux(): | |
poppler_dir = "/tmp/poppler" | |
if not os.path.exists(poppler_dir): | |
os.makedirs(poppler_dir, exist_ok=True) | |
subprocess.run([ | |
"apt-get", "update" | |
], check=True) | |
subprocess.run([ | |
"apt-get", "install", "-y", "poppler-utils" | |
], check=True) | |
setup_poppler_linux() | |
preset_prompts = [ | |
"Please convert the document into Markdown format.", | |
"Generate a clean and structured Markdown version of the document.", | |
"Transform this content into Markdown with proper headings and bullet points.", | |
"Convert the text to Markdown, preserving structure and formatting.", | |
"Reformat this document as Markdown with clear sections and lists.", | |
] | |
def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None): | |
url = f"{openai_api_base}{route}" | |
headers = {} | |
if api_key: | |
headers["Authorization"] = f"Bearer {api_key}" | |
with open(file_path, "rb") as f: | |
files = {"file": (os.path.basename(file_path), f, "application/pdf")} | |
response = requests.post(url, files=files, headers=headers) | |
return response | |
async def send_pdf_async_aiohttp(file_path, server_ip, route="/upload", Authorization=None): | |
"""使用aiohttp异步发送PDF""" | |
# url = f"http://{server_ip}:{port}{route}" | |
url = f"{server_ip}{route}" | |
headers = {} | |
if Authorization: | |
headers["Authorization"] = f"Bearer {Authorization}" | |
try: | |
async with aiohttp.ClientSession() as session: | |
with open(file_path, "rb") as f: | |
data = aiohttp.FormData() | |
data.add_field('file', f, filename=os.path.basename(file_path), content_type='application/pdf') | |
async with session.post(url, data=data, headers=headers) as response: | |
print(f"PDF发送成功: {file_path}, 状态码: {response.status}") | |
return response | |
except Exception as e: | |
print(f"PDF发送失败: {file_path}, 错误: {e}") | |
return None | |
def extract_makrdown(text): | |
m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE) | |
if m: | |
return m.group(1).strip() | |
else: | |
return text | |
openai_api_key = "EMPTY" | |
openai_api_base = os.environ.get("openai_api_base") | |
IP = os.environ.get("IP") | |
PORT = os.environ.get("PORT") | |
Authorization = os.environ.get("Authorization") | |
client = AsyncOpenAI( | |
api_key=openai_api_key, | |
base_url=openai_api_base + "/v1", | |
) | |
async def request(messages): | |
chat_completion_from_base64 = await client.chat.completions.create( | |
messages=messages, | |
extra_headers={ | |
"Authorization": f"Bearer {Authorization}" | |
}, | |
model="Qwen2_5VL", | |
max_completion_tokens=4096, | |
stream=True, | |
temperature=0.0, | |
top_p=0.95 | |
) | |
page = "" | |
async for chunk in chat_completion_from_base64: | |
if chunk.choices[0].delta.content: | |
content = chunk.choices[0].delta.content | |
choice = chunk.choices[0] | |
if choice.finish_reason is not None: | |
print(f"end reason = {choice.finish_reason}") | |
break | |
page += content | |
yield content | |
def images_to_pdf(img_paths, pdf_path): | |
if isinstance(img_paths, (str, Path)): | |
img_paths = [img_paths] | |
if not img_paths: | |
raise ValueError("img_paths is empty") | |
images = [] | |
for p in img_paths: | |
p = Path(p) | |
if not p.is_file(): | |
raise FileNotFoundError(p) | |
img = Image.open(p) | |
if img.mode in ("RGBA", "P"): | |
img = img.convert("RGB") | |
images.append(img) | |
pdf_path = Path(pdf_path) | |
pdf_path.parent.mkdir(parents=True, exist_ok=True) | |
images[0].save(pdf_path, | |
save_all=True, | |
append_images=images[1:], | |
resolution=300.0) | |
return pdf_path | |
def encode_image(image_path): | |
with open(image_path, "rb") as image_file: | |
return base64.b64encode(image_file.read()).decode("utf-8") | |
def build_message(image_path, prompt): | |
content = [ | |
{ | |
"type": "image_url", | |
"image_url": { | |
"url": f"data:image/jpeg;base64,{encode_image(image_path)}" | |
} | |
}, | |
{"type": "text", 'text': prompt} | |
] | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{'role': 'user', 'content': content} | |
] | |
return messages | |
def download_markdown_file(md_text): | |
filename = f"markdown_{uuid.uuid4().hex[:8]}.md" | |
filepath = Path("downloads") / filename | |
filepath.parent.mkdir(exist_ok=True) | |
with open(filepath, "w", encoding="utf-8") as f: | |
f.write(md_text) | |
return str(filepath) | |
async def doc_parser(doc_path, prompt): | |
doc_path = Path(doc_path) | |
if not doc_path.is_file(): | |
raise FileNotFoundError(doc_path) | |
with tempfile.TemporaryDirectory() as tmpdir: | |
tmpdir = Path(tmpdir) | |
queries = [] | |
if doc_path.suffix.lower() == ".pdf": | |
pages: List[Image.Image] = convert_from_path(doc_path, dpi=300) | |
for idx, page in enumerate(pages, start=1): | |
img_path = tmpdir / f"page_{idx}.png" | |
page.save(img_path, "PNG") | |
messages = build_message(img_path, prompt) | |
queries.append(messages) | |
else: | |
messages = build_message(doc_path, prompt) | |
queries.append(messages) | |
all_pages = [] | |
all_pages_raw = [] | |
for query in queries: | |
pages = "" | |
async for chunk in request(query): | |
pages += chunk | |
yield extract_makrdown(pages), pages | |
all_pages.append(extract_makrdown(pages)) | |
all_pages_raw.append(pages) | |
print(all_pages) | |
yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw) | |
def compress_directory_to_zip(directory_path, output_zip_path): | |
try: | |
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, dirs, files in os.walk(directory_path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
arcname = os.path.relpath(file_path, directory_path) | |
zipf.write(file_path, arcname) | |
return 0 | |
except Exception as e: | |
logger.exception(e) | |
return -1 | |
latex_delimiters = [ | |
{'left': '$$', 'right': '$$', 'display': True}, | |
{'left': '$', 'right': '$', 'display': False}, | |
{'left': '\\(', 'right': '\\)', 'display': False}, | |
{'left': '\\[', 'right': '\\]', 'display': True}, | |
] | |
def check_prompt(prompt): | |
if not prompt or prompt.strip() == "": | |
raise gr.Error("Please select or enter a prompt before parsing.") | |
return prompt | |
def to_file(image_path): | |
if image_path.endswith("Academic_Papers.png"): | |
image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf") | |
return image_path | |
# async def process_file(file_path): | |
# if not file_path.endswith(".pdf"): | |
# tmp_path = Path(file_path).with_suffix(".pdf") | |
# images_to_pdf(file_path, tmp_path) | |
# else: | |
# tmp_path = Path(file_path) | |
# async with httpx.AsyncClient() as client: | |
# await send_pdf_to_parse_async(client, str(tmp_path), IP, PORT) | |
# return str(tmp_path) | |
async def process_file(file_path): | |
"""使用asyncio的异步方案""" | |
if file_path is None: | |
return None | |
if not file_path.endswith(".pdf"): | |
tmp_file_path = Path(file_path) | |
tmp_file_path = tmp_file_path.with_suffix(".pdf") | |
images_to_pdf(file_path, tmp_file_path) | |
else: | |
tmp_file_path = file_path | |
asyncio.create_task(send_pdf_async_aiohttp(tmp_file_path, server_ip=openai_api_base, Authorization=Authorization)) | |
return str(tmp_file_path) | |
if __name__ == '__main__': | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(variant='panel', scale=5): | |
file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath") | |
prompts = gr.Dropdown( | |
choices=preset_prompts, | |
label="Prompt", | |
info="Enter or select prompts...", | |
value=preset_prompts[0], | |
multiselect=False, | |
interactive=True, | |
allow_custom_value=True, | |
) | |
with gr.Row(): | |
change_bu = gr.Button('Parse') | |
clear_bu = gr.ClearButton(value='Clear') | |
pdf_show = PDF(label='Preview', interactive=False, visible=True, height=800) | |
example_root = os.path.join(os.path.dirname(__file__), 'examples') | |
images = [ | |
os.path.join(example_root, f) | |
for f in os.listdir(example_root) | |
if f.lower().endswith(('png', 'jpg', 'jpeg')) | |
] | |
with gr.Column(variant='panel', scale=5): | |
with gr.Accordion("Examples", open=True): | |
example_root = "examples" | |
file_path = [ | |
os.path.join(example_root, f) | |
for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"] | |
] | |
with gr.Row(): | |
for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]): | |
with gr.Column(scale=1, min_width=120): | |
gr.Image( | |
value=file_path[i], | |
width=120, | |
height=90, | |
show_label=False, | |
show_download_button=False | |
) | |
gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file) | |
download_btn = gr.Button("⬇️ Generate download link", size="sm") | |
output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False) | |
gr.HTML(""" | |
<style> | |
#down-file-box { | |
max-height: 300px; | |
} | |
</style> | |
""") | |
with gr.Tabs(): | |
with gr.Tab('Markdown rendering'): | |
md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, | |
latex_delimiters=latex_delimiters, | |
line_breaks=True) | |
with gr.Tab('Markdown text'): | |
md_text = gr.TextArea(lines=45, show_copy_button=True) | |
file.change(fn=process_file, inputs=file, outputs=pdf_show) | |
change_bu.click( | |
fn=check_prompt, | |
inputs=prompts, | |
outputs=prompts | |
).then( | |
lambda f: gr.update(visible=False), | |
inputs=output_file, | |
outputs=output_file | |
).then( | |
fn=doc_parser, | |
inputs=[file, prompts], | |
outputs=[md, md_text] | |
) | |
clear_bu.add([file, md, pdf_show, md_text]) | |
download_btn.click( | |
fn=download_markdown_file, | |
inputs=md_text, | |
outputs=output_file | |
).then( | |
lambda f: gr.update(visible=True), | |
inputs=output_file, | |
outputs=output_file | |
) | |
demo.launch(server_name='0.0.0.0',share=True) |