zuminghuang commited on
Commit
4ce053f
·
verified ·
1 Parent(s): 8aa7e7b

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +5 -5
  2. app.py +365 -0
  3. apt.txt +1 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Infinity Parser
3
- emoji: 🐠
4
- colorFrom: purple
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.34.2
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: Test
3
+ emoji: 📈
4
+ colorFrom: pink
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import json
5
+ import time
6
+ import copy
7
+ import base64
8
+ import asyncio
9
+ import tempfile
10
+ import subprocess
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ import zipfile
14
+
15
+ import numpy as np
16
+ import gradio as gr
17
+ from PIL import Image
18
+ from pdf2image import convert_from_path
19
+ from loguru import logger
20
+ from openai import OpenAI, AsyncOpenAI
21
+ from gradio_pdf import PDF
22
+
23
+ import uuid
24
+ import tqdm
25
+
26
+ import requests
27
+
28
+
29
+ def setup_poppler_linux():
30
+ poppler_dir = "/tmp/poppler"
31
+ if not os.path.exists(poppler_dir):
32
+ os.makedirs(poppler_dir, exist_ok=True)
33
+ subprocess.run([
34
+ "apt-get", "update"
35
+ ], check=True)
36
+ subprocess.run([
37
+ "apt-get", "install", "-y", "poppler-utils"
38
+ ], check=True)
39
+
40
+ setup_poppler_linux()
41
+
42
+
43
+ preset_prompts = [
44
+ "Please convert the document into Markdown format.",
45
+ "Generate a clean and structured Markdown version of the document.",
46
+ "Transform this content into Markdown with proper headings and bullet points.",
47
+ "Convert the text to Markdown, preserving structure and formatting.",
48
+ "Reformat this document as Markdown with clear sections and lists.",
49
+ ]
50
+
51
+
52
+ def send_pdf_to_parse(file_path, server_ip, port, route="/upload", api_key=None):
53
+ url = f"http://{server_ip}:{port}{route}"
54
+ headers = {}
55
+ if api_key:
56
+ headers["Authorization"] = f"Bearer {api_key}"
57
+
58
+ with open(file_path, "rb") as f:
59
+ files = {"file": (os.path.basename(file_path), f, "application/pdf")}
60
+ response = requests.post(url, files=files, headers=headers)
61
+ return response
62
+
63
+
64
+ def extract_makrdown(text):
65
+ m = re.search(r'```markdown\s*([\s\S]*?)```', text, re.MULTILINE)
66
+ if m:
67
+ return m.group(1).strip()
68
+ else:
69
+ return text
70
+
71
+ openai_api_key = "EMPTY"
72
+
73
+ openai_api_base = os.environ.get("openai_api_base")
74
+
75
+ IP = os.environ.get("IP")
76
+
77
+ PORT = os.environ.get("PORT")
78
+
79
+
80
+
81
+ client = AsyncOpenAI(
82
+ api_key=openai_api_key,
83
+ base_url=openai_api_base,
84
+ )
85
+
86
+
87
+ async def request(messages):
88
+
89
+ chat_completion_from_base64 = await client.chat.completions.create(
90
+ messages=messages,
91
+ model="Qwen2_5VL",
92
+ max_completion_tokens=4096,
93
+ stream=True,
94
+ temperature=0.0,
95
+ top_p=0.95
96
+ )
97
+
98
+ page = ""
99
+ async for chunk in chat_completion_from_base64:
100
+ if chunk.choices[0].delta.content:
101
+ content = chunk.choices[0].delta.content
102
+
103
+ choice = chunk.choices[0]
104
+ if choice.finish_reason is not None:
105
+ print(f"end reason = {choice.finish_reason}")
106
+ break
107
+ page += content
108
+
109
+ yield content
110
+
111
+
112
+ def images_to_pdf(img_paths, pdf_path):
113
+
114
+ if isinstance(img_paths, (str, Path)):
115
+ img_paths = [img_paths]
116
+
117
+ if not img_paths:
118
+ raise ValueError("img_paths is empty")
119
+ images = []
120
+ for p in img_paths:
121
+ p = Path(p)
122
+ if not p.is_file():
123
+ raise FileNotFoundError(p)
124
+
125
+ img = Image.open(p)
126
+ if img.mode in ("RGBA", "P"):
127
+ img = img.convert("RGB")
128
+ images.append(img)
129
+
130
+ pdf_path = Path(pdf_path)
131
+ pdf_path.parent.mkdir(parents=True, exist_ok=True)
132
+ images[0].save(pdf_path,
133
+ save_all=True,
134
+ append_images=images[1:],
135
+ resolution=300.0)
136
+ return pdf_path
137
+
138
+
139
+ def encode_image(image_path):
140
+ with open(image_path, "rb") as image_file:
141
+ return base64.b64encode(image_file.read()).decode("utf-8")
142
+
143
+ def build_message(image_path, prompt):
144
+
145
+ content = [
146
+ {
147
+ "type": "image_url",
148
+ "image_url": {
149
+ "url": f"data:image/jpeg;base64,{encode_image(image_path)}"
150
+ }
151
+ },
152
+ {"type": "text", 'text': prompt}
153
+ ]
154
+
155
+
156
+ messages = [
157
+ {"role": "system", "content": "You are a helpful assistant."},
158
+ {'role': 'user', 'content': content}
159
+
160
+ ]
161
+
162
+ return messages
163
+
164
+
165
+
166
+ def download_markdown_file(md_text):
167
+ filename = f"markdown_{uuid.uuid4().hex[:8]}.md"
168
+ filepath = Path("downloads") / filename
169
+ filepath.parent.mkdir(exist_ok=True)
170
+ with open(filepath, "w", encoding="utf-8") as f:
171
+ f.write(md_text)
172
+ return str(filepath)
173
+
174
+
175
+ async def doc_parser(doc_path, prompt):
176
+
177
+ doc_path = Path(doc_path)
178
+ if not doc_path.is_file():
179
+ raise FileNotFoundError(doc_path)
180
+
181
+ with tempfile.TemporaryDirectory() as tmpdir:
182
+ tmpdir = Path(tmpdir)
183
+
184
+ queries = []
185
+ if doc_path.suffix.lower() == ".pdf":
186
+ pages: List[Image.Image] = convert_from_path(doc_path, dpi=300)
187
+ for idx, page in enumerate(pages, start=1):
188
+ img_path = tmpdir / f"page_{idx}.png"
189
+ page.save(img_path, "PNG")
190
+
191
+ messages = build_message(img_path, prompt)
192
+ queries.append(messages)
193
+
194
+ else:
195
+ messages = build_message(doc_path, prompt)
196
+ queries.append(messages)
197
+
198
+ all_pages = []
199
+ all_pages_raw = []
200
+ for query in queries:
201
+ pages = ""
202
+ async for chunk in request(query):
203
+ pages += chunk
204
+ yield extract_makrdown(pages), pages
205
+ all_pages.append(extract_makrdown(pages))
206
+ all_pages_raw.append(pages)
207
+ print(all_pages)
208
+ yield "\n---\n".join(all_pages), "\n\n".join(all_pages_raw)
209
+
210
+
211
+ def compress_directory_to_zip(directory_path, output_zip_path):
212
+ try:
213
+ with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
214
+
215
+
216
+ for root, dirs, files in os.walk(directory_path):
217
+ for file in files:
218
+
219
+ file_path = os.path.join(root, file)
220
+
221
+ arcname = os.path.relpath(file_path, directory_path)
222
+
223
+ zipf.write(file_path, arcname)
224
+ return 0
225
+ except Exception as e:
226
+ logger.exception(e)
227
+ return -1
228
+
229
+ latex_delimiters = [
230
+ {'left': '$$', 'right': '$$', 'display': True},
231
+ {'left': '$', 'right': '$', 'display': False},
232
+ {'left': '\\(', 'right': '\\)', 'display': False},
233
+ {'left': '\\[', 'right': '\\]', 'display': True},
234
+ ]
235
+
236
+ def check_prompt(prompt):
237
+ if not prompt or prompt.strip() == "":
238
+ raise gr.Error("Please select or enter a prompt before parsing.")
239
+ return prompt
240
+
241
+ def to_file(image_path):
242
+
243
+ if image_path.endswith("Academic_Papers.png"):
244
+ image_path = image_path.replace("Academic_Papers.png", "Academic_Papers.pdf")
245
+
246
+ return image_path
247
+
248
+ def process_file(file_path):
249
+ if file_path is None:
250
+ return None
251
+ if not file_path.endswith(".pdf"):
252
+
253
+ tmp_file_path = Path(file_path)
254
+ tmp_file_path = tmp_file_path.with_suffix(".pdf")
255
+ images_to_pdf(file_path, tmp_file_path)
256
+ else:
257
+ send_pdf_to_parse(file_path, IP, PORT)
258
+ tmp_file_path = file_path
259
+
260
+ return str(tmp_file_path)
261
+
262
+
263
+ if __name__ == '__main__':
264
+ with gr.Blocks() as demo:
265
+ with gr.Row():
266
+ with gr.Column(variant='panel', scale=5):
267
+
268
+ file = gr.File(label='Please upload a PDF or image', file_types=['.pdf', '.png', '.jpeg', '.jpg'], type="filepath")
269
+ prompts = gr.Dropdown(
270
+ choices=preset_prompts,
271
+ label="Prompt",
272
+ info="Enter or select prompts...",
273
+ value=preset_prompts[0],
274
+ multiselect=False,
275
+ interactive=True,
276
+ allow_custom_value=True,
277
+ )
278
+
279
+ with gr.Row():
280
+ change_bu = gr.Button('Parse')
281
+ clear_bu = gr.ClearButton(value='Clear')
282
+ pdf_show = PDF(label='Preview', interactive=False, visible=True, height=800)
283
+
284
+
285
+
286
+ example_root = os.path.join(os.path.dirname(__file__), 'examples')
287
+ images = [
288
+ os.path.join(example_root, f)
289
+ for f in os.listdir(example_root)
290
+ if f.lower().endswith(('png', 'jpg', 'jpeg'))
291
+ ]
292
+
293
+ with gr.Column(variant='panel', scale=5):
294
+ with gr.Accordion("Examples", open=True):
295
+ example_root = "examples"
296
+ file_path = [
297
+ os.path.join(example_root, f)
298
+ for f in ["Financial_Reports.png", "Books.png", "Magazines.png", "Academic_Papers.png"]
299
+
300
+ ]
301
+
302
+ with gr.Row():
303
+ for i, label in enumerate(["Financial Reports(IMG)", "Books(IMG)", "Magazines(IMG)", "Academic Papers(PDF)"]):
304
+ with gr.Column(scale=1, min_width=120):
305
+ gr.Image(
306
+ value=file_path[i],
307
+ width=120,
308
+ height=90,
309
+ show_label=False,
310
+ show_download_button=False
311
+ )
312
+ gr.Button(label).click(fn=to_file, inputs=gr.State(file_path[i]), outputs=file)
313
+
314
+
315
+ download_btn = gr.Button("⬇️ Generate download link", size="sm")
316
+ output_file = gr.File(label='Parse result', interactive=False, elem_id="down-file-box",visible=False)
317
+
318
+ gr.HTML("""
319
+ <style>
320
+ #down-file-box {
321
+ max-height: 300px;
322
+ }
323
+ </style>
324
+ """)
325
+ with gr.Tabs():
326
+ with gr.Tab('Markdown rendering'):
327
+ md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True,
328
+ latex_delimiters=latex_delimiters,
329
+ line_breaks=True)
330
+ with gr.Tab('Markdown text'):
331
+ md_text = gr.TextArea(lines=45, show_copy_button=True)
332
+
333
+
334
+
335
+ file.change(fn=process_file, inputs=file, outputs=pdf_show)
336
+
337
+
338
+ change_bu.click(
339
+ fn=check_prompt,
340
+ inputs=prompts,
341
+ outputs=prompts
342
+ ).then(
343
+ lambda f: gr.update(visible=False),
344
+ inputs=output_file,
345
+ outputs=output_file
346
+ ).then(
347
+ fn=doc_parser,
348
+ inputs=[file, prompts],
349
+ outputs=[md, md_text]
350
+ )
351
+
352
+ clear_bu.add([file, md, pdf_show, md_text])
353
+
354
+ download_btn.click(
355
+ fn=download_markdown_file,
356
+ inputs=md_text,
357
+ outputs=output_file
358
+ ).then(
359
+ lambda f: gr.update(visible=True),
360
+ inputs=output_file,
361
+ outputs=output_file
362
+ )
363
+
364
+
365
+ demo.launch(server_name='0.0.0.0',share=True)
apt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ gradio-pdf>=0.0.11
3
+ pdf2image>=1.16.3
4
+ pillow>=10.0.0
5
+ numpy>=1.24.0
6
+ openai>=1.0.0
7
+ loguru>=0.7.0
8
+ tqdm>=4.66.0
9
+ requests>=2.31.0
10
+ PyMuPDF>=1.23.0