yasserrmd commited on
Commit
39ea117
·
verified ·
1 Parent(s): 2803b23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -21
app.py CHANGED
@@ -3,50 +3,49 @@ from docling.document_converter import DocumentConverter
3
  from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
4
  import spaces
5
 
 
6
  @spaces.GPU
7
  def convert_document(file, output_format):
8
  pdf_opts = PdfPipelineOptions(
9
  do_ocr=True,
10
- ocr_options=TesseractCliOcrOptions(lang=["eng"]), # or ["eng", "ara"]
11
  )
12
- # Load document and convert it using Docling
13
  converter = DocumentConverter(pipeline_options=pdf_opts)
14
  result = converter.convert(file.name)
15
 
16
- # Check available attributes in DoclingDocument
17
- available_attributes = dir(result.document)
18
-
19
- # Choose the output format
20
  if output_format == "Markdown":
21
  converted_text = result.document.export_to_markdown()
22
  elif output_format == "JSON":
23
- converted_text = result.document.export_to_json()
 
 
24
  else:
25
- converted_text = "Unsupported format"
26
 
27
- # Placeholder metadata extraction based on available attributes
28
- metadata = {
29
- "Available Attributes": available_attributes
30
- }
31
 
32
  return converted_text, metadata
33
 
34
- # Define the Gradio interface using the new component syntax
35
  with gr.Blocks() as app:
36
- gr.Markdown("# Document Converter with Docling")
37
- gr.Markdown("Upload a document, choose the output format, and get the converted text with metadata.")
 
 
 
 
38
 
39
- file_input = gr.File(label="Upload Document")
40
- format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")
41
- output_text = gr.Textbox(label="Converted Document")
42
  output_metadata = gr.JSON(label="Metadata")
43
 
44
- # Define the process button and bind it to the function
45
  convert_button = gr.Button("Convert")
46
  convert_button.click(
47
- convert_document,
48
  inputs=[file_input, format_input],
49
  outputs=[output_text, output_metadata]
50
  )
51
 
52
- app.launch(debug=True)
 
3
  from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
4
  import spaces
5
 
6
+ # GPU decorator not really required for Docling OCR, but kept if you want
7
  @spaces.GPU
8
  def convert_document(file, output_format):
9
  pdf_opts = PdfPipelineOptions(
10
  do_ocr=True,
11
+ ocr_options=TesseractCliOcrOptions(lang=["eng"]),
12
  )
13
+
14
  converter = DocumentConverter(pipeline_options=pdf_opts)
15
  result = converter.convert(file.name)
16
 
17
+ # Choose output format safely
 
 
 
18
  if output_format == "Markdown":
19
  converted_text = result.document.export_to_markdown()
20
  elif output_format == "JSON":
21
+ # JSON needs to be dumped into a string for the Textbox
22
+ import json
23
+ converted_text = json.dumps(result.document.export_to_json(), indent=2)
24
  else:
25
+ converted_text = "⚠️ Unsupported format"
26
 
27
+ # Metadata always JSON-friendly
28
+ metadata = {"Available Attributes": dir(result.document)}
 
 
29
 
30
  return converted_text, metadata
31
 
32
+
33
  with gr.Blocks() as app:
34
+ gr.Markdown("# 📄 Document Converter with Docling OCR")
35
+ gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")
36
+
37
+ with gr.Row():
38
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
39
+ format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")
40
 
41
+ output_text = gr.Textbox(label="Converted Document", lines=20)
 
 
42
  output_metadata = gr.JSON(label="Metadata")
43
 
 
44
  convert_button = gr.Button("Convert")
45
  convert_button.click(
46
+ fn=convert_document,
47
  inputs=[file_input, format_input],
48
  outputs=[output_text, output_metadata]
49
  )
50
 
51
+ app.launch(debug=True)