File size: 1,958 Bytes
e2d728a
ee63712
 
2803b23
ee63712
8e024f6
e2d728a
39ea117
8e024f6
e2d728a
ee63712
2803b23
 
ee63712
2803b23
39ea117
ee63712
 
 
 
 
 
 
 
e2d728a
4678d36
39ea117
e2d728a
 
 
6491463
e2d728a
39ea117
e2d728a
ee63712
39ea117
e2d728a
 
 
39ea117
a07d796
39ea117
 
 
 
 
 
e2d728a
39ea117
a07d796
 
 
 
39ea117
a07d796
 
 
e2d728a
39ea117
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import gradio as gr
import json
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
import spaces

# GPU decorator not really required for Docling OCR, but kept if you want
@spaces.GPU
def convert_document(file, output_format):
    # Configure OCR pipeline
    pdf_opts = PdfPipelineOptions(
        do_ocr=True,
        ocr_options=TesseractCliOcrOptions(lang=["eng"])  # or ["eng","ara"] if needed
    )

    # Correct way: pass options via format_options
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)
        }
    )

    # Convert document
    result = converter.convert(file.name)

    # Choose output format safely
    if output_format == "Markdown":
        converted_text = result.document.export_to_markdown()
    elif output_format == "JSON":
        converted_text = result.document.export_to_dict()
    else:
        converted_text = "⚠️ Unsupported format"

    # Metadata as JSON-friendly dict
    metadata = {"Available Attributes": dir(result.document)}

    return converted_text, metadata


with gr.Blocks() as app:
    gr.Markdown("# 📄 Document Converter with Docling OCR")
    gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")

    with gr.Row():
        file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
        format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")

    output_text = gr.Textbox(label="Converted Document", lines=20)
    output_metadata = gr.JSON(label="Metadata")

    convert_button = gr.Button("Convert")
    convert_button.click(
        fn=convert_document,
        inputs=[file_input, format_input],
        outputs=[output_text, output_metadata]
    )

app.launch(debug=True)