Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,958 Bytes
e2d728a ee63712 2803b23 ee63712 8e024f6 e2d728a 39ea117 8e024f6 e2d728a ee63712 2803b23 ee63712 2803b23 39ea117 ee63712 e2d728a 4678d36 39ea117 e2d728a 6491463 e2d728a 39ea117 e2d728a ee63712 39ea117 e2d728a 39ea117 a07d796 39ea117 e2d728a 39ea117 a07d796 39ea117 a07d796 e2d728a 39ea117 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import gradio as gr
import json
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions
from docling.datamodel.base_models import InputFormat
import spaces
# GPU decorator not really required for Docling OCR, but kept if you want
@spaces.GPU
def convert_document(file, output_format):
# Configure OCR pipeline
pdf_opts = PdfPipelineOptions(
do_ocr=True,
ocr_options=TesseractCliOcrOptions(lang=["eng"]) # or ["eng","ara"] if needed
)
# Correct way: pass options via format_options
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts)
}
)
# Convert document
result = converter.convert(file.name)
# Choose output format safely
if output_format == "Markdown":
converted_text = result.document.export_to_markdown()
elif output_format == "JSON":
converted_text = result.document.export_to_dict()
else:
converted_text = "⚠️ Unsupported format"
# Metadata as JSON-friendly dict
metadata = {"Available Attributes": dir(result.document)}
return converted_text, metadata
with gr.Blocks() as app:
gr.Markdown("# 📄 Document Converter with Docling OCR")
gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.")
with gr.Row():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format")
output_text = gr.Textbox(label="Converted Document", lines=20)
output_metadata = gr.JSON(label="Metadata")
convert_button = gr.Button("Convert")
convert_button.click(
fn=convert_document,
inputs=[file_input, format_input],
outputs=[output_text, output_metadata]
)
app.launch(debug=True)
|