Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import json | |
from docling.document_converter import DocumentConverter, PdfFormatOption | |
from docling.datamodel.pipeline_options import PdfPipelineOptions, TesseractCliOcrOptions | |
from docling.datamodel.base_models import InputFormat | |
import spaces | |
# GPU decorator not really required for Docling OCR, but kept if you want | |
def convert_document(file, output_format): | |
# Configure OCR pipeline | |
pdf_opts = PdfPipelineOptions( | |
do_ocr=True, | |
ocr_options=TesseractCliOcrOptions(lang=["eng"]) # or ["eng","ara"] if needed | |
) | |
# Correct way: pass options via format_options | |
converter = DocumentConverter( | |
format_options={ | |
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_opts) | |
} | |
) | |
# Convert document | |
result = converter.convert(file.name) | |
# Choose output format safely | |
if output_format == "Markdown": | |
converted_text = result.document.export_to_markdown() | |
elif output_format == "JSON": | |
converted_text = result.document.export_to_dict() | |
else: | |
converted_text = "⚠️ Unsupported format" | |
# Metadata as JSON-friendly dict | |
metadata = {"Available Attributes": dir(result.document)} | |
return converted_text, metadata | |
with gr.Blocks() as app: | |
gr.Markdown("# 📄 Document Converter with Docling OCR") | |
gr.Markdown("Upload a PDF, choose the output format, and get the converted text + metadata.") | |
with gr.Row(): | |
file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
format_input = gr.Radio(["Markdown", "JSON"], label="Choose Output Format") | |
output_text = gr.Textbox(label="Converted Document", lines=20) | |
output_metadata = gr.JSON(label="Metadata") | |
convert_button = gr.Button("Convert") | |
convert_button.click( | |
fn=convert_document, | |
inputs=[file_input, format_input], | |
outputs=[output_text, output_metadata] | |
) | |
app.launch(debug=True) | |