Spaces:

DaVinciCode
/

doctra-document-parser

Sleeping

App Files Files Community

doctra-document-parser / app.py

DaVinciCode

modifications done

8364708 about 1 month ago

raw

history blame contribute delete

39.2 kB

	"""
	Doctra - Document Parser for Hugging Face Spaces

	This is a Hugging Face Spaces deployment of the Doctra document parsing library.
	It provides a comprehensive web interface for PDF parsing, table/chart extraction,
	image restoration, and enhanced document processing.
	"""

	import os
	import shutil
	import tempfile
	import re
	import html as _html
	import base64
	import json
	from pathlib import Path
	from typing import Optional, Tuple, List, Dict, Any

	import gradio as gr
	import pandas as pd

	# Mock google.genai to avoid import errors
	import sys
	from unittest.mock import MagicMock

	# Create a mock google.genai module
	mock_google_genai = MagicMock()
	sys.modules['google.genai'] = mock_google_genai
	sys.modules['google.genai.types'] = MagicMock()

	# Now import Doctra components
	try:
	from doctra.parsers.structured_pdf_parser import StructuredPDFParser
	from doctra.parsers.table_chart_extractor import ChartTablePDFParser
	from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
	from doctra.ui.docres_wrapper import DocResUIWrapper
	from doctra.utils.pdf_io import render_pdf_to_images
	except ImportError as e:
	print(f"Warning: Some Doctra components may not be available: {e}")
	# Create mock classes if imports fail
	StructuredPDFParser = None
	ChartTablePDFParser = None
	EnhancedPDFParser = None
	DocResUIWrapper = None
	render_pdf_to_images = None


	# UI Theme and Styling Constants
	THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")

	CUSTOM_CSS = """
	/* Full-width layout */
	.gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
	.container {max-width: 100% !important}
	.app {max-width: 100% !important}

	/* Header and helpers */
	.header {margin-bottom: 8px}
	.subtitle {color: var(--body-text-color-subdued)}
	.card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
	.status-ok {color: var(--color-success)}

	/* Scrollable gallery styling */
	.scrollable-gallery {
	max-height: 600px !important;
	overflow-y: auto !important;
	border: 1px solid var(--border-color) !important;
	border-radius: 8px !important;
	padding: 8px !important;
	}

	/* Page content styling */
	.page-content img {
	max-width: 100% !important;
	height: auto !important;
	display: block !important;
	margin: 10px auto !important;
	border: 1px solid #ddd !important;
	border-radius: 8px !important;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
	}

	.page-content {
	max-height: none !important;
	overflow: visible !important;
	}

	/* Table styling */
	.page-content table.doc-table {
	width: 100% !important;
	border-collapse: collapse !important;
	margin: 12px 0 !important;
	}
	.page-content table.doc-table th,
	.page-content table.doc-table td {
	border: 1px solid #e5e7eb !important;
	padding: 8px 10px !important;
	text-align: left !important;
	}
	.page-content table.doc-table thead th {
	background: #f9fafb !important;
	font-weight: 600 !important;
	}
	.page-content table.doc-table tbody tr:nth-child(even) td {
	background: #fafafa !important;
	}

	/* Clickable image buttons */
	.image-button {
	background: #0066cc !important;
	color: white !important;
	border: none !important;
	padding: 5px 10px !important;
	border-radius: 4px !important;
	cursor: pointer !important;
	margin: 2px !important;
	font-size: 14px !important;
	}

	.image-button:hover {
	background: #0052a3 !important;
	}
	"""


	def gather_outputs(
	out_dir: Path,
	allowed_kinds: Optional[List[str]] = None,
	zip_filename: Optional[str] = None,
	is_structured_parsing: bool = False
	) -> Tuple[List[tuple[str, str]], List[str], str]:
	"""
	Gather output files and create a ZIP archive for download.
	"""
	gallery_items: List[tuple[str, str]] = []
	file_paths: List[str] = []

	if out_dir.exists():
	if is_structured_parsing:
	# For structured parsing, include all files
	for file_path in sorted(out_dir.rglob("*")):
	if file_path.is_file():
	file_paths.append(str(file_path))
	else:
	# For full parsing, include specific main files
	main_files = [
	"result.html",
	"result.md",
	"tables.html",
	"tables.xlsx"
	]

	for main_file in main_files:
	file_path = out_dir / main_file
	if file_path.exists():
	file_paths.append(str(file_path))

	# Include images based on allowed kinds
	if allowed_kinds:
	for kind in allowed_kinds:
	p = out_dir / kind
	if p.exists():
	for img in sorted(p.glob("*.png")):
	file_paths.append(str(img))

	images_dir = out_dir / "images" / kind
	if images_dir.exists():
	for img in sorted(images_dir.glob("*.jpg")):
	file_paths.append(str(img))
	else:
	# Include all images if no specific kinds specified
	for p in (out_dir / "charts").glob("*.png"):
	file_paths.append(str(p))
	for p in (out_dir / "tables").glob("*.png"):
	file_paths.append(str(p))
	for p in (out_dir / "images").rglob("*.jpg"):
	file_paths.append(str(p))

	# Include Excel files based on allowed kinds
	if allowed_kinds:
	if "charts" in allowed_kinds and "tables" in allowed_kinds:
	excel_files = ["parsed_tables_charts.xlsx"]
	elif "charts" in allowed_kinds:
	excel_files = ["parsed_charts.xlsx"]
	elif "tables" in allowed_kinds:
	excel_files = ["parsed_tables.xlsx"]
	else:
	excel_files = []

	for excel_file in excel_files:
	excel_path = out_dir / excel_file
	if excel_path.exists():
	file_paths.append(str(excel_path))

	# Build gallery items for image display
	kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
	for sub in kinds:
	p = out_dir / sub
	if p.exists():
	for img in sorted(p.glob("*.png")):
	gallery_items.append((str(img), f"{sub}: {img.name}"))

	images_dir = out_dir / "images" / sub
	if images_dir.exists():
	for img in sorted(images_dir.glob("*.jpg")):
	gallery_items.append((str(img), f"{sub}: {img.name}"))

	# Create ZIP archive
	tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))

	if zip_filename:
	safe_filename = re.sub(r'[<>:"/\\\|?*]', '_', zip_filename)
	zip_base = tmp_zip_dir / safe_filename
	else:
	zip_base = tmp_zip_dir / "doctra_outputs"

	filtered_dir = tmp_zip_dir / "filtered_outputs"
	shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$', '.tmp', '*.temp'))

	zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))

	return gallery_items, file_paths, zip_path


	def validate_vlm_config(use_vlm: bool, vlm_api_key: str, vlm_provider: str = "gemini") -> Optional[str]:
	"""
	Validate VLM configuration parameters.
	"""
	if use_vlm and vlm_provider not in ["ollama"] and not vlm_api_key:
	return "❌ Error: VLM API key is required when using VLM (except for Ollama)"

	if use_vlm and vlm_api_key and vlm_provider not in ["ollama"]:
	# Basic API key validation
	if len(vlm_api_key.strip()) < 10:
	return "❌ Error: VLM API key appears to be too short or invalid"
	if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
	return "❌ Error: OpenAI API key appears to be invalid (too short)"

	return None


	def create_page_html_content(page_content: List[str], base_dir: Optional[Path] = None) -> str:
	"""
	Convert page content lines to HTML with inline images and proper formatting.
	"""
	processed_content = []
	paragraph_buffer = []

	def flush_paragraph():
	"""Flush accumulated paragraph content to HTML"""
	nonlocal paragraph_buffer
	if paragraph_buffer:
	joined = '<br/>'.join(_html.escape(l) for l in paragraph_buffer)
	processed_content.append(f'<p>{joined}</p>')
	paragraph_buffer = []

	def is_markdown_table_header(s: str) -> bool:
	return '\|' in s and ('---' in s or '—' in s)

	def render_markdown_table(lines: List[str]) -> str:
	rows = [l.strip().strip('\|').split('\|') for l in lines]
	rows = [[_html.escape(c.strip()) for c in r] for r in rows]
	if len(rows) < 2:
	return ""

	header = rows[0]
	body = rows[2:] if len(rows) > 2 else []
	thead = '<thead><tr>' + ''.join(f'<th>{c}</th>' for c in header) + '</tr></thead>'
	tbody = '<tbody>' + ''.join('<tr>' + ''.join(f'<td>{c}</td>' for c in r) + '</tr>' for r in body) + '</tbody>'
	return f'<table class="doc-table">{thead}{tbody}</table>'

	i = 0
	n = len(page_content)

	while i < n:
	raw_line = page_content[i]
	line = raw_line.rstrip('\r\n')
	stripped = line.strip()

	# Handle image references
	if stripped.startswith('![') and ('](images/' in stripped or '](images\\' in stripped):
	flush_paragraph()
	match = re.match(r'!\[([^\]]+)\]$([^)]+)$', stripped)
	if match and base_dir is not None:
	caption = match.group(1)
	rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
	abs_path = (base_dir / rel_path).resolve()
	try:
	with open(abs_path, 'rb') as f:
	b64 = base64.b64encode(f.read()).decode('ascii')
	processed_content.append(f'<figure><img src="data:image/jpeg;base64,{b64}" alt="{_html.escape(caption)}"/><figcaption>{_html.escape(caption)}</figcaption></figure>')
	except Exception as e:
	print(f"❌ Failed to embed image {rel_path}: {e}")
	processed_content.append(f'<div>{_html.escape(caption)} (image not found)</div>')
	else:
	processed_content.append(f'<div>{_html.escape(stripped)}</div>')
	i += 1
	continue

	# Handle markdown tables
	if (stripped.startswith('\|') or stripped.count('\|') >= 2) and i + 1 < n and is_markdown_table_header(page_content[i + 1]):
	flush_paragraph()
	table_block = [stripped]
	i += 1
	table_block.append(page_content[i].strip())
	i += 1
	while i < n:
	nxt = page_content[i].rstrip('\r\n')
	if nxt.strip() == '' or (not nxt.strip().startswith('\|') and nxt.count('\|') < 2):
	break
	table_block.append(nxt.strip())
	i += 1
	html_table = render_markdown_table(table_block)
	if html_table:
	processed_content.append(html_table)
	else:
	for tl in table_block:
	paragraph_buffer.append(tl)
	continue

	# Handle headers and content
	if stripped.startswith('## '):
	flush_paragraph()
	processed_content.append(f'<h3>{_html.escape(stripped[3:])}</h3>')
	elif stripped.startswith('# '):
	flush_paragraph()
	processed_content.append(f'<h2>{_html.escape(stripped[2:])}</h2>')
	elif stripped == '':
	flush_paragraph()
	processed_content.append('<br/>')
	else:
	paragraph_buffer.append(raw_line)
	i += 1

	flush_paragraph()
	return "\n".join(processed_content)


	def run_full_parse(
	pdf_file: str,
	use_vlm: bool,
	vlm_provider: str,
	vlm_api_key: str,
	layout_model_name: str,
	dpi: int,
	min_score: float,
	ocr_lang: str,
	ocr_psm: int,
	ocr_oem: int,
	ocr_extra_config: str,
	box_separator: str,
	) -> Tuple[str, Optional[str], List[tuple[str, str]], List[str], str]:
	"""Run full PDF parsing with structured output."""
	if not pdf_file:
	return ("No file provided.", None, [], [], "")

	# Check if Doctra components are available
	if StructuredPDFParser is None:
	return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, [], [], "")

	# Validate VLM configuration
	vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
	if vlm_error:
	return (vlm_error, None, [], [], "")

	original_filename = Path(pdf_file).stem

	# Create temporary directory for processing
	tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
	input_pdf = tmp_dir / f"{original_filename}.pdf"
	shutil.copy2(pdf_file, input_pdf)

	# Initialize parser with configuration
	parser = StructuredPDFParser(
	use_vlm=use_vlm,
	vlm_provider=vlm_provider,
	vlm_api_key=vlm_api_key or None,
	layout_model_name=layout_model_name,
	dpi=int(dpi),
	min_score=float(min_score),
	ocr_lang=ocr_lang,
	ocr_psm=int(ocr_psm),
	ocr_oem=int(ocr_oem),
	ocr_extra_config=ocr_extra_config or "",
	box_separator=box_separator or "\n",
	)

	try:
	parser.parse(str(input_pdf))
	except Exception as e:
	import traceback
	traceback.print_exc()
	try:
	error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
	return (f"❌ VLM processing failed: {error_msg}", None, [], [], "")
	except Exception:
	return (f"❌ VLM processing failed: <Unicode encoding error>", None, [], [], "")

	# Find output directory
	outputs_root = Path("outputs")
	out_dir = outputs_root / original_filename / "full_parse"
	if not out_dir.exists():
	candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
	if candidates:
	out_dir = candidates[0] / "full_parse"
	else:
	out_dir = outputs_root

	# Read markdown file if it exists
	md_file = next(out_dir.glob("*.md"), None)
	md_preview = None
	if md_file and md_file.exists():
	try:
	with md_file.open("r", encoding="utf-8", errors="ignore") as f:
	md_preview = f.read()
	except Exception:
	md_preview = None

	# Gather output files and create ZIP
	gallery_items, file_paths, zip_path = gather_outputs(
	out_dir,
	zip_filename=original_filename,
	is_structured_parsing=False
	)

	return (
	f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}",
	md_preview,
	gallery_items,
	file_paths,
	zip_path
	)


	def run_extract(
	pdf_file: str,
	target: str,
	use_vlm: bool,
	vlm_provider: str,
	vlm_api_key: str,
	layout_model_name: str,
	dpi: int,
	min_score: float,
	) -> Tuple[str, str, List[tuple[str, str]], List[str], str]:
	"""Run table/chart extraction from PDF."""
	if not pdf_file:
	return ("No file provided.", "", [], [], "")

	# Check if Doctra components are available
	if ChartTablePDFParser is None:
	return ("❌ Error: Doctra library not properly installed. Please check the requirements.", "", [], [], "")

	# Validate VLM configuration
	vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
	if vlm_error:
	return (vlm_error, "", [], [], "")

	original_filename = Path(pdf_file).stem

	# Create temporary directory for processing
	tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_"))
	input_pdf = tmp_dir / f"{original_filename}.pdf"
	shutil.copy2(pdf_file, input_pdf)

	# Initialize parser with configuration
	parser = ChartTablePDFParser(
	extract_charts=(target in ("charts", "both")),
	extract_tables=(target in ("tables", "both")),
	use_vlm=use_vlm,
	vlm_provider=vlm_provider,
	vlm_api_key=vlm_api_key or None,
	layout_model_name=layout_model_name,
	dpi=int(dpi),
	min_score=float(min_score),
	)

	# Run extraction
	output_base = Path("outputs")
	parser.parse(str(input_pdf), str(output_base))

	# Find output directory
	outputs_root = output_base
	out_dir = outputs_root / original_filename / "structured_parsing"
	if not out_dir.exists():
	if outputs_root.exists():
	candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
	if candidates:
	out_dir = candidates[0] / "structured_parsing"
	else:
	out_dir = outputs_root
	else:
	outputs_root.mkdir(parents=True, exist_ok=True)
	out_dir = outputs_root

	# Determine which kinds to include in outputs based on target selection
	allowed_kinds: Optional[List[str]] = None
	if target in ("tables", "charts"):
	allowed_kinds = [target]
	elif target == "both":
	allowed_kinds = ["tables", "charts"]

	# Gather output files and create ZIP
	gallery_items, file_paths, zip_path = gather_outputs(
	out_dir,
	allowed_kinds,
	zip_filename=original_filename,
	is_structured_parsing=True
	)

	# Build tables HTML preview from Excel data (when VLM enabled)
	tables_html = ""
	try:
	if use_vlm:
	# Find Excel file based on target
	excel_filename = None
	if target in ("tables", "charts"):
	if target == "tables":
	excel_filename = "parsed_tables.xlsx"
	else: # charts
	excel_filename = "parsed_charts.xlsx"
	elif target == "both":
	excel_filename = "parsed_tables_charts.xlsx"

	if excel_filename:
	excel_path = out_dir / excel_filename
	if excel_path.exists():
	# Read Excel file and create HTML tables
	xl_file = pd.ExcelFile(excel_path)
	html_blocks = []

	for sheet_name in xl_file.sheet_names:
	df = pd.read_excel(excel_path, sheet_name=sheet_name)
	if not df.empty:
	# Create table with title
	title = f"<h3>{_html.escape(sheet_name)}</h3>"

	# Convert DataFrame to HTML table
	table_html = df.to_html(
	classes="doc-table",
	table_id=None,
	escape=True,
	index=False,
	na_rep=""
	)

	html_blocks.append(title + table_html)

	tables_html = "\n".join(html_blocks)
	except Exception as e:
	try:
	error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
	print(f"Error building tables HTML: {error_msg}")
	except Exception:
	print(f"Error building tables HTML: <Unicode encoding error>")
	tables_html = ""

	return (
	f"✅ Parsing completed successfully!\n📁 Output directory: {out_dir}",
	tables_html,
	gallery_items,
	file_paths,
	zip_path
	)


	def run_docres_restoration(
	pdf_file: str,
	task: str,
	device: str,
	dpi: int,
	save_enhanced: bool,
	save_images: bool
	) -> Tuple[str, Optional[str], Optional[str], Optional[dict], List[str]]:
	"""Run DocRes image restoration on PDF."""
	if not pdf_file:
	return ("No file provided.", None, None, None, [])

	# Check if Doctra components are available
	if DocResUIWrapper is None:
	return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, None, None, [])

	try:
	# Initialize DocRes engine
	device_str = None if device == "auto" else device
	docres = DocResUIWrapper(device=device_str)

	# Extract filename
	original_filename = Path(pdf_file).stem

	# Create output directory
	output_dir = Path("outputs") / f"{original_filename}_docres"
	output_dir.mkdir(parents=True, exist_ok=True)

	# Run DocRes restoration
	enhanced_pdf_path = output_dir / f"{original_filename}_enhanced.pdf"
	docres.restore_pdf(
	pdf_path=pdf_file,
	output_path=str(enhanced_pdf_path),
	task=task,
	dpi=dpi
	)

	# Prepare outputs
	file_paths = []

	if save_enhanced and enhanced_pdf_path.exists():
	file_paths.append(str(enhanced_pdf_path))

	if save_images:
	# Look for enhanced images
	images_dir = output_dir / "enhanced_images"
	if images_dir.exists():
	for img_path in sorted(images_dir.glob("*.jpg")):
	file_paths.append(str(img_path))

	# Create metadata
	metadata = {
	"task": task,
	"device": str(docres.device),
	"dpi": dpi,
	"original_file": pdf_file,
	"enhanced_file": str(enhanced_pdf_path) if enhanced_pdf_path.exists() else None,
	"output_directory": str(output_dir)
	}

	status_msg = f"✅ DocRes restoration completed successfully!\n📁 Output directory: {output_dir}"

	enhanced_pdf_file = str(enhanced_pdf_path) if enhanced_pdf_path.exists() else None
	return (status_msg, pdf_file, enhanced_pdf_file, metadata, file_paths)

	except Exception as e:
	error_msg = f"❌ DocRes restoration failed: {str(e)}"
	return (error_msg, None, None, None, [])


	def run_enhanced_parse(
	pdf_file: str,
	use_image_restoration: bool,
	restoration_task: str,
	restoration_device: str,
	restoration_dpi: int,
	use_vlm: bool,
	vlm_provider: str,
	vlm_api_key: str,
	layout_model_name: str,
	dpi: int,
	min_score: float,
	ocr_lang: str,
	ocr_psm: int,
	ocr_oem: int,
	ocr_extra_config: str,
	box_separator: str,
	) -> Tuple[str, Optional[str], List[str], str, Optional[str], Optional[str], str]:
	"""Run enhanced PDF parsing with DocRes image restoration."""
	if not pdf_file:
	return ("No file provided.", None, [], "", None, None, "")

	# Check if Doctra components are available
	if EnhancedPDFParser is None:
	return ("❌ Error: Doctra library not properly installed. Please check the requirements.", None, [], "", None, None, "")

	# Validate VLM configuration if VLM is enabled
	if use_vlm:
	vlm_error = validate_vlm_config(use_vlm, vlm_api_key, vlm_provider)
	if vlm_error:
	return (vlm_error, None, [], "", None, None, "")

	original_filename = Path(pdf_file).stem

	# Create temporary directory for processing
	tmp_dir = Path(tempfile.mkdtemp(prefix="doctra_enhanced_"))
	input_pdf = tmp_dir / f"{original_filename}.pdf"
	shutil.copy2(pdf_file, input_pdf)

	try:
	# Initialize enhanced parser with configuration
	parser = EnhancedPDFParser(
	use_image_restoration=use_image_restoration,
	restoration_task=restoration_task,
	restoration_device=restoration_device if restoration_device != "auto" else None,
	restoration_dpi=int(restoration_dpi),
	use_vlm=use_vlm,
	vlm_provider=vlm_provider,
	vlm_api_key=vlm_api_key or None,
	layout_model_name=layout_model_name,
	dpi=int(dpi),
	min_score=float(min_score),
	ocr_lang=ocr_lang,
	ocr_psm=int(ocr_psm),
	ocr_oem=int(ocr_oem),
	ocr_extra_config=ocr_extra_config or "",
	box_separator=box_separator or "\n",
	)

	# Parse the PDF with enhancement
	parser.parse(str(input_pdf))

	except Exception as e:
	import traceback
	traceback.print_exc()
	try:
	error_msg = str(e).encode('utf-8', errors='replace').decode('utf-8')
	return (f"❌ Enhanced parsing failed: {error_msg}", None, [], "", None, None, "")
	except Exception:
	return (f"❌ Enhanced parsing failed: <Unicode encoding error>", None, [], "", None, None, "")

	# Find output directory
	outputs_root = Path("outputs")
	out_dir = outputs_root / original_filename / "enhanced_parse"
	if not out_dir.exists():
	candidates = sorted(outputs_root.glob("*/"), key=lambda p: p.stat().st_mtime, reverse=True)
	if candidates:
	out_dir = candidates[0] / "enhanced_parse"
	else:
	out_dir = outputs_root

	# If still no enhanced_parse directory, try to find any directory with enhanced files
	if not out_dir.exists():
	for candidate_dir in outputs_root.rglob("*"):
	if candidate_dir.is_dir():
	enhanced_pdfs = list(candidate_dir.glob("enhanced.pdf"))
	if enhanced_pdfs:
	out_dir = candidate_dir
	break

	# Load first page content initially
	md_preview = None
	try:
	pages_dir = out_dir / "pages"
	first_page_path = pages_dir / "page_001.md"
	if first_page_path.exists():
	with first_page_path.open("r", encoding="utf-8", errors="ignore") as f:
	md_content = f.read()

	md_lines = md_content.split('\n')
	md_preview = create_page_html_content(md_lines, out_dir)
	else:
	md_file = next(out_dir.glob("*.md"), None)
	if md_file and md_file.exists():
	with md_file.open("r", encoding="utf-8", errors="ignore") as f:
	md_content = f.read()

	md_lines = md_content.split('\n')
	md_preview = create_page_html_content(md_lines, out_dir)
	except Exception as e:
	print(f"❌ Error loading initial content: {e}")
	md_preview = None

	# Gather output files and create ZIP
	_, file_paths, zip_path = gather_outputs(
	out_dir,
	zip_filename=f"{original_filename}_enhanced",
	is_structured_parsing=False
	)

	# Look for enhanced PDF file
	enhanced_pdf_path = None
	if use_image_restoration:
	enhanced_pdf_candidates = list(out_dir.glob("enhanced.pdf"))
	if enhanced_pdf_candidates:
	enhanced_pdf_path = str(enhanced_pdf_candidates[0])
	else:
	parent_enhanced = list(out_dir.parent.glob("enhanced.pdf"))
	if parent_enhanced:
	enhanced_pdf_path = str(parent_enhanced[0])

	return (
	f"✅ Enhanced parsing completed successfully!\n📁 Output directory: {out_dir}",
	md_preview,
	file_paths,
	zip_path,
	pdf_file, # Original PDF path
	enhanced_pdf_path, # Enhanced PDF path
	str(out_dir) # Output directory for page-specific content
	)


	def create_tips_markdown() -> str:
	"""Create the tips section markdown for the UI."""
	return """
	<div class="card">
	<b>Tips</b>
	<ul>
	<li>On Spaces, set a secret <code>VLM_API_KEY</code> to enable VLM features.</li>
	<li>Use <strong>Enhanced Parser</strong> for documents that need image restoration before parsing (scanned docs, low-quality PDFs).</li>
	<li>Use <strong>DocRes Image Restoration</strong> for standalone image enhancement without parsing.</li>
	<li>DocRes tasks: <code>appearance</code> (default), <code>dewarping</code>, <code>deshadowing</code>, <code>deblurring</code>, <code>binarization</code>, <code>end2end</code>.</li>
	<li>Outputs are saved under <code>outputs/<pdf_stem>/</code>.</li>
	<li><strong>Note:</strong> Google Gemini VLM may not be available due to dependency conflicts. Use OpenAI, Anthropic, or other VLM providers.</li>
	</ul>
	</div>
	"""


	# Create the main Gradio interface
	with gr.Blocks(title="Doctra - Document Parser", theme=THEME, css=CUSTOM_CSS) as demo:
	# Header section
	gr.Markdown(
	"""
	<div class="header">
	<h2 style="margin:0">Doctra — Document Parser</h2>
	<div class="subtitle">Parse PDFs, extract tables/charts, preview markdown, and download outputs.</div>
	</div>
	"""
	)

	# Full Parse Tab
	with gr.Tab("Full Parse"):
	with gr.Row():
	pdf = gr.File(file_types=[".pdf"], label="PDF")
	use_vlm = gr.Checkbox(label="Use VLM (optional)", value=False)
	vlm_provider = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
	vlm_api_key = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")

	with gr.Accordion("Advanced", open=False):
	with gr.Row():
	layout_model = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
	dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
	min_score = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")
	with gr.Row():
	ocr_lang = gr.Textbox(value="eng", label="OCR Language")
	ocr_psm = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
	ocr_oem = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")
	with gr.Row():
	ocr_config = gr.Textbox(value="", label="Extra OCR config")
	box_sep = gr.Textbox(value="\n", label="Box separator")

	run_btn = gr.Button("▶ Run Full Parse", variant="primary")
	status = gr.Textbox(label="Status", elem_classes=["status-ok"])

	# Full Parse components
	with gr.Row():
	with gr.Column():
	md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])
	with gr.Column():
	page_image = gr.Image(label="Page image", interactive=False)
	files_out = gr.Files(label="Download individual output files")
	zip_out = gr.File(label="Download all outputs (ZIP)")

	run_btn.click(
	fn=run_full_parse,
	inputs=[pdf, use_vlm, vlm_provider, vlm_api_key, layout_model, dpi, min_score, ocr_lang, ocr_psm, ocr_oem, ocr_config, box_sep],
	outputs=[status, md_preview, files_out, zip_out],
	)

	# Tables & Charts Tab
	with gr.Tab("Extract Tables/Charts"):
	with gr.Row():
	pdf_e = gr.File(file_types=[".pdf"], label="PDF")
	target = gr.Dropdown(["tables", "charts", "both"], value="both", label="Target")
	use_vlm_e = gr.Checkbox(label="Use VLM (optional)", value=False)
	vlm_provider_e = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
	vlm_api_key_e = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")

	with gr.Accordion("Advanced", open=False):
	with gr.Row():
	layout_model_e = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
	dpi_e = gr.Slider(100, 400, value=200, step=10, label="DPI")
	min_score_e = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")

	run_btn_e = gr.Button("▶ Run Extraction", variant="primary")
	status_e = gr.Textbox(label="Status")

	with gr.Row():
	with gr.Column():
	tables_preview_e = gr.HTML(label="Extracted Data", elem_classes=["page-content"])
	with gr.Column():
	image_e = gr.Image(label="Selected Image", interactive=False)

	files_out_e = gr.Files(label="Download individual output files")
	zip_out_e = gr.File(label="Download all outputs (ZIP)")

	run_btn_e.click(
	fn=lambda f, t, a, b, c, d, e, g: run_extract(
	f.name if f else "",
	t,
	a,
	b,
	c,
	d,
	e,
	g,
	),
	inputs=[pdf_e, target, use_vlm_e, vlm_provider_e, vlm_api_key_e, layout_model_e, dpi_e, min_score_e],
	outputs=[status_e, tables_preview_e, files_out_e, zip_out_e],
	)

	# DocRes Image Restoration Tab
	with gr.Tab("DocRes Image Restoration"):
	with gr.Row():
	pdf_docres = gr.File(file_types=[".pdf"], label="PDF")
	docres_task_standalone = gr.Dropdown(
	["appearance", "dewarping", "deshadowing", "deblurring", "binarization", "end2end"],
	value="appearance",
	label="Restoration Task"
	)
	docres_device_standalone = gr.Dropdown(
	["auto", "cuda", "cpu"],
	value="auto",
	label="Device"
	)

	with gr.Row():
	docres_dpi = gr.Slider(100, 400, value=200, step=10, label="DPI")
	docres_save_enhanced = gr.Checkbox(label="Save Enhanced PDF", value=True)
	docres_save_images = gr.Checkbox(label="Save Enhanced Images", value=True)

	run_docres_btn = gr.Button("▶ Run DocRes Restoration", variant="primary")
	docres_status = gr.Textbox(label="Status", elem_classes=["status-ok"])

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📄 Original PDF")
	docres_original_pdf = gr.File(label="Original PDF File", interactive=False, visible=False)
	docres_original_page_image = gr.Image(label="Original PDF Page", interactive=False, height=800)
	with gr.Column():
	gr.Markdown("### ✨ Enhanced PDF")
	docres_enhanced_pdf = gr.File(label="Enhanced PDF File", interactive=False, visible=False)
	docres_enhanced_page_image = gr.Image(label="Enhanced PDF Page", interactive=False, height=800)

	docres_files_out = gr.Files(label="Download enhanced files")

	run_docres_btn.click(
	fn=run_docres_restoration,
	inputs=[pdf_docres, docres_task_standalone, docres_device_standalone, docres_dpi, docres_save_enhanced, docres_save_images],
	outputs=[docres_status, docres_original_pdf, docres_enhanced_pdf, docres_files_out]
	)

	# Enhanced Parser Tab
	with gr.Tab("Enhanced Parser"):
	with gr.Row():
	pdf_enhanced = gr.File(file_types=[".pdf"], label="PDF")
	use_image_restoration = gr.Checkbox(label="Use Image Restoration", value=True)
	restoration_task = gr.Dropdown(
	["appearance", "dewarping", "deshadowing", "deblurring", "binarization", "end2end"],
	value="appearance",
	label="Restoration Task"
	)
	restoration_device = gr.Dropdown(
	["auto", "cuda", "cpu"],
	value="auto",
	label="Restoration Device"
	)

	with gr.Row():
	use_vlm_enhanced = gr.Checkbox(label="Use VLM (optional)", value=False)
	vlm_provider_enhanced = gr.Dropdown(["openai", "anthropic", "openrouter", "ollama"], value="openai", label="VLM Provider")
	vlm_api_key_enhanced = gr.Textbox(type="password", label="VLM API Key", placeholder="Optional if VLM disabled")

	with gr.Accordion("Advanced Settings", open=False):
	with gr.Row():
	restoration_dpi = gr.Slider(100, 400, value=200, step=10, label="Restoration DPI")
	layout_model_enhanced = gr.Textbox(value="PP-DocLayout_plus-L", label="Layout model")
	dpi_enhanced = gr.Slider(100, 400, value=200, step=10, label="Processing DPI")
	min_score_enhanced = gr.Slider(0, 1, value=0.0, step=0.05, label="Min layout score")

	with gr.Row():
	ocr_lang_enhanced = gr.Textbox(value="eng", label="OCR Language")
	ocr_psm_enhanced = gr.Slider(0, 13, value=4, step=1, label="Tesseract PSM")
	ocr_oem_enhanced = gr.Slider(0, 3, value=3, step=1, label="Tesseract OEM")

	with gr.Row():
	ocr_config_enhanced = gr.Textbox(value="", label="Extra OCR config")
	box_sep_enhanced = gr.Textbox(value="\n", label="Box separator")

	run_enhanced_btn = gr.Button("▶ Run Enhanced Parse", variant="primary")
	enhanced_status = gr.Textbox(label="Status", elem_classes=["status-ok"])

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📄 Original PDF")
	enhanced_original_pdf = gr.File(label="Original PDF File", interactive=False, visible=False)
	enhanced_original_page_image = gr.Image(label="Original PDF Page", interactive=False, height=600)
	with gr.Column():
	gr.Markdown("### ✨ Enhanced PDF")
	enhanced_enhanced_pdf = gr.File(label="Enhanced PDF File", interactive=False, visible=False)
	enhanced_enhanced_page_image = gr.Image(label="Enhanced PDF Page", interactive=False, height=600)

	with gr.Row():
	enhanced_md_preview = gr.HTML(label="Extracted Content", visible=True, elem_classes=["page-content"])

	enhanced_files_out = gr.Files(label="Download individual output files")
	enhanced_zip_out = gr.File(label="Download all outputs (ZIP)")

	run_enhanced_btn.click(
	fn=run_enhanced_parse,
	inputs=[
	pdf_enhanced, use_image_restoration, restoration_task, restoration_device, restoration_dpi,
	use_vlm_enhanced, vlm_provider_enhanced, vlm_api_key_enhanced, layout_model_enhanced,
	dpi_enhanced, min_score_enhanced, ocr_lang_enhanced, ocr_psm_enhanced, ocr_oem_enhanced,
	ocr_config_enhanced, box_sep_enhanced
	],
	outputs=[
	enhanced_status, enhanced_md_preview, enhanced_files_out, enhanced_zip_out,
	enhanced_original_pdf, enhanced_enhanced_pdf
	]
	)

	# Tips section
	gr.Markdown(create_tips_markdown())


	if __name__ == "__main__":
	# Launch the interface
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.getenv("PORT", "7860")),
	share=False
	)