Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
PDF to Markdown Converter using MinerU (vendor/mineru) | |
This is the main conversion script that uses the local MinerU installation | |
""" | |
import os | |
import sys | |
import logging | |
import argparse | |
from pathlib import Path | |
import subprocess | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.StreamHandler(), | |
logging.FileHandler('pdf_converter.log') | |
] | |
) | |
logger = logging.getLogger(__name__) | |
class PdfConverterResult: | |
"""Class representing the result of a PDF conversion""" | |
def __init__(self, pdf_path: str, success: bool, md_path: str = None, | |
time_taken: float = 0, error: str = None): | |
self.pdf_path = pdf_path | |
self.success = success | |
self.md_path = md_path | |
self.time_taken = time_taken | |
self.error = error | |
def __str__(self): | |
if self.success: | |
return f"β Successfully converted {self.pdf_path} in {self.time_taken:.2f}s" | |
else: | |
return f"β Failed to convert {self.pdf_path}: {self.error}" | |
class MineruPdfConverter: | |
""" | |
PDF to Markdown converter using MinerU | |
""" | |
def __init__(self, output_dir: str = "output"): | |
self.output_dir = output_dir | |
os.makedirs(output_dir, exist_ok=True) | |
def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult: | |
"""Convert a single PDF file to Markdown using MinerU""" | |
import time | |
start_time = time.time() | |
try: | |
pdf_path = Path(pdf_path) | |
if not pdf_path.exists(): | |
return PdfConverterResult( | |
str(pdf_path), False, error=f"File not found: {pdf_path}" | |
) | |
logger.info(f"Processing: {pdf_path}") | |
# Prepare output directory | |
pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem) | |
# Run MinerU command | |
cmd = [ | |
"mineru", | |
"-p", str(pdf_path), | |
"-o", pdf_output_dir, | |
"-m", "txt", # Use text mode | |
"-f", "false", # Disable formula parsing for speed | |
"-t", "false", # Disable table parsing for speed | |
] | |
logger.info(f"Running command: {' '.join(cmd)}") | |
# Execute MinerU | |
result = subprocess.run(cmd, capture_output=True, text=True) | |
if result.returncode != 0: | |
error_msg = result.stderr if result.stderr else "Unknown error" | |
return PdfConverterResult( | |
str(pdf_path), False, error=error_msg | |
) | |
# Find the generated markdown file | |
md_path = None | |
expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md" | |
if expected_md.exists(): | |
md_path = str(expected_md) | |
logger.info(f"β Markdown file created: {md_path}") | |
else: | |
# Search for any .md file in the output directory | |
for md_file in Path(pdf_output_dir).rglob("*.md"): | |
md_path = str(md_file) | |
logger.info(f"β Found markdown file: {md_path}") | |
break | |
if not md_path: | |
return PdfConverterResult( | |
str(pdf_path), False, error="No markdown file generated" | |
) | |
# Delete original PDF if requested | |
if delete_after and pdf_path.exists(): | |
pdf_path.unlink() | |
logger.info(f"ποΈ Deleted original PDF: {pdf_path}") | |
elapsed_time = time.time() - start_time | |
return PdfConverterResult( | |
str(pdf_path), True, md_path=md_path, time_taken=elapsed_time | |
) | |
except Exception as e: | |
logger.error(f"Error processing {pdf_path}: {e}") | |
import traceback | |
traceback.print_exc() | |
return PdfConverterResult( | |
str(pdf_path), False, error=str(e) | |
) | |
class BatchProcessor: | |
"""Process multiple PDF files in batch""" | |
def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output", | |
workers: int = 1, delete_after: bool = False): | |
self.batch_dir = batch_dir | |
self.output_dir = output_dir | |
self.workers = workers | |
self.delete_after = delete_after | |
self.converter = MineruPdfConverter(output_dir) | |
def find_pdf_files(self) -> list[Path]: | |
"""Find all PDF files in the batch directory""" | |
pdf_files = [] | |
batch_path = Path(self.batch_dir) | |
if not batch_path.exists(): | |
logger.warning(f"Batch directory not found: {self.batch_dir}") | |
return pdf_files | |
# Find all PDFs recursively | |
pdf_files = list(batch_path.rglob("*.pdf")) | |
logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}") | |
return pdf_files | |
def process_batch(self) -> tuple[int, int]: | |
"""Process all PDFs in the batch directory""" | |
pdf_files = self.find_pdf_files() | |
if not pdf_files: | |
logger.info("No PDF files found to process") | |
return 0, 0 | |
successful = 0 | |
failed = 0 | |
logger.info(f"Starting batch processing of {len(pdf_files)} files...") | |
# Process files sequentially (MinerU already handles parallelism internally) | |
for pdf_file in pdf_files: | |
result = self.converter.convert_file(str(pdf_file), self.delete_after) | |
if result.success: | |
successful += 1 | |
logger.info(f"β {result}") | |
else: | |
failed += 1 | |
logger.error(f"β {result}") | |
return successful, failed | |
def main(): | |
"""Main entry point""" | |
parser = argparse.ArgumentParser( | |
description="Convert PDF files to Markdown using MinerU", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
epilog=""" | |
Examples: | |
# Convert a single PDF | |
%(prog)s convert path/to/file.pdf | |
# Batch convert all PDFs in batch-files directory | |
%(prog)s batch | |
# Batch convert with custom settings | |
%(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4 | |
# Delete PDFs after successful conversion | |
%(prog)s batch --delete-after | |
""" | |
) | |
subparsers = parser.add_subparsers(dest='command', help='Command to run') | |
# Convert command | |
convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file') | |
convert_parser.add_argument('pdf_file', help='Path to PDF file') | |
convert_parser.add_argument('--output-dir', default='output', help='Output directory') | |
convert_parser.add_argument('--delete-after', action='store_true', | |
help='Delete PDF after successful conversion') | |
# Batch command | |
batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files') | |
batch_parser.add_argument('--batch-dir', default='batch-files', | |
help='Directory containing PDF files') | |
batch_parser.add_argument('--output-dir', default='output', | |
help='Output directory') | |
batch_parser.add_argument('--workers', type=int, default=1, | |
help='Number of parallel workers') | |
batch_parser.add_argument('--delete-after', action='store_true', | |
help='Delete PDFs after successful conversion') | |
args = parser.parse_args() | |
# Auto-detect command if none specified | |
if not args.command: | |
# If first argument looks like a file, assume convert command | |
if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()): | |
args.command = 'convert' | |
args.pdf_file = sys.argv[1] | |
args.output_dir = 'output' | |
args.delete_after = False | |
else: | |
# Default to batch mode | |
args.command = 'batch' | |
args.batch_dir = 'batch-files' | |
args.output_dir = 'output' | |
args.workers = 1 | |
args.delete_after = False | |
# Execute command | |
if args.command == 'convert': | |
converter = MineruPdfConverter(args.output_dir) | |
result = converter.convert_file(args.pdf_file, args.delete_after) | |
print(result) | |
sys.exit(0 if result.success else 1) | |
elif args.command == 'batch': | |
processor = BatchProcessor( | |
args.batch_dir, | |
args.output_dir, | |
args.workers, | |
args.delete_after | |
) | |
successful, failed = processor.process_batch() | |
print(f"\nπ Batch processing complete:") | |
print(f" β Successful: {successful}") | |
print(f" β Failed: {failed}") | |
print(f" π Output directory: {args.output_dir}") | |
sys.exit(0 if failed == 0 else 1) | |
else: | |
parser.print_help() | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |