#!/usr/bin/env python3 """ PDF to Markdown Converter using MinerU (vendor/mineru) This is the main conversion script that uses the local MinerU installation """ import os import sys import logging import argparse from pathlib import Path import subprocess # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('pdf_converter.log') ] ) logger = logging.getLogger(__name__) class PdfConverterResult: """Class representing the result of a PDF conversion""" def __init__(self, pdf_path: str, success: bool, md_path: str = None, time_taken: float = 0, error: str = None): self.pdf_path = pdf_path self.success = success self.md_path = md_path self.time_taken = time_taken self.error = error def __str__(self): if self.success: return f"āœ… Successfully converted {self.pdf_path} in {self.time_taken:.2f}s" else: return f"āŒ Failed to convert {self.pdf_path}: {self.error}" class MineruPdfConverter: """ PDF to Markdown converter using MinerU """ def __init__(self, output_dir: str = "output"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult: """Convert a single PDF file to Markdown using MinerU""" import time start_time = time.time() try: pdf_path = Path(pdf_path) if not pdf_path.exists(): return PdfConverterResult( str(pdf_path), False, error=f"File not found: {pdf_path}" ) logger.info(f"Processing: {pdf_path}") # Prepare output directory pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem) # Run MinerU command cmd = [ "mineru", "-p", str(pdf_path), "-o", pdf_output_dir, "-m", "txt", # Use text mode "-f", "false", # Disable formula parsing for speed "-t", "false", # Disable table parsing for speed ] logger.info(f"Running command: {' '.join(cmd)}") # Execute MinerU result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: error_msg = result.stderr if result.stderr else "Unknown error" return PdfConverterResult( str(pdf_path), False, error=error_msg ) # Find the generated markdown file md_path = None expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md" if expected_md.exists(): md_path = str(expected_md) logger.info(f"āœ… Markdown file created: {md_path}") else: # Search for any .md file in the output directory for md_file in Path(pdf_output_dir).rglob("*.md"): md_path = str(md_file) logger.info(f"āœ… Found markdown file: {md_path}") break if not md_path: return PdfConverterResult( str(pdf_path), False, error="No markdown file generated" ) # Delete original PDF if requested if delete_after and pdf_path.exists(): pdf_path.unlink() logger.info(f"šŸ—‘ļø Deleted original PDF: {pdf_path}") elapsed_time = time.time() - start_time return PdfConverterResult( str(pdf_path), True, md_path=md_path, time_taken=elapsed_time ) except Exception as e: logger.error(f"Error processing {pdf_path}: {e}") import traceback traceback.print_exc() return PdfConverterResult( str(pdf_path), False, error=str(e) ) class BatchProcessor: """Process multiple PDF files in batch""" def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output", workers: int = 1, delete_after: bool = False): self.batch_dir = batch_dir self.output_dir = output_dir self.workers = workers self.delete_after = delete_after self.converter = MineruPdfConverter(output_dir) def find_pdf_files(self) -> list[Path]: """Find all PDF files in the batch directory""" pdf_files = [] batch_path = Path(self.batch_dir) if not batch_path.exists(): logger.warning(f"Batch directory not found: {self.batch_dir}") return pdf_files # Find all PDFs recursively pdf_files = list(batch_path.rglob("*.pdf")) logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}") return pdf_files def process_batch(self) -> tuple[int, int]: """Process all PDFs in the batch directory""" pdf_files = self.find_pdf_files() if not pdf_files: logger.info("No PDF files found to process") return 0, 0 successful = 0 failed = 0 logger.info(f"Starting batch processing of {len(pdf_files)} files...") # Process files sequentially (MinerU already handles parallelism internally) for pdf_file in pdf_files: result = self.converter.convert_file(str(pdf_file), self.delete_after) if result.success: successful += 1 logger.info(f"āœ… {result}") else: failed += 1 logger.error(f"āŒ {result}") return successful, failed def main(): """Main entry point""" parser = argparse.ArgumentParser( description="Convert PDF files to Markdown using MinerU", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Convert a single PDF %(prog)s convert path/to/file.pdf # Batch convert all PDFs in batch-files directory %(prog)s batch # Batch convert with custom settings %(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4 # Delete PDFs after successful conversion %(prog)s batch --delete-after """ ) subparsers = parser.add_subparsers(dest='command', help='Command to run') # Convert command convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file') convert_parser.add_argument('pdf_file', help='Path to PDF file') convert_parser.add_argument('--output-dir', default='output', help='Output directory') convert_parser.add_argument('--delete-after', action='store_true', help='Delete PDF after successful conversion') # Batch command batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files') batch_parser.add_argument('--batch-dir', default='batch-files', help='Directory containing PDF files') batch_parser.add_argument('--output-dir', default='output', help='Output directory') batch_parser.add_argument('--workers', type=int, default=1, help='Number of parallel workers') batch_parser.add_argument('--delete-after', action='store_true', help='Delete PDFs after successful conversion') args = parser.parse_args() # Auto-detect command if none specified if not args.command: # If first argument looks like a file, assume convert command if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()): args.command = 'convert' args.pdf_file = sys.argv[1] args.output_dir = 'output' args.delete_after = False else: # Default to batch mode args.command = 'batch' args.batch_dir = 'batch-files' args.output_dir = 'output' args.workers = 1 args.delete_after = False # Execute command if args.command == 'convert': converter = MineruPdfConverter(args.output_dir) result = converter.convert_file(args.pdf_file, args.delete_after) print(result) sys.exit(0 if result.success else 1) elif args.command == 'batch': processor = BatchProcessor( args.batch_dir, args.output_dir, args.workers, args.delete_after ) successful, failed = processor.process_batch() print(f"\nšŸ“Š Batch processing complete:") print(f" āœ… Successful: {successful}") print(f" āŒ Failed: {failed}") print(f" šŸ“ Output directory: {args.output_dir}") sys.exit(0 if failed == 0 else 1) else: parser.print_help() sys.exit(1) if __name__ == "__main__": main()