mineru2 / pdf_converter_mineru.py
marcosremar2's picture
Add PDF conversion API endpoints
550ec39
#!/usr/bin/env python3
"""
PDF to Markdown Converter using MinerU (vendor/mineru)
This is the main conversion script that uses the local MinerU installation
"""
import os
import sys
import logging
import argparse
from pathlib import Path
import subprocess
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('pdf_converter.log')
]
)
logger = logging.getLogger(__name__)
class PdfConverterResult:
"""Class representing the result of a PDF conversion"""
def __init__(self, pdf_path: str, success: bool, md_path: str = None,
time_taken: float = 0, error: str = None):
self.pdf_path = pdf_path
self.success = success
self.md_path = md_path
self.time_taken = time_taken
self.error = error
def __str__(self):
if self.success:
return f"βœ… Successfully converted {self.pdf_path} in {self.time_taken:.2f}s"
else:
return f"❌ Failed to convert {self.pdf_path}: {self.error}"
class MineruPdfConverter:
"""
PDF to Markdown converter using MinerU
"""
def __init__(self, output_dir: str = "output"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult:
"""Convert a single PDF file to Markdown using MinerU"""
import time
start_time = time.time()
try:
pdf_path = Path(pdf_path)
if not pdf_path.exists():
return PdfConverterResult(
str(pdf_path), False, error=f"File not found: {pdf_path}"
)
logger.info(f"Processing: {pdf_path}")
# Prepare output directory
pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem)
# Run MinerU command
cmd = [
"mineru",
"-p", str(pdf_path),
"-o", pdf_output_dir,
"-m", "txt", # Use text mode
"-f", "false", # Disable formula parsing for speed
"-t", "false", # Disable table parsing for speed
]
logger.info(f"Running command: {' '.join(cmd)}")
# Execute MinerU
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
error_msg = result.stderr if result.stderr else "Unknown error"
return PdfConverterResult(
str(pdf_path), False, error=error_msg
)
# Find the generated markdown file
md_path = None
expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md"
if expected_md.exists():
md_path = str(expected_md)
logger.info(f"βœ… Markdown file created: {md_path}")
else:
# Search for any .md file in the output directory
for md_file in Path(pdf_output_dir).rglob("*.md"):
md_path = str(md_file)
logger.info(f"βœ… Found markdown file: {md_path}")
break
if not md_path:
return PdfConverterResult(
str(pdf_path), False, error="No markdown file generated"
)
# Delete original PDF if requested
if delete_after and pdf_path.exists():
pdf_path.unlink()
logger.info(f"πŸ—‘οΈ Deleted original PDF: {pdf_path}")
elapsed_time = time.time() - start_time
return PdfConverterResult(
str(pdf_path), True, md_path=md_path, time_taken=elapsed_time
)
except Exception as e:
logger.error(f"Error processing {pdf_path}: {e}")
import traceback
traceback.print_exc()
return PdfConverterResult(
str(pdf_path), False, error=str(e)
)
class BatchProcessor:
"""Process multiple PDF files in batch"""
def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output",
workers: int = 1, delete_after: bool = False):
self.batch_dir = batch_dir
self.output_dir = output_dir
self.workers = workers
self.delete_after = delete_after
self.converter = MineruPdfConverter(output_dir)
def find_pdf_files(self) -> list[Path]:
"""Find all PDF files in the batch directory"""
pdf_files = []
batch_path = Path(self.batch_dir)
if not batch_path.exists():
logger.warning(f"Batch directory not found: {self.batch_dir}")
return pdf_files
# Find all PDFs recursively
pdf_files = list(batch_path.rglob("*.pdf"))
logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}")
return pdf_files
def process_batch(self) -> tuple[int, int]:
"""Process all PDFs in the batch directory"""
pdf_files = self.find_pdf_files()
if not pdf_files:
logger.info("No PDF files found to process")
return 0, 0
successful = 0
failed = 0
logger.info(f"Starting batch processing of {len(pdf_files)} files...")
# Process files sequentially (MinerU already handles parallelism internally)
for pdf_file in pdf_files:
result = self.converter.convert_file(str(pdf_file), self.delete_after)
if result.success:
successful += 1
logger.info(f"βœ… {result}")
else:
failed += 1
logger.error(f"❌ {result}")
return successful, failed
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(
description="Convert PDF files to Markdown using MinerU",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Convert a single PDF
%(prog)s convert path/to/file.pdf
# Batch convert all PDFs in batch-files directory
%(prog)s batch
# Batch convert with custom settings
%(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4
# Delete PDFs after successful conversion
%(prog)s batch --delete-after
"""
)
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Convert command
convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file')
convert_parser.add_argument('pdf_file', help='Path to PDF file')
convert_parser.add_argument('--output-dir', default='output', help='Output directory')
convert_parser.add_argument('--delete-after', action='store_true',
help='Delete PDF after successful conversion')
# Batch command
batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files')
batch_parser.add_argument('--batch-dir', default='batch-files',
help='Directory containing PDF files')
batch_parser.add_argument('--output-dir', default='output',
help='Output directory')
batch_parser.add_argument('--workers', type=int, default=1,
help='Number of parallel workers')
batch_parser.add_argument('--delete-after', action='store_true',
help='Delete PDFs after successful conversion')
args = parser.parse_args()
# Auto-detect command if none specified
if not args.command:
# If first argument looks like a file, assume convert command
if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()):
args.command = 'convert'
args.pdf_file = sys.argv[1]
args.output_dir = 'output'
args.delete_after = False
else:
# Default to batch mode
args.command = 'batch'
args.batch_dir = 'batch-files'
args.output_dir = 'output'
args.workers = 1
args.delete_after = False
# Execute command
if args.command == 'convert':
converter = MineruPdfConverter(args.output_dir)
result = converter.convert_file(args.pdf_file, args.delete_after)
print(result)
sys.exit(0 if result.success else 1)
elif args.command == 'batch':
processor = BatchProcessor(
args.batch_dir,
args.output_dir,
args.workers,
args.delete_after
)
successful, failed = processor.process_batch()
print(f"\nπŸ“Š Batch processing complete:")
print(f" βœ… Successful: {successful}")
print(f" ❌ Failed: {failed}")
print(f" πŸ“ Output directory: {args.output_dir}")
sys.exit(0 if failed == 0 else 1)
else:
parser.print_help()
sys.exit(1)
if __name__ == "__main__":
main()