ai-sl-api / document_parsing.py
deenasun's picture
set up app.py with fully integrated text to ASL video flow
dbca390
raw
history blame
8.1 kB
import os
import sys
from pathlib import Path
from typing import Optional, Union
import logging
# Import document parsing libraries
try:
import PyPDF2
from docx import Document
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
except ImportError as e:
print(f"Missing required dependency: {e}")
print("Please install dependencies with: pip install -r requirements.txt")
sys.exit(1)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentParser:
"""
A class to parse and extract text from various document formats.
Supports PDF, TXT, DOC, DOCX, and EPUB files.
"""
def __init__(self):
self.supported_formats = {
'application/pdf': self._parse_pdf,
'text/plain': self._parse_txt,
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': self._parse_docx,
'application/msword': self._parse_doc,
'application/epub+zip': self._parse_epub
}
def get_file_type(self, file_path: Union[str, Path]) -> str:
"""
Detect the MIME type of a file using file extension.
Args:
file_path: Path to the file
Returns:
MIME type string
"""
return self._get_mime_from_extension(file_path)
def _get_mime_from_extension(self, file_path: Union[str, Path]) -> str:
"""
Determine MIME type from file extension.
Args:
file_path: Path to the file
Returns:
MIME type string
"""
extension = Path(file_path).suffix.lower()
extension_map = {
'.pdf': 'application/pdf',
'.txt': 'text/plain',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.doc': 'application/msword',
'.epub': 'application/epub+zip'
}
return extension_map.get(extension, 'unknown')
def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
"""
Extract text from a document file.
Args:
file_path: Path to the document file
Returns:
Extracted text as string, or None if extraction fails
"""
file_path = Path(file_path)
if not file_path.exists():
logger.error(f"File not found: {file_path}")
return None
try:
mime_type = self.get_file_type(file_path)
logger.info(f"Detected file type: {mime_type}")
if mime_type in self.supported_formats:
return self.supported_formats[mime_type](file_path)
else:
logger.error(f"Unsupported file type: {mime_type}")
return None
except Exception as e:
logger.error(f"Error extracting text from {file_path}: {e}")
return None
def _parse_pdf(self, file_path: Path) -> str:
"""
Extract text from PDF file.
Args:
file_path: Path to PDF file
Returns:
Extracted text
"""
text = ""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
except Exception as e:
logger.error(f"Error parsing PDF {file_path}: {e}")
raise
return text.strip()
def _parse_txt(self, file_path: Path) -> str:
"""
Extract text from plain text file.
Args:
file_path: Path to text file
Returns:
Extracted text
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except UnicodeDecodeError:
# Try with different encoding
try:
with open(file_path, 'r', encoding='latin-1') as file:
return file.read()
except Exception as e:
logger.error(f"Error reading text file {file_path}: {e}")
raise
except Exception as e:
logger.error(f"Error reading text file {file_path}: {e}")
raise
def _parse_docx(self, file_path: Path) -> str:
"""
Extract text from DOCX file.
Args:
file_path: Path to DOCX file
Returns:
Extracted text
"""
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
logger.error(f"Error parsing DOCX {file_path}: {e}")
raise
def _parse_doc(self, file_path: Path) -> str:
"""
Extract text from DOC file (legacy Word format).
Note: This requires additional dependencies like antiword or catdoc.
Args:
file_path: Path to DOC file
Returns:
Extracted text
"""
try:
# Try using antiword if available
import subprocess
result = subprocess.run(['antiword', str(file_path)],
capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
# Fallback: try catdoc
result = subprocess.run(['catdoc', str(file_path)],
capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip()
raise Exception("Neither antiword nor catdoc found. Please install one of them.")
except FileNotFoundError:
raise Exception("antiword or catdoc not found. Please install one of them for DOC file support.")
except Exception as e:
logger.error(f"Error parsing DOC {file_path}: {e}")
raise
def _parse_epub(self, file_path: Path) -> str:
"""
Extract text from EPUB file.
Args:
file_path: Path to EPUB file
Returns:
Extracted text
"""
try:
book = epub.read_epub(file_path)
text = ""
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
content = item.get_content().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
text += soup.get_text() + "\n"
return text.strip()
except Exception as e:
logger.error(f"Error parsing EPUB {file_path}: {e}")
raise
def main():
"""
Main function to demonstrate usage of the DocumentParser.
"""
if len(sys.argv) != 2:
print("Usage: python document_parsing.py <file_path>")
print("Supported formats: PDF, TXT, DOC, DOCX, EPUB")
sys.exit(1)
file_path = sys.argv[1]
parser = DocumentParser()
print(f"Extracting text from: {file_path}")
print("-" * 50)
extracted_text = parser.extract_text(file_path)
if extracted_text:
print("Extracted text:")
print(extracted_text)
print(f"\nTotal characters: {len(extracted_text)}")
else:
print("Failed to extract text from the file.")
sys.exit(1)
if __name__ == "__main__":
main()