Spaces:

OrganizedProgrammers
/

Docxtract

Running

App Files Files Community

Lucas ARRIESSE commited on 2 days ago

Commit

2dc9b4d

1 Parent(s): d952e74

Extract first supported file type in archive

Browse files

Files changed (1) hide show

api/docs.py +38 -22

api/docs.py CHANGED Viewed

@@ -41,11 +41,18 @@ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
 # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
 LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
-async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
-    This is an asynchronous version.
     Args:
         contents: File contents
@@ -105,13 +112,16 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
         return out_bytes
-async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
-    """Convert given file represented as a (filename, ext, bytes) to a list of lines"""
     final_text: str = None
     if ext == ".doc":
         logging.debug(f"Converting {filename} .doc --> .docx")
-        docx_bytes = await convert_file(bytes, filename, "doc", "docx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     elif ext == ".docx":
@@ -122,11 +132,11 @@ async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str
         final_text = extracted_data.content
     elif ext == ".ppt":
         logging.debug(f"Converting {filename} .ppt --> .pptx")
-        docx_bytes = await convert_file(bytes, filename, "ppt", "pptx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     else:
-        if ext in FORMAT_MIME_TYPES:  # file extension is supported
             extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
             final_text = extracted_data.content
         else:
@@ -146,14 +156,17 @@ FTP_MAX_PARALLEL_WORKERS = asyncio.Semaphore(4)
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
-    """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
     async with FTP_DOWNLOAD_RATE_LIMITER:
         async with FTP_MAX_PARALLEL_WORKERS:
             if not url.endswith("zip"):
                 raise ValueError("URL doit pointer vers un fichier ZIP")
-            doc_id = os.path.splitext(os.path.basename(url))[0]
             resp = await client.get(url, headers={
                 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             })
@@ -168,10 +181,20 @@ async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.B
                     file_name = entry.filename
                     root, ext = os.path.splitext(file_name)
                     doc_bytes = zf.read(file_name)
-                    return (root, ext.lower(), io.BytesIO(doc_bytes))
-            raise ValueError("Aucun fichier trouvé dans l'archive")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
@@ -235,13 +258,6 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     output.seek(0)
     return output
-FORMAT_MIME_TYPES = {
-    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-    ".pdf": "application/pdf",
-    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
-}
 # ============================================= Doc routes =========================================================
@@ -356,7 +372,7 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
         """Attempts to convert a document to text and returns success status and content."""
         try:
             filename, ext, bytes = await get_doc_archive(item.url, http_client)
-            text_lines = await convert_to_txt(filename, ext, bytes)
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
         except Exception as e:
@@ -410,12 +426,12 @@ async def download_user_docs(files: list[UploadFile] = File(...)):
     filenames = [file["filename"] for file in file_infos]
     logging.info(f"Got {len(file_infos)} user files to convert.")
-    logging.info(f"Filenames: {filenames}")
     # convert files to text
     async def _process_single_document(item: dict):
         try:
-            text_lines = await convert_to_txt(item["filename"], item["extension"], item["content"])
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item["filename"], "content": content_bytes}
         except Exception as e:
@@ -478,7 +494,7 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
         # convert the docx to txt for use
         try:
             filename, ext, bytes = await get_doc_archive(url, http_client)
-            txt_data = await convert_to_txt(filename, ext, bytes)
             full = "\n".join(txt_data)
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))

 # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
 LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
+# Supported file types for text extraction and their MIME type
+FORMAT_MIME_TYPES = {
+    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    ".pdf": "application/pdf",
+    ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+    ".zip": ""
+}
+async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
     """
     Converts the given file bytes using Libreoffice headless to the specified file type.
     Args:
         contents: File contents
         return out_bytes
+async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
+    """
+    Convert given file represented as a (filename, ext, bytes) to a list of lines.
+    File types which require conversion for handling are converted to the appropriate format before being converted to text.
+    """
     final_text: str = None
     if ext == ".doc":
         logging.debug(f"Converting {filename} .doc --> .docx")
+        docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     elif ext == ".docx":
         final_text = extracted_data.content
     elif ext == ".ppt":
         logging.debug(f"Converting {filename} .ppt --> .pptx")
+        docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
         extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
         final_text = extracted_data.content
     else:
+        if ext in FORMAT_MIME_TYPES:  # check if file extension is supported
             extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
             final_text = extracted_data.content
         else:
 async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
+    """
+    Récupère le document zippé depuis l'URL et le retourne un tuple (nom, extension, contenu).
+    Le premier document avec une extension convertible en texte est séléctionné
+    """
     async with FTP_DOWNLOAD_RATE_LIMITER:
         async with FTP_MAX_PARALLEL_WORKERS:
             if not url.endswith("zip"):
                 raise ValueError("URL doit pointer vers un fichier ZIP")
+            # doc_id = os.path.splitext(os.path.basename(url))[0]
             resp = await client.get(url, headers={
                 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             })
                     file_name = entry.filename
                     root, ext = os.path.splitext(file_name)
+                    ext = ext.lower()
+                    # skip the file if it isn't supported
+                    if ext not in FORMAT_MIME_TYPES:
+                        logging.debug(
+                            f"Skipping unsupported filetype found in archive: {ext}")
+                        continue
                     doc_bytes = zf.read(file_name)
+                    return (root, ext, io.BytesIO(doc_bytes))
+            raise ValueError(
+                "No file with a supported extension type was found in the archive file")
 def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
     output.seek(0)
     return output
 # ============================================= Doc routes =========================================================
         """Attempts to convert a document to text and returns success status and content."""
         try:
             filename, ext, bytes = await get_doc_archive(item.url, http_client)
+            text_lines = await extract_text_contents(filename, ext, bytes)
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
         except Exception as e:
     filenames = [file["filename"] for file in file_infos]
     logging.info(f"Got {len(file_infos)} user files to convert.")
+    logging.debug(f"Filenames: {filenames}")
     # convert files to text
     async def _process_single_document(item: dict):
         try:
+            text_lines = await extract_text_contents(item["filename"], item["extension"], item["content"])
             content_bytes = "\n".join(text_lines).encode("utf-8")
             return {"doc_id": item["filename"], "content": content_bytes}
         except Exception as e:
         # convert the docx to txt for use
         try:
             filename, ext, bytes = await get_doc_archive(url, http_client)
+            txt_data = await extract_text_contents(filename, ext, bytes)
             full = "\n".join(txt_data)
         except Exception as e:
             fmt = "".join(traceback.format_exception(e))