Lucas ARRIESSE commited on
Commit
2dc9b4d
·
1 Parent(s): d952e74

Extract first supported file type in archive

Browse files
Files changed (1) hide show
  1. api/docs.py +38 -22
api/docs.py CHANGED
@@ -41,11 +41,18 @@ KREUZBERG_CONFIG: ExtractionConfig = ExtractionConfig(
41
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
42
  LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
43
 
 
 
 
 
 
 
 
 
44
 
45
- async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
46
  """
47
  Converts the given file bytes using Libreoffice headless to the specified file type.
48
- This is an asynchronous version.
49
 
50
  Args:
51
  contents: File contents
@@ -105,13 +112,16 @@ async def convert_file(contents: io.BytesIO, filename: str, input_ext: str, outp
105
  return out_bytes
106
 
107
 
108
- async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
109
- """Convert given file represented as a (filename, ext, bytes) to a list of lines"""
 
 
 
110
 
111
  final_text: str = None
112
  if ext == ".doc":
113
  logging.debug(f"Converting {filename} .doc --> .docx")
114
- docx_bytes = await convert_file(bytes, filename, "doc", "docx")
115
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
116
  final_text = extracted_data.content
117
  elif ext == ".docx":
@@ -122,11 +132,11 @@ async def convert_to_txt(filename: str, ext: str, bytes: io.BytesIO) -> list[str
122
  final_text = extracted_data.content
123
  elif ext == ".ppt":
124
  logging.debug(f"Converting {filename} .ppt --> .pptx")
125
- docx_bytes = await convert_file(bytes, filename, "ppt", "pptx")
126
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
127
  final_text = extracted_data.content
128
  else:
129
- if ext in FORMAT_MIME_TYPES: # file extension is supported
130
  extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
131
  final_text = extracted_data.content
132
  else:
@@ -146,14 +156,17 @@ FTP_MAX_PARALLEL_WORKERS = asyncio.Semaphore(4)
146
 
147
 
148
  async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
149
- """Récupère le docx depuis l'URL et le retourne un tuple (nom, extension, contenu)"""
 
 
 
150
 
151
  async with FTP_DOWNLOAD_RATE_LIMITER:
152
  async with FTP_MAX_PARALLEL_WORKERS:
153
  if not url.endswith("zip"):
154
  raise ValueError("URL doit pointer vers un fichier ZIP")
155
 
156
- doc_id = os.path.splitext(os.path.basename(url))[0]
157
  resp = await client.get(url, headers={
158
  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
159
  })
@@ -168,10 +181,20 @@ async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.B
168
 
169
  file_name = entry.filename
170
  root, ext = os.path.splitext(file_name)
 
 
 
 
 
 
 
 
171
  doc_bytes = zf.read(file_name)
172
- return (root, ext.lower(), io.BytesIO(doc_bytes))
173
 
174
- raise ValueError("Aucun fichier trouvé dans l'archive")
 
 
 
175
 
176
 
177
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
@@ -235,13 +258,6 @@ def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
235
  output.seek(0)
236
  return output
237
 
238
-
239
- FORMAT_MIME_TYPES = {
240
- ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
241
- ".pdf": "application/pdf",
242
- ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation"
243
- }
244
-
245
  # ============================================= Doc routes =========================================================
246
 
247
 
@@ -356,7 +372,7 @@ async def download_docs(req: DownloadDocsRequest, http_client: AsyncClient = Dep
356
  """Attempts to convert a document to text and returns success status and content."""
357
  try:
358
  filename, ext, bytes = await get_doc_archive(item.url, http_client)
359
- text_lines = await convert_to_txt(filename, ext, bytes)
360
  content_bytes = "\n".join(text_lines).encode("utf-8")
361
  return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
362
  except Exception as e:
@@ -410,12 +426,12 @@ async def download_user_docs(files: list[UploadFile] = File(...)):
410
 
411
  filenames = [file["filename"] for file in file_infos]
412
  logging.info(f"Got {len(file_infos)} user files to convert.")
413
- logging.info(f"Filenames: {filenames}")
414
 
415
  # convert files to text
416
  async def _process_single_document(item: dict):
417
  try:
418
- text_lines = await convert_to_txt(item["filename"], item["extension"], item["content"])
419
  content_bytes = "\n".join(text_lines).encode("utf-8")
420
  return {"doc_id": item["filename"], "content": content_bytes}
421
  except Exception as e:
@@ -478,7 +494,7 @@ async def extract_requirements_from_docs(req: ExtractRequirementsRequest, llm_ro
478
  # convert the docx to txt for use
479
  try:
480
  filename, ext, bytes = await get_doc_archive(url, http_client)
481
- txt_data = await convert_to_txt(filename, ext, bytes)
482
  full = "\n".join(txt_data)
483
  except Exception as e:
484
  fmt = "".join(traceback.format_exception(e))
 
41
  # Unfortunately needs to be kept to 1, as libreoffice isn't built to support parallel instances
42
  LO_CONVERSION_MUTEX = asyncio.Semaphore(1)
43
 
44
+ # Supported file types for text extraction and their MIME type
45
+ FORMAT_MIME_TYPES = {
46
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
47
+ ".pdf": "application/pdf",
48
+ ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
49
+ ".zip": ""
50
+ }
51
+
52
 
53
+ async def convert_file_type(contents: io.BytesIO, filename: str, input_ext: str, output_ext: str, filter: str = None) -> io.BytesIO:
54
  """
55
  Converts the given file bytes using Libreoffice headless to the specified file type.
 
56
 
57
  Args:
58
  contents: File contents
 
112
  return out_bytes
113
 
114
 
115
+ async def extract_text_contents(filename: str, ext: str, bytes: io.BytesIO) -> list[str]:
116
+ """
117
+ Convert given file represented as a (filename, ext, bytes) to a list of lines.
118
+ File types which require conversion for handling are converted to the appropriate format before being converted to text.
119
+ """
120
 
121
  final_text: str = None
122
  if ext == ".doc":
123
  logging.debug(f"Converting {filename} .doc --> .docx")
124
+ docx_bytes = await convert_file_type(bytes, filename, "doc", "docx")
125
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".docx"], config=KREUZBERG_CONFIG)
126
  final_text = extracted_data.content
127
  elif ext == ".docx":
 
132
  final_text = extracted_data.content
133
  elif ext == ".ppt":
134
  logging.debug(f"Converting {filename} .ppt --> .pptx")
135
+ docx_bytes = await convert_file_type(bytes, filename, "ppt", "pptx")
136
  extracted_data = await extract_bytes(docx_bytes.read(), FORMAT_MIME_TYPES[".pptx"], config=KREUZBERG_CONFIG)
137
  final_text = extracted_data.content
138
  else:
139
+ if ext in FORMAT_MIME_TYPES: # check if file extension is supported
140
  extracted_data = await extract_bytes(bytes.read(), FORMAT_MIME_TYPES[ext], config=KREUZBERG_CONFIG)
141
  final_text = extracted_data.content
142
  else:
 
156
 
157
 
158
  async def get_doc_archive(url: str, client: AsyncClient) -> tuple[str, str, io.BytesIO]:
159
+ """
160
+ Récupère le document zippé depuis l'URL et le retourne un tuple (nom, extension, contenu).
161
+ Le premier document avec une extension convertible en texte est séléctionné
162
+ """
163
 
164
  async with FTP_DOWNLOAD_RATE_LIMITER:
165
  async with FTP_MAX_PARALLEL_WORKERS:
166
  if not url.endswith("zip"):
167
  raise ValueError("URL doit pointer vers un fichier ZIP")
168
 
169
+ # doc_id = os.path.splitext(os.path.basename(url))[0]
170
  resp = await client.get(url, headers={
171
  "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
172
  })
 
181
 
182
  file_name = entry.filename
183
  root, ext = os.path.splitext(file_name)
184
+ ext = ext.lower()
185
+
186
+ # skip the file if it isn't supported
187
+ if ext not in FORMAT_MIME_TYPES:
188
+ logging.debug(
189
+ f"Skipping unsupported filetype found in archive: {ext}")
190
+ continue
191
+
192
  doc_bytes = zf.read(file_name)
 
193
 
194
+ return (root, ext, io.BytesIO(doc_bytes))
195
+
196
+ raise ValueError(
197
+ "No file with a supported extension type was found in the archive file")
198
 
199
 
200
  def apply_docx_revisions(docx_zip: zipfile.ZipFile) -> io.BytesIO:
 
258
  output.seek(0)
259
  return output
260
 
 
 
 
 
 
 
 
261
  # ============================================= Doc routes =========================================================
262
 
263
 
 
372
  """Attempts to convert a document to text and returns success status and content."""
373
  try:
374
  filename, ext, bytes = await get_doc_archive(item.url, http_client)
375
+ text_lines = await extract_text_contents(filename, ext, bytes)
376
  content_bytes = "\n".join(text_lines).encode("utf-8")
377
  return {"doc_id": item.document, "content": content_bytes, "agenda_item": item.agenda_item}
378
  except Exception as e:
 
426
 
427
  filenames = [file["filename"] for file in file_infos]
428
  logging.info(f"Got {len(file_infos)} user files to convert.")
429
+ logging.debug(f"Filenames: {filenames}")
430
 
431
  # convert files to text
432
  async def _process_single_document(item: dict):
433
  try:
434
+ text_lines = await extract_text_contents(item["filename"], item["extension"], item["content"])
435
  content_bytes = "\n".join(text_lines).encode("utf-8")
436
  return {"doc_id": item["filename"], "content": content_bytes}
437
  except Exception as e:
 
494
  # convert the docx to txt for use
495
  try:
496
  filename, ext, bytes = await get_doc_archive(url, http_client)
497
+ txt_data = await extract_text_contents(filename, ext, bytes)
498
  full = "\n".join(txt_data)
499
  except Exception as e:
500
  fmt = "".join(traceback.format_exception(e))