zrguo commited on
Commit
c7cae5d
·
unverified ·
2 Parent(s): e8ae5e8 5f68aa6

Merge pull request #689 from ParisNeo/main

Browse files
Files changed (2) hide show
  1. lightrag/api/lightrag_server.py +26 -79
  2. requirements.txt +3 -5
lightrag/api/lightrag_server.py CHANGED
@@ -556,7 +556,7 @@ class DocumentManager:
556
  def __init__(
557
  self,
558
  input_dir: str,
559
- supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
560
  ):
561
  self.input_dir = Path(input_dir)
562
  self.supported_extensions = supported_extensions
@@ -973,38 +973,14 @@ def create_app(args):
973
  async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
974
  content = await f.read()
975
 
976
- case ".pdf":
977
- if not pm.is_installed("pypdf2"):
978
- pm.install("pypdf2")
979
- from PyPDF2 import PdfReader
980
-
981
- # PDF handling
982
- reader = PdfReader(str(file_path))
983
- content = ""
984
- for page in reader.pages:
985
- content += page.extract_text() + "\n"
986
-
987
- case ".docx":
988
- if not pm.is_installed("python-docx"):
989
- pm.install("python-docx")
990
- from docx import Document
991
-
992
- # Word document handling
993
- doc = Document(file_path)
994
- content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
995
-
996
- case ".pptx":
997
- if not pm.is_installed("pptx"):
998
- pm.install("pptx")
999
- from pptx import Presentation # type: ignore
1000
-
1001
- # PowerPoint handling
1002
- prs = Presentation(file_path)
1003
- content = ""
1004
- for slide in prs.slides:
1005
- for shape in slide.shapes:
1006
- if hasattr(shape, "text"):
1007
- content += shape.text + "\n"
1008
 
1009
  case _:
1010
  raise ValueError(f"Unsupported file format: {ext}")
@@ -1282,55 +1258,26 @@ def create_app(args):
1282
  text_content = await file.read()
1283
  content = text_content.decode("utf-8")
1284
 
1285
- case ".pdf":
1286
- if not pm.is_installed("pypdf2"):
1287
- pm.install("pypdf2")
1288
- from PyPDF2 import PdfReader
1289
- from io import BytesIO
1290
 
1291
- # Read PDF from memory
1292
- pdf_content = await file.read()
1293
- pdf_file = BytesIO(pdf_content)
1294
- reader = PdfReader(pdf_file)
1295
- content = ""
1296
- for page in reader.pages:
1297
- content += page.extract_text() + "\n"
1298
-
1299
- case ".docx":
1300
- if not pm.is_installed("python-docx"):
1301
- pm.install("python-docx")
1302
- from docx import Document
1303
- from io import BytesIO
1304
-
1305
- # Read DOCX from memory
1306
- docx_content = await file.read()
1307
- docx_file = BytesIO(docx_content)
1308
- doc = Document(docx_file)
1309
- content = "\n".join(
1310
- [paragraph.text for paragraph in doc.paragraphs]
1311
- )
1312
 
1313
- case ".pptx":
1314
- if not pm.is_installed("pptx"):
1315
- pm.install("pptx")
1316
- from pptx import Presentation # type: ignore
1317
- from io import BytesIO
1318
 
1319
- # Read PPTX from memory
1320
- pptx_content = await file.read()
1321
- pptx_file = BytesIO(pptx_content)
1322
- prs = Presentation(pptx_file)
1323
- content = ""
1324
- for slide in prs.slides:
1325
- for shape in slide.shapes:
1326
- if hasattr(shape, "text"):
1327
- content += shape.text + "\n"
1328
-
1329
- case _:
1330
- raise HTTPException(
1331
- status_code=400,
1332
- detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
1333
- )
1334
 
1335
  # Insert content into RAG system
1336
  if content:
 
556
  def __init__(
557
  self,
558
  input_dir: str,
559
+ supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
560
  ):
561
  self.input_dir = Path(input_dir)
562
  self.supported_extensions = supported_extensions
 
973
  async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
974
  content = await f.read()
975
 
976
+ case ".pdf" | ".docx" | ".pptx" | ".xlsx":
977
+ if not pm.is_installed("docling"):
978
+ pm.install("docling")
979
+ from docling.document_converter import DocumentConverter
980
+
981
+ converter = DocumentConverter()
982
+ result = converter.convert(file_path)
983
+ content = result.document.export_to_markdown()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
984
 
985
  case _:
986
  raise ValueError(f"Unsupported file format: {ext}")
 
1258
  text_content = await file.read()
1259
  content = text_content.decode("utf-8")
1260
 
1261
+ case ".pdf" | ".docx" | ".pptx" | ".xlsx":
1262
+ if not pm.is_installed("docling"):
1263
+ pm.install("docling")
1264
+ from docling.document_converter import DocumentConverter
 
1265
 
1266
+ # Create a temporary file to save the uploaded content
1267
+ temp_path = Path("temp") / file.filename
1268
+ temp_path.parent.mkdir(exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1269
 
1270
+ # Save the uploaded file
1271
+ with temp_path.open("wb") as f:
1272
+ f.write(await file.read())
 
 
1273
 
1274
+ try:
1275
+ converter = DocumentConverter()
1276
+ result = converter.convert(str(temp_path))
1277
+ content = result.document.export_to_markdown()
1278
+ finally:
1279
+ # Clean up the temporary file
1280
+ temp_path.unlink()
 
 
 
 
 
 
 
 
1281
 
1282
  # Insert content into RAG system
1283
  if content:
requirements.txt CHANGED
@@ -2,6 +2,9 @@ accelerate
2
  aiofiles
3
  aiohttp
4
  configparser
 
 
 
5
  graspologic
6
 
7
  # database packages
@@ -11,12 +14,7 @@ networkx
11
  numpy
12
  pipmaster
13
  pydantic
14
-
15
- # File manipulation libraries
16
- PyPDF2
17
- python-docx
18
  python-dotenv
19
- python-pptx
20
 
21
  setuptools
22
  tenacity
 
2
  aiofiles
3
  aiohttp
4
  configparser
5
+
6
+ # File manipulation libraries
7
+ docling
8
  graspologic
9
 
10
  # database packages
 
14
  numpy
15
  pipmaster
16
  pydantic
 
 
 
 
17
  python-dotenv
 
18
 
19
  setuptools
20
  tenacity