Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

zrguo commited on Feb 1

Commit

c7cae5d

unverified ·

2 Parent(s): e8ae5e8 5f68aa6

Merge pull request #689 from ParisNeo/main

Browse files

Files changed (2) hide show

lightrag/api/lightrag_server.py +26 -79
requirements.txt +3 -5

lightrag/api/lightrag_server.py CHANGED Viewed

@@ -556,7 +556,7 @@ class DocumentManager:
     def __init__(
         self,
         input_dir: str,
-        supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
     ):
         self.input_dir = Path(input_dir)
         self.supported_extensions = supported_extensions
@@ -973,38 +973,14 @@ def create_app(args):
                 async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                     content = await f.read()
-            case ".pdf":
-                if not pm.is_installed("pypdf2"):
-                    pm.install("pypdf2")
-                from PyPDF2 import PdfReader
-                # PDF handling
-                reader = PdfReader(str(file_path))
-                content = ""
-                for page in reader.pages:
-                    content += page.extract_text() + "\n"
-            case ".docx":
-                if not pm.is_installed("python-docx"):
-                    pm.install("python-docx")
-                from docx import Document
-                # Word document handling
-                doc = Document(file_path)
-                content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
-            case ".pptx":
-                if not pm.is_installed("pptx"):
-                    pm.install("pptx")
-                from pptx import Presentation  # type: ignore
-                # PowerPoint handling
-                prs = Presentation(file_path)
-                content = ""
-                for slide in prs.slides:
-                    for shape in slide.shapes:
-                        if hasattr(shape, "text"):
-                            content += shape.text + "\n"
             case _:
                 raise ValueError(f"Unsupported file format: {ext}")
@@ -1282,55 +1258,26 @@ def create_app(args):
                     text_content = await file.read()
                     content = text_content.decode("utf-8")
-                case ".pdf":
-                    if not pm.is_installed("pypdf2"):
-                        pm.install("pypdf2")
-                    from PyPDF2 import PdfReader
-                    from io import BytesIO
-                    # Read PDF from memory
-                    pdf_content = await file.read()
-                    pdf_file = BytesIO(pdf_content)
-                    reader = PdfReader(pdf_file)
-                    content = ""
-                    for page in reader.pages:
-                        content += page.extract_text() + "\n"
-                case ".docx":
-                    if not pm.is_installed("python-docx"):
-                        pm.install("python-docx")
-                    from docx import Document
-                    from io import BytesIO
-                    # Read DOCX from memory
-                    docx_content = await file.read()
-                    docx_file = BytesIO(docx_content)
-                    doc = Document(docx_file)
-                    content = "\n".join(
-                        [paragraph.text for paragraph in doc.paragraphs]
-                    )
-                case ".pptx":
-                    if not pm.is_installed("pptx"):
-                        pm.install("pptx")
-                    from pptx import Presentation  # type: ignore
-                    from io import BytesIO
-                    # Read PPTX from memory
-                    pptx_content = await file.read()
-                    pptx_file = BytesIO(pptx_content)
-                    prs = Presentation(pptx_file)
-                    content = ""
-                    for slide in prs.slides:
-                        for shape in slide.shapes:
-                            if hasattr(shape, "text"):
-                                content += shape.text + "\n"
-                case _:
-                    raise HTTPException(
-                        status_code=400,
-                        detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
-                    )
             # Insert content into RAG system
             if content:

     def __init__(
         self,
         input_dir: str,
+        supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
     ):
         self.input_dir = Path(input_dir)
         self.supported_extensions = supported_extensions
                 async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                     content = await f.read()
+            case ".pdf" | ".docx" | ".pptx" | ".xlsx":
+                if not pm.is_installed("docling"):
+                    pm.install("docling")
+                from docling.document_converter import DocumentConverter
+                converter = DocumentConverter()
+                result = converter.convert(file_path)
+                content = result.document.export_to_markdown()
             case _:
                 raise ValueError(f"Unsupported file format: {ext}")
                     text_content = await file.read()
                     content = text_content.decode("utf-8")
+                case ".pdf" | ".docx" | ".pptx" | ".xlsx":
+                    if not pm.is_installed("docling"):
+                        pm.install("docling")
+                    from docling.document_converter import DocumentConverter
+                    # Create a temporary file to save the uploaded content
+                    temp_path = Path("temp") / file.filename
+                    temp_path.parent.mkdir(exist_ok=True)
+                    # Save the uploaded file
+                    with temp_path.open("wb") as f:
+                        f.write(await file.read())
+                    try:
+                        converter = DocumentConverter()
+                        result = converter.convert(str(temp_path))
+                        content = result.document.export_to_markdown()
+                    finally:
+                        # Clean up the temporary file
+                        temp_path.unlink()
             # Insert content into RAG system
             if content:

requirements.txt CHANGED Viewed

@@ -2,6 +2,9 @@ accelerate
 aiofiles
 aiohttp
 configparser
 graspologic
 # database packages
@@ -11,12 +14,7 @@ networkx
 numpy
 pipmaster
 pydantic
-# File manipulation libraries
-PyPDF2
-python-docx
 python-dotenv
-python-pptx
 setuptools
 tenacity

 aiofiles
 aiohttp
 configparser
+# File manipulation libraries
+docling
 graspologic
 # database packages
 numpy
 pipmaster
 pydantic
 python-dotenv
 setuptools
 tenacity