Merge pull request #689 from ParisNeo/main
Browse files- lightrag/api/lightrag_server.py +26 -79
- requirements.txt +3 -5
lightrag/api/lightrag_server.py
CHANGED
@@ -556,7 +556,7 @@ class DocumentManager:
|
|
556 |
def __init__(
|
557 |
self,
|
558 |
input_dir: str,
|
559 |
-
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx"),
|
560 |
):
|
561 |
self.input_dir = Path(input_dir)
|
562 |
self.supported_extensions = supported_extensions
|
@@ -973,38 +973,14 @@ def create_app(args):
|
|
973 |
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
|
974 |
content = await f.read()
|
975 |
|
976 |
-
case ".pdf":
|
977 |
-
if not pm.is_installed("
|
978 |
-
pm.install("
|
979 |
-
from
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
content =
|
984 |
-
for page in reader.pages:
|
985 |
-
content += page.extract_text() + "\n"
|
986 |
-
|
987 |
-
case ".docx":
|
988 |
-
if not pm.is_installed("python-docx"):
|
989 |
-
pm.install("python-docx")
|
990 |
-
from docx import Document
|
991 |
-
|
992 |
-
# Word document handling
|
993 |
-
doc = Document(file_path)
|
994 |
-
content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
995 |
-
|
996 |
-
case ".pptx":
|
997 |
-
if not pm.is_installed("pptx"):
|
998 |
-
pm.install("pptx")
|
999 |
-
from pptx import Presentation # type: ignore
|
1000 |
-
|
1001 |
-
# PowerPoint handling
|
1002 |
-
prs = Presentation(file_path)
|
1003 |
-
content = ""
|
1004 |
-
for slide in prs.slides:
|
1005 |
-
for shape in slide.shapes:
|
1006 |
-
if hasattr(shape, "text"):
|
1007 |
-
content += shape.text + "\n"
|
1008 |
|
1009 |
case _:
|
1010 |
raise ValueError(f"Unsupported file format: {ext}")
|
@@ -1282,55 +1258,26 @@ def create_app(args):
|
|
1282 |
text_content = await file.read()
|
1283 |
content = text_content.decode("utf-8")
|
1284 |
|
1285 |
-
case ".pdf":
|
1286 |
-
if not pm.is_installed("
|
1287 |
-
pm.install("
|
1288 |
-
from
|
1289 |
-
from io import BytesIO
|
1290 |
|
1291 |
-
#
|
1292 |
-
|
1293 |
-
|
1294 |
-
reader = PdfReader(pdf_file)
|
1295 |
-
content = ""
|
1296 |
-
for page in reader.pages:
|
1297 |
-
content += page.extract_text() + "\n"
|
1298 |
-
|
1299 |
-
case ".docx":
|
1300 |
-
if not pm.is_installed("python-docx"):
|
1301 |
-
pm.install("python-docx")
|
1302 |
-
from docx import Document
|
1303 |
-
from io import BytesIO
|
1304 |
-
|
1305 |
-
# Read DOCX from memory
|
1306 |
-
docx_content = await file.read()
|
1307 |
-
docx_file = BytesIO(docx_content)
|
1308 |
-
doc = Document(docx_file)
|
1309 |
-
content = "\n".join(
|
1310 |
-
[paragraph.text for paragraph in doc.paragraphs]
|
1311 |
-
)
|
1312 |
|
1313 |
-
|
1314 |
-
|
1315 |
-
|
1316 |
-
from pptx import Presentation # type: ignore
|
1317 |
-
from io import BytesIO
|
1318 |
|
1319 |
-
|
1320 |
-
|
1321 |
-
|
1322 |
-
|
1323 |
-
|
1324 |
-
|
1325 |
-
|
1326 |
-
if hasattr(shape, "text"):
|
1327 |
-
content += shape.text + "\n"
|
1328 |
-
|
1329 |
-
case _:
|
1330 |
-
raise HTTPException(
|
1331 |
-
status_code=400,
|
1332 |
-
detail=f"Unsupported file type. Supported types: {doc_manager.supported_extensions}",
|
1333 |
-
)
|
1334 |
|
1335 |
# Insert content into RAG system
|
1336 |
if content:
|
|
|
556 |
def __init__(
|
557 |
self,
|
558 |
input_dir: str,
|
559 |
+
supported_extensions: tuple = (".txt", ".md", ".pdf", ".docx", ".pptx", "xlsx"),
|
560 |
):
|
561 |
self.input_dir = Path(input_dir)
|
562 |
self.supported_extensions = supported_extensions
|
|
|
973 |
async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
|
974 |
content = await f.read()
|
975 |
|
976 |
+
case ".pdf" | ".docx" | ".pptx" | ".xlsx":
|
977 |
+
if not pm.is_installed("docling"):
|
978 |
+
pm.install("docling")
|
979 |
+
from docling.document_converter import DocumentConverter
|
980 |
+
|
981 |
+
converter = DocumentConverter()
|
982 |
+
result = converter.convert(file_path)
|
983 |
+
content = result.document.export_to_markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
984 |
|
985 |
case _:
|
986 |
raise ValueError(f"Unsupported file format: {ext}")
|
|
|
1258 |
text_content = await file.read()
|
1259 |
content = text_content.decode("utf-8")
|
1260 |
|
1261 |
+
case ".pdf" | ".docx" | ".pptx" | ".xlsx":
|
1262 |
+
if not pm.is_installed("docling"):
|
1263 |
+
pm.install("docling")
|
1264 |
+
from docling.document_converter import DocumentConverter
|
|
|
1265 |
|
1266 |
+
# Create a temporary file to save the uploaded content
|
1267 |
+
temp_path = Path("temp") / file.filename
|
1268 |
+
temp_path.parent.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1269 |
|
1270 |
+
# Save the uploaded file
|
1271 |
+
with temp_path.open("wb") as f:
|
1272 |
+
f.write(await file.read())
|
|
|
|
|
1273 |
|
1274 |
+
try:
|
1275 |
+
converter = DocumentConverter()
|
1276 |
+
result = converter.convert(str(temp_path))
|
1277 |
+
content = result.document.export_to_markdown()
|
1278 |
+
finally:
|
1279 |
+
# Clean up the temporary file
|
1280 |
+
temp_path.unlink()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1281 |
|
1282 |
# Insert content into RAG system
|
1283 |
if content:
|
requirements.txt
CHANGED
@@ -2,6 +2,9 @@ accelerate
|
|
2 |
aiofiles
|
3 |
aiohttp
|
4 |
configparser
|
|
|
|
|
|
|
5 |
graspologic
|
6 |
|
7 |
# database packages
|
@@ -11,12 +14,7 @@ networkx
|
|
11 |
numpy
|
12 |
pipmaster
|
13 |
pydantic
|
14 |
-
|
15 |
-
# File manipulation libraries
|
16 |
-
PyPDF2
|
17 |
-
python-docx
|
18 |
python-dotenv
|
19 |
-
python-pptx
|
20 |
|
21 |
setuptools
|
22 |
tenacity
|
|
|
2 |
aiofiles
|
3 |
aiohttp
|
4 |
configparser
|
5 |
+
|
6 |
+
# File manipulation libraries
|
7 |
+
docling
|
8 |
graspologic
|
9 |
|
10 |
# database packages
|
|
|
14 |
numpy
|
15 |
pipmaster
|
16 |
pydantic
|
|
|
|
|
|
|
|
|
17 |
python-dotenv
|
|
|
18 |
|
19 |
setuptools
|
20 |
tenacity
|