Merge branch 'main' into neo4j-add-min-degree
Browse files
lightrag/api/routers/document_routes.py
CHANGED
@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
|
|
16 |
|
17 |
from lightrag import LightRAG
|
18 |
from lightrag.base import DocProcessingStatus, DocStatus
|
19 |
-
from
|
|
|
|
|
|
|
|
|
20 |
|
21 |
router = APIRouter(
|
22 |
prefix="/documents",
|
@@ -240,54 +244,93 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
|
|
240 |
)
|
241 |
return False
|
242 |
case ".pdf":
|
243 |
-
if
|
244 |
-
pm.
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
case ".docx":
|
253 |
-
if
|
254 |
-
pm.
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
case ".pptx":
|
262 |
-
if
|
263 |
-
pm.
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
case ".xlsx":
|
274 |
-
if
|
275 |
-
pm.
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
)
|
288 |
-
|
289 |
-
)
|
290 |
-
content += "\n"
|
291 |
case _:
|
292 |
logger.error(
|
293 |
f"Unsupported file type: {file_path.name} (extension {ext})"
|
|
|
16 |
|
17 |
from lightrag import LightRAG
|
18 |
from lightrag.base import DocProcessingStatus, DocStatus
|
19 |
+
from lightrag.api.utils_api import (
|
20 |
+
get_api_key_dependency,
|
21 |
+
global_args,
|
22 |
+
get_auth_dependency,
|
23 |
+
)
|
24 |
|
25 |
router = APIRouter(
|
26 |
prefix="/documents",
|
|
|
244 |
)
|
245 |
return False
|
246 |
case ".pdf":
|
247 |
+
if global_args["main_args"].document_loading_engine == "DOCLING":
|
248 |
+
if not pm.is_installed("docling"): # type: ignore
|
249 |
+
pm.install("docling")
|
250 |
+
from docling.document_converter import DocumentConverter
|
251 |
+
|
252 |
+
converter = DocumentConverter()
|
253 |
+
result = converter.convert(file_path)
|
254 |
+
content = result.document.export_to_markdown()
|
255 |
+
else:
|
256 |
+
if not pm.is_installed("pypdf2"): # type: ignore
|
257 |
+
pm.install("pypdf2")
|
258 |
+
from PyPDF2 import PdfReader # type: ignore
|
259 |
+
from io import BytesIO
|
260 |
+
|
261 |
+
pdf_file = BytesIO(file)
|
262 |
+
reader = PdfReader(pdf_file)
|
263 |
+
for page in reader.pages:
|
264 |
+
content += page.extract_text() + "\n"
|
265 |
case ".docx":
|
266 |
+
if global_args["main_args"].document_loading_engine == "DOCLING":
|
267 |
+
if not pm.is_installed("docling"): # type: ignore
|
268 |
+
pm.install("docling")
|
269 |
+
from docling.document_converter import DocumentConverter
|
270 |
+
|
271 |
+
converter = DocumentConverter()
|
272 |
+
result = converter.convert(file_path)
|
273 |
+
content = result.document.export_to_markdown()
|
274 |
+
else:
|
275 |
+
if not pm.is_installed("python-docx"): # type: ignore
|
276 |
+
pm.install("docx")
|
277 |
+
from docx import Document # type: ignore
|
278 |
+
from io import BytesIO
|
279 |
+
|
280 |
+
docx_file = BytesIO(file)
|
281 |
+
doc = Document(docx_file)
|
282 |
+
content = "\n".join(
|
283 |
+
[paragraph.text for paragraph in doc.paragraphs]
|
284 |
+
)
|
285 |
case ".pptx":
|
286 |
+
if global_args["main_args"].document_loading_engine == "DOCLING":
|
287 |
+
if not pm.is_installed("docling"): # type: ignore
|
288 |
+
pm.install("docling")
|
289 |
+
from docling.document_converter import DocumentConverter
|
290 |
+
|
291 |
+
converter = DocumentConverter()
|
292 |
+
result = converter.convert(file_path)
|
293 |
+
content = result.document.export_to_markdown()
|
294 |
+
else:
|
295 |
+
if not pm.is_installed("python-pptx"): # type: ignore
|
296 |
+
pm.install("pptx")
|
297 |
+
from pptx import Presentation # type: ignore
|
298 |
+
from io import BytesIO
|
299 |
+
|
300 |
+
pptx_file = BytesIO(file)
|
301 |
+
prs = Presentation(pptx_file)
|
302 |
+
for slide in prs.slides:
|
303 |
+
for shape in slide.shapes:
|
304 |
+
if hasattr(shape, "text"):
|
305 |
+
content += shape.text + "\n"
|
306 |
case ".xlsx":
|
307 |
+
if global_args["main_args"].document_loading_engine == "DOCLING":
|
308 |
+
if not pm.is_installed("docling"): # type: ignore
|
309 |
+
pm.install("docling")
|
310 |
+
from docling.document_converter import DocumentConverter
|
311 |
+
|
312 |
+
converter = DocumentConverter()
|
313 |
+
result = converter.convert(file_path)
|
314 |
+
content = result.document.export_to_markdown()
|
315 |
+
else:
|
316 |
+
if not pm.is_installed("openpyxl"): # type: ignore
|
317 |
+
pm.install("openpyxl")
|
318 |
+
from openpyxl import load_workbook # type: ignore
|
319 |
+
from io import BytesIO
|
320 |
+
|
321 |
+
xlsx_file = BytesIO(file)
|
322 |
+
wb = load_workbook(xlsx_file)
|
323 |
+
for sheet in wb:
|
324 |
+
content += f"Sheet: {sheet.title}\n"
|
325 |
+
for row in sheet.iter_rows(values_only=True):
|
326 |
+
content += (
|
327 |
+
"\t".join(
|
328 |
+
str(cell) if cell is not None else ""
|
329 |
+
for cell in row
|
330 |
+
)
|
331 |
+
+ "\n"
|
332 |
)
|
333 |
+
content += "\n"
|
|
|
|
|
334 |
case _:
|
335 |
logger.error(
|
336 |
f"Unsupported file type: {file_path.name} (extension {ext})"
|
lightrag/api/routers/ollama_api.py
CHANGED
@@ -11,7 +11,7 @@ import asyncio
|
|
11 |
from ascii_colors import trace_exception
|
12 |
from lightrag import LightRAG, QueryParam
|
13 |
from lightrag.utils import encode_string_by_tiktoken
|
14 |
-
from
|
15 |
|
16 |
|
17 |
# query mode according to query prefix (bypass is not LightRAG quer mode)
|
|
|
11 |
from ascii_colors import trace_exception
|
12 |
from lightrag import LightRAG, QueryParam
|
13 |
from lightrag.utils import encode_string_by_tiktoken
|
14 |
+
from lightrag.api.utils_api import ollama_server_infos
|
15 |
|
16 |
|
17 |
# query mode according to query prefix (bypass is not LightRAG quer mode)
|
lightrag/api/utils_api.py
CHANGED
@@ -18,6 +18,8 @@ from .auth import auth_handler
|
|
18 |
# Load environment variables
|
19 |
load_dotenv(override=True)
|
20 |
|
|
|
|
|
21 |
|
22 |
class OllamaServerInfos:
|
23 |
# Constants for emulated Ollama model information
|
@@ -365,8 +367,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
|
|
365 |
"ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
|
366 |
)
|
367 |
|
|
|
|
|
|
|
368 |
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
369 |
|
|
|
370 |
return args
|
371 |
|
372 |
|
|
|
18 |
# Load environment variables
|
19 |
load_dotenv(override=True)
|
20 |
|
21 |
+
global_args = {"main_args": None}
|
22 |
+
|
23 |
|
24 |
class OllamaServerInfos:
|
25 |
# Constants for emulated Ollama model information
|
|
|
367 |
"ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
|
368 |
)
|
369 |
|
370 |
+
# Select Document loading tool (DOCLING, DEFAULT)
|
371 |
+
args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
|
372 |
+
|
373 |
ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
|
374 |
|
375 |
+
global_args["main_args"] = args
|
376 |
return args
|
377 |
|
378 |
|
lightrag/lightrag.py
CHANGED
@@ -2084,6 +2084,7 @@ class LightRAG:
|
|
2084 |
cast(StorageNameSpace, storage_inst).index_done_callback()
|
2085 |
for storage_inst in [ # type: ignore
|
2086 |
self.entities_vdb,
|
|
|
2087 |
self.chunk_entity_relation_graph,
|
2088 |
]
|
2089 |
]
|
|
|
2084 |
cast(StorageNameSpace, storage_inst).index_done_callback()
|
2085 |
for storage_inst in [ # type: ignore
|
2086 |
self.entities_vdb,
|
2087 |
+
self.relationships_vdb,
|
2088 |
self.chunk_entity_relation_graph,
|
2089 |
]
|
2090 |
]
|