zrguo commited on
Commit
9b78d7d
·
unverified ·
2 Parent(s): 7e48475 66250ac

Merge branch 'main' into neo4j-add-min-degree

Browse files
lightrag/api/routers/document_routes.py CHANGED
@@ -16,7 +16,11 @@ from pydantic import BaseModel, Field, field_validator
16
 
17
  from lightrag import LightRAG
18
  from lightrag.base import DocProcessingStatus, DocStatus
19
- from ..utils_api import get_api_key_dependency, get_auth_dependency
 
 
 
 
20
 
21
  router = APIRouter(
22
  prefix="/documents",
@@ -240,54 +244,93 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
240
  )
241
  return False
242
  case ".pdf":
243
- if not pm.is_installed("pypdf2"): # type: ignore
244
- pm.install("pypdf2")
245
- from PyPDF2 import PdfReader # type: ignore
246
- from io import BytesIO
247
-
248
- pdf_file = BytesIO(file)
249
- reader = PdfReader(pdf_file)
250
- for page in reader.pages:
251
- content += page.extract_text() + "\n"
 
 
 
 
 
 
 
 
 
252
  case ".docx":
253
- if not pm.is_installed("python-docx"): # type: ignore
254
- pm.install("docx")
255
- from docx import Document # type: ignore
256
- from io import BytesIO
257
-
258
- docx_file = BytesIO(file)
259
- doc = Document(docx_file)
260
- content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
 
 
 
 
 
 
 
 
 
 
 
261
  case ".pptx":
262
- if not pm.is_installed("python-pptx"): # type: ignore
263
- pm.install("pptx")
264
- from pptx import Presentation # type: ignore
265
- from io import BytesIO
266
-
267
- pptx_file = BytesIO(file)
268
- prs = Presentation(pptx_file)
269
- for slide in prs.slides:
270
- for shape in slide.shapes:
271
- if hasattr(shape, "text"):
272
- content += shape.text + "\n"
 
 
 
 
 
 
 
 
 
273
  case ".xlsx":
274
- if not pm.is_installed("openpyxl"): # type: ignore
275
- pm.install("openpyxl")
276
- from openpyxl import load_workbook # type: ignore
277
- from io import BytesIO
278
-
279
- xlsx_file = BytesIO(file)
280
- wb = load_workbook(xlsx_file)
281
- for sheet in wb:
282
- content += f"Sheet: {sheet.title}\n"
283
- for row in sheet.iter_rows(values_only=True):
284
- content += (
285
- "\t".join(
286
- str(cell) if cell is not None else "" for cell in row
 
 
 
 
 
 
 
 
 
 
 
 
287
  )
288
- + "\n"
289
- )
290
- content += "\n"
291
  case _:
292
  logger.error(
293
  f"Unsupported file type: {file_path.name} (extension {ext})"
 
16
 
17
  from lightrag import LightRAG
18
  from lightrag.base import DocProcessingStatus, DocStatus
19
+ from lightrag.api.utils_api import (
20
+ get_api_key_dependency,
21
+ global_args,
22
+ get_auth_dependency,
23
+ )
24
 
25
  router = APIRouter(
26
  prefix="/documents",
 
244
  )
245
  return False
246
  case ".pdf":
247
+ if global_args["main_args"].document_loading_engine == "DOCLING":
248
+ if not pm.is_installed("docling"): # type: ignore
249
+ pm.install("docling")
250
+ from docling.document_converter import DocumentConverter
251
+
252
+ converter = DocumentConverter()
253
+ result = converter.convert(file_path)
254
+ content = result.document.export_to_markdown()
255
+ else:
256
+ if not pm.is_installed("pypdf2"): # type: ignore
257
+ pm.install("pypdf2")
258
+ from PyPDF2 import PdfReader # type: ignore
259
+ from io import BytesIO
260
+
261
+ pdf_file = BytesIO(file)
262
+ reader = PdfReader(pdf_file)
263
+ for page in reader.pages:
264
+ content += page.extract_text() + "\n"
265
  case ".docx":
266
+ if global_args["main_args"].document_loading_engine == "DOCLING":
267
+ if not pm.is_installed("docling"): # type: ignore
268
+ pm.install("docling")
269
+ from docling.document_converter import DocumentConverter
270
+
271
+ converter = DocumentConverter()
272
+ result = converter.convert(file_path)
273
+ content = result.document.export_to_markdown()
274
+ else:
275
+ if not pm.is_installed("python-docx"): # type: ignore
276
+ pm.install("docx")
277
+ from docx import Document # type: ignore
278
+ from io import BytesIO
279
+
280
+ docx_file = BytesIO(file)
281
+ doc = Document(docx_file)
282
+ content = "\n".join(
283
+ [paragraph.text for paragraph in doc.paragraphs]
284
+ )
285
  case ".pptx":
286
+ if global_args["main_args"].document_loading_engine == "DOCLING":
287
+ if not pm.is_installed("docling"): # type: ignore
288
+ pm.install("docling")
289
+ from docling.document_converter import DocumentConverter
290
+
291
+ converter = DocumentConverter()
292
+ result = converter.convert(file_path)
293
+ content = result.document.export_to_markdown()
294
+ else:
295
+ if not pm.is_installed("python-pptx"): # type: ignore
296
+ pm.install("pptx")
297
+ from pptx import Presentation # type: ignore
298
+ from io import BytesIO
299
+
300
+ pptx_file = BytesIO(file)
301
+ prs = Presentation(pptx_file)
302
+ for slide in prs.slides:
303
+ for shape in slide.shapes:
304
+ if hasattr(shape, "text"):
305
+ content += shape.text + "\n"
306
  case ".xlsx":
307
+ if global_args["main_args"].document_loading_engine == "DOCLING":
308
+ if not pm.is_installed("docling"): # type: ignore
309
+ pm.install("docling")
310
+ from docling.document_converter import DocumentConverter
311
+
312
+ converter = DocumentConverter()
313
+ result = converter.convert(file_path)
314
+ content = result.document.export_to_markdown()
315
+ else:
316
+ if not pm.is_installed("openpyxl"): # type: ignore
317
+ pm.install("openpyxl")
318
+ from openpyxl import load_workbook # type: ignore
319
+ from io import BytesIO
320
+
321
+ xlsx_file = BytesIO(file)
322
+ wb = load_workbook(xlsx_file)
323
+ for sheet in wb:
324
+ content += f"Sheet: {sheet.title}\n"
325
+ for row in sheet.iter_rows(values_only=True):
326
+ content += (
327
+ "\t".join(
328
+ str(cell) if cell is not None else ""
329
+ for cell in row
330
+ )
331
+ + "\n"
332
  )
333
+ content += "\n"
 
 
334
  case _:
335
  logger.error(
336
  f"Unsupported file type: {file_path.name} (extension {ext})"
lightrag/api/routers/ollama_api.py CHANGED
@@ -11,7 +11,7 @@ import asyncio
11
  from ascii_colors import trace_exception
12
  from lightrag import LightRAG, QueryParam
13
  from lightrag.utils import encode_string_by_tiktoken
14
- from ..utils_api import ollama_server_infos
15
 
16
 
17
  # query mode according to query prefix (bypass is not LightRAG quer mode)
 
11
  from ascii_colors import trace_exception
12
  from lightrag import LightRAG, QueryParam
13
  from lightrag.utils import encode_string_by_tiktoken
14
+ from lightrag.api.utils_api import ollama_server_infos
15
 
16
 
17
  # query mode according to query prefix (bypass is not LightRAG quer mode)
lightrag/api/utils_api.py CHANGED
@@ -18,6 +18,8 @@ from .auth import auth_handler
18
  # Load environment variables
19
  load_dotenv(override=True)
20
 
 
 
21
 
22
  class OllamaServerInfos:
23
  # Constants for emulated Ollama model information
@@ -365,8 +367,12 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
365
  "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
366
  )
367
 
 
 
 
368
  ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
369
 
 
370
  return args
371
 
372
 
 
18
  # Load environment variables
19
  load_dotenv(override=True)
20
 
21
+ global_args = {"main_args": None}
22
+
23
 
24
  class OllamaServerInfos:
25
  # Constants for emulated Ollama model information
 
367
  "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
368
  )
369
 
370
+ # Select Document loading tool (DOCLING, DEFAULT)
371
+ args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
372
+
373
  ollama_server_infos.LIGHTRAG_MODEL = args.simulated_model_name
374
 
375
+ global_args["main_args"] = args
376
  return args
377
 
378
 
lightrag/lightrag.py CHANGED
@@ -2084,6 +2084,7 @@ class LightRAG:
2084
  cast(StorageNameSpace, storage_inst).index_done_callback()
2085
  for storage_inst in [ # type: ignore
2086
  self.entities_vdb,
 
2087
  self.chunk_entity_relation_graph,
2088
  ]
2089
  ]
 
2084
  cast(StorageNameSpace, storage_inst).index_done_callback()
2085
  for storage_inst in [ # type: ignore
2086
  self.entities_vdb,
2087
+ self.relationships_vdb,
2088
  self.chunk_entity_relation_graph,
2089
  ]
2090
  ]