Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

Daniel.y commited on Mar 21

Commit

1473fec

unverified ·

2 Parent(s): 54deb16 dd1704b

Merge pull request #1145 from danielaskdd/optimize-doc-scan

Browse files

Optimize parallel file processing in folder scan and fix dead lock problem in Gunicorn multi workers mode

Files changed (7) hide show

lightrag/api/auth.py +3 -0
lightrag/api/gunicorn_config.py +3 -1
lightrag/api/lightrag_server.py +1 -1
lightrag/api/routers/document_routes.py +31 -11
lightrag/api/run_with_gunicorn.py +2 -2
lightrag/api/utils_api.py +12 -7
lightrag/kg/shared_storage.py +93 -2

lightrag/api/auth.py CHANGED Viewed

@@ -3,6 +3,9 @@ from datetime import datetime, timedelta
 import jwt
 from fastapi import HTTPException, status
 from pydantic import BaseModel
 class TokenPayload(BaseModel):

 import jwt
 from fastapi import HTTPException, status
 from pydantic import BaseModel
+from dotenv import load_dotenv
+load_dotenv()
 class TokenPayload(BaseModel):

lightrag/api/gunicorn_config.py CHANGED Viewed

@@ -29,7 +29,9 @@ preload_app = True
 worker_class = "uvicorn.workers.UvicornWorker"
 # Other Gunicorn configurations
-timeout = int(os.getenv("TIMEOUT", 150))  # Default 150s to match run_with_gunicorn.py
 keepalive = int(os.getenv("KEEPALIVE", 5))  # Default 5s
 # Logging configuration

 worker_class = "uvicorn.workers.UvicornWorker"
 # Other Gunicorn configurations
+timeout = int(
+    os.getenv("TIMEOUT", 150 * 2)
+)  # Default 150s *2 to match run_with_gunicorn.py
 keepalive = int(os.getenv("KEEPALIVE", 5))  # Default 5s
 # Logging configuration

lightrag/api/lightrag_server.py CHANGED Viewed

@@ -49,7 +49,7 @@ from .auth import auth_handler
 # Load environment variables
 # Updated to use the .env that is inside the current folder
 # This update allows the user to put a different.env file for each lightrag folder
-load_dotenv(".env", override=True)
 # Initialize config parser
 config = configparser.ConfigParser()

 # Load environment variables
 # Updated to use the .env that is inside the current folder
 # This update allows the user to put a different.env file for each lightrag folder
+load_dotenv()
 # Initialize config parser
 config = configparser.ConfigParser()

lightrag/api/routers/document_routes.py CHANGED Viewed

@@ -405,7 +405,7 @@ async def pipeline_index_file(rag: LightRAG, file_path: Path):
 async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
-    """Index multiple files concurrently
     Args:
         rag: LightRAG instance
@@ -416,12 +416,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
     try:
         enqueued = False
-        if len(file_paths) == 1:
-            enqueued = await pipeline_enqueue_file(rag, file_paths[0])
-        else:
-            tasks = [pipeline_enqueue_file(rag, path) for path in file_paths]
-            enqueued = any(await asyncio.gather(*tasks))
         if enqueued:
             await rag.apipeline_process_enqueue_documents()
     except Exception as e:
@@ -472,14 +472,34 @@ async def run_scanning_process(rag: LightRAG, doc_manager: DocumentManager):
         total_files = len(new_files)
         logger.info(f"Found {total_files} new files to index.")
-        for idx, file_path in enumerate(new_files):
-            try:
-                await pipeline_index_file(rag, file_path)
-            except Exception as e:
-                logger.error(f"Error indexing file {file_path}: {str(e)}")
     except Exception as e:
         logger.error(f"Error during scanning process: {str(e)}")
 def create_document_routes(

 async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
+    """Index multiple files sequentially to avoid high CPU load
     Args:
         rag: LightRAG instance
     try:
         enqueued = False
+        # Process files sequentially
+        for file_path in file_paths:
+            if await pipeline_enqueue_file(rag, file_path):
+                enqueued = True
+        # Process the queue only if at least one file was successfully enqueued
         if enqueued:
             await rag.apipeline_process_enqueue_documents()
     except Exception as e:
         total_files = len(new_files)
         logger.info(f"Found {total_files} new files to index.")
+        if not new_files:
+            return
+        # Get MAX_PARALLEL_INSERT from global_args
+        max_parallel = global_args["max_parallel_insert"]
+        # Calculate batch size as 2 * MAX_PARALLEL_INSERT
+        batch_size = 2 * max_parallel
+        # Process files in batches
+        for i in range(0, total_files, batch_size):
+            batch_files = new_files[i : i + batch_size]
+            batch_num = i // batch_size + 1
+            total_batches = (total_files + batch_size - 1) // batch_size
+            logger.info(
+                f"Processing batch {batch_num}/{total_batches} with {len(batch_files)} files"
+            )
+            await pipeline_index_files(rag, batch_files)
+            # Log progress
+            processed = min(i + batch_size, total_files)
+            logger.info(
+                f"Processed {processed}/{total_files} files ({processed/total_files*100:.1f}%)"
+            )
     except Exception as e:
         logger.error(f"Error during scanning process: {str(e)}")
+        logger.error(traceback.format_exc())
 def create_document_routes(

lightrag/api/run_with_gunicorn.py CHANGED Viewed

@@ -13,7 +13,7 @@ from dotenv import load_dotenv
 # Updated to use the .env that is inside the current folder
 # This update allows the user to put a different.env file for each lightrag folder
-load_dotenv(".env")
 def check_and_install_dependencies():
@@ -140,7 +140,7 @@ def main():
             # Timeout configuration prioritizes command line arguments
             gunicorn_config.timeout = (
-                args.timeout if args.timeout else int(os.getenv("TIMEOUT", 150))
             )
             # Keepalive configuration

 # Updated to use the .env that is inside the current folder
 # This update allows the user to put a different.env file for each lightrag folder
+load_dotenv()
 def check_and_install_dependencies():
             # Timeout configuration prioritizes command line arguments
             gunicorn_config.timeout = (
+                args.timeout if args.timeout * 2 else int(os.getenv("TIMEOUT", 150 * 2))
             )
             # Keepalive configuration

lightrag/api/utils_api.py CHANGED Viewed

@@ -16,7 +16,7 @@ from starlette.status import HTTP_403_FORBIDDEN
 from .auth import auth_handler
 # Load environment variables
-load_dotenv(override=True)
 global_args = {"main_args": None}
@@ -365,6 +365,9 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
         "LIGHTRAG_VECTOR_STORAGE", DefaultRAGStorageConfig.VECTOR_STORAGE
     )
     # Handle openai-ollama special case
     if args.llm_binding == "openai-ollama":
         args.llm_binding = "openai"
@@ -441,8 +444,8 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.log_level}")
     ASCIIColors.white("    ├─ Verbose Debug: ", end="")
     ASCIIColors.yellow(f"{args.verbose}")
-    ASCIIColors.white("    ├─ Timeout: ", end="")
-    ASCIIColors.yellow(f"{args.timeout if args.timeout else 'None (infinite)'}")
     ASCIIColors.white("    └─ API Key: ", end="")
     ASCIIColors.yellow("Set" if args.key else "Not Set")
@@ -459,8 +462,10 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.llm_binding}")
     ASCIIColors.white("    ├─ Host: ", end="")
     ASCIIColors.yellow(f"{args.llm_binding_host}")
-    ASCIIColors.white("    └─ Model: ", end="")
     ASCIIColors.yellow(f"{args.llm_model}")
     # Embedding Configuration
     ASCIIColors.magenta("\n📊 Embedding Configuration:")
@@ -475,8 +480,10 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     # RAG Configuration
     ASCIIColors.magenta("\n⚙️ RAG Configuration:")
-    ASCIIColors.white("    ├─ Max Async Operations: ", end="")
     ASCIIColors.yellow(f"{args.max_async}")
     ASCIIColors.white("    ├─ Max Tokens: ", end="")
     ASCIIColors.yellow(f"{args.max_tokens}")
     ASCIIColors.white("    ├─ Max Embed Tokens: ", end="")
@@ -485,8 +492,6 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.chunk_size}")
     ASCIIColors.white("    ├─ Chunk Overlap Size: ", end="")
     ASCIIColors.yellow(f"{args.chunk_overlap_size}")
-    ASCIIColors.white("    ├─ History Turns: ", end="")
-    ASCIIColors.yellow(f"{args.history_turns}")
     ASCIIColors.white("    ├─ Cosine Threshold: ", end="")
     ASCIIColors.yellow(f"{args.cosine_threshold}")
     ASCIIColors.white("    ├─ Top-K: ", end="")

 from .auth import auth_handler
 # Load environment variables
+load_dotenv()
 global_args = {"main_args": None}
         "LIGHTRAG_VECTOR_STORAGE", DefaultRAGStorageConfig.VECTOR_STORAGE
     )
+    # Get MAX_PARALLEL_INSERT from environment
+    global_args["max_parallel_insert"] = get_env_value("MAX_PARALLEL_INSERT", 2, int)
     # Handle openai-ollama special case
     if args.llm_binding == "openai-ollama":
         args.llm_binding = "openai"
     ASCIIColors.yellow(f"{args.log_level}")
     ASCIIColors.white("    ├─ Verbose Debug: ", end="")
     ASCIIColors.yellow(f"{args.verbose}")
+    ASCIIColors.white("    ├─ History Turns: ", end="")
+    ASCIIColors.yellow(f"{args.history_turns}")
     ASCIIColors.white("    └─ API Key: ", end="")
     ASCIIColors.yellow("Set" if args.key else "Not Set")
     ASCIIColors.yellow(f"{args.llm_binding}")
     ASCIIColors.white("    ├─ Host: ", end="")
     ASCIIColors.yellow(f"{args.llm_binding_host}")
+    ASCIIColors.white("    ├─ Model: ", end="")
     ASCIIColors.yellow(f"{args.llm_model}")
+    ASCIIColors.white("    └─ Timeout: ", end="")
+    ASCIIColors.yellow(f"{args.timeout if args.timeout else 'None (infinite)'}")
     # Embedding Configuration
     ASCIIColors.magenta("\n📊 Embedding Configuration:")
     # RAG Configuration
     ASCIIColors.magenta("\n⚙️ RAG Configuration:")
+    ASCIIColors.white("    ├─ Max Async for LLM: ", end="")
     ASCIIColors.yellow(f"{args.max_async}")
+    ASCIIColors.white("    ├─ Max Parallel Insert: ", end="")
+    ASCIIColors.yellow(f"{global_args['max_parallel_insert']}")
     ASCIIColors.white("    ├─ Max Tokens: ", end="")
     ASCIIColors.yellow(f"{args.max_tokens}")
     ASCIIColors.white("    ├─ Max Embed Tokens: ", end="")
     ASCIIColors.yellow(f"{args.chunk_size}")
     ASCIIColors.white("    ├─ Chunk Overlap Size: ", end="")
     ASCIIColors.yellow(f"{args.chunk_overlap_size}")
     ASCIIColors.white("    ├─ Cosine Threshold: ", end="")
     ASCIIColors.yellow(f"{args.cosine_threshold}")
     ASCIIColors.white("    ├─ Top-K: ", end="")

lightrag/kg/shared_storage.py CHANGED Viewed

@@ -41,6 +41,9 @@ _pipeline_status_lock: Optional[LockType] = None
 _graph_db_lock: Optional[LockType] = None
 _data_init_lock: Optional[LockType] = None
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
@@ -51,12 +54,14 @@ class UnifiedLock(Generic[T]):
         is_async: bool,
         name: str = "unnamed",
         enable_logging: bool = True,
     ):
         self._lock = lock
         self._is_async = is_async
         self._pid = os.getpid()  # for debug only
         self._name = name  # for debug only
         self._enable_logging = enable_logging  # for debug only
     async def __aenter__(self) -> "UnifiedLock[T]":
         try:
@@ -64,16 +69,39 @@ class UnifiedLock(Generic[T]):
                 f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
             if self._is_async:
                 await self._lock.acquire()
             else:
                 self._lock.acquire()
             direct_log(
                 f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
             return self
         except Exception as e:
             direct_log(
                 f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}",
                 level="ERROR",
@@ -82,15 +110,29 @@ class UnifiedLock(Generic[T]):
             raise
     async def __aexit__(self, exc_type, exc_val, exc_tb):
         try:
             direct_log(
                 f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
             if self._is_async:
                 self._lock.release()
             else:
                 self._lock.release()
             direct_log(
                 f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})",
                 enable_output=self._enable_logging,
@@ -101,6 +143,31 @@ class UnifiedLock(Generic[T]):
                 level="ERROR",
                 enable_output=self._enable_logging,
             )
             raise
     def __enter__(self) -> "UnifiedLock[T]":
@@ -151,51 +218,61 @@ class UnifiedLock(Generic[T]):
 def get_internal_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
     return UnifiedLock(
         lock=_internal_lock,
         is_async=not is_multiprocess,
         name="internal_lock",
         enable_logging=enable_logging,
     )
 def get_storage_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
     return UnifiedLock(
         lock=_storage_lock,
         is_async=not is_multiprocess,
         name="storage_lock",
         enable_logging=enable_logging,
     )
 def get_pipeline_status_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
     return UnifiedLock(
         lock=_pipeline_status_lock,
         is_async=not is_multiprocess,
         name="pipeline_status_lock",
         enable_logging=enable_logging,
     )
 def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified graph database lock for ensuring atomic operations"""
     return UnifiedLock(
         lock=_graph_db_lock,
         is_async=not is_multiprocess,
         name="graph_db_lock",
         enable_logging=enable_logging,
     )
 def get_data_init_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified data initialization lock for ensuring atomic data initialization"""
     return UnifiedLock(
         lock=_data_init_lock,
         is_async=not is_multiprocess,
         name="data_init_lock",
         enable_logging=enable_logging,
     )
@@ -229,7 +306,8 @@ def initialize_share_data(workers: int = 1):
         _shared_dicts, \
         _init_flags, \
         _initialized, \
-        _update_flags
     # Check if already initialized
     if _initialized:
@@ -251,6 +329,16 @@ def initialize_share_data(workers: int = 1):
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
         direct_log(
             f"Process {os.getpid()} Shared-Data created for Multiple Process (workers={workers})"
         )
@@ -264,6 +352,7 @@ def initialize_share_data(workers: int = 1):
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
         direct_log(f"Process {os.getpid()} Shared-Data created for Single Process")
     # Mark as initialized
@@ -458,7 +547,8 @@ def finalize_share_data():
         _shared_dicts, \
         _init_flags, \
         _initialized, \
-        _update_flags
     # Check if already initialized
     if not _initialized:
@@ -523,5 +613,6 @@ def finalize_share_data():
     _graph_db_lock = None
     _data_init_lock = None
     _update_flags = None
     direct_log(f"Process {os.getpid()} storage data finalization complete")

 _graph_db_lock: Optional[LockType] = None
 _data_init_lock: Optional[LockType] = None
+# async locks for coroutine synchronization in multiprocess mode
+_async_locks: Optional[Dict[str, asyncio.Lock]] = None
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
         is_async: bool,
         name: str = "unnamed",
         enable_logging: bool = True,
+        async_lock: Optional[asyncio.Lock] = None,
     ):
         self._lock = lock
         self._is_async = is_async
         self._pid = os.getpid()  # for debug only
         self._name = name  # for debug only
         self._enable_logging = enable_logging  # for debug only
+        self._async_lock = async_lock  # auxiliary lock for coroutine synchronization
     async def __aenter__(self) -> "UnifiedLock[T]":
         try:
                 f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
+            # If in multiprocess mode and async lock exists, acquire it first
+            if not self._is_async and self._async_lock is not None:
+                direct_log(
+                    f"== Lock == Process {self._pid}: Acquiring async lock for '{self._name}'",
+                    enable_output=self._enable_logging,
+                )
+                await self._async_lock.acquire()
+                direct_log(
+                    f"== Lock == Process {self._pid}: Async lock for '{self._name}' acquired",
+                    enable_output=self._enable_logging,
+                )
+            # Then acquire the main lock
             if self._is_async:
                 await self._lock.acquire()
             else:
                 self._lock.acquire()
             direct_log(
                 f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
             return self
         except Exception as e:
+            # If main lock acquisition fails, release the async lock if it was acquired
+            if (
+                not self._is_async
+                and self._async_lock is not None
+                and self._async_lock.locked()
+            ):
+                self._async_lock.release()
             direct_log(
                 f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}",
                 level="ERROR",
             raise
     async def __aexit__(self, exc_type, exc_val, exc_tb):
+        main_lock_released = False
         try:
             direct_log(
                 f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})",
                 enable_output=self._enable_logging,
             )
+            # Release main lock first
             if self._is_async:
                 self._lock.release()
             else:
                 self._lock.release()
+            main_lock_released = True
+            # Then release async lock if in multiprocess mode
+            if not self._is_async and self._async_lock is not None:
+                direct_log(
+                    f"== Lock == Process {self._pid}: Releasing async lock for '{self._name}'",
+                    enable_output=self._enable_logging,
+                )
+                self._async_lock.release()
             direct_log(
                 f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})",
                 enable_output=self._enable_logging,
                 level="ERROR",
                 enable_output=self._enable_logging,
             )
+            # If main lock release failed but async lock hasn't been released, try to release it
+            if (
+                not main_lock_released
+                and not self._is_async
+                and self._async_lock is not None
+            ):
+                try:
+                    direct_log(
+                        f"== Lock == Process {self._pid}: Attempting to release async lock after main lock failure",
+                        level="WARNING",
+                        enable_output=self._enable_logging,
+                    )
+                    self._async_lock.release()
+                    direct_log(
+                        f"== Lock == Process {self._pid}: Successfully released async lock after main lock failure",
+                        enable_output=self._enable_logging,
+                    )
+                except Exception as inner_e:
+                    direct_log(
+                        f"== Lock == Process {self._pid}: Failed to release async lock after main lock failure: {inner_e}",
+                        level="ERROR",
+                        enable_output=self._enable_logging,
+                    )
             raise
     def __enter__(self) -> "UnifiedLock[T]":
 def get_internal_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    async_lock = _async_locks.get("internal_lock") if is_multiprocess else None
     return UnifiedLock(
         lock=_internal_lock,
         is_async=not is_multiprocess,
         name="internal_lock",
         enable_logging=enable_logging,
+        async_lock=async_lock,
     )
 def get_storage_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    async_lock = _async_locks.get("storage_lock") if is_multiprocess else None
     return UnifiedLock(
         lock=_storage_lock,
         is_async=not is_multiprocess,
         name="storage_lock",
         enable_logging=enable_logging,
+        async_lock=async_lock,
     )
 def get_pipeline_status_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    async_lock = _async_locks.get("pipeline_status_lock") if is_multiprocess else None
     return UnifiedLock(
         lock=_pipeline_status_lock,
         is_async=not is_multiprocess,
         name="pipeline_status_lock",
         enable_logging=enable_logging,
+        async_lock=async_lock,
     )
 def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified graph database lock for ensuring atomic operations"""
+    async_lock = _async_locks.get("graph_db_lock") if is_multiprocess else None
     return UnifiedLock(
         lock=_graph_db_lock,
         is_async=not is_multiprocess,
         name="graph_db_lock",
         enable_logging=enable_logging,
+        async_lock=async_lock,
     )
 def get_data_init_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified data initialization lock for ensuring atomic data initialization"""
+    async_lock = _async_locks.get("data_init_lock") if is_multiprocess else None
     return UnifiedLock(
         lock=_data_init_lock,
         is_async=not is_multiprocess,
         name="data_init_lock",
         enable_logging=enable_logging,
+        async_lock=async_lock,
     )
         _shared_dicts, \
         _init_flags, \
         _initialized, \
+        _update_flags, \
+        _async_locks
     # Check if already initialized
     if _initialized:
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
+        # Initialize async locks for multiprocess mode
+        _async_locks = {
+            "internal_lock": asyncio.Lock(),
+            "storage_lock": asyncio.Lock(),
+            "pipeline_status_lock": asyncio.Lock(),
+            "graph_db_lock": asyncio.Lock(),
+            "data_init_lock": asyncio.Lock(),
+        }
         direct_log(
             f"Process {os.getpid()} Shared-Data created for Multiple Process (workers={workers})"
         )
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
+        _async_locks = None  # No need for async locks in single process mode
         direct_log(f"Process {os.getpid()} Shared-Data created for Single Process")
     # Mark as initialized
         _shared_dicts, \
         _init_flags, \
         _initialized, \
+        _update_flags, \
+        _async_locks
     # Check if already initialized
     if not _initialized:
     _graph_db_lock = None
     _data_init_lock = None
     _update_flags = None
+    _async_locks = None
     direct_log(f"Process {os.getpid()} storage data finalization complete")