Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

zrguo commited on Mar 10

Commit

1fbf326

unverified ·

2 Parent(s): 66250ac 9b78d7d

Merge pull request #1036 from danielaskdd/neo4j-add-min-degree

Browse files

Refactoring Neo4j implementation and fixing storage init problem for Gunicorn

Files changed (9) hide show

lightrag/api/lightrag_server.py +21 -21
lightrag/api/utils_api.py +8 -1
lightrag/kg/json_doc_status_impl.py +34 -17
lightrag/kg/json_kv_impl.py +58 -16
lightrag/kg/neo4j_impl.py +656 -328
lightrag/kg/shared_storage.py +189 -36
lightrag/lightrag.py +30 -90
lightrag/operate.py +47 -26
lightrag/utils.py +42 -14

lightrag/api/lightrag_server.py CHANGED Viewed

@@ -50,9 +50,6 @@ from .auth import auth_handler
 # This update allows the user to put a different.env file for each lightrag folder
 load_dotenv(".env", override=True)
-# Read entity extraction cache config
-enable_llm_cache = os.getenv("ENABLE_LLM_CACHE_FOR_EXTRACT", "false").lower() == "true"
 # Initialize config parser
 config = configparser.ConfigParser()
 config.read("config.ini")
@@ -144,23 +141,25 @@ def create_app(args):
         try:
             # Initialize database connections
             await rag.initialize_storages()
-            await initialize_pipeline_status()
-            # Auto scan documents if enabled
-            if args.auto_scan_at_startup:
-                # Check if a task is already running (with lock protection)
-                pipeline_status = await get_namespace_data("pipeline_status")
-                should_start_task = False
-                async with get_pipeline_status_lock():
-                    if not pipeline_status.get("busy", False):
-                        should_start_task = True
-                # Only start the task if no other task is running
-                if should_start_task:
-                    # Create background task
-                    task = asyncio.create_task(run_scanning_process(rag, doc_manager))
-                    app.state.background_tasks.add(task)
-                    task.add_done_callback(app.state.background_tasks.discard)
-                    logger.info("Auto scan task started at startup.")
             ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
@@ -326,7 +325,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=enable_llm_cache,  # Read from environment variable
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -355,7 +354,7 @@ def create_app(args):
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
-            enable_llm_cache_for_entity_extract=enable_llm_cache,  # Read from environment variable
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
@@ -419,6 +418,7 @@ def create_app(args):
                 "doc_status_storage": args.doc_status_storage,
                 "graph_storage": args.graph_storage,
                 "vector_storage": args.vector_storage,
             },
             "update_status": update_status,
         }

 # This update allows the user to put a different.env file for each lightrag folder
 load_dotenv(".env", override=True)
 # Initialize config parser
 config = configparser.ConfigParser()
 config.read("config.ini")
         try:
             # Initialize database connections
             await rag.initialize_storages()
+            await initialize_pipeline_status()
+            pipeline_status = await get_namespace_data("pipeline_status")
+            should_start_autoscan = False
+            async with get_pipeline_status_lock():
+                # Auto scan documents if enabled
+                if args.auto_scan_at_startup:
+                    if not pipeline_status.get("autoscanned", False):
+                        pipeline_status["autoscanned"] = True
+                        should_start_autoscan = True
+            # Only run auto scan when no other process started it first
+            if should_start_autoscan:
+                # Create background task
+                task = asyncio.create_task(run_scanning_process(rag, doc_manager))
+                app.state.background_tasks.add(task)
+                task.add_done_callback(app.state.background_tasks.discard)
+                logger.info(f"Process {os.getpid()} auto scan task started at startup.")
             ASCIIColors.green("\nServer is ready to accept connections! 🚀\n")
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
             vector_db_storage_cls_kwargs={
                 "cosine_better_than_threshold": args.cosine_threshold
             },
+            enable_llm_cache_for_entity_extract=args.enable_llm_cache_for_extract,
             embedding_cache_config={
                 "enabled": True,
                 "similarity_threshold": 0.95,
                 "doc_status_storage": args.doc_status_storage,
                 "graph_storage": args.graph_storage,
                 "vector_storage": args.vector_storage,
+                "enable_llm_cache_for_extract": args.enable_llm_cache_for_extract,
             },
             "update_status": update_status,
         }

lightrag/api/utils_api.py CHANGED Viewed

@@ -362,6 +362,11 @@ def parse_args(is_uvicorn_mode: bool = False) -> argparse.Namespace:
     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
     # Select Document loading tool (DOCLING, DEFAULT)
     args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
@@ -457,8 +462,10 @@ def display_splash_screen(args: argparse.Namespace) -> None:
     ASCIIColors.yellow(f"{args.history_turns}")
     ASCIIColors.white("    ├─ Cosine Threshold: ", end="")
     ASCIIColors.yellow(f"{args.cosine_threshold}")
-    ASCIIColors.white("    └─ Top-K: ", end="")
     ASCIIColors.yellow(f"{args.top_k}")
     # System Configuration
     ASCIIColors.magenta("\n💾 Storage Configuration:")

     args.chunk_size = get_env_value("CHUNK_SIZE", 1200, int)
     args.chunk_overlap_size = get_env_value("CHUNK_OVERLAP_SIZE", 100, int)
+    # Inject LLM cache configuration
+    args.enable_llm_cache_for_extract = get_env_value(
+        "ENABLE_LLM_CACHE_FOR_EXTRACT", False, bool
+    )
     # Select Document loading tool (DOCLING, DEFAULT)
     args.document_loading_engine = get_env_value("DOCUMENT_LOADING_ENGINE", "DEFAULT")
     ASCIIColors.yellow(f"{args.history_turns}")
     ASCIIColors.white("    ├─ Cosine Threshold: ", end="")
     ASCIIColors.yellow(f"{args.cosine_threshold}")
+    ASCIIColors.white("    ├─ Top-K: ", end="")
     ASCIIColors.yellow(f"{args.top_k}")
+    ASCIIColors.white("    └─ LLM Cache for Extraction Enabled: ", end="")
+    ASCIIColors.yellow(f"{args.enable_llm_cache_for_extract}")
     # System Configuration
     ASCIIColors.magenta("\n💾 Storage Configuration:")

lightrag/kg/json_doc_status_impl.py CHANGED Viewed

@@ -15,6 +15,10 @@ from lightrag.utils import (
 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
     try_initialize_namespace,
 )
@@ -27,21 +31,25 @@ class JsonDocStatusStorage(DocStatusStorage):
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
-        self._storage_lock = get_storage_lock()
         self._data = None
     async def initialize(self):
         """Initialize storage data"""
-        # check need_init must before get_namespace_data
-        need_init = try_initialize_namespace(self.namespace)
-        self._data = await get_namespace_data(self.namespace)
-        if need_init:
-            loaded_data = load_json(self._file_name) or {}
-            async with self._storage_lock:
-                self._data.update(loaded_data)
-                logger.info(
-                    f"Loaded document status storage with {len(loaded_data)} records"
-                )
     async def filter_keys(self, keys: set[str]) -> set[str]:
         """Return keys that should be processed (not in storage or not successfully processed)"""
@@ -87,18 +95,24 @@ class JsonDocStatusStorage(DocStatusStorage):
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            data_dict = (
-                dict(self._data) if hasattr(self._data, "_getvalue") else self._data
-            )
-            write_json(data_dict, self._file_name)
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
         if not data:
             return
         async with self._storage_lock:
             self._data.update(data)
         await self.index_done_callback()
     async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
@@ -109,9 +123,12 @@ class JsonDocStatusStorage(DocStatusStorage):
         async with self._storage_lock:
             for doc_id in doc_ids:
                 self._data.pop(doc_id, None)
         await self.index_done_callback()
     async def drop(self) -> None:
         """Drop the storage"""
         async with self._storage_lock:
             self._data.clear()

 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
+    get_data_init_lock,
+    get_update_flag,
+    set_all_update_flags,
+    clear_all_update_flags,
     try_initialize_namespace,
 )
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
         self._data = None
+        self._storage_lock = None
+        self.storage_updated = None
     async def initialize(self):
         """Initialize storage data"""
+        self._storage_lock = get_storage_lock()
+        self.storage_updated = await get_update_flag(self.namespace)
+        async with get_data_init_lock():
+            # check need_init must before get_namespace_data
+            need_init = await try_initialize_namespace(self.namespace)
+            self._data = await get_namespace_data(self.namespace)
+            if need_init:
+                loaded_data = load_json(self._file_name) or {}
+                async with self._storage_lock:
+                    self._data.update(loaded_data)
+                    logger.info(
+                        f"Process {os.getpid()} doc status load {self.namespace} with {len(loaded_data)} records"
+                    )
     async def filter_keys(self, keys: set[str]) -> set[str]:
         """Return keys that should be processed (not in storage or not successfully processed)"""
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
+            if self.storage_updated.value:
+                data_dict = (
+                    dict(self._data) if hasattr(self._data, "_getvalue") else self._data
+                )
+                logger.info(
+                    f"Process {os.getpid()} doc status writting {len(data_dict)} records to {self.namespace}"
+                )
+                write_json(data_dict, self._file_name)
+                await clear_all_update_flags(self.namespace)
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
             return
+        logger.info(f"Inserting {len(data)} records to {self.namespace}")
         async with self._storage_lock:
             self._data.update(data)
+            await set_all_update_flags(self.namespace)
         await self.index_done_callback()
     async def get_by_id(self, id: str) -> Union[dict[str, Any], None]:
         async with self._storage_lock:
             for doc_id in doc_ids:
                 self._data.pop(doc_id, None)
+            await set_all_update_flags(self.namespace)
         await self.index_done_callback()
     async def drop(self) -> None:
         """Drop the storage"""
         async with self._storage_lock:
             self._data.clear()
+            await set_all_update_flags(self.namespace)
+        await self.index_done_callback()

lightrag/kg/json_kv_impl.py CHANGED Viewed

@@ -13,6 +13,10 @@ from lightrag.utils import (
 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
     try_initialize_namespace,
 )
@@ -23,26 +27,63 @@ class JsonKVStorage(BaseKVStorage):
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
-        self._storage_lock = get_storage_lock()
         self._data = None
     async def initialize(self):
         """Initialize storage data"""
-        # check need_init must before get_namespace_data
-        need_init = try_initialize_namespace(self.namespace)
-        self._data = await get_namespace_data(self.namespace)
-        if need_init:
-            loaded_data = load_json(self._file_name) or {}
-            async with self._storage_lock:
-                self._data.update(loaded_data)
-                logger.info(f"Load KV {self.namespace} with {len(loaded_data)} data")
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
-            data_dict = (
-                dict(self._data) if hasattr(self._data, "_getvalue") else self._data
-            )
-            write_json(data_dict, self._file_name)
     async def get_all(self) -> dict[str, Any]:
         """Get all data from storage
@@ -73,15 +114,16 @@ class JsonKVStorage(BaseKVStorage):
             return set(keys) - set(self._data.keys())
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
-        logger.info(f"Inserting {len(data)} to {self.namespace}")
         if not data:
             return
         async with self._storage_lock:
-            left_data = {k: v for k, v in data.items() if k not in self._data}
-            self._data.update(left_data)
     async def delete(self, ids: list[str]) -> None:
         async with self._storage_lock:
             for doc_id in ids:
                 self._data.pop(doc_id, None)
         await self.index_done_callback()

 from .shared_storage import (
     get_namespace_data,
     get_storage_lock,
+    get_data_init_lock,
+    get_update_flag,
+    set_all_update_flags,
+    clear_all_update_flags,
     try_initialize_namespace,
 )
     def __post_init__(self):
         working_dir = self.global_config["working_dir"]
         self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
         self._data = None
+        self._storage_lock = None
+        self.storage_updated = None
     async def initialize(self):
         """Initialize storage data"""
+        self._storage_lock = get_storage_lock()
+        self.storage_updated = await get_update_flag(self.namespace)
+        async with get_data_init_lock():
+            # check need_init must before get_namespace_data
+            need_init = await try_initialize_namespace(self.namespace)
+            self._data = await get_namespace_data(self.namespace)
+            if need_init:
+                loaded_data = load_json(self._file_name) or {}
+                async with self._storage_lock:
+                    self._data.update(loaded_data)
+                    # Calculate data count based on namespace
+                    if self.namespace.endswith("cache"):
+                        # For cache namespaces, sum the cache entries across all cache types
+                        data_count = sum(
+                            len(first_level_dict)
+                            for first_level_dict in loaded_data.values()
+                            if isinstance(first_level_dict, dict)
+                        )
+                    else:
+                        # For non-cache namespaces, use the original count method
+                        data_count = len(loaded_data)
+                    logger.info(
+                        f"Process {os.getpid()} KV load {self.namespace} with {data_count} records"
+                    )
     async def index_done_callback(self) -> None:
         async with self._storage_lock:
+            if self.storage_updated.value:
+                data_dict = (
+                    dict(self._data) if hasattr(self._data, "_getvalue") else self._data
+                )
+                # Calculate data count based on namespace
+                if self.namespace.endswith("cache"):
+                    # # For cache namespaces, sum the cache entries across all cache types
+                    data_count = sum(
+                        len(first_level_dict)
+                        for first_level_dict in data_dict.values()
+                        if isinstance(first_level_dict, dict)
+                    )
+                else:
+                    # For non-cache namespaces, use the original count method
+                    data_count = len(data_dict)
+                logger.info(
+                    f"Process {os.getpid()} KV writting {data_count} records to {self.namespace}"
+                )
+                write_json(data_dict, self._file_name)
+                await clear_all_update_flags(self.namespace)
     async def get_all(self) -> dict[str, Any]:
         """Get all data from storage
             return set(keys) - set(self._data.keys())
     async def upsert(self, data: dict[str, dict[str, Any]]) -> None:
         if not data:
             return
+        logger.info(f"Inserting {len(data)} records to {self.namespace}")
         async with self._storage_lock:
+            self._data.update(data)
+            await set_all_update_flags(self.namespace)
     async def delete(self, ids: list[str]) -> None:
         async with self._storage_lock:
             for doc_id in ids:
                 self._data.pop(doc_id, None)
+            await set_all_update_flags(self.namespace)
         await self.index_done_callback()

lightrag/kg/neo4j_impl.py CHANGED Viewed

@@ -3,7 +3,7 @@ import inspect
 import os
 import re
 from dataclasses import dataclass
-from typing import Any, List, Dict, final
 import numpy as np
 import configparser
@@ -15,6 +15,7 @@ from tenacity import (
     retry_if_exception_type,
 )
 from ..utils import logger
 from ..base import BaseGraphStorage
 from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
@@ -37,6 +38,9 @@ config.read("config.ini", "utf-8")
 # Get maximum number of graph nodes from environment variable, default is 1000
 MAX_GRAPH_NODES = int(os.getenv("MAX_GRAPH_NODES", 1000))
 @final
 @dataclass
@@ -60,19 +64,25 @@ class Neo4JStorage(BaseGraphStorage):
         MAX_CONNECTION_POOL_SIZE = int(
             os.environ.get(
                 "NEO4J_MAX_CONNECTION_POOL_SIZE",
-                config.get("neo4j", "connection_pool_size", fallback=800),
             )
         )
         CONNECTION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_TIMEOUT",
-                config.get("neo4j", "connection_timeout", fallback=60.0),
             ),
         )
         CONNECTION_ACQUISITION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_ACQUISITION_TIMEOUT",
-                config.get("neo4j", "connection_acquisition_timeout", fallback=60.0),
             ),
         )
         DATABASE = os.environ.get(
@@ -85,6 +95,7 @@ class Neo4JStorage(BaseGraphStorage):
             max_connection_pool_size=MAX_CONNECTION_POOL_SIZE,
             connection_timeout=CONNECTION_TIMEOUT,
             connection_acquisition_timeout=CONNECTION_ACQUISITION_TIMEOUT,
         )
         # Try to connect to the database
@@ -152,65 +163,103 @@ class Neo4JStorage(BaseGraphStorage):
         }
     async def close(self):
         if self._driver:
             await self._driver.close()
             self._driver = None
     async def __aexit__(self, exc_type, exc, tb):
-        if self._driver:
-            await self._driver.close()
     async def index_done_callback(self) -> None:
         # Noe4J handles persistence automatically
         pass
-    async def _label_exists(self, label: str) -> bool:
-        """Check if a label exists in the Neo4j database."""
-        query = "CALL db.labels() YIELD label RETURN label"
-        try:
-            async with self._driver.session(database=self._DATABASE) as session:
-                result = await session.run(query)
-                labels = [record["label"] for record in await result.data()]
-                return label in labels
-        except Exception as e:
-            logger.error(f"Error checking label existence: {e}")
-            return False
-    async def _ensure_label(self, label: str) -> str:
-        """Ensure a label exists by validating it."""
         clean_label = label.strip('"')
-        if not await self._label_exists(clean_label):
-            logger.warning(f"Label '{clean_label}' does not exist in Neo4j")
         return clean_label
     async def has_node(self, node_id: str) -> bool:
-        entity_name_label = await self._ensure_label(node_id)
-        async with self._driver.session(database=self._DATABASE) as session:
-            query = (
-                f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
-            )
-            result = await session.run(query)
-            single_result = await result.single()
-            logger.debug(
-                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['node_exists']}"
-            )
-            return single_result["node_exists"]
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        entity_name_label_source = source_node_id.strip('"')
-        entity_name_label_target = target_node_id.strip('"')
-        async with self._driver.session(database=self._DATABASE) as session:
-            query = (
-                f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
-                "RETURN COUNT(r) > 0 AS edgeExists"
-            )
-            result = await session.run(query)
-            single_result = await result.single()
-            logger.debug(
-                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{single_result['edgeExists']}"
-            )
-            return single_result["edgeExists"]
     async def get_node(self, node_id: str) -> dict[str, str] | None:
         """Get node by its label identifier.
@@ -221,43 +270,108 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             dict: Node properties if found
             None: If node not found
         """
-        async with self._driver.session(database=self._DATABASE) as session:
-            entity_name_label = await self._ensure_label(node_id)
-            query = f"MATCH (n:`{entity_name_label}`) RETURN n"
-            result = await session.run(query)
-            record = await result.single()
-            if record:
-                node = record["n"]
-                node_dict = dict(node)
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
-                )
-                return node_dict
-            return None
     async def node_degree(self, node_id: str) -> int:
-        entity_name_label = node_id.strip('"')
-        async with self._driver.session(database=self._DATABASE) as session:
-            query = f"""
-                MATCH (n:`{entity_name_label}`)
-                RETURN COUNT{{ (n)--() }} AS totalEdgeCount
-            """
-            result = await session.run(query)
-            record = await result.single()
-            if record:
-                edge_count = record["totalEdgeCount"]
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_count}"
                 )
-                return edge_count
-            else:
-                return None
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        entity_name_label_source = src_id.strip('"')
-        entity_name_label_target = tgt_id.strip('"')
         src_degree = await self.node_degree(entity_name_label_source)
         trg_degree = await self.node_degree(entity_name_label_target)
@@ -266,116 +380,152 @@ class Neo4JStorage(BaseGraphStorage):
         trg_degree = 0 if trg_degree is None else trg_degree
         degrees = int(src_degree) + int(trg_degree)
-        logger.debug(
-            f"{inspect.currentframe().f_code.co_name}:query:src_Degree+trg_degree:result:{degrees}"
-        )
         return degrees
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
         try:
-            entity_name_label_source = source_node_id.strip('"')
-            entity_name_label_target = target_node_id.strip('"')
-            async with self._driver.session(database=self._DATABASE) as session:
                 query = f"""
-                MATCH (start:`{entity_name_label_source}`)-[r]->(end:`{entity_name_label_target}`)
                 RETURN properties(r) as edge_properties
-                LIMIT 1
                 """
                 result = await session.run(query)
-                record = await result.single()
-                if record:
-                    try:
-                        result = dict(record["edge_properties"])
-                        logger.info(f"Result: {result}")
-                        # Ensure required keys exist with defaults
-                        required_keys = {
-                            "weight": 0.0,
-                            "source_id": None,
-                            "description": None,
-                            "keywords": None,
-                        }
-                        for key, default_value in required_keys.items():
-                            if key not in result:
-                                result[key] = default_value
-                                logger.warning(
-                                    f"Edge between {entity_name_label_source} and {entity_name_label_target} "
-                                    f"missing {key}, using default: {default_value}"
-                                )
-                        logger.debug(
-                            f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{result}"
-                        )
-                        return result
-                    except (KeyError, TypeError, ValueError) as e:
-                        logger.error(
-                            f"Error processing edge properties between {entity_name_label_source} "
-                            f"and {entity_name_label_target}: {str(e)}"
                         )
-                        # Return default edge properties on error
-                        return {
-                            "weight": 0.0,
-                            "description": None,
-                            "keywords": None,
-                            "source_id": None,
-                        }
-                logger.debug(
-                    f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
-                )
-                # Return default edge properties when no edge found
-                return {
-                    "weight": 0.0,
-                    "description": None,
-                    "keywords": None,
-                    "source_id": None,
-                }
         except Exception as e:
             logger.error(
                 f"Error in get_edge between {source_node_id} and {target_node_id}: {str(e)}"
             )
-            # Return default edge properties on error
-            return {
-                "weight": 0.0,
-                "description": None,
-                "keywords": None,
-                "source_id": None,
-            }
     async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
-        node_label = source_node_id.strip('"')
         """
-        Retrieves all edges (relationships) for a particular node identified by its label.
-        :return: List of dictionaries containing edge information
-        """
-        query = f"""MATCH (n:`{node_label}`)
-                OPTIONAL MATCH (n)-[r]-(connected)
-                RETURN n, r, connected"""
-        async with self._driver.session(database=self._DATABASE) as session:
-            results = await session.run(query)
-            edges = []
-            async for record in results:
-                source_node = record["n"]
-                connected_node = record["connected"]
-                source_label = (
-                    list(source_node.labels)[0] if source_node.labels else None
-                )
-                target_label = (
-                    list(connected_node.labels)[0]
-                    if connected_node and connected_node.labels
-                    else None
-                )
-                if source_label and target_label:
-                    edges.append((source_label, target_label))
-            return edges
     @retry(
         stop=stop_after_attempt(3),
@@ -397,26 +547,88 @@ class Neo4JStorage(BaseGraphStorage):
             node_id: The unique identifier for the node (used as label)
             node_data: Dictionary of node properties
         """
-        label = await self._ensure_label(node_id)
         properties = node_data
-        async def _do_upsert(tx: AsyncManagedTransaction):
-            query = f"""
-            MERGE (n:`{label}`)
-            SET n += $properties
-            """
-            await tx.run(query, properties=properties)
-            logger.debug(
-                f"Upserted node with label '{label}' and properties: {properties}"
-            )
         try:
             async with self._driver.session(database=self._DATABASE) as session:
-                await session.execute_write(_do_upsert)
         except Exception as e:
             logger.error(f"Error during upsert: {str(e)}")
             raise
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=4, max=10),
@@ -434,34 +646,55 @@ class Neo4JStorage(BaseGraphStorage):
     ) -> None:
         """
         Upsert an edge and its properties between two nodes identified by their labels.
         Args:
             source_node_id (str): Label of the source node (used as identifier)
             target_node_id (str): Label of the target node (used as identifier)
             edge_data (dict): Dictionary of properties to set on the edge
         """
-        source_label = await self._ensure_label(source_node_id)
-        target_label = await self._ensure_label(target_node_id)
         edge_properties = edge_data
-        async def _do_upsert_edge(tx: AsyncManagedTransaction):
-            query = f"""
-            MATCH (source:`{source_label}`)
-            WITH source
-            MATCH (target:`{target_label}`)
-            MERGE (source)-[r:DIRECTED]->(target)
-            SET r += $properties
-            RETURN r
-            """
-            result = await tx.run(query, properties=edge_properties)
-            record = await result.single()
-            logger.debug(
-                f"Upserted edge from '{source_label}' to '{target_label}' with properties: {edge_properties}, result: {record['r'] if record else None}"
-            )
         try:
             async with self._driver.session(database=self._DATABASE) as session:
-                await session.execute_write(_do_upsert_edge)
         except Exception as e:
             logger.error(f"Error during edge upsert: {str(e)}")
             raise
@@ -470,199 +703,286 @@ class Neo4JStorage(BaseGraphStorage):
         print("Implemented but never called.")
     async def get_knowledge_graph(
-        self, node_label: str, max_depth: int = 5
     ) -> KnowledgeGraph:
         """
         Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
         Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
         When reducing the number of nodes, the prioritization criteria are as follows:
-            1. Label matching nodes take precedence (nodes containing the specified label string)
-            2. Followed by nodes directly connected to the matching nodes
-            3. Finally, the degree of the nodes
         Args:
-            node_label (str): String to match in node labels (will match any node containing this string in its label)
-            max_depth (int, optional): Maximum depth of the graph. Defaults to 5.
         Returns:
             KnowledgeGraph: Complete connected subgraph for specified node
         """
         label = node_label.strip('"')
-        # Escape single quotes to prevent injection attacks
-        escaped_label = label.replace("'", "\\'")
         result = KnowledgeGraph()
         seen_nodes = set()
         seen_edges = set()
-        async with self._driver.session(database=self._DATABASE) as session:
             try:
                 if label == "*":
                     main_query = """
                     MATCH (n)
                     OPTIONAL MATCH (n)-[r]-()
                     WITH n, count(r) AS degree
                     ORDER BY degree DESC
                     LIMIT $max_nodes
-                    WITH collect(n) AS nodes
-                    MATCH (a)-[r]->(b)
-                    WHERE a IN nodes AND b IN nodes
-                    RETURN nodes, collect(DISTINCT r) AS relationships
                     """
                     result_set = await session.run(
-                        main_query, {"max_nodes": MAX_GRAPH_NODES}
                     )
                 else:
-                    validate_query = f"""
-                    MATCH (n)
-                    WHERE any(label IN labels(n) WHERE label CONTAINS '{escaped_label}')
-                    RETURN n LIMIT 1
-                    """
-                    validate_result = await session.run(validate_query)
-                    if not await validate_result.single():
-                        logger.warning(
-                            f"No nodes containing '{label}' in their labels found!"
-                        )
-                        return result
                     # Main query uses partial matching
-                    main_query = f"""
                     MATCH (start)
-                    WHERE any(label IN labels(start) WHERE label CONTAINS '{escaped_label}')
                     WITH start
-                    CALL apoc.path.subgraphAll(start, {{
-                        relationshipFilter: '>',
                         minLevel: 0,
-                        maxLevel: {max_depth},
                         bfs: true
-                    }})
                     YIELD nodes, relationships
                     WITH start, nodes, relationships
                     UNWIND nodes AS node
                     OPTIONAL MATCH (node)-[r]-()
-                    WITH node, count(r) AS degree, start, nodes, relationships,
-                            CASE
-                            WHEN id(node) = id(start) THEN 2
-                            WHEN EXISTS((start)-->(node)) OR EXISTS((node)-->(start)) THEN 1
-                            ELSE 0
-                            END AS priority
-                    ORDER BY priority DESC, degree DESC
                     LIMIT $max_nodes
-                    WITH collect(node) AS filtered_nodes, nodes, relationships
-                    RETURN filtered_nodes AS nodes,
-                            [rel IN relationships WHERE startNode(rel) IN filtered_nodes AND endNode(rel) IN filtered_nodes] AS relationships
                     """
                     result_set = await session.run(
-                        main_query, {"max_nodes": MAX_GRAPH_NODES}
                     )
-                record = await result_set.single()
-                if record:
-                    # Handle nodes (compatible with multi-label cases)
-                    for node in record["nodes"]:
-                        # Use node ID + label combination as unique identifier
-                        node_id = node.id
-                        if node_id not in seen_nodes:
-                            result.nodes.append(
-                                KnowledgeGraphNode(
-                                    id=f"{node_id}",
-                                    labels=list(node.labels),
-                                    properties=dict(node),
                                 )
-                            )
-                            seen_nodes.add(node_id)
-                    # Handle relationships (including direction information)
-                    for rel in record["relationships"]:
-                        edge_id = rel.id
-                        if edge_id not in seen_edges:
-                            start = rel.start_node
-                            end = rel.end_node
-                            result.edges.append(
-                                KnowledgeGraphEdge(
-                                    id=f"{edge_id}",
-                                    type=rel.type,
-                                    source=f"{start.id}",
-                                    target=f"{end.id}",
-                                    properties=dict(rel),
                                 )
-                            )
-                            seen_edges.add(edge_id)
-                    logger.info(
-                        f"Subgraph query successful | Node count: {len(result.nodes)} | Edge count: {len(result.edges)}"
-                    )
             except neo4jExceptions.ClientError as e:
-                logger.error(f"APOC query failed: {str(e)}")
-                return await self._robust_fallback(label, max_depth)
         return result
     async def _robust_fallback(
-        self, label: str, max_depth: int
-    ) -> Dict[str, List[Dict]]:
-        """Enhanced fallback query solution"""
-        result = {"nodes": [], "edges": []}
         visited_nodes = set()
         visited_edges = set()
-        async def traverse(current_label: str, current_depth: int):
             if current_depth > max_depth:
                 return
-            # Get current node details
-            node = await self.get_node(current_label)
-            if not node:
                 return
-            node_id = f"{current_label}"
-            if node_id in visited_nodes:
                 return
-            visited_nodes.add(node_id)
-            # Add node data (with complete labels)
-            node_data = {k: v for k, v in node.items()}
-            node_data["labels"] = [
-                current_label
-            ]  # Assume get_node method returns label information
-            result["nodes"].append(node_data)
-            # Get all outgoing and incoming edges
             query = f"""
-            MATCH (a)-[r]-(b)
-            WHERE a:`{current_label}` OR b:`{current_label}`
-            RETURN a, r, b,
-                   CASE WHEN startNode(r) = a THEN 'OUTGOING' ELSE 'INCOMING' END AS direction
             """
-            async with self._driver.session(database=self._DATABASE) as session:
-                results = await session.run(query)
-                async for record in results:
-                    # Handle edges
-                    rel = record["r"]
-                    edge_id = f"{rel.id}_{rel.type}"
-                    if edge_id not in visited_edges:
-                        edge_data = dict(rel)
-                        edge_data.update(
-                            {
-                                "source": list(record["a"].labels)[0],
-                                "target": list(record["b"].labels)[0],
-                                "type": rel.type,
-                                "direction": record["direction"],
-                            }
-                        )
-                        result["edges"].append(edge_data)
-                        visited_edges.add(edge_id)
-                        # Recursively traverse adjacent nodes
-                        next_label = (
-                            list(record["b"].labels)[0]
-                            if record["direction"] == "OUTGOING"
-                            else list(record["a"].labels)[0]
-                        )
-                        await traverse(next_label, current_depth + 1)
-        await traverse(label, 0)
         return result
     async def get_all_labels(self) -> list[str]:
@@ -671,7 +991,9 @@ class Neo4JStorage(BaseGraphStorage):
         Returns:
             ["Person", "Company", ...]  # Alphabetically sorted label list
         """
-        async with self._driver.session(database=self._DATABASE) as session:
             # Method 1: Direct metadata query (Available for Neo4j 4.3+)
             # query = "CALL db.labels() YIELD label RETURN label"
@@ -683,11 +1005,15 @@ class Neo4JStorage(BaseGraphStorage):
                 RETURN DISTINCT label
                 ORDER BY label
             """
             result = await session.run(query)
             labels = []
-            async for record in result:
-                labels.append(record["label"])
             return labels
     @retry(
@@ -708,15 +1034,16 @@ class Neo4JStorage(BaseGraphStorage):
         Args:
             node_id: The label of the node to delete
         """
-        label = await self._ensure_label(node_id)
         async def _do_delete(tx: AsyncManagedTransaction):
             query = f"""
             MATCH (n:`{label}`)
             DETACH DELETE n
             """
-            await tx.run(query)
             logger.debug(f"Deleted node with label '{label}'")
         try:
             async with self._driver.session(database=self._DATABASE) as session:
@@ -765,16 +1092,17 @@ class Neo4JStorage(BaseGraphStorage):
             edges: List of edges to be deleted, each edge is a (source, target) tuple
         """
         for source, target in edges:
-            source_label = await self._ensure_label(source)
-            target_label = await self._ensure_label(target)
             async def _do_delete_edge(tx: AsyncManagedTransaction):
                 query = f"""
-                MATCH (source:`{source_label}`)-[r]->(target:`{target_label}`)
                 DELETE r
                 """
-                await tx.run(query)
                 logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
             try:
                 async with self._driver.session(database=self._DATABASE) as session:

 import os
 import re
 from dataclasses import dataclass
+from typing import Any, final, Optional
 import numpy as np
 import configparser
     retry_if_exception_type,
 )
+import logging
 from ..utils import logger
 from ..base import BaseGraphStorage
 from ..types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
 # Get maximum number of graph nodes from environment variable, default is 1000
 MAX_GRAPH_NODES = int(os.getenv("MAX_GRAPH_NODES", 1000))
+# Set neo4j logger level to ERROR to suppress warning logs
+logging.getLogger("neo4j").setLevel(logging.ERROR)
 @final
 @dataclass
         MAX_CONNECTION_POOL_SIZE = int(
             os.environ.get(
                 "NEO4J_MAX_CONNECTION_POOL_SIZE",
+                config.get("neo4j", "connection_pool_size", fallback=50),
             )
         )
         CONNECTION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_TIMEOUT",
+                config.get("neo4j", "connection_timeout", fallback=30.0),
             ),
         )
         CONNECTION_ACQUISITION_TIMEOUT = float(
             os.environ.get(
                 "NEO4J_CONNECTION_ACQUISITION_TIMEOUT",
+                config.get("neo4j", "connection_acquisition_timeout", fallback=30.0),
+            ),
+        )
+        MAX_TRANSACTION_RETRY_TIME = float(
+            os.environ.get(
+                "NEO4J_MAX_TRANSACTION_RETRY_TIME",
+                config.get("neo4j", "max_transaction_retry_time", fallback=30.0),
             ),
         )
         DATABASE = os.environ.get(
             max_connection_pool_size=MAX_CONNECTION_POOL_SIZE,
             connection_timeout=CONNECTION_TIMEOUT,
             connection_acquisition_timeout=CONNECTION_ACQUISITION_TIMEOUT,
+            max_transaction_retry_time=MAX_TRANSACTION_RETRY_TIME,
         )
         # Try to connect to the database
         }
     async def close(self):
+        """Close the Neo4j driver and release all resources"""
         if self._driver:
             await self._driver.close()
             self._driver = None
     async def __aexit__(self, exc_type, exc, tb):
+        """Ensure driver is closed when context manager exits"""
+        await self.close()
     async def index_done_callback(self) -> None:
         # Noe4J handles persistence automatically
         pass
+    def _ensure_label(self, label: str) -> str:
+        """Ensure a label is valid
+        Args:
+            label: The label to validate
+        Returns:
+            str: The cleaned label
+        Raises:
+            ValueError: If label is empty after cleaning
+        """
         clean_label = label.strip('"')
+        if not clean_label:
+            raise ValueError("Neo4j: Label cannot be empty")
         return clean_label
     async def has_node(self, node_id: str) -> bool:
+        """
+        Check if a node with the given label exists in the database
+        Args:
+            node_id: Label of the node to check
+        Returns:
+            bool: True if node exists, False otherwise
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
+        """
+        entity_name_label = self._ensure_label(node_id)
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            try:
+                query = f"MATCH (n:`{entity_name_label}`) RETURN count(n) > 0 AS node_exists"
+                result = await session.run(query)
+                single_result = await result.single()
+                await result.consume()  # Ensure result is fully consumed
+                return single_result["node_exists"]
+            except Exception as e:
+                logger.error(
+                    f"Error checking node existence for {entity_name_label}: {str(e)}"
+                )
+                await result.consume()  # Ensure results are consumed even on error
+                raise
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        """
+        Check if an edge exists between two nodes
+        Args:
+            source_node_id: Label of the source node
+            target_node_id: Label of the target node
+        Returns:
+            bool: True if edge exists, False otherwise
+        Raises:
+            ValueError: If either node_id is invalid
+            Exception: If there is an error executing the query
+        """
+        entity_name_label_source = self._ensure_label(source_node_id)
+        entity_name_label_target = self._ensure_label(target_node_id)
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            try:
+                query = (
+                    f"MATCH (a:`{entity_name_label_source}`)-[r]-(b:`{entity_name_label_target}`) "
+                    "RETURN COUNT(r) > 0 AS edgeExists"
+                )
+                result = await session.run(query)
+                single_result = await result.single()
+                await result.consume()  # Ensure result is fully consumed
+                return single_result["edgeExists"]
+            except Exception as e:
+                logger.error(
+                    f"Error checking edge existence between {entity_name_label_source} and {entity_name_label_target}: {str(e)}"
+                )
+                await result.consume()  # Ensure results are consumed even on error
+                raise
     async def get_node(self, node_id: str) -> dict[str, str] | None:
         """Get node by its label identifier.
         Returns:
             dict: Node properties if found
             None: If node not found
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
         """
+        entity_name_label = self._ensure_label(node_id)
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            try:
+                query = f"MATCH (n:`{entity_name_label}` {{entity_id: $entity_id}}) RETURN n"
+                result = await session.run(query, entity_id=entity_name_label)
+                try:
+                    records = await result.fetch(
+                        2
+                    )  # Get 2 records for duplication check
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple nodes found with label '{entity_name_label}'. Using first node."
+                        )
+                    if records:
+                        node = records[0]["n"]
+                        node_dict = dict(node)
+                        logger.debug(
+                            f"{inspect.currentframe().f_code.co_name}: query: {query}, result: {node_dict}"
+                        )
+                        return node_dict
+                    return None
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
+            except Exception as e:
+                logger.error(f"Error getting node for {entity_name_label}: {str(e)}")
+                raise
     async def node_degree(self, node_id: str) -> int:
+        """Get the degree (number of relationships) of a node with the given label.
+        If multiple nodes have the same label, returns the degree of the first node.
+        If no node is found, returns 0.
+        Args:
+            node_id: The label of the node
+        Returns:
+            int: The number of relationships the node has, or 0 if no node found
+        Raises:
+            ValueError: If node_id is invalid
+            Exception: If there is an error executing the query
+        """
+        entity_name_label = self._ensure_label(node_id)
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            try:
+                query = f"""
+                    MATCH (n:`{entity_name_label}`)
+                    OPTIONAL MATCH (n)-[r]-()
+                    RETURN n, COUNT(r) AS degree
+                """
+                result = await session.run(query)
+                try:
+                    records = await result.fetch(100)
+                    if not records:
+                        logger.warning(
+                            f"No node found with label '{entity_name_label}'"
+                        )
+                        return 0
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple nodes ({len(records)}) found with label '{entity_name_label}', using first node's degree"
+                        )
+                    degree = records[0]["degree"]
+                    logger.debug(
+                        f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{degree}"
+                    )
+                    return degree
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
+            except Exception as e:
+                logger.error(
+                    f"Error getting node degree for {entity_name_label}: {str(e)}"
                 )
+                raise
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        """Get the total degree (sum of relationships) of two nodes.
+        Args:
+            src_id: Label of the source node
+            tgt_id: Label of the target node
+        Returns:
+            int: Sum of the degrees of both nodes
+        """
+        entity_name_label_source = self._ensure_label(src_id)
+        entity_name_label_target = self._ensure_label(tgt_id)
         src_degree = await self.node_degree(entity_name_label_source)
         trg_degree = await self.node_degree(entity_name_label_target)
         trg_degree = 0 if trg_degree is None else trg_degree
         degrees = int(src_degree) + int(trg_degree)
         return degrees
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
+        """Get edge properties between two nodes.
+        Args:
+            source_node_id: Label of the source node
+            target_node_id: Label of the target node
+        Returns:
+            dict: Edge properties if found, default properties if not found or on error
+        Raises:
+            ValueError: If either node_id is invalid
+            Exception: If there is an error executing the query
+        """
         try:
+            entity_name_label_source = self._ensure_label(source_node_id)
+            entity_name_label_target = self._ensure_label(target_node_id)
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
                 query = f"""
+                MATCH (start:`{entity_name_label_source}`)-[r]-(end:`{entity_name_label_target}`)
                 RETURN properties(r) as edge_properties
                 """
                 result = await session.run(query)
+                try:
+                    records = await result.fetch(2)
+                    if len(records) > 1:
+                        logger.warning(
+                            f"Multiple edges found between '{entity_name_label_source}' and '{entity_name_label_target}'. Using first edge."
                         )
+                    if records:
+                        try:
+                            edge_result = dict(records[0]["edge_properties"])
+                            logger.debug(f"Result: {edge_result}")
+                            # Ensure required keys exist with defaults
+                            required_keys = {
+                                "weight": 0.0,
+                                "source_id": None,
+                                "description": None,
+                                "keywords": None,
+                            }
+                            for key, default_value in required_keys.items():
+                                if key not in edge_result:
+                                    edge_result[key] = default_value
+                                    logger.warning(
+                                        f"Edge between {entity_name_label_source} and {entity_name_label_target} "
+                                        f"missing {key}, using default: {default_value}"
+                                    )
+                            logger.debug(
+                                f"{inspect.currentframe().f_code.co_name}:query:{query}:result:{edge_result}"
+                            )
+                            return edge_result
+                        except (KeyError, TypeError, ValueError) as e:
+                            logger.error(
+                                f"Error processing edge properties between {entity_name_label_source} "
+                                f"and {entity_name_label_target}: {str(e)}"
+                            )
+                            # Return default edge properties on error
+                            return {
+                                "weight": 0.0,
+                                "source_id": None,
+                                "description": None,
+                                "keywords": None,
+                            }
+                    logger.debug(
+                        f"{inspect.currentframe().f_code.co_name}: No edge found between {entity_name_label_source} and {entity_name_label_target}"
+                    )
+                    # Return default edge properties when no edge found
+                    return {
+                        "weight": 0.0,
+                        "source_id": None,
+                        "description": None,
+                        "keywords": None,
+                    }
+                finally:
+                    await result.consume()  # Ensure result is fully consumed
         except Exception as e:
             logger.error(
                 f"Error in get_edge between {source_node_id} and {target_node_id}: {str(e)}"
             )
+            raise
     async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
+        """Retrieves all edges (relationships) for a particular node identified by its label.
+        Args:
+            source_node_id: Label of the node to get edges for
+        Returns:
+            list[tuple[str, str]]: List of (source_label, target_label) tuples representing edges
+            None: If no edges found
+        Raises:
+            ValueError: If source_node_id is invalid
+            Exception: If there is an error executing the query
         """
+        try:
+            node_label = self._ensure_label(source_node_id)
+            query = f"""MATCH (n:`{node_label}`)
+                    OPTIONAL MATCH (n)-[r]-(connected)
+                    RETURN n, r, connected"""
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
+                try:
+                    results = await session.run(query)
+                    edges = []
+                    async for record in results:
+                        source_node = record["n"]
+                        connected_node = record["connected"]
+                        source_label = (
+                            list(source_node.labels)[0] if source_node.labels else None
+                        )
+                        target_label = (
+                            list(connected_node.labels)[0]
+                            if connected_node and connected_node.labels
+                            else None
+                        )
+                        if source_label and target_label:
+                            edges.append((source_label, target_label))
+                    await results.consume()  # Ensure results are consumed
+                    return edges
+                except Exception as e:
+                    logger.error(f"Error getting edges for node {node_label}: {str(e)}")
+                    await results.consume()  # Ensure results are consumed even on error
+                    raise
+        except Exception as e:
+            logger.error(f"Error in get_node_edges for {source_node_id}: {str(e)}")
+            raise
     @retry(
         stop=stop_after_attempt(3),
             node_id: The unique identifier for the node (used as label)
             node_data: Dictionary of node properties
         """
+        label = self._ensure_label(node_id)
         properties = node_data
+        if "entity_id" not in properties:
+            raise ValueError("Neo4j: node properties must contain an 'entity_id' field")
         try:
             async with self._driver.session(database=self._DATABASE) as session:
+                async def execute_upsert(tx: AsyncManagedTransaction):
+                    query = f"""
+                    MERGE (n:`{label}` {{entity_id: $properties.entity_id}})
+                    SET n += $properties
+                    """
+                    result = await tx.run(query, properties=properties)
+                    logger.debug(
+                        f"Upserted node with label '{label}' and properties: {properties}"
+                    )
+                    await result.consume()  # Ensure result is fully consumed
+                await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during upsert: {str(e)}")
             raise
+    @retry(
+        stop=stop_after_attempt(3),
+        wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (
+                neo4jExceptions.ServiceUnavailable,
+                neo4jExceptions.TransientError,
+                neo4jExceptions.WriteServiceUnavailable,
+                neo4jExceptions.ClientError,
+            )
+        ),
+    )
+    async def _get_unique_node_entity_id(self, node_label: str) -> str:
+        """
+        Get the entity_id of a node with the given label, ensuring the node is unique.
+        Args:
+            node_label (str): Label of the node to check
+        Returns:
+            str: The entity_id of the unique node
+        Raises:
+            ValueError: If no node with the given label exists or if multiple nodes have the same label
+        """
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
+            query = f"""
+            MATCH (n:`{node_label}`)
+            RETURN n, count(n) as node_count
+            """
+            result = await session.run(query)
+            try:
+                records = await result.fetch(
+                    2
+                )  # We only need to know if there are 0, 1, or >1 nodes
+                if not records or records[0]["node_count"] == 0:
+                    raise ValueError(
+                        f"Neo4j: node with label '{node_label}' does not exist"
+                    )
+                if records[0]["node_count"] > 1:
+                    raise ValueError(
+                        f"Neo4j: multiple nodes found with label '{node_label}', cannot determine unique node"
+                    )
+                node = records[0]["n"]
+                if "entity_id" not in node:
+                    raise ValueError(
+                        f"Neo4j: node with label '{node_label}' does not have an entity_id property"
+                    )
+                return node["entity_id"]
+            finally:
+                await result.consume()  # Ensure result is fully consumed
     @retry(
         stop=stop_after_attempt(3),
         wait=wait_exponential(multiplier=1, min=4, max=10),
     ) -> None:
         """
         Upsert an edge and its properties between two nodes identified by their labels.
+        Ensures both source and target nodes exist and are unique before creating the edge.
+        Uses entity_id property to uniquely identify nodes.
         Args:
             source_node_id (str): Label of the source node (used as identifier)
             target_node_id (str): Label of the target node (used as identifier)
             edge_data (dict): Dictionary of properties to set on the edge
+        Raises:
+            ValueError: If either source or target node does not exist or is not unique
         """
+        source_label = self._ensure_label(source_node_id)
+        target_label = self._ensure_label(target_node_id)
         edge_properties = edge_data
+        # Get entity_ids for source and target nodes, ensuring they are unique
+        source_entity_id = await self._get_unique_node_entity_id(source_label)
+        target_entity_id = await self._get_unique_node_entity_id(target_label)
         try:
             async with self._driver.session(database=self._DATABASE) as session:
+                async def execute_upsert(tx: AsyncManagedTransaction):
+                    query = f"""
+                    MATCH (source:`{source_label}` {{entity_id: $source_entity_id}})
+                    WITH source
+                    MATCH (target:`{target_label}` {{entity_id: $target_entity_id}})
+                    MERGE (source)-[r:DIRECTED]-(target)
+                    SET r += $properties
+                    RETURN r, source, target
+                    """
+                    result = await tx.run(
+                        query,
+                        source_entity_id=source_entity_id,
+                        target_entity_id=target_entity_id,
+                        properties=edge_properties,
+                    )
+                    try:
+                        records = await result.fetch(100)
+                        if records:
+                            logger.debug(
+                                f"Upserted edge from '{source_label}' (entity_id: {source_entity_id}) "
+                                f"to '{target_label}' (entity_id: {target_entity_id}) "
+                                f"with properties: {edge_properties}"
+                            )
+                    finally:
+                        await result.consume()  # Ensure result is consumed
+                await session.execute_write(execute_upsert)
         except Exception as e:
             logger.error(f"Error during edge upsert: {str(e)}")
             raise
         print("Implemented but never called.")
     async def get_knowledge_graph(
+        self,
+        node_label: str,
+        max_depth: int = 3,
+        min_degree: int = 0,
+        inclusive: bool = False,
     ) -> KnowledgeGraph:
         """
         Retrieve a connected subgraph of nodes where the label includes the specified `node_label`.
         Maximum number of nodes is constrained by the environment variable `MAX_GRAPH_NODES` (default: 1000).
         When reducing the number of nodes, the prioritization criteria are as follows:
+            1. min_degree does not affect nodes directly connected to the matching nodes
+            2. Label matching nodes take precedence
+            3. Followed by nodes directly connected to the matching nodes
+            4. Finally, the degree of the nodes
         Args:
+            node_label: Label of the starting node
+            max_depth: Maximum depth of the subgraph
+            min_degree: Minimum degree of nodes to include. Defaults to 0
+            inclusive: Do an inclusive search if true
         Returns:
             KnowledgeGraph: Complete connected subgraph for specified node
         """
         label = node_label.strip('"')
         result = KnowledgeGraph()
         seen_nodes = set()
         seen_edges = set()
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             try:
                 if label == "*":
                     main_query = """
                     MATCH (n)
                     OPTIONAL MATCH (n)-[r]-()
                     WITH n, count(r) AS degree
+                    WHERE degree >= $min_degree
                     ORDER BY degree DESC
                     LIMIT $max_nodes
+                    WITH collect({node: n}) AS filtered_nodes
+                    UNWIND filtered_nodes AS node_info
+                    WITH collect(node_info.node) AS kept_nodes, filtered_nodes
+                    MATCH (a)-[r]-(b)
+                    WHERE a IN kept_nodes AND b IN kept_nodes
+                    RETURN filtered_nodes AS node_info,
+                           collect(DISTINCT r) AS relationships
                     """
                     result_set = await session.run(
+                        main_query,
+                        {"max_nodes": MAX_GRAPH_NODES, "min_degree": min_degree},
                     )
                 else:
                     # Main query uses partial matching
+                    main_query = """
                     MATCH (start)
+                    WHERE any(label IN labels(start) WHERE
+                        CASE
+                            WHEN $inclusive THEN label CONTAINS $label
+                            ELSE label = $label
+                        END
+                    )
                     WITH start
+                    CALL apoc.path.subgraphAll(start, {
+                        relationshipFilter: '',
                         minLevel: 0,
+                        maxLevel: $max_depth,
                         bfs: true
+                    })
                     YIELD nodes, relationships
                     WITH start, nodes, relationships
                     UNWIND nodes AS node
                     OPTIONAL MATCH (node)-[r]-()
+                    WITH node, count(r) AS degree, start, nodes, relationships
+                    WHERE node = start OR EXISTS((start)--(node)) OR degree >= $min_degree
+                    ORDER BY
+                        CASE
+                            WHEN node = start THEN 3
+                            WHEN EXISTS((start)--(node)) THEN 2
+                            ELSE 1
+                        END DESC,
+                        degree DESC
                     LIMIT $max_nodes
+                    WITH collect({node: node}) AS filtered_nodes
+                    UNWIND filtered_nodes AS node_info
+                    WITH collect(node_info.node) AS kept_nodes, filtered_nodes
+                    MATCH (a)-[r]-(b)
+                    WHERE a IN kept_nodes AND b IN kept_nodes
+                    RETURN filtered_nodes AS node_info,
+                           collect(DISTINCT r) AS relationships
                     """
                     result_set = await session.run(
+                        main_query,
+                        {
+                            "max_nodes": MAX_GRAPH_NODES,
+                            "label": label,
+                            "inclusive": inclusive,
+                            "max_depth": max_depth,
+                            "min_degree": min_degree,
+                        },
                     )
+                try:
+                    record = await result_set.single()
+                    if record:
+                        # Handle nodes (compatible with multi-label cases)
+                        for node_info in record["node_info"]:
+                            node = node_info["node"]
+                            node_id = node.id
+                            if node_id not in seen_nodes:
+                                result.nodes.append(
+                                    KnowledgeGraphNode(
+                                        id=f"{node_id}",
+                                        labels=list(node.labels),
+                                        properties=dict(node),
+                                    )
                                 )
+                                seen_nodes.add(node_id)
+                        # Handle relationships (including direction information)
+                        for rel in record["relationships"]:
+                            edge_id = rel.id
+                            if edge_id not in seen_edges:
+                                start = rel.start_node
+                                end = rel.end_node
+                                result.edges.append(
+                                    KnowledgeGraphEdge(
+                                        id=f"{edge_id}",
+                                        type=rel.type,
+                                        source=f"{start.id}",
+                                        target=f"{end.id}",
+                                        properties=dict(rel),
+                                    )
                                 )
+                                seen_edges.add(edge_id)
+                        logger.info(
+                            f"Process {os.getpid()} graph query return: {len(result.nodes)} nodes, {len(result.edges)} edges"
+                        )
+                finally:
+                    await result_set.consume()  # Ensure result set is consumed
             except neo4jExceptions.ClientError as e:
+                logger.warning(f"APOC plugin error: {str(e)}")
+                if label != "*":
+                    logger.warning(
+                        "Neo4j: falling back to basic Cypher recursive search..."
+                    )
+                    if inclusive:
+                        logger.warning(
+                            "Neo4j: inclusive search mode is not supported in recursive query, using exact matching"
+                        )
+                    return await self._robust_fallback(label, max_depth, min_degree)
         return result
     async def _robust_fallback(
+        self, label: str, max_depth: int, min_degree: int = 0
+    ) -> KnowledgeGraph:
+        """
+        Fallback implementation when APOC plugin is not available or incompatible.
+        This method implements the same functionality as get_knowledge_graph but uses
+        only basic Cypher queries and recursive traversal instead of APOC procedures.
+        """
+        result = KnowledgeGraph()
         visited_nodes = set()
         visited_edges = set()
+        async def traverse(
+            node: KnowledgeGraphNode,
+            edge: Optional[KnowledgeGraphEdge],
+            current_depth: int,
+        ):
+            # Check traversal limits
             if current_depth > max_depth:
+                logger.debug(f"Reached max depth: {max_depth}")
                 return
+            if len(visited_nodes) >= MAX_GRAPH_NODES:
+                logger.debug(f"Reached max nodes limit: {MAX_GRAPH_NODES}")
                 return
+            # Check if node already visited
+            if node.id in visited_nodes:
                 return
+            # Get all edges and target nodes
+            async with self._driver.session(
+                database=self._DATABASE, default_access_mode="READ"
+            ) as session:
+                query = """
+                MATCH (a)-[r]-(b)
+                WHERE id(a) = toInteger($node_id)
+                WITH r, b, id(r) as edge_id, id(b) as target_id
+                RETURN r, b, edge_id, target_id
+                """
+                results = await session.run(query, {"node_id": node.id})
+                # Get all records and release database connection
+                records = await results.fetch(
+                    1000
+                )  # Max neighbour nodes we can handled
+                await results.consume()  # Ensure results are consumed
+                # Nodes not connected to start node need to check degree
+                if current_depth > 1 and len(records) < min_degree:
+                    return
+                # Add current node to result
+                result.nodes.append(node)
+                visited_nodes.add(node.id)
+                # Add edge to result if it exists and not already added
+                if edge and edge.id not in visited_edges:
+                    result.edges.append(edge)
+                    visited_edges.add(edge.id)
+                # Prepare nodes and edges for recursive processing
+                nodes_to_process = []
+                for record in records:
+                    rel = record["r"]
+                    edge_id = str(record["edge_id"])
+                    if edge_id not in visited_edges:
+                        b_node = record["b"]
+                        target_id = str(record["target_id"])
+                        if b_node.labels:  # Only process if target node has labels
+                            # Create KnowledgeGraphNode for target
+                            target_node = KnowledgeGraphNode(
+                                id=f"{target_id}",
+                                labels=list(b_node.labels),
+                                properties=dict(b_node),
+                            )
+                            # Create KnowledgeGraphEdge
+                            target_edge = KnowledgeGraphEdge(
+                                id=f"{edge_id}",
+                                type=rel.type,
+                                source=f"{node.id}",
+                                target=f"{target_id}",
+                                properties=dict(rel),
+                            )
+                            nodes_to_process.append((target_node, target_edge))
+                        else:
+                            logger.warning(
+                                f"Skipping edge {edge_id} due to missing labels on target node"
+                            )
+                # Process nodes after releasing database connection
+                for target_node, target_edge in nodes_to_process:
+                    await traverse(target_node, target_edge, current_depth + 1)
+        # Get the starting node's data
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             query = f"""
+            MATCH (n:`{label}`)
+            RETURN id(n) as node_id, n
             """
+            node_result = await session.run(query)
+            try:
+                node_record = await node_result.single()
+                if not node_record:
+                    return result
+                # Create initial KnowledgeGraphNode
+                start_node = KnowledgeGraphNode(
+                    id=f"{node_record['node_id']}",
+                    labels=list(node_record["n"].labels),
+                    properties=dict(node_record["n"]),
+                )
+            finally:
+                await node_result.consume()  # Ensure results are consumed
+            # Start traversal with the initial node
+            await traverse(start_node, None, 0)
         return result
     async def get_all_labels(self) -> list[str]:
         Returns:
             ["Person", "Company", ...]  # Alphabetically sorted label list
         """
+        async with self._driver.session(
+            database=self._DATABASE, default_access_mode="READ"
+        ) as session:
             # Method 1: Direct metadata query (Available for Neo4j 4.3+)
             # query = "CALL db.labels() YIELD label RETURN label"
                 RETURN DISTINCT label
                 ORDER BY label
             """
             result = await session.run(query)
             labels = []
+            try:
+                async for record in result:
+                    labels.append(record["label"])
+            finally:
+                await (
+                    result.consume()
+                )  # Ensure results are consumed even if processing fails
             return labels
     @retry(
         Args:
             node_id: The label of the node to delete
         """
+        label = self._ensure_label(node_id)
         async def _do_delete(tx: AsyncManagedTransaction):
             query = f"""
             MATCH (n:`{label}`)
             DETACH DELETE n
             """
+            result = await tx.run(query)
             logger.debug(f"Deleted node with label '{label}'")
+            await result.consume()  # Ensure result is fully consumed
         try:
             async with self._driver.session(database=self._DATABASE) as session:
             edges: List of edges to be deleted, each edge is a (source, target) tuple
         """
         for source, target in edges:
+            source_label = self._ensure_label(source)
+            target_label = self._ensure_label(target)
             async def _do_delete_edge(tx: AsyncManagedTransaction):
                 query = f"""
+                MATCH (source:`{source_label}`)-[r]-(target:`{target_label}`)
                 DELETE r
                 """
+                result = await tx.run(query)
                 logger.debug(f"Deleted edge from '{source_label}' to '{target_label}'")
+                await result.consume()  # Ensure result is fully consumed
             try:
                 async with self._driver.session(database=self._DATABASE) as session:

lightrag/kg/shared_storage.py CHANGED Viewed

@@ -7,12 +7,18 @@ from typing import Any, Dict, Optional, Union, TypeVar, Generic
 # Define a direct print function for critical logs that must be visible in all processes
-def direct_log(message, level="INFO"):
     """
     Log a message directly to stderr to ensure visibility in all processes,
     including the Gunicorn master process.
     """
-    print(f"{level}: {message}", file=sys.stderr, flush=True)
 T = TypeVar("T")
@@ -32,55 +38,165 @@ _update_flags: Optional[Dict[str, bool]] = None  # namespace -> updated
 _storage_lock: Optional[LockType] = None
 _internal_lock: Optional[LockType] = None
 _pipeline_status_lock: Optional[LockType] = None
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
-    def __init__(self, lock: Union[ProcessLock, asyncio.Lock], is_async: bool):
         self._lock = lock
         self._is_async = is_async
     async def __aenter__(self) -> "UnifiedLock[T]":
-        if self._is_async:
-            await self._lock.acquire()
-        else:
-            self._lock.acquire()
-        return self
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        if self._is_async:
-            self._lock.release()
-        else:
-            self._lock.release()
     def __enter__(self) -> "UnifiedLock[T]":
         """For backward compatibility"""
-        if self._is_async:
-            raise RuntimeError("Use 'async with' for shared_storage lock")
-        self._lock.acquire()
-        return self
     def __exit__(self, exc_type, exc_val, exc_tb):
         """For backward compatibility"""
-        if self._is_async:
-            raise RuntimeError("Use 'async with' for shared_storage lock")
-        self._lock.release()
-def get_internal_lock() -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_internal_lock, is_async=not is_multiprocess)
-def get_storage_lock() -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_storage_lock, is_async=not is_multiprocess)
-def get_pipeline_status_lock() -> UnifiedLock:
     """return unified storage lock for data consistency"""
-    return UnifiedLock(lock=_pipeline_status_lock, is_async=not is_multiprocess)
 def initialize_share_data(workers: int = 1):
@@ -108,6 +224,8 @@ def initialize_share_data(workers: int = 1):
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -120,14 +238,16 @@ def initialize_share_data(workers: int = 1):
         )
         return
-    _manager = Manager()
     _workers = workers
     if workers > 1:
         is_multiprocess = True
         _internal_lock = _manager.Lock()
         _storage_lock = _manager.Lock()
         _pipeline_status_lock = _manager.Lock()
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
@@ -139,6 +259,8 @@ def initialize_share_data(workers: int = 1):
         _internal_lock = asyncio.Lock()
         _storage_lock = asyncio.Lock()
         _pipeline_status_lock = asyncio.Lock()
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
@@ -164,6 +286,7 @@ async def initialize_pipeline_status():
         history_messages = _manager.list() if is_multiprocess else []
         pipeline_namespace.update(
             {
                 "busy": False,  # Control concurrent processes
                 "job_name": "Default Job",  # Current job name (indexing files/indexing texts)
                 "job_start": None,  # Job start time
@@ -200,7 +323,12 @@ async def get_update_flag(namespace: str):
         if is_multiprocess and _manager is not None:
             new_update_flag = _manager.Value("b", False)
         else:
-            new_update_flag = False
         _update_flags[namespace].append(new_update_flag)
         return new_update_flag
@@ -220,7 +348,26 @@ async def set_all_update_flags(namespace: str):
             if is_multiprocess:
                 _update_flags[namespace][i].value = True
             else:
-                _update_flags[namespace][i] = True
 async def get_all_update_flags_status() -> Dict[str, list]:
@@ -247,7 +394,7 @@ async def get_all_update_flags_status() -> Dict[str, list]:
     return result
-def try_initialize_namespace(namespace: str) -> bool:
     """
     Returns True if the current worker(process) gets initialization permission for loading data later.
     The worker does not get the permission is prohibited to load data from files.
@@ -257,15 +404,17 @@ def try_initialize_namespace(namespace: str) -> bool:
     if _init_flags is None:
         raise ValueError("Try to create nanmespace before Shared-Data is initialized")
-    if namespace not in _init_flags:
-        _init_flags[namespace] = True
         direct_log(
-            f"Process {os.getpid()} ready to initialize storage namespace: [{namespace}]"
         )
-        return True
-    direct_log(
-        f"Process {os.getpid()} storage namespace already initialized: [{namespace}]"
-    )
     return False
@@ -304,6 +453,8 @@ def finalize_share_data():
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
@@ -369,6 +520,8 @@ def finalize_share_data():
     _storage_lock = None
     _internal_lock = None
     _pipeline_status_lock = None
     _update_flags = None
     direct_log(f"Process {os.getpid()} storage data finalization complete")

 # Define a direct print function for critical logs that must be visible in all processes
+def direct_log(message, level="INFO", enable_output: bool = True):
     """
     Log a message directly to stderr to ensure visibility in all processes,
     including the Gunicorn master process.
+    Args:
+        message: The message to log
+        level: Log level (default: "INFO")
+        enable_output: Whether to actually output the log (default: True)
     """
+    if enable_output:
+        print(f"{level}: {message}", file=sys.stderr, flush=True)
 T = TypeVar("T")
 _storage_lock: Optional[LockType] = None
 _internal_lock: Optional[LockType] = None
 _pipeline_status_lock: Optional[LockType] = None
+_graph_db_lock: Optional[LockType] = None
+_data_init_lock: Optional[LockType] = None
 class UnifiedLock(Generic[T]):
     """Provide a unified lock interface type for asyncio.Lock and multiprocessing.Lock"""
+    def __init__(
+        self,
+        lock: Union[ProcessLock, asyncio.Lock],
+        is_async: bool,
+        name: str = "unnamed",
+        enable_logging: bool = True,
+    ):
         self._lock = lock
         self._is_async = is_async
+        self._pid = os.getpid()  # for debug only
+        self._name = name  # for debug only
+        self._enable_logging = enable_logging  # for debug only
     async def __aenter__(self) -> "UnifiedLock[T]":
+        try:
+            direct_log(
+                f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
+            if self._is_async:
+                await self._lock.acquire()
+            else:
+                self._lock.acquire()
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
+            return self
+        except Exception as e:
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}': {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
+            raise
     async def __aexit__(self, exc_type, exc_val, exc_tb):
+        try:
+            direct_log(
+                f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
+            if self._is_async:
+                self._lock.release()
+            else:
+                self._lock.release()
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' released (async={self._is_async})",
+                enable_output=self._enable_logging,
+            )
+        except Exception as e:
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to release lock '{self._name}': {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
+            raise
     def __enter__(self) -> "UnifiedLock[T]":
         """For backward compatibility"""
+        try:
+            if self._is_async:
+                raise RuntimeError("Use 'async with' for shared_storage lock")
+            direct_log(
+                f"== Lock == Process {self._pid}: Acquiring lock '{self._name}' (sync)",
+                enable_output=self._enable_logging,
+            )
+            self._lock.acquire()
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' acquired (sync)",
+                enable_output=self._enable_logging,
+            )
+            return self
+        except Exception as e:
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to acquire lock '{self._name}' (sync): {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
+            raise
     def __exit__(self, exc_type, exc_val, exc_tb):
         """For backward compatibility"""
+        try:
+            if self._is_async:
+                raise RuntimeError("Use 'async with' for shared_storage lock")
+            direct_log(
+                f"== Lock == Process {self._pid}: Releasing lock '{self._name}' (sync)",
+                enable_output=self._enable_logging,
+            )
+            self._lock.release()
+            direct_log(
+                f"== Lock == Process {self._pid}: Lock '{self._name}' released (sync)",
+                enable_output=self._enable_logging,
+            )
+        except Exception as e:
+            direct_log(
+                f"== Lock == Process {self._pid}: Failed to release lock '{self._name}' (sync): {e}",
+                level="ERROR",
+                enable_output=self._enable_logging,
+            )
+            raise
+def get_internal_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    return UnifiedLock(
+        lock=_internal_lock,
+        is_async=not is_multiprocess,
+        name="internal_lock",
+        enable_logging=enable_logging,
+    )
+def get_storage_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    return UnifiedLock(
+        lock=_storage_lock,
+        is_async=not is_multiprocess,
+        name="storage_lock",
+        enable_logging=enable_logging,
+    )
+def get_pipeline_status_lock(enable_logging: bool = False) -> UnifiedLock:
     """return unified storage lock for data consistency"""
+    return UnifiedLock(
+        lock=_pipeline_status_lock,
+        is_async=not is_multiprocess,
+        name="pipeline_status_lock",
+        enable_logging=enable_logging,
+    )
+def get_graph_db_lock(enable_logging: bool = False) -> UnifiedLock:
+    """return unified graph database lock for ensuring atomic operations"""
+    return UnifiedLock(
+        lock=_graph_db_lock,
+        is_async=not is_multiprocess,
+        name="graph_db_lock",
+        enable_logging=enable_logging,
+    )
+def get_data_init_lock(enable_logging: bool = False) -> UnifiedLock:
+    """return unified data initialization lock for ensuring atomic data initialization"""
+    return UnifiedLock(
+        lock=_data_init_lock,
+        is_async=not is_multiprocess,
+        name="data_init_lock",
+        enable_logging=enable_logging,
+    )
 def initialize_share_data(workers: int = 1):
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
+        _graph_db_lock, \
+        _data_init_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
         )
         return
     _workers = workers
     if workers > 1:
         is_multiprocess = True
+        _manager = Manager()
         _internal_lock = _manager.Lock()
         _storage_lock = _manager.Lock()
         _pipeline_status_lock = _manager.Lock()
+        _graph_db_lock = _manager.Lock()
+        _data_init_lock = _manager.Lock()
         _shared_dicts = _manager.dict()
         _init_flags = _manager.dict()
         _update_flags = _manager.dict()
         _internal_lock = asyncio.Lock()
         _storage_lock = asyncio.Lock()
         _pipeline_status_lock = asyncio.Lock()
+        _graph_db_lock = asyncio.Lock()
+        _data_init_lock = asyncio.Lock()
         _shared_dicts = {}
         _init_flags = {}
         _update_flags = {}
         history_messages = _manager.list() if is_multiprocess else []
         pipeline_namespace.update(
             {
+                "autoscanned": False,  # Auto-scan started
                 "busy": False,  # Control concurrent processes
                 "job_name": "Default Job",  # Current job name (indexing files/indexing texts)
                 "job_start": None,  # Job start time
         if is_multiprocess and _manager is not None:
             new_update_flag = _manager.Value("b", False)
         else:
+            # Create a simple mutable object to store boolean value for compatibility with mutiprocess
+            class MutableBoolean:
+                def __init__(self, initial_value=False):
+                    self.value = initial_value
+            new_update_flag = MutableBoolean(False)
         _update_flags[namespace].append(new_update_flag)
         return new_update_flag
             if is_multiprocess:
                 _update_flags[namespace][i].value = True
             else:
+                # Use .value attribute instead of direct assignment
+                _update_flags[namespace][i].value = True
+async def clear_all_update_flags(namespace: str):
+    """Clear all update flag of namespace indicating all workers need to reload data from files"""
+    global _update_flags
+    if _update_flags is None:
+        raise ValueError("Try to create namespace before Shared-Data is initialized")
+    async with get_internal_lock():
+        if namespace not in _update_flags:
+            raise ValueError(f"Namespace {namespace} not found in update flags")
+        # Update flags for both modes
+        for i in range(len(_update_flags[namespace])):
+            if is_multiprocess:
+                _update_flags[namespace][i].value = False
+            else:
+                # Use .value attribute instead of direct assignment
+                _update_flags[namespace][i].value = False
 async def get_all_update_flags_status() -> Dict[str, list]:
     return result
+async def try_initialize_namespace(namespace: str) -> bool:
     """
     Returns True if the current worker(process) gets initialization permission for loading data later.
     The worker does not get the permission is prohibited to load data from files.
     if _init_flags is None:
         raise ValueError("Try to create nanmespace before Shared-Data is initialized")
+    async with get_internal_lock():
+        if namespace not in _init_flags:
+            _init_flags[namespace] = True
+            direct_log(
+                f"Process {os.getpid()} ready to initialize storage namespace: [{namespace}]"
+            )
+            return True
         direct_log(
+            f"Process {os.getpid()} storage namespace already initialized: [{namespace}]"
         )
     return False
         _storage_lock, \
         _internal_lock, \
         _pipeline_status_lock, \
+        _graph_db_lock, \
+        _data_init_lock, \
         _shared_dicts, \
         _init_flags, \
         _initialized, \
     _storage_lock = None
     _internal_lock = None
     _pipeline_status_lock = None
+    _graph_db_lock = None
+    _data_init_lock = None
     _update_flags = None
     direct_log(f"Process {os.getpid()} storage data finalization complete")

lightrag/lightrag.py CHANGED Viewed

@@ -354,6 +354,9 @@ class LightRAG:
             namespace=make_namespace(
                 self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
             ),
             embedding_func=self.embedding_func,
         )
@@ -404,18 +407,8 @@ class LightRAG:
             embedding_func=None,
         )
-        if self.llm_response_cache and hasattr(
-            self.llm_response_cache, "global_config"
-        ):
-            hashing_kv = self.llm_response_cache
-        else:
-            hashing_kv = self.key_string_value_json_storage_cls(  # type: ignore
-                namespace=make_namespace(
-                    self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                ),
-                global_config=asdict(self),
-                embedding_func=self.embedding_func,
-            )
         self.llm_model_func = limit_async_func_call(self.llm_model_max_async)(
             partial(
@@ -590,6 +583,7 @@ class LightRAG:
             split_by_character, split_by_character_only
         )
     def insert_custom_chunks(
         self,
         full_text: str,
@@ -601,6 +595,7 @@ class LightRAG:
             self.ainsert_custom_chunks(full_text, text_chunks, doc_id)
         )
     async def ainsert_custom_chunks(
         self, full_text: str, text_chunks: list[str], doc_id: str | None = None
     ) -> None:
@@ -892,7 +887,9 @@ class LightRAG:
                                 self.chunks_vdb.upsert(chunks)
                             )
                             entity_relation_task = asyncio.create_task(
-                                self._process_entity_relation_graph(chunks)
                             )
                             full_docs_task = asyncio.create_task(
                                 self.full_docs.upsert(
@@ -1007,21 +1004,27 @@ class LightRAG:
                 pipeline_status["latest_message"] = log_message
                 pipeline_status["history_messages"].append(log_message)
-    async def _process_entity_relation_graph(self, chunk: dict[str, Any]) -> None:
         try:
             await extract_entities(
                 chunk,
                 knowledge_graph_inst=self.chunk_entity_relation_graph,
                 entity_vdb=self.entities_vdb,
                 relationships_vdb=self.relationships_vdb,
-                llm_response_cache=self.llm_response_cache,
                 global_config=asdict(self),
             )
         except Exception as e:
             logger.error("Failed to extract entities and relationships")
             raise e
-    async def _insert_done(self) -> None:
         tasks = [
             cast(StorageNameSpace, storage_inst).index_done_callback()
             for storage_inst in [  # type: ignore
@@ -1040,12 +1043,10 @@ class LightRAG:
         log_message = "All Insert done"
         logger.info(log_message)
-        # 获取 pipeline_status 并更新 latest_message 和 history_messages
-        from lightrag.kg.shared_storage import get_namespace_data
-        pipeline_status = await get_namespace_data("pipeline_status")
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
     def insert_custom_kg(
         self, custom_kg: dict[str, Any], full_doc_id: str = None
@@ -1260,16 +1261,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
                 system_prompt=system_prompt,
             )
         elif param.mode == "naive":
@@ -1279,16 +1271,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
                 system_prompt=system_prompt,
             )
         elif param.mode == "mix":
@@ -1301,16 +1284,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
                 system_prompt=system_prompt,
             )
         else:
@@ -1344,14 +1318,7 @@ class LightRAG:
             text=query,
             param=param,
             global_config=asdict(self),
-            hashing_kv=self.llm_response_cache
-            or self.key_string_value_json_storage_cls(
-                namespace=make_namespace(
-                    self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                ),
-                global_config=asdict(self),
-                embedding_func=self.embedding_func,
-            ),
         )
         param.hl_keywords = hl_keywords
@@ -1375,16 +1342,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
             )
         elif param.mode == "naive":
             response = await naive_query(
@@ -1393,16 +1351,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
             )
         elif param.mode == "mix":
             response = await mix_kg_vector_query(
@@ -1414,16 +1363,7 @@ class LightRAG:
                 self.text_chunks,
                 param,
                 asdict(self),
-                hashing_kv=self.llm_response_cache
-                if self.llm_response_cache
-                and hasattr(self.llm_response_cache, "global_config")
-                else self.key_string_value_json_storage_cls(
-                    namespace=make_namespace(
-                        self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
-                    ),
-                    global_config=asdict(self),
-                    embedding_func=self.embedding_func,
-                ),
             )
         else:
             raise ValueError(f"Unknown mode {param.mode}")

             namespace=make_namespace(
                 self.namespace_prefix, NameSpace.KV_STORE_LLM_RESPONSE_CACHE
             ),
+            global_config=asdict(
+                self
+            ),  # Add global_config to ensure cache works properly
             embedding_func=self.embedding_func,
         )
             embedding_func=None,
         )
+        # Directly use llm_response_cache, don't create a new object
+        hashing_kv = self.llm_response_cache
         self.llm_model_func = limit_async_func_call(self.llm_model_max_async)(
             partial(
             split_by_character, split_by_character_only
         )
+    # TODO: deprecated, use insert instead
     def insert_custom_chunks(
         self,
         full_text: str,
             self.ainsert_custom_chunks(full_text, text_chunks, doc_id)
         )
+    # TODO: deprecated, use ainsert instead
     async def ainsert_custom_chunks(
         self, full_text: str, text_chunks: list[str], doc_id: str | None = None
     ) -> None:
                                 self.chunks_vdb.upsert(chunks)
                             )
                             entity_relation_task = asyncio.create_task(
+                                self._process_entity_relation_graph(
+                                    chunks, pipeline_status, pipeline_status_lock
+                                )
                             )
                             full_docs_task = asyncio.create_task(
                                 self.full_docs.upsert(
                 pipeline_status["latest_message"] = log_message
                 pipeline_status["history_messages"].append(log_message)
+    async def _process_entity_relation_graph(
+        self, chunk: dict[str, Any], pipeline_status=None, pipeline_status_lock=None
+    ) -> None:
         try:
             await extract_entities(
                 chunk,
                 knowledge_graph_inst=self.chunk_entity_relation_graph,
                 entity_vdb=self.entities_vdb,
                 relationships_vdb=self.relationships_vdb,
                 global_config=asdict(self),
+                pipeline_status=pipeline_status,
+                pipeline_status_lock=pipeline_status_lock,
+                llm_response_cache=self.llm_response_cache,
             )
         except Exception as e:
             logger.error("Failed to extract entities and relationships")
             raise e
+    async def _insert_done(
+        self, pipeline_status=None, pipeline_status_lock=None
+    ) -> None:
         tasks = [
             cast(StorageNameSpace, storage_inst).index_done_callback()
             for storage_inst in [  # type: ignore
         log_message = "All Insert done"
         logger.info(log_message)
+        if pipeline_status is not None and pipeline_status_lock is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
     def insert_custom_kg(
         self, custom_kg: dict[str, Any], full_doc_id: str = None
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         elif param.mode == "naive":
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         elif param.mode == "mix":
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         else:
             text=query,
             param=param,
             global_config=asdict(self),
+            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
         )
         param.hl_keywords = hl_keywords
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         elif param.mode == "naive":
             response = await naive_query(
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         elif param.mode == "mix":
             response = await mix_kg_vector_query(
                 self.text_chunks,
                 param,
                 asdict(self),
+                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
             )
         else:
             raise ValueError(f"Unknown mode {param.mode}")

lightrag/operate.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import asyncio
 import json
 import re
 from typing import Any, AsyncIterator
 from collections import Counter, defaultdict
@@ -220,6 +221,7 @@ async def _merge_nodes_then_upsert(
         entity_name, description, global_config
     )
     node_data = dict(
         entity_type=entity_type,
         description=description,
         source_id=source_id,
@@ -301,6 +303,7 @@ async def _merge_edges_then_upsert(
             await knowledge_graph_inst.upsert_node(
                 need_insert_id,
                 node_data={
                     "source_id": source_id,
                     "description": description,
                     "entity_type": "UNKNOWN",
@@ -337,11 +340,10 @@ async def extract_entities(
     entity_vdb: BaseVectorStorage,
     relationships_vdb: BaseVectorStorage,
     global_config: dict[str, str],
     llm_response_cache: BaseKVStorage | None = None,
 ) -> None:
-    from lightrag.kg.shared_storage import get_namespace_data
-    pipeline_status = await get_namespace_data("pipeline_status")
     use_llm_func: callable = global_config["llm_model_func"]
     entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
     enable_llm_cache_for_entity_extract: bool = global_config[
@@ -400,6 +402,7 @@ async def extract_entities(
             else:
                 _prompt = input_text
             arg_hash = compute_args_hash(_prompt)
             cached_return, _1, _2, _3 = await handle_cache(
                 llm_response_cache,
@@ -407,7 +410,6 @@ async def extract_entities(
                 _prompt,
                 "default",
                 cache_type="extract",
-                force_llm_cache=True,
             )
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
@@ -504,8 +506,10 @@ async def extract_entities(
         relations_count = len(maybe_edges)
         log_message = f"  Chunk {processed_chunks}/{total_chunks}: extracted {entities_count} entities and {relations_count} relationships (deduplicated)"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
         return dict(maybe_nodes), dict(maybe_edges)
     tasks = [_process_single_content(c) for c in ordered_chunks]
@@ -519,42 +523,58 @@ async def extract_entities(
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
-    all_entities_data = await asyncio.gather(
-        *[
-            _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
-            for k, v in maybe_nodes.items()
-        ]
-    )
-    all_relationships_data = await asyncio.gather(
-        *[
-            _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
-            for k, v in maybe_edges.items()
-        ]
-    )
     if not (all_entities_data or all_relationships_data):
         log_message = "Didn't extract any entities and relationships."
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
         return
     if not all_entities_data:
         log_message = "Didn't extract any entities"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
     if not all_relationships_data:
         log_message = "Didn't extract any relationships"
         logger.info(log_message)
-        pipeline_status["latest_message"] = log_message
-        pipeline_status["history_messages"].append(log_message)
     log_message = f"Extracted {len(all_entities_data)} entities and {len(all_relationships_data)} relationships (deduplicated)"
     logger.info(log_message)
-    pipeline_status["latest_message"] = log_message
-    pipeline_status["history_messages"].append(log_message)
     verbose_debug(
         f"New entities:{all_entities_data}, relationships:{all_relationships_data}"
     )
@@ -1017,6 +1037,7 @@ async def _build_query_context(
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
 ):
     if query_param.mode == "local":
         entities_context, relations_context, text_units_context = await _get_node_data(
             ll_keywords,

 import asyncio
 import json
 import re
+import os
 from typing import Any, AsyncIterator
 from collections import Counter, defaultdict
         entity_name, description, global_config
     )
     node_data = dict(
+        entity_id=entity_name,
         entity_type=entity_type,
         description=description,
         source_id=source_id,
             await knowledge_graph_inst.upsert_node(
                 need_insert_id,
                 node_data={
+                    "entity_id": need_insert_id,
                     "source_id": source_id,
                     "description": description,
                     "entity_type": "UNKNOWN",
     entity_vdb: BaseVectorStorage,
     relationships_vdb: BaseVectorStorage,
     global_config: dict[str, str],
+    pipeline_status: dict = None,
+    pipeline_status_lock=None,
     llm_response_cache: BaseKVStorage | None = None,
 ) -> None:
     use_llm_func: callable = global_config["llm_model_func"]
     entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
     enable_llm_cache_for_entity_extract: bool = global_config[
             else:
                 _prompt = input_text
+            # TODO： add cache_type="extract"
             arg_hash = compute_args_hash(_prompt)
             cached_return, _1, _2, _3 = await handle_cache(
                 llm_response_cache,
                 _prompt,
                 "default",
                 cache_type="extract",
             )
             if cached_return:
                 logger.debug(f"Found cache for {arg_hash}")
         relations_count = len(maybe_edges)
         log_message = f"  Chunk {processed_chunks}/{total_chunks}: extracted {entities_count} entities and {relations_count} relationships (deduplicated)"
         logger.info(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
         return dict(maybe_nodes), dict(maybe_edges)
     tasks = [_process_single_content(c) for c in ordered_chunks]
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
+    from .kg.shared_storage import get_graph_db_lock
+    graph_db_lock = get_graph_db_lock(enable_logging=False)
+    # Ensure that nodes and edges are merged and upserted atomically
+    async with graph_db_lock:
+        all_entities_data = await asyncio.gather(
+            *[
+                _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
+                for k, v in maybe_nodes.items()
+            ]
+        )
+        all_relationships_data = await asyncio.gather(
+            *[
+                _merge_edges_then_upsert(
+                    k[0], k[1], v, knowledge_graph_inst, global_config
+                )
+                for k, v in maybe_edges.items()
+            ]
+        )
     if not (all_entities_data or all_relationships_data):
         log_message = "Didn't extract any entities and relationships."
         logger.info(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
         return
     if not all_entities_data:
         log_message = "Didn't extract any entities"
         logger.info(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
     if not all_relationships_data:
         log_message = "Didn't extract any relationships"
         logger.info(log_message)
+        if pipeline_status is not None:
+            async with pipeline_status_lock:
+                pipeline_status["latest_message"] = log_message
+                pipeline_status["history_messages"].append(log_message)
     log_message = f"Extracted {len(all_entities_data)} entities and {len(all_relationships_data)} relationships (deduplicated)"
     logger.info(log_message)
+    if pipeline_status is not None:
+        async with pipeline_status_lock:
+            pipeline_status["latest_message"] = log_message
+            pipeline_status["history_messages"].append(log_message)
     verbose_debug(
         f"New entities:{all_entities_data}, relationships:{all_relationships_data}"
     )
     text_chunks_db: BaseKVStorage,
     query_param: QueryParam,
 ):
+    logger.info(f"Process {os.getpid()} buidling query context...")
     if query_param.mode == "local":
         entities_context, relations_context, text_units_context = await _get_node_data(
             ll_keywords,

lightrag/utils.py CHANGED Viewed

@@ -633,15 +633,15 @@ async def handle_cache(
     prompt,
     mode="default",
     cache_type=None,
-    force_llm_cache=False,
 ):
     """Generic cache handling function"""
-    if hashing_kv is None or not (
-        force_llm_cache or hashing_kv.global_config.get("enable_llm_cache")
-    ):
         return None, None, None, None
-    if mode != "default":
         # Get embedding cache configuration
         embedding_cache_config = hashing_kv.global_config.get(
             "embedding_cache_config",
@@ -651,8 +651,7 @@ async def handle_cache(
         use_llm_check = embedding_cache_config.get("use_llm_check", False)
         quantized = min_val = max_val = None
-        if is_embedding_cache_enabled:
-            # Use embedding cache
             current_embedding = await hashing_kv.embedding_func([prompt])
             llm_model_func = hashing_kv.global_config.get("llm_model_func")
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
@@ -667,24 +666,29 @@ async def handle_cache(
                 cache_type=cache_type,
             )
             if best_cached_response is not None:
-                logger.info(f"Embedding cached hit(mode:{mode} type:{cache_type})")
                 return best_cached_response, None, None, None
             else:
                 # if caching keyword embedding is enabled, return the quantized embedding for saving it latter
-                logger.info(f"Embedding cached missed(mode:{mode} type:{cache_type})")
                 return None, quantized, min_val, max_val
-    # For default mode or is_embedding_cache_enabled is False, use regular cache
-    # default mode is for extract_entities or naive query
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
     else:
         mode_cache = await hashing_kv.get_by_id(mode) or {}
     if args_hash in mode_cache:
-        logger.info(f"Non-embedding cached hit(mode:{mode} type:{cache_type})")
         return mode_cache[args_hash]["return"], None, None, None
-    logger.info(f"Non-embedding cached missed(mode:{mode} type:{cache_type})")
     return None, None, None, None
@@ -701,9 +705,22 @@ class CacheData:
 async def save_to_cache(hashing_kv, cache_data: CacheData):
-    if hashing_kv is None or hasattr(cache_data.content, "__aiter__"):
         return
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = (
             await hashing_kv.get_by_mode_and_id(cache_data.mode, cache_data.args_hash)
@@ -712,6 +729,16 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
     else:
         mode_cache = await hashing_kv.get_by_id(cache_data.mode) or {}
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
         "cache_type": cache_data.cache_type,
@@ -726,6 +753,7 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
         "original_prompt": cache_data.prompt,
     }
     await hashing_kv.upsert({cache_data.mode: mode_cache})

     prompt,
     mode="default",
     cache_type=None,
 ):
     """Generic cache handling function"""
+    if hashing_kv is None:
         return None, None, None, None
+    if mode != "default":  # handle cache for all type of query
+        if not hashing_kv.global_config.get("enable_llm_cache"):
+            return None, None, None, None
         # Get embedding cache configuration
         embedding_cache_config = hashing_kv.global_config.get(
             "embedding_cache_config",
         use_llm_check = embedding_cache_config.get("use_llm_check", False)
         quantized = min_val = max_val = None
+        if is_embedding_cache_enabled:  # Use embedding simularity to match cache
             current_embedding = await hashing_kv.embedding_func([prompt])
             llm_model_func = hashing_kv.global_config.get("llm_model_func")
             quantized, min_val, max_val = quantize_embedding(current_embedding[0])
                 cache_type=cache_type,
             )
             if best_cached_response is not None:
+                logger.debug(f"Embedding cached hit(mode:{mode} type:{cache_type})")
                 return best_cached_response, None, None, None
             else:
                 # if caching keyword embedding is enabled, return the quantized embedding for saving it latter
+                logger.debug(f"Embedding cached missed(mode:{mode} type:{cache_type})")
                 return None, quantized, min_val, max_val
+    else:  # handle cache for entity extraction
+        if not hashing_kv.global_config.get("enable_llm_cache_for_entity_extract"):
+            return None, None, None, None
+    # Here is the conditions of code reaching this point:
+    #     1. All query mode: enable_llm_cache is True and embedding simularity is not enabled
+    #     2. Entity extract: enable_llm_cache_for_entity_extract is True
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = await hashing_kv.get_by_mode_and_id(mode, args_hash) or {}
     else:
         mode_cache = await hashing_kv.get_by_id(mode) or {}
     if args_hash in mode_cache:
+        logger.debug(f"Non-embedding cached hit(mode:{mode} type:{cache_type})")
         return mode_cache[args_hash]["return"], None, None, None
+    logger.debug(f"Non-embedding cached missed(mode:{mode} type:{cache_type})")
     return None, None, None, None
 async def save_to_cache(hashing_kv, cache_data: CacheData):
+    """Save data to cache, with improved handling for streaming responses and duplicate content.
+    Args:
+        hashing_kv: The key-value storage for caching
+        cache_data: The cache data to save
+    """
+    # Skip if storage is None or content is a streaming response
+    if hashing_kv is None or not cache_data.content:
         return
+    # If content is a streaming response, don't cache it
+    if hasattr(cache_data.content, "__aiter__"):
+        logger.debug("Streaming response detected, skipping cache")
+        return
+    # Get existing cache data
     if exists_func(hashing_kv, "get_by_mode_and_id"):
         mode_cache = (
             await hashing_kv.get_by_mode_and_id(cache_data.mode, cache_data.args_hash)
     else:
         mode_cache = await hashing_kv.get_by_id(cache_data.mode) or {}
+    # Check if we already have identical content cached
+    if cache_data.args_hash in mode_cache:
+        existing_content = mode_cache[cache_data.args_hash].get("return")
+        if existing_content == cache_data.content:
+            logger.info(
+                f"Cache content unchanged for {cache_data.args_hash}, skipping update"
+            )
+            return
+    # Update cache with new content
     mode_cache[cache_data.args_hash] = {
         "return": cache_data.content,
         "cache_type": cache_data.cache_type,
         "original_prompt": cache_data.prompt,
     }
+    # Only upsert if there's actual new content
     await hashing_kv.upsert({cache_data.mode: mode_cache})