Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

choizhang commited on Apr 13

Commit

48b5396

2 Parent(s): 3659583 0a1b5ba

Merge branch 'main' into edit-node

Browse files

Files changed (25) hide show

lightrag/api/README-zh.md +4 -0
lightrag/api/README.md +2 -0
lightrag/api/__init__.py +1 -1
lightrag/api/routers/document_routes.py +4 -1
lightrag/api/routers/ollama_api.py +1 -1
lightrag/api/routers/query_routes.py +1 -1
lightrag/api/webui/assets/{index-DSVCuARS.js → index-CkwV8nfm.js} +0 -0
lightrag/api/webui/index.html +0 -0
lightrag/base.py +117 -17
lightrag/kg/age_impl.py +0 -24
lightrag/kg/gremlin_impl.py +0 -32
lightrag/kg/mongo_impl.py +0 -14
lightrag/kg/neo4j_impl.py +1 -15
lightrag/kg/networkx_impl.py +1 -59
lightrag/kg/postgres_impl.py +0 -24
lightrag/kg/tidb_impl.py +0 -7
lightrag/lightrag.py +12 -25
lightrag/operate.py +20 -7
lightrag/utils.py +44 -0
lightrag_webui/src/api/lightrag.ts +2 -1
lightrag_webui/src/components/retrieval/QuerySettings.tsx +1 -0
lightrag_webui/src/locales/ar.json +3 -2
lightrag_webui/src/locales/en.json +3 -2
lightrag_webui/src/locales/fr.json +3 -2
lightrag_webui/src/locales/zh.json +3 -2

lightrag/api/README-zh.md CHANGED Viewed

@@ -102,6 +102,10 @@ lightrag-gunicorn --workers 4
 - `--log-level`：日志级别（默认：INFO）
 - --input-dir：指定要扫描文档的目录（默认：./input）
 ### 启动时自动扫描
 当使用 `--auto-scan-at-startup` 参数启动任何服务器时，系统将自动：

 - `--log-level`：日志级别（默认：INFO）
 - --input-dir：指定要扫描文档的目录（默认：./input）
+> ** 要求将.env文件置于启动目录中是经过特意设计的**。 这样做的目的是支持用户同时启动多个LightRAG实例，并为不同实例配置不同的.env文件。
+> **修改.env文件后，您需要重新打开终端以使新设置生效**。 这是因为每次启动时，LightRAG Server会将.env文件中的环境变量加载至系统环境变量，且系统环境变量的设置具有更高优先级。
 ### 启动时自动扫描
 当使用 `--auto-scan-at-startup` 参数启动任何服务器时，系统将自动：

lightrag/api/README.md CHANGED Viewed

@@ -106,6 +106,8 @@ Here are some commonly used startup parameters:
 > The requirement for the .env file to be in the startup directory is intentionally designed this way. The purpose is to support users in launching multiple LightRAG instances simultaneously, allowing different .env files for different instances.
 ### Auto scan on startup
 When starting any of the servers with the `--auto-scan-at-startup` parameter, the system will automatically:

 > The requirement for the .env file to be in the startup directory is intentionally designed this way. The purpose is to support users in launching multiple LightRAG instances simultaneously, allowing different .env files for different instances.
+> **After changing the .env file, you need to open a new terminal to make  the new settings take effect.** This because the LightRAG Server will load the environment variables from .env into the system environment variables each time it starts, and LightRAG Server will prioritize the settings in the system environment variables.
 ### Auto scan on startup
 When starting any of the servers with the `--auto-scan-at-startup` parameter, the system will automatically:

lightrag/api/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __api_version__ = "~~0146~~"


1	+ __api_version__ = "0148"

lightrag/api/routers/document_routes.py CHANGED Viewed

@@ -499,7 +499,10 @@ async def pipeline_enqueue_file(rag: LightRAG, file_path: Path) -> bool:
                     content = result.document.export_to_markdown()
                 else:
                     if not pm.is_installed("python-docx"):  # type: ignore
-                        pm.install("docx")
                     from docx import Document  # type: ignore
                     from io import BytesIO

                     content = result.document.export_to_markdown()
                 else:
                     if not pm.is_installed("python-docx"):  # type: ignore
+                        try:
+                            pm.install("python-docx")
+                        except Exception:
+                            pm.install("docx")
                     from docx import Document  # type: ignore
                     from io import BytesIO

lightrag/api/routers/ollama_api.py CHANGED Viewed

@@ -308,7 +308,7 @@ class OllamaAPI:
                             "Cache-Control": "no-cache",
                             "Connection": "keep-alive",
                             "Content-Type": "application/x-ndjson",
-                            "X-Accel-Buffering": "no",  # 确保在Nginx代理时正确处理流式响应
                         },
                     )
                 else:

                             "Cache-Control": "no-cache",
                             "Connection": "keep-alive",
                             "Content-Type": "application/x-ndjson",
+                            "X-Accel-Buffering": "no",  # Ensure proper handling of streaming responses in Nginx proxy
                         },
                     )
                 else:

lightrag/api/routers/query_routes.py CHANGED Viewed

@@ -22,7 +22,7 @@ class QueryRequest(BaseModel):
         description="The query text",
     )
-    mode: Literal["local", "global", "hybrid", "naive", "mix"] = Field(
         default="hybrid",
         description="Query mode",
     )

         description="The query text",
     )
+    mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = Field(
         default="hybrid",
         description="Query mode",
     )

lightrag/api/webui/assets/{index-DSVCuARS.js → index-CkwV8nfm.js} RENAMED Viewed

Binary files a/lightrag/api/webui/assets/index-DSVCuARS.js and b/lightrag/api/webui/assets/index-CkwV8nfm.js differ

lightrag/api/webui/index.html CHANGED Viewed

Binary files a/lightrag/api/webui/index.html and b/lightrag/api/webui/index.html differ

lightrag/base.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import (
     TypeVar,
     Callable,
 )
-import numpy as np
 from .utils import EmbeddingFunc
 from .types import KnowledgeGraph
@@ -36,7 +35,7 @@ T = TypeVar("T")
 class QueryParam:
     """Configuration parameters for query execution in LightRAG."""
-    mode: Literal["local", "global", "hybrid", "naive", "mix"] = "global"
     """Specifies the retrieval mode:
     - "local": Focuses on context-dependent information.
     - "global": Utilizes global knowledge.
@@ -281,63 +280,164 @@ class BaseGraphStorage(StorageNameSpace, ABC):
     @abstractmethod
     async def has_node(self, node_id: str) -> bool:
-        """Check if an edge exists in the graph."""
     @abstractmethod
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
-        """Get the degree of a node."""
     @abstractmethod
     async def node_degree(self, node_id: str) -> int:
-        """Get the degree of an edge."""
     @abstractmethod
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
-        """Get a node by its id."""
     @abstractmethod
     async def get_node(self, node_id: str) -> dict[str, str] | None:
-        """Get node by its label identifier, return only node properties"""
     @abstractmethod
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
-        """Get edge properties between two nodes"""
     @abstractmethod
     async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
-        """Upsert a node into the graph."""
     @abstractmethod
     async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
-        """Upsert an edge into the graph."""
     @abstractmethod
     async def upsert_edge(
         self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
     ) -> None:
-        """Delete a node from the graph.
         Importance notes for in-memory storage:
         1. Changes will be persisted to disk during the next index_done_callback
         2. Only one process should updating the storage at a time before index_done_callback,
            KG-storage-log should be used to avoid data corruption
         """
     @abstractmethod
     async def delete_node(self, node_id: str) -> None:
-        """Embed nodes using an algorithm."""
     @abstractmethod
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        """Get all labels in the graph."""
     @abstractmethod
     async def get_all_labels(self) -> list[str]:
-        """Get a knowledge graph of a node."""
     @abstractmethod
     async def get_knowledge_graph(

     TypeVar,
     Callable,
 )
 from .utils import EmbeddingFunc
 from .types import KnowledgeGraph
 class QueryParam:
     """Configuration parameters for query execution in LightRAG."""
+    mode: Literal["local", "global", "hybrid", "naive", "mix", "bypass"] = "global"
     """Specifies the retrieval mode:
     - "local": Focuses on context-dependent information.
     - "global": Utilizes global knowledge.
     @abstractmethod
     async def has_node(self, node_id: str) -> bool:
+        """Check if a node exists in the graph.
+        Args:
+            node_id: The ID of the node to check
+        Returns:
+            True if the node exists, False otherwise
+        """
     @abstractmethod
     async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        """Check if an edge exists between two nodes.
+        Args:
+            source_node_id: The ID of the source node
+            target_node_id: The ID of the target node
+        Returns:
+            True if the edge exists, False otherwise
+        """
     @abstractmethod
     async def node_degree(self, node_id: str) -> int:
+        """Get the degree (number of connected edges) of a node.
+        Args:
+            node_id: The ID of the node
+        Returns:
+            The number of edges connected to the node
+        """
     @abstractmethod
     async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        """Get the total degree of an edge (sum of degrees of its source and target nodes).
+        Args:
+            src_id: The ID of the source node
+            tgt_id: The ID of the target node
+        Returns:
+            The sum of the degrees of the source and target nodes
+        """
     @abstractmethod
     async def get_node(self, node_id: str) -> dict[str, str] | None:
+        """Get node by its ID, returning only node properties.
+        Args:
+            node_id: The ID of the node to retrieve
+        Returns:
+            A dictionary of node properties if found, None otherwise
+        """
     @abstractmethod
     async def get_edge(
         self, source_node_id: str, target_node_id: str
     ) -> dict[str, str] | None:
+        """Get edge properties between two nodes.
+        Args:
+            source_node_id: The ID of the source node
+            target_node_id: The ID of the target node
+        Returns:
+            A dictionary of edge properties if found, None otherwise
+        """
     @abstractmethod
     async def get_node_edges(self, source_node_id: str) -> list[tuple[str, str]] | None:
+        """Get all edges connected to a node.
+        Args:
+            source_node_id: The ID of the node to get edges for
+        Returns:
+            A list of (source_id, target_id) tuples representing edges,
+            or None if the node doesn't exist
+        """
     @abstractmethod
     async def upsert_node(self, node_id: str, node_data: dict[str, str]) -> None:
+        """Insert a new node or update an existing node in the graph.
+        Importance notes for in-memory storage:
+        1. Changes will be persisted to disk during the next index_done_callback
+        2. Only one process should updating the storage at a time before index_done_callback,
+           KG-storage-log should be used to avoid data corruption
+        Args:
+            node_id: The ID of the node to insert or update
+            node_data: A dictionary of node properties
+        """
     @abstractmethod
     async def upsert_edge(
         self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
     ) -> None:
+        """Insert a new edge or update an existing edge in the graph.
         Importance notes for in-memory storage:
         1. Changes will be persisted to disk during the next index_done_callback
         2. Only one process should updating the storage at a time before index_done_callback,
            KG-storage-log should be used to avoid data corruption
+        Args:
+            source_node_id: The ID of the source node
+            target_node_id: The ID of the target node
+            edge_data: A dictionary of edge properties
         """
     @abstractmethod
     async def delete_node(self, node_id: str) -> None:
+        """Delete a node from the graph.
+        Importance notes for in-memory storage:
+        1. Changes will be persisted to disk during the next index_done_callback
+        2. Only one process should updating the storage at a time before index_done_callback,
+           KG-storage-log should be used to avoid data corruption
+        Args:
+            node_id: The ID of the node to delete
+        """
     @abstractmethod
+    async def remove_nodes(self, nodes: list[str]):
+        """Delete multiple nodes
+        Importance notes:
+        1. Changes will be persisted to disk during the next index_done_callback
+        2. Only one process should updating the storage at a time before index_done_callback,
+           KG-storage-log should be used to avoid data corruption
+        Args:
+            nodes: List of node IDs to be deleted
+        """
+    @abstractmethod
+    async def remove_edges(self, edges: list[tuple[str, str]]):
+        """Delete multiple edges
+        Importance notes:
+        1. Changes will be persisted to disk during the next index_done_callback
+        2. Only one process should updating the storage at a time before index_done_callback,
+           KG-storage-log should be used to avoid data corruption
+        Args:
+            edges: List of edges to be deleted, each edge is a (source, target) tuple
+        """
     @abstractmethod
     async def get_all_labels(self) -> list[str]:
+        """Get all labels in the graph.
+        Returns:
+            A list of all node labels in the graph, sorted alphabetically
+        """
     @abstractmethod
     async def get_knowledge_graph(

lightrag/kg/age_impl.py CHANGED Viewed

@@ -6,7 +6,6 @@ import sys
 from contextlib import asynccontextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, List, NamedTuple, Optional, Union, final
-import numpy as np
 import pipmaster as pm
 from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
@@ -89,11 +88,6 @@ class AGEStorage(BaseGraphStorage):
         return None
-    def __post_init__(self):
-        self._node_embed_algorithms = {
-            "node2vec": self._node2vec_embed,
-        }
     async def close(self):
         if self._driver:
             await self._driver.close()
@@ -593,9 +587,6 @@ class AGEStorage(BaseGraphStorage):
             logger.error("Error during edge upsert: {%s}", e)
             raise
-    async def _node2vec_embed(self):
-        print("Implemented but never called.")
     @asynccontextmanager
     async def _get_pool_connection(self, timeout: Optional[float] = None):
         """Workaround for a psycopg_pool bug"""
@@ -668,21 +659,6 @@ class AGEStorage(BaseGraphStorage):
                 logger.error(f"Error during edge deletion: {str(e)}")
                 raise
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        """Embed nodes using the specified algorithm
-        Args:
-            algorithm: Name of the embedding algorithm
-        Returns:
-            tuple: (embedding matrix, list of node identifiers)
-        """
-        if algorithm not in self._node_embed_algorithms:
-            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
-        return await self._node_embed_algorithms[algorithm]()
     async def get_all_labels(self) -> list[str]:
         """Get all node labels in the database

 from contextlib import asynccontextmanager
 from dataclasses import dataclass
 from typing import Any, Dict, List, NamedTuple, Optional, Union, final
 import pipmaster as pm
 from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
         return None
     async def close(self):
         if self._driver:
             await self._driver.close()
             logger.error("Error during edge upsert: {%s}", e)
             raise
     @asynccontextmanager
     async def _get_pool_connection(self, timeout: Optional[float] = None):
         """Workaround for a psycopg_pool bug"""
                 logger.error(f"Error during edge deletion: {str(e)}")
                 raise
     async def get_all_labels(self) -> list[str]:
         """Get all node labels in the database

lightrag/kg/gremlin_impl.py CHANGED Viewed

@@ -6,9 +6,6 @@ import pipmaster as pm
 from dataclasses import dataclass
 from typing import Any, Dict, List, final
-import numpy as np
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -72,11 +69,6 @@ class GremlinStorage(BaseGraphStorage):
             transport_factory=lambda: AiohttpTransport(call_from_event_loop=True),
         )
-    def __post_init__(self):
-        self._node_embed_algorithms = {
-            "node2vec": self._node2vec_embed,
-        }
     async def close(self):
         if self._driver:
             self._driver.close()
@@ -392,9 +384,6 @@ class GremlinStorage(BaseGraphStorage):
             logger.error("Error during edge upsert: {%s}", e)
             raise
-    async def _node2vec_embed(self):
-        print("Implemented but never called.")
     async def delete_node(self, node_id: str) -> None:
         """Delete a node with the specified entity_name
@@ -419,27 +408,6 @@ class GremlinStorage(BaseGraphStorage):
             logger.error(f"Error during node deletion: {str(e)}")
             raise
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        """
-        Embed nodes using the specified algorithm.
-        Currently, only node2vec is supported but never called.
-        Args:
-            algorithm: The name of the embedding algorithm to use
-        Returns:
-            A tuple of (embeddings, node_ids)
-        Raises:
-            NotImplementedError: If the specified algorithm is not supported
-            ValueError: If the algorithm is not supported
-        """
-        if algorithm not in self._node_embed_algorithms:
-            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
-        return await self._node_embed_algorithms[algorithm]()
     async def get_all_labels(self) -> list[str]:
         """
         Get all node entity_names in the graph

 from dataclasses import dataclass
 from typing import Any, Dict, List, final
 from tenacity import (
     retry,
     retry_if_exception_type,
             transport_factory=lambda: AiohttpTransport(call_from_event_loop=True),
         )
     async def close(self):
         if self._driver:
             self._driver.close()
             logger.error("Error during edge upsert: {%s}", e)
             raise
     async def delete_node(self, node_id: str) -> None:
         """Delete a node with the specified entity_name
             logger.error(f"Error during node deletion: {str(e)}")
             raise
     async def get_all_labels(self) -> list[str]:
         """
         Get all node entity_names in the graph

lightrag/kg/mongo_impl.py CHANGED Viewed

@@ -663,20 +663,6 @@ class MongoGraphStorage(BaseGraphStorage):
         # Remove the node doc
         await self.collection.delete_one({"_id": node_id})
-    #
-    # -------------------------------------------------------------------------
-    # EMBEDDINGS (NOT IMPLEMENTED)
-    # -------------------------------------------------------------------------
-    #
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        """
-        Placeholder for demonstration, raises NotImplementedError.
-        """
-        raise NotImplementedError("Node embedding is not used in lightrag.")
     #
     # -------------------------------------------------------------------------
     # QUERY

         # Remove the node doc
         await self.collection.delete_one({"_id": node_id})
     #
     # -------------------------------------------------------------------------
     # QUERY

lightrag/kg/neo4j_impl.py CHANGED Viewed

@@ -2,8 +2,7 @@ import inspect
 import os
 import re
 from dataclasses import dataclass
-from typing import Any, final
-import numpy as np
 import configparser
@@ -51,11 +50,6 @@ class Neo4JStorage(BaseGraphStorage):
         )
         self._driver = None
-    def __post_init__(self):
-        self._node_embed_algorithms = {
-            "node2vec": self._node2vec_embed,
-        }
     async def initialize(self):
         URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
         USERNAME = os.environ.get(
@@ -635,9 +629,6 @@ class Neo4JStorage(BaseGraphStorage):
             logger.error(f"Error during edge upsert: {str(e)}")
             raise
-    async def _node2vec_embed(self):
-        print("Implemented but never called.")
     async def get_knowledge_graph(
         self,
         node_label: str,
@@ -1126,11 +1117,6 @@ class Neo4JStorage(BaseGraphStorage):
                 logger.error(f"Error during edge deletion: {str(e)}")
                 raise
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        raise NotImplementedError
     async def drop(self) -> dict[str, str]:
         """Drop all data from storage and clean up resources

 import os
 import re
 from dataclasses import dataclass
+from typing import final
 import configparser
         )
         self._driver = None
     async def initialize(self):
         URI = os.environ.get("NEO4J_URI", config.get("neo4j", "uri", fallback=None))
         USERNAME = os.environ.get(
             logger.error(f"Error during edge upsert: {str(e)}")
             raise
     async def get_knowledge_graph(
         self,
         node_label: str,
                 logger.error(f"Error during edge deletion: {str(e)}")
                 raise
     async def drop(self) -> dict[str, str]:
         """Drop all data from storage and clean up resources

lightrag/kg/networkx_impl.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import os
 from dataclasses import dataclass
-from typing import Any, final
-import numpy as np
 from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
 from lightrag.utils import logger
@@ -16,7 +15,6 @@ if not pm.is_installed("graspologic"):
     pm.install("graspologic")
 import networkx as nx
-from graspologic import embed
 from .shared_storage import (
     get_storage_lock,
     get_update_flag,
@@ -42,40 +40,6 @@ class NetworkXStorage(BaseGraphStorage):
         )
         nx.write_graphml(graph, file_name)
-    # TODO：deprecated, remove later
-    @staticmethod
-    def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
-        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
-        Ensure an undirected graph with the same relationships will always be read the same way.
-        """
-        fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
-        sorted_nodes = graph.nodes(data=True)
-        sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
-        fixed_graph.add_nodes_from(sorted_nodes)
-        edges = list(graph.edges(data=True))
-        if not graph.is_directed():
-            def _sort_source_target(edge):
-                source, target, edge_data = edge
-                if source > target:
-                    temp = source
-                    source = target
-                    target = temp
-                return source, target, edge_data
-            edges = [_sort_source_target(edge) for edge in edges]
-        def _get_edge_key(source: Any, target: Any) -> str:
-            return f"{source} -> {target}"
-        edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
-        fixed_graph.add_edges_from(edges)
-        return fixed_graph
     def __post_init__(self):
         self._graphml_xml_file = os.path.join(
             self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
@@ -94,10 +58,6 @@ class NetworkXStorage(BaseGraphStorage):
             logger.info("Created new empty graph")
         self._graph = preloaded_graph or nx.Graph()
-        self._node_embed_algorithms = {
-            "node2vec": self._node2vec_embed,
-        }
     async def initialize(self):
         """Initialize storage data"""
         # Get the update flag for cross-process update notification
@@ -191,24 +151,6 @@ class NetworkXStorage(BaseGraphStorage):
         else:
             logger.warning(f"Node {node_id} not found in the graph for deletion.")
-    # TODO: NOT USED
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        if algorithm not in self._node_embed_algorithms:
-            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
-        return await self._node_embed_algorithms[algorithm]()
-    # TODO: NOT USED
-    async def _node2vec_embed(self):
-        graph = await self._get_graph()
-        embeddings, nodes = embed.node2vec_embed(
-            graph,
-            **self.global_config["node2vec_params"],
-        )
-        nodes_ids = [graph.nodes[node_id]["id"] for node_id in nodes]
-        return embeddings, nodes_ids
     async def remove_nodes(self, nodes: list[str]):
         """Delete multiple nodes

 import os
 from dataclasses import dataclass
+from typing import final
 from lightrag.types import KnowledgeGraph, KnowledgeGraphNode, KnowledgeGraphEdge
 from lightrag.utils import logger
     pm.install("graspologic")
 import networkx as nx
 from .shared_storage import (
     get_storage_lock,
     get_update_flag,
         )
         nx.write_graphml(graph, file_name)
     def __post_init__(self):
         self._graphml_xml_file = os.path.join(
             self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
             logger.info("Created new empty graph")
         self._graph = preloaded_graph or nx.Graph()
     async def initialize(self):
         """Initialize storage data"""
         # Get the update flag for cross-process update notification
         else:
             logger.warning(f"Node {node_id} not found in the graph for deletion.")
     async def remove_nodes(self, nodes: list[str]):
         """Delete multiple nodes

lightrag/kg/postgres_impl.py CHANGED Viewed

@@ -1021,9 +1021,6 @@ class PGGraphQueryException(Exception):
 class PGGraphStorage(BaseGraphStorage):
     def __post_init__(self):
         self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
-        self._node_embed_algorithms = {
-            "node2vec": self._node2vec_embed,
-        }
         self.db: PostgreSQLDB | None = None
     async def initialize(self):
@@ -1396,9 +1393,6 @@ class PGGraphStorage(BaseGraphStorage):
             )
             raise
-    async def _node2vec_embed(self):
-        print("Implemented but never called.")
     async def delete_node(self, node_id: str) -> None:
         """
         Delete a node from the graph.
@@ -1485,24 +1479,6 @@ class PGGraphStorage(BaseGraphStorage):
         labels = [result["label"] for result in results]
         return labels
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        """
-        Generate node embeddings using the specified algorithm.
-        Args:
-            algorithm (str): The name of the embedding algorithm to use.
-        Returns:
-            tuple[np.ndarray[Any, Any], list[str]]: A tuple containing the embeddings and the corresponding node IDs.
-        """
-        if algorithm not in self._node_embed_algorithms:
-            raise ValueError(f"Unsupported embedding algorithm: {algorithm}")
-        embed_func = self._node_embed_algorithms[algorithm]
-        return await embed_func()
     async def get_knowledge_graph(
         self,
         node_label: str,

 class PGGraphStorage(BaseGraphStorage):
     def __post_init__(self):
         self.graph_name = self.namespace or os.environ.get("AGE_GRAPH_NAME", "lightrag")
         self.db: PostgreSQLDB | None = None
     async def initialize(self):
             )
             raise
     async def delete_node(self, node_id: str) -> None:
         """
         Delete a node from the graph.
         labels = [result["label"] for result in results]
         return labels
     async def get_knowledge_graph(
         self,
         node_label: str,

lightrag/kg/tidb_impl.py CHANGED Viewed

@@ -800,13 +800,6 @@ class TiDBGraphStorage(BaseGraphStorage):
         }
         await self.db.execute(merge_sql, data)
-    async def embed_nodes(
-        self, algorithm: str
-    ) -> tuple[np.ndarray[Any, Any], list[str]]:
-        if algorithm not in self._node_embed_algorithms:
-            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
-        return await self._node_embed_algorithms[algorithm]()
     # Query
     async def has_node(self, node_id: str) -> bool:

         }
         await self.db.execute(merge_sql, data)
     # Query
     async def has_node(self, node_id: str) -> bool:

lightrag/lightrag.py CHANGED Viewed

@@ -155,31 +155,6 @@ class LightRAG:
     Defaults to `chunking_by_token_size` if not specified.
     """
-    # Node embedding
-    # ---
-    node_embedding_algorithm: str = field(default="node2vec")
-    """Algorithm used for node embedding in knowledge graphs."""
-    node2vec_params: dict[str, int] = field(
-        default_factory=lambda: {
-            "dimensions": 1536,
-            "num_walks": 10,
-            "walk_length": 40,
-            "window_size": 2,
-            "iterations": 3,
-            "random_seed": 3,
-        }
-    )
-    """Configuration for the node2vec embedding algorithm:
-    - dimensions: Number of dimensions for embeddings.
-    - num_walks: Number of random walks per node.
-    - walk_length: Number of steps per random walk.
-    - window_size: Context window size for training.
-    - iterations: Number of iterations for training.
-    - random_seed: Seed value for reproducibility.
-    """
     # Embedding
     # ---
@@ -904,8 +879,10 @@ class LightRAG:
                         async with pipeline_status_lock:
                             log_message = f"Processing file: {file_path}"
                             pipeline_status["history_messages"].append(log_message)
                             log_message = f"Processing d-id: {doc_id}"
                             pipeline_status["latest_message"] = log_message
                             pipeline_status["history_messages"].append(log_message)
@@ -1381,6 +1358,16 @@ class LightRAG:
                 hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
         else:
             raise ValueError(f"Unknown mode {param.mode}")
         await self._query_done()

     Defaults to `chunking_by_token_size` if not specified.
     """
     # Embedding
     # ---
                         async with pipeline_status_lock:
                             log_message = f"Processing file: {file_path}"
+                            logger.info(log_message)
                             pipeline_status["history_messages"].append(log_message)
                             log_message = f"Processing d-id: {doc_id}"
+                            logger.info(log_message)
                             pipeline_status["latest_message"] = log_message
                             pipeline_status["history_messages"].append(log_message)
                 hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
                 system_prompt=system_prompt,
             )
+        elif param.mode == "bypass":
+            # Bypass mode: directly use LLM without knowledge retrieval
+            use_llm_func = param.model_func or global_config["llm_model_func"]
+            param.stream = True if param.stream is None else param.stream
+            response = await use_llm_func(
+                query.strip(),
+                system_prompt=system_prompt,
+                history_messages=param.conversation_history,
+                stream=param.stream,
+            )
         else:
             raise ValueError(f"Unknown mode {param.mode}")
         await self._query_done()

lightrag/operate.py CHANGED Viewed

@@ -16,6 +16,7 @@ from .utils import (
     encode_string_by_tiktoken,
     is_float_regex,
     list_of_list_to_csv,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
     truncate_list_by_token_size,
@@ -163,6 +164,9 @@ async def _handle_single_entity_extraction(
         )
         return None
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
     if not entity_type.strip() or entity_type.startswith('("'):
@@ -172,7 +176,9 @@ async def _handle_single_entity_extraction(
         return None
     # Clean and validate description
-    entity_description = clean_str(record_attributes[3]).strip('"')
     if not entity_description.strip():
         logger.warning(
             f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
@@ -196,13 +202,20 @@ async def _handle_single_relationship_extraction(
     if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
-    source = clean_str(record_attributes[1]).strip('"')
-    target = clean_str(record_attributes[2]).strip('"')
-    edge_description = clean_str(record_attributes[3]).strip('"')
-    edge_keywords = clean_str(record_attributes[4]).strip('"')
     edge_source_id = chunk_key
     weight = (
-        float(record_attributes[-1].strip('"'))
         if is_float_regex(record_attributes[-1])
         else 1.0
     )
@@ -642,7 +655,7 @@ async def extract_entities(
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
-        log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)"
         logger.info(log_message)
         if pipeline_status is not None:
             async with pipeline_status_lock:

     encode_string_by_tiktoken,
     is_float_regex,
     list_of_list_to_csv,
+    normalize_extracted_info,
     pack_user_ass_to_openai_messages,
     split_string_by_multi_markers,
     truncate_list_by_token_size,
         )
         return None
+    # Normalize entity name
+    entity_name = normalize_extracted_info(entity_name, is_entity=True)
     # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
     if not entity_type.strip() or entity_type.startswith('("'):
         return None
     # Clean and validate description
+    entity_description = clean_str(record_attributes[3])
+    entity_description = normalize_extracted_info(entity_description)
     if not entity_description.strip():
         logger.warning(
             f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
     if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
         return None
     # add this record as edge
+    source = clean_str(record_attributes[1])
+    target = clean_str(record_attributes[2])
+    # Normalize source and target entity names
+    source = normalize_extracted_info(source, is_entity=True)
+    target = normalize_extracted_info(target, is_entity=True)
+    edge_description = clean_str(record_attributes[3])
+    edge_description = normalize_extracted_info(edge_description)
+    edge_keywords = clean_str(record_attributes[4]).strip('"').strip("'")
     edge_source_id = chunk_key
     weight = (
+        float(record_attributes[-1].strip('"').strip("'"))
         if is_float_regex(record_attributes[-1])
         else 1.0
     )
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
+        log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel"
         logger.info(log_message)
         if pipeline_status is not None:
             async with pipeline_status_lock:

lightrag/utils.py CHANGED Viewed

@@ -1006,6 +1006,50 @@ def get_content_summary(content: str, max_length: int = 250) -> str:
     return content[:max_length] + "..."
 def clean_text(text: str) -> str:
     """Clean text by removing null bytes (0x00) and whitespace

     return content[:max_length] + "..."
+def normalize_extracted_info(name: str, is_entity=False) -> str:
+    """Normalize entity/relation names and description with the following rules:
+    1. Remove spaces between Chinese characters
+    2. Remove spaces between Chinese characters and English letters/numbers
+    3. Preserve spaces within English text and numbers
+    4. Replace Chinese parentheses with English parentheses
+    5. Replace Chinese dash with English dash
+    Args:
+        name: Entity name to normalize
+    Returns:
+        Normalized entity name
+    """
+    # Replace Chinese parentheses with English parentheses
+    name = name.replace("（", "(").replace("）", ")")
+    # Replace Chinese dash with English dash
+    name = name.replace("—", "-").replace("－", "-")
+    # Use regex to remove spaces between Chinese characters
+    # Regex explanation:
+    # (?<=[\u4e00-\u9fa5]): Positive lookbehind for Chinese character
+    # \s+: One or more whitespace characters
+    # (?=[\u4e00-\u9fa5]): Positive lookahead for Chinese character
+    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[\u4e00-\u9fa5])", "", name)
+    # Remove spaces between Chinese and English/numbers
+    name = re.sub(r"(?<=[\u4e00-\u9fa5])\s+(?=[a-zA-Z0-9])", "", name)
+    name = re.sub(r"(?<=[a-zA-Z0-9])\s+(?=[\u4e00-\u9fa5])", "", name)
+    # Remove English quotation marks from the beginning and end
+    name = name.strip('"').strip("'")
+    if is_entity:
+        # remove Chinese quotes
+        name = name.replace("“", "").replace("”", "").replace("‘", "").replace("’", "")
+        # remove English queotes in and around chinese
+        name = re.sub(r"['\"]+(?=[\u4e00-\u9fa5])", "", name)
+        name = re.sub(r"(?<=[\u4e00-\u9fa5])['\"]+", "", name)
+    return name
 def clean_text(text: str) -> str:
     """Clean text by removing null bytes (0x00) and whitespace

lightrag_webui/src/api/lightrag.ts CHANGED Viewed

@@ -65,8 +65,9 @@ export type LightragDocumentsScanProgress = {
  * - "global": Utilizes global knowledge.
  * - "hybrid": Combines local and global retrieval methods.
  * - "mix": Integrates knowledge graph and vector retrieval.
  */
-export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix'
 export type Message = {
   role: 'user' | 'assistant' | 'system'

  * - "global": Utilizes global knowledge.
  * - "hybrid": Combines local and global retrieval methods.
  * - "mix": Integrates knowledge graph and vector retrieval.
+ * - "bypass": Bypasses knowledge retrieval and directly uses the LLM.
  */
+export type QueryMode = 'naive' | 'local' | 'global' | 'hybrid' | 'mix' | 'bypass'
 export type Message = {
   role: 'user' | 'assistant' | 'system'

lightrag_webui/src/components/retrieval/QuerySettings.tsx CHANGED Viewed

@@ -55,6 +55,7 @@ export default function QuerySettings() {
                     <SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem>
                     <SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem>
                     <SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem>
                   </SelectGroup>
                 </SelectContent>
               </Select>

                     <SelectItem value="global">{t('retrievePanel.querySettings.queryModeOptions.global')}</SelectItem>
                     <SelectItem value="hybrid">{t('retrievePanel.querySettings.queryModeOptions.hybrid')}</SelectItem>
                     <SelectItem value="mix">{t('retrievePanel.querySettings.queryModeOptions.mix')}</SelectItem>
+                    <SelectItem value="bypass">{t('retrievePanel.querySettings.queryModeOptions.bypass')}</SelectItem>
                   </SelectGroup>
                 </SelectContent>
               </Select>

lightrag_webui/src/locales/ar.json CHANGED Viewed

@@ -306,13 +306,14 @@
       "parametersTitle": "المعلمات",
       "parametersDescription": "تكوين معلمات الاستعلام الخاص بك",
       "queryMode": "وضع الاستعلام",
-      "queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي",
       "queryModeOptions": {
         "naive": "ساذج",
         "local": "محلي",
         "global": "عالمي",
         "hybrid": "مختلط",
-        "mix": "مزيج"
       },
       "responseFormat": "تنسيق الرد",
       "responseFormatTooltip": "يحدد تنسيق الرد. أمثلة:\n• فقرات متعددة\n• فقرة واحدة\n• نقاط نقطية",

       "parametersTitle": "المعلمات",
       "parametersDescription": "تكوين معلمات الاستعلام الخاص بك",
       "queryMode": "وضع الاستعلام",
+      "queryModeTooltip": "حدد استراتيجية الاسترجاع:\n• ساذج: بحث أساسي بدون تقنيات متقدمة\n• محلي: استرجاع معلومات يعتمد على السياق\n• عالمي: يستخدم قاعدة المعرفة العالمية\n• مختلط: يجمع بين الاسترجاع المحلي والعالمي\n• مزيج: يدمج شبكة المعرفة مع الاسترجاع المتجهي\n• تجاوز: يمرر الاستعلام مباشرة إلى LLM بدون استرجاع",
       "queryModeOptions": {
         "naive": "ساذج",
         "local": "محلي",
         "global": "عالمي",
         "hybrid": "مختلط",
+        "mix": "مزيج",
+        "bypass": "تجاوز"
       },
       "responseFormat": "تنسيق الرد",
       "responseFormatTooltip": "يحدد تنسيق الرد. أمثلة:\n• فقرات متعددة\n• فقرة واحدة\n• نقاط نقطية",

lightrag_webui/src/locales/en.json CHANGED Viewed

@@ -305,13 +305,14 @@
       "parametersTitle": "Parameters",
       "parametersDescription": "Configure your query parameters",
       "queryMode": "Query Mode",
-      "queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval",
       "queryModeOptions": {
         "naive": "Naive",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybrid",
-        "mix": "Mix"
       },
       "responseFormat": "Response Format",
       "responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points",

       "parametersTitle": "Parameters",
       "parametersDescription": "Configure your query parameters",
       "queryMode": "Query Mode",
+      "queryModeTooltip": "Select the retrieval strategy:\n• Naive: Basic search without advanced techniques\n• Local: Context-dependent information retrieval\n• Global: Utilizes global knowledge base\n• Hybrid: Combines local and global retrieval\n• Mix: Integrates knowledge graph with vector retrieval\n• Bypass: Passes query directly to LLM without retrieval",
       "queryModeOptions": {
         "naive": "Naive",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybrid",
+        "mix": "Mix",
+        "bypass": "Bypass"
       },
       "responseFormat": "Response Format",
       "responseFormatTooltip": "Defines the response format. Examples:\n• Multiple Paragraphs\n• Single Paragraph\n• Bullet Points",

lightrag_webui/src/locales/fr.json CHANGED Viewed

@@ -306,13 +306,14 @@
       "parametersTitle": "Paramètres",
       "parametersDescription": "Configurez vos paramètres de requête",
       "queryMode": "Mode de requête",
-      "queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle",
       "queryModeOptions": {
         "naive": "Naïf",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybride",
-        "mix": "Mixte"
       },
       "responseFormat": "Format de réponse",
       "responseFormatTooltip": "Définit le format de la réponse. Exemples :\n• Plusieurs paragraphes\n• Paragraphe unique\n• Points à puces",

       "parametersTitle": "Paramètres",
       "parametersDescription": "Configurez vos paramètres de requête",
       "queryMode": "Mode de requête",
+      "queryModeTooltip": "Sélectionnez la stratégie de récupération :\n• Naïf : Recherche de base sans techniques avancées\n• Local : Récupération d'informations dépendante du contexte\n• Global : Utilise une base de connaissances globale\n• Hybride : Combine récupération locale et globale\n• Mixte : Intègre le graphe de connaissances avec la récupération vectorielle\n• Bypass : Transmet directement la requête au LLM sans récupération",
       "queryModeOptions": {
         "naive": "Naïf",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybride",
+        "mix": "Mixte",
+        "bypass": "Bypass"
       },
       "responseFormat": "Format de réponse",
       "responseFormatTooltip": "Définit le format de la réponse. Exemples :\n• Plusieurs paragraphes\n• Paragraphe unique\n• Points à puces",

lightrag_webui/src/locales/zh.json CHANGED Viewed

@@ -306,13 +306,14 @@
       "parametersTitle": "参数",
       "parametersDescription": "配置查询参数",
       "queryMode": "查询模式",
-      "queryModeTooltip": "选择检索策略：\n• Naive：基础搜索，无高级技术\n• Local：上下文相关信息检索\n• Global：利用全局知识库\n• Hybrid：结合本地和全局检索\n• Mix：整合知识图谱和向量检索",
       "queryModeOptions": {
         "naive": "Naive",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybrid",
-        "mix": "Mix"
       },
       "responseFormat": "响应格式",
       "responseFormatTooltip": "定义响应格式。例如：\n• 多段落\n• 单段落\n• 要点",

       "parametersTitle": "参数",
       "parametersDescription": "配置查询参数",
       "queryMode": "查询模式",
+      "queryModeTooltip": "选择检索策略：\n• Naive：基础搜索，无高级技术\n• Local：上下文相关信息检索\n• Global：利用全局知识库\n• Hybrid：结合本地和全局检索\n• Mix：整合知识图谱和向量检索\n• Bypass：直接传递查询到LLM，不进行检索",
       "queryModeOptions": {
         "naive": "Naive",
         "local": "Local",
         "global": "Global",
         "hybrid": "Hybrid",
+        "mix": "Mix",
+        "bypass": "Bypass"
       },
       "responseFormat": "响应格式",
       "responseFormatTooltip": "定义响应格式。例如：\n• 多段落\n• 单段落\n• 要点",