Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

yangdx commited on Mar 11

Commit

3f54db9

2 Parent(s): c27f760 4eec805

Merge branch 'main' into fix-neo4j-duplicate-nodes

Browse files

Files changed (9) hide show

README.md +2 -0
lightrag/__init__.py +1 -1
lightrag/base.py +6 -1
lightrag/kg/postgres_impl.py +86 -21
lightrag/lightrag.py +45 -85
lightrag/llm/azure_openai.py +2 -0
lightrag/operate.py +189 -40
lightrag/prompt.py +1 -1
lightrag/utils.py +49 -0

README.md CHANGED Viewed

@@ -176,6 +176,8 @@ class QueryParam:
     """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
     max_token_for_local_context: int = 4000
     """Maximum number of tokens allocated for entity descriptions in local retrieval."""
     ...
 ```

     """Maximum number of tokens allocated for relationship descriptions in global retrieval."""
     max_token_for_local_context: int = 4000
     """Maximum number of tokens allocated for entity descriptions in local retrieval."""
+    ids: list[str] | None = None # ONLY SUPPORTED FOR PG VECTOR DBs
+    """List of ids to filter the RAG."""
     ...
 ```

lightrag/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
-__version__ = "1.2.4"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"

 from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
+__version__ = "1.2.5"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/LightRAG"

lightrag/base.py CHANGED Viewed

@@ -81,6 +81,9 @@ class QueryParam:
     history_turns: int = 3
     """Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
 @dataclass
 class StorageNameSpace(ABC):
@@ -107,7 +110,9 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     meta_fields: set[str] = field(default_factory=set)
     @abstractmethod
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
     @abstractmethod

     history_turns: int = 3
     """Number of complete conversation turns (user-assistant pairs) to consider in the response context."""
+    ids: list[str] | None = None
+    """List of ids to filter the results."""
 @dataclass
 class StorageNameSpace(ABC):
     meta_fields: set[str] = field(default_factory=set)
     @abstractmethod
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
     @abstractmethod

lightrag/kg/postgres_impl.py CHANGED Viewed

@@ -438,6 +438,8 @@ class PGVectorStorage(BaseVectorStorage):
             "entity_name": item["entity_name"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
         }
         return upsert_sql, data
@@ -450,6 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
         }
         return upsert_sql, data
@@ -492,13 +496,20 @@ class PGVectorStorage(BaseVectorStorage):
             await self.db.execute(upsert_sql, data)
     #################### query method ###############
-    async def query(self, query: str, top_k: int) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
         sql = SQL_TEMPLATES[self.base_namespace].format(
-            embedding_string=embedding_string
         )
         params = {
             "workspace": self.db.workspace,
@@ -1491,6 +1502,7 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                     )"""
     },
@@ -1504,6 +1516,7 @@ TABLES = {
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                     )"""
     },
@@ -1586,8 +1599,9 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
-    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET entity_name=EXCLUDED.entity_name,
                       content=EXCLUDED.content,
@@ -1595,8 +1609,8 @@ SQL_TEMPLATES = {
                       update_time=CURRENT_TIMESTAMP
                      """,
     "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
-                      target_id, content, content_vector)
-                      VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET source_id=EXCLUDED.source_id,
                       target_id=EXCLUDED.target_id,
@@ -1604,21 +1618,21 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
                      """,
     # SQL for VectorStorage
-    "entities": """SELECT entity_name FROM
-        (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
-        (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_VDB_RELATION where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
-    "chunks": """SELECT id FROM
-        (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-        FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
-        WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
-       """,
     # DROP tables
     "drop_all": """
 	    DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
@@ -1642,4 +1656,55 @@ SQL_TEMPLATES = {
     "drop_vdb_relation": """
 	    DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
        """,
 }

             "entity_name": item["entity_name"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
         }
         return upsert_sql, data
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
         }
         return upsert_sql, data
             await self.db.execute(upsert_sql, data)
     #################### query method ###############
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
+        if ids:
+            formatted_ids = ",".join(f"'{id}'" for id in ids)
+        else:
+            formatted_ids = "NULL"
         sql = SQL_TEMPLATES[self.base_namespace].format(
+            embedding_string=embedding_string, doc_ids=formatted_ids
         )
         params = {
             "workspace": self.db.workspace,
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
                     )"""
     },
                     content_vector VECTOR,
                     create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                     update_time TIMESTAMP,
+                    chunk_id VARCHAR(255) NULL,
 	                CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
                     )"""
     },
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
+    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
+                      content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET entity_name=EXCLUDED.entity_name,
                       content=EXCLUDED.content,
                       update_time=CURRENT_TIMESTAMP
                      """,
     "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
+                      target_id, content, content_vector, chunk_id)
+                      VALUES ($1, $2, $3, $4, $5, $6, $7)
                       ON CONFLICT (workspace,id) DO UPDATE
                       SET source_id=EXCLUDED.source_id,
                       target_id=EXCLUDED.target_id,
                       content_vector=EXCLUDED.content_vector, update_time = CURRENT_TIMESTAMP
                      """,
     # SQL for VectorStorage
+    # "entities": """SELECT entity_name FROM
+    #     (SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_ENTITY where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "relationships": """SELECT source_id as src_id, target_id as tgt_id FROM
+    #     (SELECT id, source_id,target_id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_VDB_RELATION where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
+    # "chunks": """SELECT id FROM
+    #     (SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+    #     FROM LIGHTRAG_DOC_CHUNKS where workspace=$1)
+    #     WHERE distance>$2 ORDER BY distance DESC  LIMIT $3
+    #    """,
     # DROP tables
     "drop_all": """
 	    DROP TABLE IF EXISTS LIGHTRAG_DOC_FULL CASCADE;
     "drop_vdb_relation": """
 	    DROP TABLE IF EXISTS LIGHTRAG_VDB_RELATION CASCADE;
        """,
+    "relationships": """
+    WITH relevant_chunks AS (
+        SELECT id as chunk_id
+        FROM LIGHTRAG_DOC_CHUNKS
+        WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+    )
+    SELECT source_id as src_id, target_id as tgt_id
+    FROM (
+        SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
+        FROM LIGHTRAG_VDB_RELATION r
+        WHERE r.workspace=$1
+        AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+    ) filtered
+    WHERE distance>$2
+    ORDER BY distance DESC
+    LIMIT $3
+    """,
+    "entities": """
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT entity_name FROM
+            (
+                SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_VDB_ENTITY
+                where workspace=$1
+                AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+        WHERE distance>$2
+        ORDER BY distance DESC
+        LIMIT $3
+    """,
+    "chunks": """
+        WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
+            WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
+        )
+        SELECT id FROM
+            (
+                SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_DOC_CHUNKS
+                where workspace=$1
+                AND id IN (SELECT chunk_id FROM relevant_chunks)
+            )
+            WHERE distance>$2
+            ORDER BY distance DESC
+            LIMIT $3
+    """,
 }

lightrag/lightrag.py CHANGED Viewed

@@ -30,11 +30,10 @@ from .namespace import NameSpace, make_namespace
 from .operate import (
     chunking_by_token_size,
     extract_entities,
-    extract_keywords_only,
     kg_query,
-    kg_query_with_keywords,
     mix_kg_vector_query,
     naive_query,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
@@ -45,6 +44,9 @@ from .utils import (
     encode_string_by_tiktoken,
     lazy_external_import,
     limit_async_func_call,
     logger,
 )
 from .types import KnowledgeGraph
@@ -309,7 +311,7 @@ class LightRAG:
             # Verify storage implementation compatibility
             verify_storage_implementation(storage_type, storage_name)
             # Check environment variables
-            # self.check_storage_env_vars(storage_name)
         # Ensure vector_db_storage_cls_kwargs has required fields
         self.vector_db_storage_cls_kwargs = {
@@ -536,11 +538,6 @@ class LightRAG:
         storage_class = lazy_external_import(import_path, storage_name)
         return storage_class
-    @staticmethod
-    def clean_text(text: str) -> str:
-        """Clean text by removing null bytes (0x00) and whitespace"""
-        return text.strip().replace("\x00", "")
     def insert(
         self,
         input: str | list[str],
@@ -602,8 +599,8 @@ class LightRAG:
         update_storage = False
         try:
             # Clean input texts
-            full_text = self.clean_text(full_text)
-            text_chunks = [self.clean_text(chunk) for chunk in text_chunks]
             # Process cleaned texts
             if doc_id is None:
@@ -682,7 +679,7 @@ class LightRAG:
             contents = {id_: doc for id_, doc in zip(ids, input)}
         else:
             # Clean input text and remove duplicates
-            input = list(set(self.clean_text(doc) for doc in input))
             # Generate contents dict of MD5 hash IDs and documents
             contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
@@ -698,7 +695,7 @@ class LightRAG:
         new_docs: dict[str, Any] = {
             id_: {
                 "content": content,
-                "content_summary": self._get_content_summary(content),
                 "content_length": len(content),
                 "status": DocStatus.PENDING,
                 "created_at": datetime.now().isoformat(),
@@ -1063,7 +1060,7 @@ class LightRAG:
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
-                chunk_content = self.clean_text(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 tokens = len(
                     encode_string_by_tiktoken(
@@ -1296,8 +1293,17 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ):
         """
-        1. Extract keywords from the 'query' using new function in operate.py.
-        2. Then run the standard aquery() flow with the final prompt (formatted_question).
         """
         loop = always_get_an_event_loop()
         return loop.run_until_complete(
@@ -1308,66 +1314,29 @@ class LightRAG:
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ) -> str | AsyncIterator[str]:
         """
-        1. Calls extract_keywords_only to get HL/LL keywords from 'query'.
-        2. Then calls kg_query(...) or naive_query(...), etc. as the main query, while also injecting the newly extracted keywords if needed.
         """
-        # ---------------------
-        # STEP 1: Keyword Extraction
-        # ---------------------
-        hl_keywords, ll_keywords = await extract_keywords_only(
-            text=query,
             param=param,
             global_config=asdict(self),
-            hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
         )
-        param.hl_keywords = hl_keywords
-        param.ll_keywords = ll_keywords
-        # ---------------------
-        # STEP 2: Final Query Logic
-        # ---------------------
-        # Create a new string with the prompt and the keywords
-        ll_keywords_str = ", ".join(ll_keywords)
-        hl_keywords_str = ", ".join(hl_keywords)
-        formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
-        if param.mode in ["local", "global", "hybrid"]:
-            response = await kg_query_with_keywords(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "naive":
-            response = await naive_query(
-                formatted_question,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        elif param.mode == "mix":
-            response = await mix_kg_vector_query(
-                formatted_question,
-                self.chunk_entity_relation_graph,
-                self.entities_vdb,
-                self.relationships_vdb,
-                self.chunks_vdb,
-                self.text_chunks,
-                param,
-                asdict(self),
-                hashing_kv=self.llm_response_cache,  # Directly use llm_response_cache
-            )
-        else:
-            raise ValueError(f"Unknown mode {param.mode}")
         await self._query_done()
         return response
@@ -1465,21 +1434,6 @@ class LightRAG:
             ]
         )
-    def _get_content_summary(self, content: str, max_length: int = 100) -> str:
-        """Get summary of document content
-        Args:
-            content: Original document content
-            max_length: Maximum length of summary
-        Returns:
-            Truncated content with ellipsis if needed
-        """
-        content = content.strip()
-        if len(content) <= max_length:
-            return content
-        return content[:max_length] + "..."
     async def get_processing_status(self) -> dict[str, int]:
         """Get current document processing status counts
@@ -2622,6 +2576,12 @@ class LightRAG:
             # 9. Delete source entities
             for entity_name in source_entities:
                 # Delete entity node from knowledge graph
                 await self.chunk_entity_relation_graph.delete_node(entity_name)

 from .operate import (
     chunking_by_token_size,
     extract_entities,
     kg_query,
     mix_kg_vector_query,
     naive_query,
+    query_with_keywords,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 from .utils import (
     encode_string_by_tiktoken,
     lazy_external_import,
     limit_async_func_call,
+    get_content_summary,
+    clean_text,
+    check_storage_env_vars,
     logger,
 )
 from .types import KnowledgeGraph
             # Verify storage implementation compatibility
             verify_storage_implementation(storage_type, storage_name)
             # Check environment variables
+            check_storage_env_vars(storage_name)
         # Ensure vector_db_storage_cls_kwargs has required fields
         self.vector_db_storage_cls_kwargs = {
         storage_class = lazy_external_import(import_path, storage_name)
         return storage_class
     def insert(
         self,
         input: str | list[str],
         update_storage = False
         try:
             # Clean input texts
+            full_text = clean_text(full_text)
+            text_chunks = [clean_text(chunk) for chunk in text_chunks]
             # Process cleaned texts
             if doc_id is None:
             contents = {id_: doc for id_, doc in zip(ids, input)}
         else:
             # Clean input text and remove duplicates
+            input = list(set(clean_text(doc) for doc in input))
             # Generate contents dict of MD5 hash IDs and documents
             contents = {compute_mdhash_id(doc, prefix="doc-"): doc for doc in input}
         new_docs: dict[str, Any] = {
             id_: {
                 "content": content,
+                "content_summary": get_content_summary(content),
                 "content_length": len(content),
                 "status": DocStatus.PENDING,
                 "created_at": datetime.now().isoformat(),
             all_chunks_data: dict[str, dict[str, str]] = {}
             chunk_to_source_map: dict[str, str] = {}
             for chunk_data in custom_kg.get("chunks", []):
+                chunk_content = clean_text(chunk_data["content"])
                 source_id = chunk_data["source_id"]
                 tokens = len(
                     encode_string_by_tiktoken(
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ):
         """
+        Query with separate keyword extraction step.
+        This method extracts keywords from the query first, then uses them for the query.
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+        Returns:
+            Query response
         """
         loop = always_get_an_event_loop()
         return loop.run_until_complete(
         self, query: str, prompt: str, param: QueryParam = QueryParam()
     ) -> str | AsyncIterator[str]:
         """
+        Async version of query_with_separate_keyword_extraction.
+        Args:
+            query: User query
+            prompt: Additional prompt for the query
+            param: Query parameters
+        Returns:
+            Query response or async iterator
         """
+        response = await query_with_keywords(
+            query=query,
+            prompt=prompt,
             param=param,
+            knowledge_graph_inst=self.chunk_entity_relation_graph,
+            entities_vdb=self.entities_vdb,
+            relationships_vdb=self.relationships_vdb,
+            chunks_vdb=self.chunks_vdb,
+            text_chunks_db=self.text_chunks,
             global_config=asdict(self),
+            hashing_kv=self.llm_response_cache,
         )
         await self._query_done()
         return response
             ]
         )
     async def get_processing_status(self) -> dict[str, int]:
         """Get current document processing status counts
             # 9. Delete source entities
             for entity_name in source_entities:
+                if entity_name == target_entity:
+                    logger.info(
+                        f"Skipping deletion of '{entity_name}' as it's also the target entity"
+                    )
+                    continue
                 # Delete entity node from knowledge graph
                 await self.chunk_entity_relation_graph.delete_node(entity_name)

lightrag/llm/azure_openai.py CHANGED Viewed

@@ -55,6 +55,7 @@ async def azure_openai_complete_if_cache(
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )
@@ -136,6 +137,7 @@ async def azure_openai_embed(
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )

     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )
     openai_async_client = AsyncAzureOpenAI(
         azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
+        azure_deployment=model,
         api_key=os.getenv("AZURE_OPENAI_API_KEY"),
         api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
     )

lightrag/operate.py CHANGED Viewed

@@ -141,18 +141,36 @@ async def _handle_single_entity_extraction(
 ):
     if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
         return None
-    # add this record as a node in the G
     entity_name = clean_str(record_attributes[1]).strip('"')
     if not entity_name.strip():
         return None
     entity_type = clean_str(record_attributes[2]).strip('"')
     entity_description = clean_str(record_attributes[3]).strip('"')
-    entity_source_id = chunk_key
     return dict(
         entity_name=entity_name,
         entity_type=entity_type,
         description=entity_description,
-        source_id=entity_source_id,
         metadata={"created_at": time.time()},
     )
@@ -438,30 +456,91 @@ async def extract_entities(
         else:
             return await use_llm_func(input_text)
     async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
-        """ "Prpocess a single chunk
         Args:
             chunk_key_dp (tuple[str, TextChunkSchema]):
-                ("chunck-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
         """
         nonlocal processed_chunks
         chunk_key = chunk_key_dp[0]
         chunk_dp = chunk_key_dp[1]
         content = chunk_dp["content"]
-        # hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
         hint_prompt = entity_extract_prompt.format(
             **context_base, input_text="{input_text}"
         ).format(**context_base, input_text=content)
         final_result = await _user_llm_func_with_cache(hint_prompt)
         history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
         for now_glean_index in range(entity_extract_max_gleaning):
             glean_result = await _user_llm_func_with_cache(
                 continue_prompt, history_messages=history
             )
             history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
-            final_result += glean_result
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
@@ -472,35 +551,6 @@ async def extract_entities(
             if if_loop_result != "yes":
                 break
-        records = split_string_by_multi_markers(
-            final_result,
-            [context_base["record_delimiter"], context_base["completion_delimiter"]],
-        )
-        maybe_nodes = defaultdict(list)
-        maybe_edges = defaultdict(list)
-        for record in records:
-            record = re.search(r"\((.*)\)", record)
-            if record is None:
-                continue
-            record = record.group(1)
-            record_attributes = split_string_by_multi_markers(
-                record, [context_base["tuple_delimiter"]]
-            )
-            if_entities = await _handle_single_entity_extraction(
-                record_attributes, chunk_key
-            )
-            if if_entities is not None:
-                maybe_nodes[if_entities["entity_name"]].append(if_entities)
-                continue
-            if_relation = await _handle_single_relationship_extraction(
-                record_attributes, chunk_key
-            )
-            if if_relation is not None:
-                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
-                    if_relation
-                )
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
@@ -912,7 +962,10 @@ async def mix_kg_vector_query(
         try:
             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
-            results = await chunks_vdb.query(augmented_query, top_k=mix_topk)
             if not results:
                 return None
@@ -1121,7 +1174,11 @@ async def _get_node_data(
     logger.info(
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
-    results = await entities_vdb.query(query, top_k=query_param.top_k)
     if not len(results):
         return "", "", ""
     # get entity information
@@ -1374,7 +1431,10 @@ async def _get_edge_data(
     logger.info(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
-    results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
     if not len(results):
         return "", "", ""
@@ -1623,7 +1683,9 @@ async def naive_query(
     if cached_response is not None:
         return cached_response
-    results = await chunks_vdb.query(query, top_k=query_param.top_k)
     if not len(results):
         return PROMPTS["fail_response"]
@@ -1854,3 +1916,90 @@ async def kg_query_with_keywords(
         )
     return response

 ):
     if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
         return None
+    # Clean and validate entity name
     entity_name = clean_str(record_attributes[1]).strip('"')
     if not entity_name.strip():
+        logger.warning(
+            f"Entity extraction error: empty entity name in: {record_attributes}"
+        )
         return None
+    # Clean and validate entity type
     entity_type = clean_str(record_attributes[2]).strip('"')
+    if not entity_type.strip() or entity_type.startswith('("'):
+        logger.warning(
+            f"Entity extraction error: invalid entity type in: {record_attributes}"
+        )
+        return None
+    # Clean and validate description
     entity_description = clean_str(record_attributes[3]).strip('"')
+    if not entity_description.strip():
+        logger.warning(
+            f"Entity extraction error: empty description for entity '{entity_name}' of type '{entity_type}'"
+        )
+        return None
     return dict(
         entity_name=entity_name,
         entity_type=entity_type,
         description=entity_description,
+        source_id=chunk_key,
         metadata={"created_at": time.time()},
     )
         else:
             return await use_llm_func(input_text)
+    async def _process_extraction_result(result: str, chunk_key: str):
+        """Process a single extraction result (either initial or gleaning)
+        Args:
+            result (str): The extraction result to process
+            chunk_key (str): The chunk key for source tracking
+        Returns:
+            tuple: (nodes_dict, edges_dict) containing the extracted entities and relationships
+        """
+        maybe_nodes = defaultdict(list)
+        maybe_edges = defaultdict(list)
+        records = split_string_by_multi_markers(
+            result,
+            [context_base["record_delimiter"], context_base["completion_delimiter"]],
+        )
+        for record in records:
+            record = re.search(r"\((.*)\)", record)
+            if record is None:
+                continue
+            record = record.group(1)
+            record_attributes = split_string_by_multi_markers(
+                record, [context_base["tuple_delimiter"]]
+            )
+            if_entities = await _handle_single_entity_extraction(
+                record_attributes, chunk_key
+            )
+            if if_entities is not None:
+                maybe_nodes[if_entities["entity_name"]].append(if_entities)
+                continue
+            if_relation = await _handle_single_relationship_extraction(
+                record_attributes, chunk_key
+            )
+            if if_relation is not None:
+                maybe_edges[(if_relation["src_id"], if_relation["tgt_id"])].append(
+                    if_relation
+                )
+        return maybe_nodes, maybe_edges
     async def _process_single_content(chunk_key_dp: tuple[str, TextChunkSchema]):
+        """Process a single chunk
         Args:
             chunk_key_dp (tuple[str, TextChunkSchema]):
+                ("chunk-xxxxxx", {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int})
         """
         nonlocal processed_chunks
         chunk_key = chunk_key_dp[0]
         chunk_dp = chunk_key_dp[1]
         content = chunk_dp["content"]
+        # Get initial extraction
         hint_prompt = entity_extract_prompt.format(
             **context_base, input_text="{input_text}"
         ).format(**context_base, input_text=content)
         final_result = await _user_llm_func_with_cache(hint_prompt)
         history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
+        # Process initial extraction
+        maybe_nodes, maybe_edges = await _process_extraction_result(
+            final_result, chunk_key
+        )
+        # Process additional gleaning results
         for now_glean_index in range(entity_extract_max_gleaning):
             glean_result = await _user_llm_func_with_cache(
                 continue_prompt, history_messages=history
             )
             history += pack_user_ass_to_openai_messages(continue_prompt, glean_result)
+            # Process gleaning result separately
+            glean_nodes, glean_edges = await _process_extraction_result(
+                glean_result, chunk_key
+            )
+            # Merge results
+            for entity_name, entities in glean_nodes.items():
+                maybe_nodes[entity_name].extend(entities)
+            for edge_key, edges in glean_edges.items():
+                maybe_edges[edge_key].extend(edges)
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
             if if_loop_result != "yes":
                 break
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
         try:
             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
+            # TODO: add ids to the query
+            results = await chunks_vdb.query(
+                augmented_query, top_k=mix_topk, ids=query_param.ids
+            )
             if not results:
                 return None
     logger.info(
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
+    results = await entities_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return "", "", ""
     # get entity information
     logger.info(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
+    results = await relationships_vdb.query(
+        keywords, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return "", "", ""
     if cached_response is not None:
         return cached_response
+    results = await chunks_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return PROMPTS["fail_response"]
         )
     return response
+async def query_with_keywords(
+    query: str,
+    prompt: str,
+    param: QueryParam,
+    knowledge_graph_inst: BaseGraphStorage,
+    entities_vdb: BaseVectorStorage,
+    relationships_vdb: BaseVectorStorage,
+    chunks_vdb: BaseVectorStorage,
+    text_chunks_db: BaseKVStorage,
+    global_config: dict[str, str],
+    hashing_kv: BaseKVStorage | None = None,
+) -> str | AsyncIterator[str]:
+    """
+    Extract keywords from the query and then use them for retrieving information.
+    1. Extracts high-level and low-level keywords from the query
+    2. Formats the query with the extracted keywords and prompt
+    3. Uses the appropriate query method based on param.mode
+    Args:
+        query: The user's query
+        prompt: Additional prompt to prepend to the query
+        param: Query parameters
+        knowledge_graph_inst: Knowledge graph storage
+        entities_vdb: Entities vector database
+        relationships_vdb: Relationships vector database
+        chunks_vdb: Document chunks vector database
+        text_chunks_db: Text chunks storage
+        global_config: Global configuration
+        hashing_kv: Cache storage
+    Returns:
+        Query response or async iterator
+    """
+    # Extract keywords
+    hl_keywords, ll_keywords = await extract_keywords_only(
+        text=query,
+        param=param,
+        global_config=global_config,
+        hashing_kv=hashing_kv,
+    )
+    param.hl_keywords = hl_keywords
+    param.ll_keywords = ll_keywords
+    # Create a new string with the prompt and the keywords
+    ll_keywords_str = ", ".join(ll_keywords)
+    hl_keywords_str = ", ".join(hl_keywords)
+    formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
+    # Use appropriate query method based on mode
+    if param.mode in ["local", "global", "hybrid"]:
+        return await kg_query_with_keywords(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "naive":
+        return await naive_query(
+            formatted_question,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    elif param.mode == "mix":
+        return await mix_kg_vector_query(
+            formatted_question,
+            knowledge_graph_inst,
+            entities_vdb,
+            relationships_vdb,
+            chunks_vdb,
+            text_chunks_db,
+            param,
+            global_config,
+            hashing_kv=hashing_kv,
+        )
+    else:
+        raise ValueError(f"Unknown mode {param.mode}")

lightrag/prompt.py CHANGED Viewed

@@ -236,7 +236,7 @@ Given the query and conversation history, list both high-level and low-level key
 ---Instructions---
 - Consider both the current query and relevant conversation history when extracting keywords
-- Output the keywords in JSON format
 - The JSON should have two keys:
   - "high_level_keywords" for overarching concepts or themes
   - "low_level_keywords" for specific entities or details

 ---Instructions---
 - Consider both the current query and relevant conversation history when extracting keywords
+- Output the keywords in JSON format, it will be parsed by a JSON parser, do not add any extra content in output
 - The JSON should have two keys:
   - "high_level_keywords" for overarching concepts or themes
   - "low_level_keywords" for specific entities or details

lightrag/utils.py CHANGED Viewed

@@ -890,3 +890,52 @@ def lazy_external_import(module_name: str, class_name: str) -> Callable[..., Any
         return cls(*args, **kwargs)
     return import_class

         return cls(*args, **kwargs)
     return import_class
+def get_content_summary(content: str, max_length: int = 100) -> str:
+    """Get summary of document content
+    Args:
+        content: Original document content
+        max_length: Maximum length of summary
+    Returns:
+        Truncated content with ellipsis if needed
+    """
+    content = content.strip()
+    if len(content) <= max_length:
+        return content
+    return content[:max_length] + "..."
+def clean_text(text: str) -> str:
+    """Clean text by removing null bytes (0x00) and whitespace
+    Args:
+        text: Input text to clean
+    Returns:
+        Cleaned text
+    """
+    return text.strip().replace("\x00", "")
+def check_storage_env_vars(storage_name: str) -> None:
+    """Check if all required environment variables for storage implementation exist
+    Args:
+        storage_name: Storage implementation name
+    Raises:
+        ValueError: If required environment variables are missing
+    """
+    from lightrag.kg import STORAGE_ENV_REQUIREMENTS
+    required_vars = STORAGE_ENV_REQUIREMENTS.get(storage_name, [])
+    missing_vars = [var for var in required_vars if var not in os.environ]
+    if missing_vars:
+        raise ValueError(
+            f"Storage implementation '{storage_name}' requires the following "
+            f"environment variables: {', '.join(missing_vars)}"
+        )