Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

LarFii commited on Dec 31, 2024

Commit

5289003

1 Parent(s): 0bee9e3

fix linting errors

Browse files

Files changed (2) hide show

lightrag/lightrag.py +92 -53
lightrag/storage.py +13 -6

lightrag/lightrag.py CHANGED Viewed

@@ -731,30 +731,34 @@ class LightRAG:
             if not doc_status:
                 logger.warning(f"Document {doc_id} not found")
                 return
             logger.debug(f"Starting deletion for document {doc_id}")
             # 2. Get all related chunks
-            chunks = await self.text_chunks.filter(lambda x: x.get("full_doc_id") == doc_id)
             chunk_ids = list(chunks.keys())
             logger.debug(f"Found {len(chunk_ids)} chunks to delete")
             # 3. Before deleting, check the related entities and relationships for these chunks
             for chunk_id in chunk_ids:
                 # Check entities
                 entities = [
-                    dp for dp in self.entities_vdb.client_storage["data"]
                     if dp.get("source_id") == chunk_id
                 ]
                 logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities")
                 # Check relationships
                 relations = [
-                    dp for dp in self.relationships_vdb.client_storage["data"]
                     if dp.get("source_id") == chunk_id
                 ]
                 logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations")
             # Continue with the original deletion process...
             # 4. Delete chunks from vector database
@@ -775,31 +779,39 @@ class LightRAG:
             # Process entities
             for node, data in nodes:
-                if 'source_id' in data:
                     # Split source_id using GRAPH_FIELD_SEP
-                    sources = set(data['source_id'].split(GRAPH_FIELD_SEP))
                     sources.difference_update(chunk_ids)
                     if not sources:
                         entities_to_delete.add(node)
-                        logger.debug(f"Entity {node} marked for deletion - no remaining sources")
                     else:
                         new_source_id = GRAPH_FIELD_SEP.join(sources)
                         entities_to_update[node] = new_source_id
-                        logger.debug(f"Entity {node} will be updated with new source_id: {new_source_id}")
             # Process relationships
             for src, tgt, data in edges:
-                if 'source_id' in data:
                     # Split source_id using GRAPH_FIELD_SEP
-                    sources = set(data['source_id'].split(GRAPH_FIELD_SEP))
                     sources.difference_update(chunk_ids)
                     if not sources:
                         relationships_to_delete.add((src, tgt))
-                        logger.debug(f"Relationship {src}-{tgt} marked for deletion - no remaining sources")
                     else:
                         new_source_id = GRAPH_FIELD_SEP.join(sources)
                         relationships_to_update[(src, tgt)] = new_source_id
-                        logger.debug(f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}")
             # Delete entities
             if entities_to_delete:
@@ -812,9 +824,11 @@ class LightRAG:
             # Update entities
             for entity, new_source_id in entities_to_update.items():
                 node_data = self.chunk_entity_relation_graph._graph.nodes[entity]
-                node_data['source_id'] = new_source_id
                 await self.chunk_entity_relation_graph.upsert_node(entity, node_data)
-                logger.debug(f"Updated entity {entity} with new source_id: {new_source_id}")
             # Delete relationships
             if relationships_to_delete:
@@ -823,15 +837,21 @@ class LightRAG:
                     rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-")
                     await self.relationships_vdb.delete([rel_id_0, rel_id_1])
                     logger.debug(f"Deleted relationship {src}-{tgt} from vector DB")
-                self.chunk_entity_relation_graph.remove_edges(list(relationships_to_delete))
-                logger.debug(f"Deleted {len(relationships_to_delete)} relationships from graph")
             # Update relationships
             for (src, tgt), new_source_id in relationships_to_update.items():
                 edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt]
-                edge_data['source_id'] = new_source_id
                 await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data)
-                logger.debug(f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}")
             # 6. Delete original document and status
             await self.full_docs.delete([doc_id])
@@ -851,31 +871,39 @@ class LightRAG:
                 # Verify if the document has been deleted
                 if await self.full_docs.get_by_id(doc_id):
                     logger.error(f"Document {doc_id} still exists in full_docs")
                 # Verify if chunks have been deleted
                 remaining_chunks = await self.text_chunks.filter(
                     lambda x: x.get("full_doc_id") == doc_id
                 )
                 if remaining_chunks:
                     logger.error(f"Found {len(remaining_chunks)} remaining chunks")
                 # Verify entities and relationships
                 for chunk_id in chunk_ids:
                     # Check entities
                     entities_with_chunk = [
-                        dp for dp in self.entities_vdb.client_storage["data"]
-                        if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
                     ]
                     if entities_with_chunk:
-                        logger.error(f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}")
                     # Check relationships
                     relations_with_chunk = [
-                        dp for dp in self.relationships_vdb.client_storage["data"]
-                        if chunk_id in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
                     ]
                     if relations_with_chunk:
-                        logger.error(f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}")
             await verify_deletion()
@@ -886,13 +914,15 @@ class LightRAG:
         """Synchronous version of adelete"""
         return asyncio.run(self.adelete_by_doc_id(doc_id))
-    async def get_entity_info(self, entity_name: str, include_vector_data: bool = False):
         """Get detailed information of an entity
         Args:
             entity_name: Entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         Returns:
             dict: A dictionary containing entity information, including:
                 - entity_name: Entity name
@@ -901,47 +931,50 @@ class LightRAG:
                 - vector_data: (optional) Data from the vector database
         """
         entity_name = f'"{entity_name.upper()}"'
         # Get information from the graph
         node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
-        source_id = node_data.get('source_id') if node_data else None
         result = {
             "entity_name": entity_name,
             "source_id": source_id,
             "graph_data": node_data,
         }
         # Optional: Get vector database information
         if include_vector_data:
             entity_id = compute_mdhash_id(entity_name, prefix="ent-")
             vector_data = self.entities_vdb._client.get([entity_id])
             result["vector_data"] = vector_data[0] if vector_data else None
         return result
     def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False):
         """Synchronous version of getting entity information
         Args:
             entity_name: Entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         """
         try:
             import tracemalloc
             tracemalloc.start()
             return asyncio.run(self.get_entity_info(entity_name, include_vector_data))
         finally:
             tracemalloc.stop()
-    async def get_relation_info(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False):
         """Get detailed information of a relationship
         Args:
             src_entity: Source entity name (no need for quotes)
             tgt_entity: Target entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         Returns:
             dict: A dictionary containing relationship information, including:
                 - src_entity: Source entity name
@@ -952,29 +985,33 @@ class LightRAG:
         """
         src_entity = f'"{src_entity.upper()}"'
         tgt_entity = f'"{tgt_entity.upper()}"'
         # Get information from the graph
-        edge_data = await self.chunk_entity_relation_graph.get_edge(src_entity, tgt_entity)
-        source_id = edge_data.get('source_id') if edge_data else None
         result = {
             "src_entity": src_entity,
             "tgt_entity": tgt_entity,
             "source_id": source_id,
             "graph_data": edge_data,
         }
         # Optional: Get vector database information
         if include_vector_data:
             rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
             vector_data = self.relationships_vdb._client.get([rel_id])
             result["vector_data"] = vector_data[0] if vector_data else None
         return result
-    def get_relation_info_sync(self, src_entity: str, tgt_entity: str, include_vector_data: bool = False):
         """Synchronous version of getting relationship information
         Args:
             src_entity: Source entity name (no need for quotes)
             tgt_entity: Target entity name (no need for quotes)
@@ -982,8 +1019,10 @@ class LightRAG:
         """
         try:
             import tracemalloc
             tracemalloc.start()
-            return asyncio.run(self.get_relation_info(src_entity, tgt_entity, include_vector_data))
         finally:
             tracemalloc.stop()

             if not doc_status:
                 logger.warning(f"Document {doc_id} not found")
                 return
             logger.debug(f"Starting deletion for document {doc_id}")
             # 2. Get all related chunks
+            chunks = await self.text_chunks.filter(
+                lambda x: x.get("full_doc_id") == doc_id
+            )
             chunk_ids = list(chunks.keys())
             logger.debug(f"Found {len(chunk_ids)} chunks to delete")
             # 3. Before deleting, check the related entities and relationships for these chunks
             for chunk_id in chunk_ids:
                 # Check entities
                 entities = [
+                    dp
+                    for dp in self.entities_vdb.client_storage["data"]
                     if dp.get("source_id") == chunk_id
                 ]
                 logger.debug(f"Chunk {chunk_id} has {len(entities)} related entities")
                 # Check relationships
                 relations = [
+                    dp
+                    for dp in self.relationships_vdb.client_storage["data"]
                     if dp.get("source_id") == chunk_id
                 ]
                 logger.debug(f"Chunk {chunk_id} has {len(relations)} related relations")
             # Continue with the original deletion process...
             # 4. Delete chunks from vector database
             # Process entities
             for node, data in nodes:
+                if "source_id" in data:
                     # Split source_id using GRAPH_FIELD_SEP
+                    sources = set(data["source_id"].split(GRAPH_FIELD_SEP))
                     sources.difference_update(chunk_ids)
                     if not sources:
                         entities_to_delete.add(node)
+                        logger.debug(
+                            f"Entity {node} marked for deletion - no remaining sources"
+                        )
                     else:
                         new_source_id = GRAPH_FIELD_SEP.join(sources)
                         entities_to_update[node] = new_source_id
+                        logger.debug(
+                            f"Entity {node} will be updated with new source_id: {new_source_id}"
+                        )
             # Process relationships
             for src, tgt, data in edges:
+                if "source_id" in data:
                     # Split source_id using GRAPH_FIELD_SEP
+                    sources = set(data["source_id"].split(GRAPH_FIELD_SEP))
                     sources.difference_update(chunk_ids)
                     if not sources:
                         relationships_to_delete.add((src, tgt))
+                        logger.debug(
+                            f"Relationship {src}-{tgt} marked for deletion - no remaining sources"
+                        )
                     else:
                         new_source_id = GRAPH_FIELD_SEP.join(sources)
                         relationships_to_update[(src, tgt)] = new_source_id
+                        logger.debug(
+                            f"Relationship {src}-{tgt} will be updated with new source_id: {new_source_id}"
+                        )
             # Delete entities
             if entities_to_delete:
             # Update entities
             for entity, new_source_id in entities_to_update.items():
                 node_data = self.chunk_entity_relation_graph._graph.nodes[entity]
+                node_data["source_id"] = new_source_id
                 await self.chunk_entity_relation_graph.upsert_node(entity, node_data)
+                logger.debug(
+                    f"Updated entity {entity} with new source_id: {new_source_id}"
+                )
             # Delete relationships
             if relationships_to_delete:
                     rel_id_1 = compute_mdhash_id(tgt + src, prefix="rel-")
                     await self.relationships_vdb.delete([rel_id_0, rel_id_1])
                     logger.debug(f"Deleted relationship {src}-{tgt} from vector DB")
+                self.chunk_entity_relation_graph.remove_edges(
+                    list(relationships_to_delete)
+                )
+                logger.debug(
+                    f"Deleted {len(relationships_to_delete)} relationships from graph"
+                )
             # Update relationships
             for (src, tgt), new_source_id in relationships_to_update.items():
                 edge_data = self.chunk_entity_relation_graph._graph.edges[src, tgt]
+                edge_data["source_id"] = new_source_id
                 await self.chunk_entity_relation_graph.upsert_edge(src, tgt, edge_data)
+                logger.debug(
+                    f"Updated relationship {src}-{tgt} with new source_id: {new_source_id}"
+                )
             # 6. Delete original document and status
             await self.full_docs.delete([doc_id])
                 # Verify if the document has been deleted
                 if await self.full_docs.get_by_id(doc_id):
                     logger.error(f"Document {doc_id} still exists in full_docs")
                 # Verify if chunks have been deleted
                 remaining_chunks = await self.text_chunks.filter(
                     lambda x: x.get("full_doc_id") == doc_id
                 )
                 if remaining_chunks:
                     logger.error(f"Found {len(remaining_chunks)} remaining chunks")
                 # Verify entities and relationships
                 for chunk_id in chunk_ids:
                     # Check entities
                     entities_with_chunk = [
+                        dp
+                        for dp in self.entities_vdb.client_storage["data"]
+                        if chunk_id
+                        in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
                     ]
                     if entities_with_chunk:
+                        logger.error(
+                            f"Found {len(entities_with_chunk)} entities still referencing chunk {chunk_id}"
+                        )
                     # Check relationships
                     relations_with_chunk = [
+                        dp
+                        for dp in self.relationships_vdb.client_storage["data"]
+                        if chunk_id
+                        in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
                     ]
                     if relations_with_chunk:
+                        logger.error(
+                            f"Found {len(relations_with_chunk)} relations still referencing chunk {chunk_id}"
+                        )
             await verify_deletion()
         """Synchronous version of adelete"""
         return asyncio.run(self.adelete_by_doc_id(doc_id))
+    async def get_entity_info(
+        self, entity_name: str, include_vector_data: bool = False
+    ):
         """Get detailed information of an entity
         Args:
             entity_name: Entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         Returns:
             dict: A dictionary containing entity information, including:
                 - entity_name: Entity name
                 - vector_data: (optional) Data from the vector database
         """
         entity_name = f'"{entity_name.upper()}"'
         # Get information from the graph
         node_data = await self.chunk_entity_relation_graph.get_node(entity_name)
+        source_id = node_data.get("source_id") if node_data else None
         result = {
             "entity_name": entity_name,
             "source_id": source_id,
             "graph_data": node_data,
         }
         # Optional: Get vector database information
         if include_vector_data:
             entity_id = compute_mdhash_id(entity_name, prefix="ent-")
             vector_data = self.entities_vdb._client.get([entity_id])
             result["vector_data"] = vector_data[0] if vector_data else None
         return result
     def get_entity_info_sync(self, entity_name: str, include_vector_data: bool = False):
         """Synchronous version of getting entity information
         Args:
             entity_name: Entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         """
         try:
             import tracemalloc
             tracemalloc.start()
             return asyncio.run(self.get_entity_info(entity_name, include_vector_data))
         finally:
             tracemalloc.stop()
+    async def get_relation_info(
+        self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
+    ):
         """Get detailed information of a relationship
         Args:
             src_entity: Source entity name (no need for quotes)
             tgt_entity: Target entity name (no need for quotes)
             include_vector_data: Whether to include data from the vector database
         Returns:
             dict: A dictionary containing relationship information, including:
                 - src_entity: Source entity name
         """
         src_entity = f'"{src_entity.upper()}"'
         tgt_entity = f'"{tgt_entity.upper()}"'
         # Get information from the graph
+        edge_data = await self.chunk_entity_relation_graph.get_edge(
+            src_entity, tgt_entity
+        )
+        source_id = edge_data.get("source_id") if edge_data else None
         result = {
             "src_entity": src_entity,
             "tgt_entity": tgt_entity,
             "source_id": source_id,
             "graph_data": edge_data,
         }
         # Optional: Get vector database information
         if include_vector_data:
             rel_id = compute_mdhash_id(src_entity + tgt_entity, prefix="rel-")
             vector_data = self.relationships_vdb._client.get([rel_id])
             result["vector_data"] = vector_data[0] if vector_data else None
         return result
+    def get_relation_info_sync(
+        self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
+    ):
         """Synchronous version of getting relationship information
         Args:
             src_entity: Source entity name (no need for quotes)
             tgt_entity: Target entity name (no need for quotes)
         """
         try:
             import tracemalloc
             tracemalloc.start()
+            return asyncio.run(
+                self.get_relation_info(src_entity, tgt_entity, include_vector_data)
+            )
         finally:
             tracemalloc.stop()

lightrag/storage.py CHANGED Viewed

@@ -188,14 +188,18 @@ class NanoVectorDBStorage(BaseVectorStorage):
         """
         try:
             self._client.delete(ids)
-            logger.info(f"Successfully deleted {len(ids)} vectors from {self.namespace}")
         except Exception as e:
             logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
     async def delete_entity(self, entity_name: str):
         try:
             entity_id = compute_mdhash_id(entity_name, prefix="ent-")
-            logger.debug(f"Attempting to delete entity {entity_name} with ID {entity_id}")
             # Check if the entity exists
             if self._client.get([entity_id]):
                 await self.delete([entity_id])
@@ -208,15 +212,18 @@ class NanoVectorDBStorage(BaseVectorStorage):
     async def delete_entity_relation(self, entity_name: str):
         try:
             relations = [
-                dp for dp in self.client_storage["data"]
                 if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name
             ]
             logger.debug(f"Found {len(relations)} relations for entity {entity_name}")
             ids_to_delete = [relation["__id__"] for relation in relations]
             if ids_to_delete:
                 await self.delete(ids_to_delete)
-                logger.debug(f"Deleted {len(ids_to_delete)} relations for {entity_name}")
             else:
                 logger.debug(f"No relations found for entity {entity_name}")
         except Exception as e:
@@ -446,4 +453,4 @@ class JsonDocStatusStorage(DocStatusStorage):
         """Delete document status by IDs"""
         for doc_id in doc_ids:
             self._data.pop(doc_id, None)
-        await self.index_done_callback()

         """
         try:
             self._client.delete(ids)
+            logger.info(
+                f"Successfully deleted {len(ids)} vectors from {self.namespace}"
+            )
         except Exception as e:
             logger.error(f"Error while deleting vectors from {self.namespace}: {e}")
     async def delete_entity(self, entity_name: str):
         try:
             entity_id = compute_mdhash_id(entity_name, prefix="ent-")
+            logger.debug(
+                f"Attempting to delete entity {entity_name} with ID {entity_id}"
+            )
             # Check if the entity exists
             if self._client.get([entity_id]):
                 await self.delete([entity_id])
     async def delete_entity_relation(self, entity_name: str):
         try:
             relations = [
+                dp
+                for dp in self.client_storage["data"]
                 if dp["src_id"] == entity_name or dp["tgt_id"] == entity_name
             ]
             logger.debug(f"Found {len(relations)} relations for entity {entity_name}")
             ids_to_delete = [relation["__id__"] for relation in relations]
             if ids_to_delete:
                 await self.delete(ids_to_delete)
+                logger.debug(
+                    f"Deleted {len(ids_to_delete)} relations for {entity_name}"
+                )
             else:
                 logger.debug(f"No relations found for entity {entity_name}")
         except Exception as e:
         """Delete document status by IDs"""
         for doc_id in doc_ids:
             self._data.pop(doc_id, None)
+        await self.index_done_callback()