Spaces:

rm-lht
/

lightrag

Configuration error

Daniel.y commited on Apr 10

Commit

1cf8d77

unverified ·

2 Parent(s): 50ed665 83bf83a

Merge pull request #1337 from danielaskdd/main

Only merge new entities/edges during gleaning stage

Files changed (2) hide show

lightrag/lightrag.py CHANGED Viewed

@@ -902,6 +902,13 @@ class LightRAG:
                         # Get file path from status document
                         file_path = getattr(status_doc, "file_path", "unknown_source")
                         # Generate chunks from document
                         chunks: dict[str, Any] = {
                             compute_mdhash_id(dp["content"], prefix="chunk-"): {

                         # Get file path from status document
                         file_path = getattr(status_doc, "file_path", "unknown_source")
+                        async with pipeline_status_lock:
+                            log_message = f"Processing file: {file_path}"
+                            pipeline_status["history_messages"].append(log_message)
+                            log_message = f"Processing d-id: {doc_id}"
+                            pipeline_status["latest_message"] = log_message
+                            pipeline_status["history_messages"].append(log_message)
                         # Generate chunks from document
                         chunks: dict[str, Any] = {
                             compute_mdhash_id(dp["content"], prefix="chunk-"): {

lightrag/operate.py CHANGED Viewed

@@ -613,11 +613,17 @@ async def extract_entities(
                 glean_result, chunk_key, file_path
             )
-            # Merge results
             for entity_name, entities in glean_nodes.items():
-                maybe_nodes[entity_name].extend(entities)
             for edge_key, edges in glean_edges.items():
-                maybe_edges[edge_key].extend(edges)
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
@@ -636,7 +642,7 @@ async def extract_entities(
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
-        log_message = f"  Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)"
         logger.info(log_message)
         if pipeline_status is not None:
             async with pipeline_status_lock:

                 glean_result, chunk_key, file_path
             )
+            # Merge results - only add entities and edges with new names
             for entity_name, entities in glean_nodes.items():
+                if (
+                    entity_name not in maybe_nodes
+                ):  # Only accetp entities with new name in gleaning stage
+                    maybe_nodes[entity_name].extend(entities)
             for edge_key, edges in glean_edges.items():
+                if (
+                    edge_key not in maybe_edges
+                ):  # Only accetp edges with new name in gleaning stage
+                    maybe_edges[edge_key].extend(edges)
             if now_glean_index == entity_extract_max_gleaning - 1:
                 break
         processed_chunks += 1
         entities_count = len(maybe_nodes)
         relations_count = len(maybe_edges)
+        log_message = f"Chk {processed_chunks}/{total_chunks}: extracted {entities_count} Ent + {relations_count} Rel (deduplicated)"
         logger.info(log_message)
         if pipeline_status is not None:
             async with pipeline_status_lock: