Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

Larfii commited on Nov 25, 2024

Commit

d3c3d88

1 Parent(s): 31f9f35

Add a progress bar

Browse files

Files changed (3) hide show

lightrag/lightrag.py +4 -1
lightrag/operate.py +42 -17
lightrag/storage.py +11 -3

lightrag/lightrag.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import asyncio
 import os
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
@@ -243,7 +244,9 @@ class LightRAG:
             logger.info(f"[New Docs] inserting {len(new_docs)} docs")
             inserting_chunks = {}
-            for doc_key, doc in new_docs.items():
                 chunks = {
                     compute_mdhash_id(dp["content"], prefix="chunk-"): {
                         **dp,

 import asyncio
 import os
+from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
             logger.info(f"[New Docs] inserting {len(new_docs)} docs")
             inserting_chunks = {}
+            for doc_key, doc in tqdm_async(
+                new_docs.items(), desc="Chunking documents", unit="doc"
+            ):
                 chunks = {
                     compute_mdhash_id(dp["content"], prefix="chunk-"): {
                         **dp,

lightrag/operate.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import json
 import re
 from typing import Union
 from collections import Counter, defaultdict
 import warnings
@@ -329,11 +330,15 @@ async def extract_entities(
         )
         return dict(maybe_nodes), dict(maybe_edges)
-    # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
-    results = await asyncio.gather(
-        *[_process_single_content(c) for c in ordered_chunks]
-    )
-    print()  # clear the progress bar
     maybe_nodes = defaultdict(list)
     maybe_edges = defaultdict(list)
     for m_nodes, m_edges in results:
@@ -341,18 +346,38 @@ async def extract_entities(
             maybe_nodes[k].extend(v)
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
-    all_entities_data = await asyncio.gather(
-        *[
-            _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
-            for k, v in maybe_nodes.items()
-        ]
-    )
-    all_relationships_data = await asyncio.gather(
-        *[
-            _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
-            for k, v in maybe_edges.items()
-        ]
-    )
     if not len(all_entities_data):
         logger.warning("Didn't extract any entities, maybe your LLM is not working")
         return None

 import asyncio
 import json
 import re
+from tqdm.asyncio import tqdm as tqdm_async
 from typing import Union
 from collections import Counter, defaultdict
 import warnings
         )
         return dict(maybe_nodes), dict(maybe_edges)
+    results = []
+    for result in tqdm_async(
+        asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
+        total=len(ordered_chunks),
+        desc="Extracting entities from chunks",
+        unit="chunk",
+    ):
+        results.append(await result)
     maybe_nodes = defaultdict(list)
     maybe_edges = defaultdict(list)
     for m_nodes, m_edges in results:
             maybe_nodes[k].extend(v)
         for k, v in m_edges.items():
             maybe_edges[tuple(sorted(k))].extend(v)
+    logger.info("Inserting entities into storage...")
+    all_entities_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [
+                _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
+                for k, v in maybe_nodes.items()
+            ]
+        ),
+        total=len(maybe_nodes),
+        desc="Inserting entities",
+        unit="entity",
+    ):
+        all_entities_data.append(await result)
+    logger.info("Inserting relationships into storage...")
+    all_relationships_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [
+                _merge_edges_then_upsert(
+                    k[0], k[1], v, knowledge_graph_inst, global_config
+                )
+                for k, v in maybe_edges.items()
+            ]
+        ),
+        total=len(maybe_edges),
+        desc="Inserting relationships",
+        unit="relationship",
+    ):
+        all_relationships_data.append(await result)
     if not len(all_entities_data):
         logger.warning("Didn't extract any entities, maybe your LLM is not working")
         return None

lightrag/storage.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import html
 import os
 from dataclasses import dataclass
 from typing import Any, Union, cast
 import networkx as nx
@@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
             contents[i : i + self._max_batch_size]
             for i in range(0, len(contents), self._max_batch_size)
         ]
-        embeddings_list = await asyncio.gather(
-            *[self.embedding_func(batch) for batch in batches]
-        )
         embeddings = np.concatenate(embeddings_list)
         for i, d in enumerate(list_data):
             d["__vector__"] = embeddings[i]

 import asyncio
 import html
 import os
+from tqdm.asyncio import tqdm as tqdm_async
 from dataclasses import dataclass
 from typing import Any, Union, cast
 import networkx as nx
             contents[i : i + self._max_batch_size]
             for i in range(0, len(contents), self._max_batch_size)
         ]
+        embedding_tasks = [self.embedding_func(batch) for batch in batches]
+        embeddings_list = []
+        for f in tqdm_async(
+            asyncio.as_completed(embedding_tasks),
+            total=len(embedding_tasks),
+            desc="Generating embeddings",
+            unit="batch",
+        ):
+            embeddings = await f
+            embeddings_list.append(embeddings)
         embeddings = np.concatenate(embeddings_list)
         for i, d in enumerate(list_data):
             d["__vector__"] = embeddings[i]