Merge branch 'HKUDS:main' into main
Browse files- README.md +45 -1
- examples/insert_custom_kg.py +108 -0
- lightrag/__init__.py +1 -1
- lightrag/lightrag.py +106 -1
- lightrag/operate.py +42 -17
- lightrag/storage.py +11 -3
README.md
CHANGED
@@ -26,7 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
|
|
26 |
</div>
|
27 |
|
28 |
## 🎉 News
|
29 |
-
- [x] [2024.11.
|
|
|
30 |
- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
|
31 |
- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
|
32 |
- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
|
@@ -327,6 +328,49 @@ with open("./newText.txt") as f:
|
|
327 |
rag.insert(f.read())
|
328 |
```
|
329 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
### Delete Entity
|
331 |
|
332 |
```python
|
|
|
26 |
</div>
|
27 |
|
28 |
## 🎉 News
|
29 |
+
- [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
|
30 |
+
- [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
|
31 |
- [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
|
32 |
- [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
|
33 |
- [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
|
|
|
328 |
rag.insert(f.read())
|
329 |
```
|
330 |
|
331 |
+
### Insert Custom KG
|
332 |
+
|
333 |
+
```python
|
334 |
+
rag = LightRAG(
|
335 |
+
working_dir=WORKING_DIR,
|
336 |
+
llm_model_func=llm_model_func,
|
337 |
+
embedding_func=EmbeddingFunc(
|
338 |
+
embedding_dim=embedding_dimension,
|
339 |
+
max_token_size=8192,
|
340 |
+
func=embedding_func,
|
341 |
+
),
|
342 |
+
)
|
343 |
+
|
344 |
+
custom_kg = {
|
345 |
+
"entities": [
|
346 |
+
{
|
347 |
+
"entity_name": "CompanyA",
|
348 |
+
"entity_type": "Organization",
|
349 |
+
"description": "A major technology company",
|
350 |
+
"source_id": "Source1"
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"entity_name": "ProductX",
|
354 |
+
"entity_type": "Product",
|
355 |
+
"description": "A popular product developed by CompanyA",
|
356 |
+
"source_id": "Source1"
|
357 |
+
}
|
358 |
+
],
|
359 |
+
"relationships": [
|
360 |
+
{
|
361 |
+
"src_id": "CompanyA",
|
362 |
+
"tgt_id": "ProductX",
|
363 |
+
"description": "CompanyA develops ProductX",
|
364 |
+
"keywords": "develop, produce",
|
365 |
+
"weight": 1.0,
|
366 |
+
"source_id": "Source1"
|
367 |
+
}
|
368 |
+
]
|
369 |
+
}
|
370 |
+
|
371 |
+
rag.insert_custom_kg(custom_kg)
|
372 |
+
```
|
373 |
+
|
374 |
### Delete Entity
|
375 |
|
376 |
```python
|
examples/insert_custom_kg.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from lightrag import LightRAG, QueryParam
|
3 |
+
from lightrag.llm import gpt_4o_mini_complete
|
4 |
+
#########
|
5 |
+
# Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
|
6 |
+
# import nest_asyncio
|
7 |
+
# nest_asyncio.apply()
|
8 |
+
#########
|
9 |
+
|
10 |
+
WORKING_DIR = "./custom_kg"
|
11 |
+
|
12 |
+
if not os.path.exists(WORKING_DIR):
|
13 |
+
os.mkdir(WORKING_DIR)
|
14 |
+
|
15 |
+
rag = LightRAG(
|
16 |
+
working_dir=WORKING_DIR,
|
17 |
+
llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
|
18 |
+
# llm_model_func=gpt_4o_complete # Optionally, use a stronger model
|
19 |
+
)
|
20 |
+
|
21 |
+
custom_kg = {
|
22 |
+
"entities": [
|
23 |
+
{
|
24 |
+
"entity_name": "CompanyA",
|
25 |
+
"entity_type": "Organization",
|
26 |
+
"description": "A major technology company",
|
27 |
+
"source_id": "Source1"
|
28 |
+
},
|
29 |
+
{
|
30 |
+
"entity_name": "ProductX",
|
31 |
+
"entity_type": "Product",
|
32 |
+
"description": "A popular product developed by CompanyA",
|
33 |
+
"source_id": "Source1"
|
34 |
+
},
|
35 |
+
{
|
36 |
+
"entity_name": "PersonA",
|
37 |
+
"entity_type": "Person",
|
38 |
+
"description": "A renowned researcher in AI",
|
39 |
+
"source_id": "Source2"
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"entity_name": "UniversityB",
|
43 |
+
"entity_type": "Organization",
|
44 |
+
"description": "A leading university specializing in technology and sciences",
|
45 |
+
"source_id": "Source2"
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"entity_name": "CityC",
|
49 |
+
"entity_type": "Location",
|
50 |
+
"description": "A large metropolitan city known for its culture and economy",
|
51 |
+
"source_id": "Source3"
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"entity_name": "EventY",
|
55 |
+
"entity_type": "Event",
|
56 |
+
"description": "An annual technology conference held in CityC",
|
57 |
+
"source_id": "Source3"
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"entity_name": "CompanyD",
|
61 |
+
"entity_type": "Organization",
|
62 |
+
"description": "A financial services company specializing in insurance",
|
63 |
+
"source_id": "Source4"
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"entity_name": "ServiceZ",
|
67 |
+
"entity_type": "Service",
|
68 |
+
"description": "An insurance product offered by CompanyD",
|
69 |
+
"source_id": "Source4"
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"relationships": [
|
73 |
+
{
|
74 |
+
"src_id": "CompanyA",
|
75 |
+
"tgt_id": "ProductX",
|
76 |
+
"description": "CompanyA develops ProductX",
|
77 |
+
"keywords": "develop, produce",
|
78 |
+
"weight": 1.0,
|
79 |
+
"source_id": "Source1"
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"src_id": "PersonA",
|
83 |
+
"tgt_id": "UniversityB",
|
84 |
+
"description": "PersonA works at UniversityB",
|
85 |
+
"keywords": "employment, affiliation",
|
86 |
+
"weight": 0.9,
|
87 |
+
"source_id": "Source2"
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"src_id": "CityC",
|
91 |
+
"tgt_id": "EventY",
|
92 |
+
"description": "EventY is hosted in CityC",
|
93 |
+
"keywords": "host, location",
|
94 |
+
"weight": 0.8,
|
95 |
+
"source_id": "Source3"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"src_id": "CompanyD",
|
99 |
+
"tgt_id": "ServiceZ",
|
100 |
+
"description": "CompanyD provides ServiceZ",
|
101 |
+
"keywords": "provide, offer",
|
102 |
+
"weight": 1.0,
|
103 |
+
"source_id": "Source4"
|
104 |
+
}
|
105 |
+
]
|
106 |
+
}
|
107 |
+
|
108 |
+
rag.insert_custom_kg(custom_kg)
|
lightrag/__init__.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
2 |
|
3 |
-
__version__ = "1.0.
|
4 |
__author__ = "Zirui Guo"
|
5 |
__url__ = "https://github.com/HKUDS/LightRAG"
|
|
|
1 |
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
2 |
|
3 |
+
__version__ = "1.0.2"
|
4 |
__author__ = "Zirui Guo"
|
5 |
__url__ = "https://github.com/HKUDS/LightRAG"
|
lightrag/lightrag.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import asyncio
|
2 |
import os
|
|
|
3 |
from dataclasses import asdict, dataclass, field
|
4 |
from datetime import datetime
|
5 |
from functools import partial
|
@@ -242,7 +243,9 @@ class LightRAG:
|
|
242 |
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
|
243 |
|
244 |
inserting_chunks = {}
|
245 |
-
for doc_key, doc in
|
|
|
|
|
246 |
chunks = {
|
247 |
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
248 |
**dp,
|
@@ -304,6 +307,108 @@ class LightRAG:
|
|
304 |
tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
|
305 |
await asyncio.gather(*tasks)
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
def query(self, query: str, param: QueryParam = QueryParam()):
|
308 |
loop = always_get_an_event_loop()
|
309 |
return loop.run_until_complete(self.aquery(query, param))
|
|
|
1 |
import asyncio
|
2 |
import os
|
3 |
+
from tqdm.asyncio import tqdm as tqdm_async
|
4 |
from dataclasses import asdict, dataclass, field
|
5 |
from datetime import datetime
|
6 |
from functools import partial
|
|
|
243 |
logger.info(f"[New Docs] inserting {len(new_docs)} docs")
|
244 |
|
245 |
inserting_chunks = {}
|
246 |
+
for doc_key, doc in tqdm_async(
|
247 |
+
new_docs.items(), desc="Chunking documents", unit="doc"
|
248 |
+
):
|
249 |
chunks = {
|
250 |
compute_mdhash_id(dp["content"], prefix="chunk-"): {
|
251 |
**dp,
|
|
|
307 |
tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
|
308 |
await asyncio.gather(*tasks)
|
309 |
|
310 |
+
def insert_custom_kg(self, custom_kg: dict):
|
311 |
+
loop = always_get_an_event_loop()
|
312 |
+
return loop.run_until_complete(self.ainsert_custom_kg(custom_kg))
|
313 |
+
|
314 |
+
async def ainsert_custom_kg(self, custom_kg: dict):
|
315 |
+
update_storage = False
|
316 |
+
try:
|
317 |
+
# Insert entities into knowledge graph
|
318 |
+
all_entities_data = []
|
319 |
+
for entity_data in custom_kg.get("entities", []):
|
320 |
+
entity_name = f'"{entity_data["entity_name"].upper()}"'
|
321 |
+
entity_type = entity_data.get("entity_type", "UNKNOWN")
|
322 |
+
description = entity_data.get("description", "No description provided")
|
323 |
+
source_id = entity_data["source_id"]
|
324 |
+
|
325 |
+
# Prepare node data
|
326 |
+
node_data = {
|
327 |
+
"entity_type": entity_type,
|
328 |
+
"description": description,
|
329 |
+
"source_id": source_id,
|
330 |
+
}
|
331 |
+
# Insert node data into the knowledge graph
|
332 |
+
await self.chunk_entity_relation_graph.upsert_node(
|
333 |
+
entity_name, node_data=node_data
|
334 |
+
)
|
335 |
+
node_data["entity_name"] = entity_name
|
336 |
+
all_entities_data.append(node_data)
|
337 |
+
update_storage = True
|
338 |
+
|
339 |
+
# Insert relationships into knowledge graph
|
340 |
+
all_relationships_data = []
|
341 |
+
for relationship_data in custom_kg.get("relationships", []):
|
342 |
+
src_id = f'"{relationship_data["src_id"].upper()}"'
|
343 |
+
tgt_id = f'"{relationship_data["tgt_id"].upper()}"'
|
344 |
+
description = relationship_data["description"]
|
345 |
+
keywords = relationship_data["keywords"]
|
346 |
+
weight = relationship_data.get("weight", 1.0)
|
347 |
+
source_id = relationship_data["source_id"]
|
348 |
+
|
349 |
+
# Check if nodes exist in the knowledge graph
|
350 |
+
for need_insert_id in [src_id, tgt_id]:
|
351 |
+
if not (
|
352 |
+
await self.chunk_entity_relation_graph.has_node(need_insert_id)
|
353 |
+
):
|
354 |
+
await self.chunk_entity_relation_graph.upsert_node(
|
355 |
+
need_insert_id,
|
356 |
+
node_data={
|
357 |
+
"source_id": source_id,
|
358 |
+
"description": "UNKNOWN",
|
359 |
+
"entity_type": "UNKNOWN",
|
360 |
+
},
|
361 |
+
)
|
362 |
+
|
363 |
+
# Insert edge into the knowledge graph
|
364 |
+
await self.chunk_entity_relation_graph.upsert_edge(
|
365 |
+
src_id,
|
366 |
+
tgt_id,
|
367 |
+
edge_data={
|
368 |
+
"weight": weight,
|
369 |
+
"description": description,
|
370 |
+
"keywords": keywords,
|
371 |
+
"source_id": source_id,
|
372 |
+
},
|
373 |
+
)
|
374 |
+
edge_data = {
|
375 |
+
"src_id": src_id,
|
376 |
+
"tgt_id": tgt_id,
|
377 |
+
"description": description,
|
378 |
+
"keywords": keywords,
|
379 |
+
}
|
380 |
+
all_relationships_data.append(edge_data)
|
381 |
+
update_storage = True
|
382 |
+
|
383 |
+
# Insert entities into vector storage if needed
|
384 |
+
if self.entities_vdb is not None:
|
385 |
+
data_for_vdb = {
|
386 |
+
compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
|
387 |
+
"content": dp["entity_name"] + dp["description"],
|
388 |
+
"entity_name": dp["entity_name"],
|
389 |
+
}
|
390 |
+
for dp in all_entities_data
|
391 |
+
}
|
392 |
+
await self.entities_vdb.upsert(data_for_vdb)
|
393 |
+
|
394 |
+
# Insert relationships into vector storage if needed
|
395 |
+
if self.relationships_vdb is not None:
|
396 |
+
data_for_vdb = {
|
397 |
+
compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
|
398 |
+
"src_id": dp["src_id"],
|
399 |
+
"tgt_id": dp["tgt_id"],
|
400 |
+
"content": dp["keywords"]
|
401 |
+
+ dp["src_id"]
|
402 |
+
+ dp["tgt_id"]
|
403 |
+
+ dp["description"],
|
404 |
+
}
|
405 |
+
for dp in all_relationships_data
|
406 |
+
}
|
407 |
+
await self.relationships_vdb.upsert(data_for_vdb)
|
408 |
+
finally:
|
409 |
+
if update_storage:
|
410 |
+
await self._insert_done()
|
411 |
+
|
412 |
def query(self, query: str, param: QueryParam = QueryParam()):
|
413 |
loop = always_get_an_event_loop()
|
414 |
return loop.run_until_complete(self.aquery(query, param))
|
lightrag/operate.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import re
|
|
|
4 |
from typing import Union
|
5 |
from collections import Counter, defaultdict
|
6 |
import warnings
|
@@ -342,11 +343,15 @@ async def extract_entities(
|
|
342 |
)
|
343 |
return dict(maybe_nodes), dict(maybe_edges)
|
344 |
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
|
|
|
|
|
|
|
|
350 |
maybe_nodes = defaultdict(list)
|
351 |
maybe_edges = defaultdict(list)
|
352 |
for m_nodes, m_edges in results:
|
@@ -354,18 +359,38 @@ async def extract_entities(
|
|
354 |
maybe_nodes[k].extend(v)
|
355 |
for k, v in m_edges.items():
|
356 |
maybe_edges[tuple(sorted(k))].extend(v)
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
369 |
if not len(all_entities_data):
|
370 |
logger.warning("Didn't extract any entities, maybe your LLM is not working")
|
371 |
return None
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import re
|
4 |
+
from tqdm.asyncio import tqdm as tqdm_async
|
5 |
from typing import Union
|
6 |
from collections import Counter, defaultdict
|
7 |
import warnings
|
|
|
343 |
)
|
344 |
return dict(maybe_nodes), dict(maybe_edges)
|
345 |
|
346 |
+
results = []
|
347 |
+
for result in tqdm_async(
|
348 |
+
asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
|
349 |
+
total=len(ordered_chunks),
|
350 |
+
desc="Extracting entities from chunks",
|
351 |
+
unit="chunk",
|
352 |
+
):
|
353 |
+
results.append(await result)
|
354 |
+
|
355 |
maybe_nodes = defaultdict(list)
|
356 |
maybe_edges = defaultdict(list)
|
357 |
for m_nodes, m_edges in results:
|
|
|
359 |
maybe_nodes[k].extend(v)
|
360 |
for k, v in m_edges.items():
|
361 |
maybe_edges[tuple(sorted(k))].extend(v)
|
362 |
+
logger.info("Inserting entities into storage...")
|
363 |
+
all_entities_data = []
|
364 |
+
for result in tqdm_async(
|
365 |
+
asyncio.as_completed(
|
366 |
+
[
|
367 |
+
_merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
|
368 |
+
for k, v in maybe_nodes.items()
|
369 |
+
]
|
370 |
+
),
|
371 |
+
total=len(maybe_nodes),
|
372 |
+
desc="Inserting entities",
|
373 |
+
unit="entity",
|
374 |
+
):
|
375 |
+
all_entities_data.append(await result)
|
376 |
+
|
377 |
+
logger.info("Inserting relationships into storage...")
|
378 |
+
all_relationships_data = []
|
379 |
+
for result in tqdm_async(
|
380 |
+
asyncio.as_completed(
|
381 |
+
[
|
382 |
+
_merge_edges_then_upsert(
|
383 |
+
k[0], k[1], v, knowledge_graph_inst, global_config
|
384 |
+
)
|
385 |
+
for k, v in maybe_edges.items()
|
386 |
+
]
|
387 |
+
),
|
388 |
+
total=len(maybe_edges),
|
389 |
+
desc="Inserting relationships",
|
390 |
+
unit="relationship",
|
391 |
+
):
|
392 |
+
all_relationships_data.append(await result)
|
393 |
+
|
394 |
if not len(all_entities_data):
|
395 |
logger.warning("Didn't extract any entities, maybe your LLM is not working")
|
396 |
return None
|
lightrag/storage.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import asyncio
|
2 |
import html
|
3 |
import os
|
|
|
4 |
from dataclasses import dataclass
|
5 |
from typing import Any, Union, cast
|
6 |
import networkx as nx
|
@@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|
95 |
contents[i : i + self._max_batch_size]
|
96 |
for i in range(0, len(contents), self._max_batch_size)
|
97 |
]
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
embeddings = np.concatenate(embeddings_list)
|
102 |
for i, d in enumerate(list_data):
|
103 |
d["__vector__"] = embeddings[i]
|
|
|
1 |
import asyncio
|
2 |
import html
|
3 |
import os
|
4 |
+
from tqdm.asyncio import tqdm as tqdm_async
|
5 |
from dataclasses import dataclass
|
6 |
from typing import Any, Union, cast
|
7 |
import networkx as nx
|
|
|
96 |
contents[i : i + self._max_batch_size]
|
97 |
for i in range(0, len(contents), self._max_batch_size)
|
98 |
]
|
99 |
+
embedding_tasks = [self.embedding_func(batch) for batch in batches]
|
100 |
+
embeddings_list = []
|
101 |
+
for f in tqdm_async(
|
102 |
+
asyncio.as_completed(embedding_tasks),
|
103 |
+
total=len(embedding_tasks),
|
104 |
+
desc="Generating embeddings",
|
105 |
+
unit="batch",
|
106 |
+
):
|
107 |
+
embeddings = await f
|
108 |
+
embeddings_list.append(embeddings)
|
109 |
embeddings = np.concatenate(embeddings_list)
|
110 |
for i, d in enumerate(list_data):
|
111 |
d["__vector__"] = embeddings[i]
|