jin commited on
Commit
55feb16
·
2 Parent(s): 0923c61 6d693be

Merge branch 'HKUDS:main' into main

Browse files
README.md CHANGED
@@ -26,7 +26,8 @@ This repository hosts the code of LightRAG. The structure of this code is based
26
  </div>
27
 
28
  ## 🎉 News
29
- - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author!
 
30
  - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
31
  - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
32
  - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
@@ -327,6 +328,49 @@ with open("./newText.txt") as f:
327
  rag.insert(f.read())
328
  ```
329
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  ### Delete Entity
331
 
332
  ```python
 
26
  </div>
27
 
28
  ## 🎉 News
29
+ - [x] [2024.11.25]🎯📢LightRAG now supports seamless integration of [custom knowledge graphs](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#insert-custom-kg), empowering users to enhance the system with their own domain expertise.
30
+ - [x] [2024.11.19]🎯📢A comprehensive guide to LightRAG is now available on [LearnOpenCV](https://learnopencv.com/lightrag). Many thanks to the blog author.
31
  - [x] [2024.11.12]🎯📢LightRAG now supports [Oracle Database 23ai for all storage types (KV, vector, and graph)](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_oracle_demo.py).
32
  - [x] [2024.11.11]🎯📢LightRAG now supports [deleting entities by their names](https://github.com/HKUDS/LightRAG?tab=readme-ov-file#delete-entity).
33
  - [x] [2024.11.09]🎯📢Introducing the [LightRAG Gui](https://lightrag-gui.streamlit.app), which allows you to insert, query, visualize, and download LightRAG knowledge.
 
328
  rag.insert(f.read())
329
  ```
330
 
331
+ ### Insert Custom KG
332
+
333
+ ```python
334
+ rag = LightRAG(
335
+ working_dir=WORKING_DIR,
336
+ llm_model_func=llm_model_func,
337
+ embedding_func=EmbeddingFunc(
338
+ embedding_dim=embedding_dimension,
339
+ max_token_size=8192,
340
+ func=embedding_func,
341
+ ),
342
+ )
343
+
344
+ custom_kg = {
345
+ "entities": [
346
+ {
347
+ "entity_name": "CompanyA",
348
+ "entity_type": "Organization",
349
+ "description": "A major technology company",
350
+ "source_id": "Source1"
351
+ },
352
+ {
353
+ "entity_name": "ProductX",
354
+ "entity_type": "Product",
355
+ "description": "A popular product developed by CompanyA",
356
+ "source_id": "Source1"
357
+ }
358
+ ],
359
+ "relationships": [
360
+ {
361
+ "src_id": "CompanyA",
362
+ "tgt_id": "ProductX",
363
+ "description": "CompanyA develops ProductX",
364
+ "keywords": "develop, produce",
365
+ "weight": 1.0,
366
+ "source_id": "Source1"
367
+ }
368
+ ]
369
+ }
370
+
371
+ rag.insert_custom_kg(custom_kg)
372
+ ```
373
+
374
  ### Delete Entity
375
 
376
  ```python
examples/insert_custom_kg.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from lightrag import LightRAG, QueryParam
3
+ from lightrag.llm import gpt_4o_mini_complete
4
+ #########
5
+ # Uncomment the below two lines if running in a jupyter notebook to handle the async nature of rag.insert()
6
+ # import nest_asyncio
7
+ # nest_asyncio.apply()
8
+ #########
9
+
10
+ WORKING_DIR = "./custom_kg"
11
+
12
+ if not os.path.exists(WORKING_DIR):
13
+ os.mkdir(WORKING_DIR)
14
+
15
+ rag = LightRAG(
16
+ working_dir=WORKING_DIR,
17
+ llm_model_func=gpt_4o_mini_complete, # Use gpt_4o_mini_complete LLM model
18
+ # llm_model_func=gpt_4o_complete # Optionally, use a stronger model
19
+ )
20
+
21
+ custom_kg = {
22
+ "entities": [
23
+ {
24
+ "entity_name": "CompanyA",
25
+ "entity_type": "Organization",
26
+ "description": "A major technology company",
27
+ "source_id": "Source1"
28
+ },
29
+ {
30
+ "entity_name": "ProductX",
31
+ "entity_type": "Product",
32
+ "description": "A popular product developed by CompanyA",
33
+ "source_id": "Source1"
34
+ },
35
+ {
36
+ "entity_name": "PersonA",
37
+ "entity_type": "Person",
38
+ "description": "A renowned researcher in AI",
39
+ "source_id": "Source2"
40
+ },
41
+ {
42
+ "entity_name": "UniversityB",
43
+ "entity_type": "Organization",
44
+ "description": "A leading university specializing in technology and sciences",
45
+ "source_id": "Source2"
46
+ },
47
+ {
48
+ "entity_name": "CityC",
49
+ "entity_type": "Location",
50
+ "description": "A large metropolitan city known for its culture and economy",
51
+ "source_id": "Source3"
52
+ },
53
+ {
54
+ "entity_name": "EventY",
55
+ "entity_type": "Event",
56
+ "description": "An annual technology conference held in CityC",
57
+ "source_id": "Source3"
58
+ },
59
+ {
60
+ "entity_name": "CompanyD",
61
+ "entity_type": "Organization",
62
+ "description": "A financial services company specializing in insurance",
63
+ "source_id": "Source4"
64
+ },
65
+ {
66
+ "entity_name": "ServiceZ",
67
+ "entity_type": "Service",
68
+ "description": "An insurance product offered by CompanyD",
69
+ "source_id": "Source4"
70
+ }
71
+ ],
72
+ "relationships": [
73
+ {
74
+ "src_id": "CompanyA",
75
+ "tgt_id": "ProductX",
76
+ "description": "CompanyA develops ProductX",
77
+ "keywords": "develop, produce",
78
+ "weight": 1.0,
79
+ "source_id": "Source1"
80
+ },
81
+ {
82
+ "src_id": "PersonA",
83
+ "tgt_id": "UniversityB",
84
+ "description": "PersonA works at UniversityB",
85
+ "keywords": "employment, affiliation",
86
+ "weight": 0.9,
87
+ "source_id": "Source2"
88
+ },
89
+ {
90
+ "src_id": "CityC",
91
+ "tgt_id": "EventY",
92
+ "description": "EventY is hosted in CityC",
93
+ "keywords": "host, location",
94
+ "weight": 0.8,
95
+ "source_id": "Source3"
96
+ },
97
+ {
98
+ "src_id": "CompanyD",
99
+ "tgt_id": "ServiceZ",
100
+ "description": "CompanyD provides ServiceZ",
101
+ "keywords": "provide, offer",
102
+ "weight": 1.0,
103
+ "source_id": "Source4"
104
+ }
105
+ ]
106
+ }
107
+
108
+ rag.insert_custom_kg(custom_kg)
lightrag/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
  from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
2
 
3
- __version__ = "1.0.1"
4
  __author__ = "Zirui Guo"
5
  __url__ = "https://github.com/HKUDS/LightRAG"
 
1
  from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
2
 
3
+ __version__ = "1.0.2"
4
  __author__ = "Zirui Guo"
5
  __url__ = "https://github.com/HKUDS/LightRAG"
lightrag/lightrag.py CHANGED
@@ -1,5 +1,6 @@
1
  import asyncio
2
  import os
 
3
  from dataclasses import asdict, dataclass, field
4
  from datetime import datetime
5
  from functools import partial
@@ -242,7 +243,9 @@ class LightRAG:
242
  logger.info(f"[New Docs] inserting {len(new_docs)} docs")
243
 
244
  inserting_chunks = {}
245
- for doc_key, doc in new_docs.items():
 
 
246
  chunks = {
247
  compute_mdhash_id(dp["content"], prefix="chunk-"): {
248
  **dp,
@@ -304,6 +307,108 @@ class LightRAG:
304
  tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
305
  await asyncio.gather(*tasks)
306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  def query(self, query: str, param: QueryParam = QueryParam()):
308
  loop = always_get_an_event_loop()
309
  return loop.run_until_complete(self.aquery(query, param))
 
1
  import asyncio
2
  import os
3
+ from tqdm.asyncio import tqdm as tqdm_async
4
  from dataclasses import asdict, dataclass, field
5
  from datetime import datetime
6
  from functools import partial
 
243
  logger.info(f"[New Docs] inserting {len(new_docs)} docs")
244
 
245
  inserting_chunks = {}
246
+ for doc_key, doc in tqdm_async(
247
+ new_docs.items(), desc="Chunking documents", unit="doc"
248
+ ):
249
  chunks = {
250
  compute_mdhash_id(dp["content"], prefix="chunk-"): {
251
  **dp,
 
307
  tasks.append(cast(StorageNameSpace, storage_inst).index_done_callback())
308
  await asyncio.gather(*tasks)
309
 
310
+ def insert_custom_kg(self, custom_kg: dict):
311
+ loop = always_get_an_event_loop()
312
+ return loop.run_until_complete(self.ainsert_custom_kg(custom_kg))
313
+
314
+ async def ainsert_custom_kg(self, custom_kg: dict):
315
+ update_storage = False
316
+ try:
317
+ # Insert entities into knowledge graph
318
+ all_entities_data = []
319
+ for entity_data in custom_kg.get("entities", []):
320
+ entity_name = f'"{entity_data["entity_name"].upper()}"'
321
+ entity_type = entity_data.get("entity_type", "UNKNOWN")
322
+ description = entity_data.get("description", "No description provided")
323
+ source_id = entity_data["source_id"]
324
+
325
+ # Prepare node data
326
+ node_data = {
327
+ "entity_type": entity_type,
328
+ "description": description,
329
+ "source_id": source_id,
330
+ }
331
+ # Insert node data into the knowledge graph
332
+ await self.chunk_entity_relation_graph.upsert_node(
333
+ entity_name, node_data=node_data
334
+ )
335
+ node_data["entity_name"] = entity_name
336
+ all_entities_data.append(node_data)
337
+ update_storage = True
338
+
339
+ # Insert relationships into knowledge graph
340
+ all_relationships_data = []
341
+ for relationship_data in custom_kg.get("relationships", []):
342
+ src_id = f'"{relationship_data["src_id"].upper()}"'
343
+ tgt_id = f'"{relationship_data["tgt_id"].upper()}"'
344
+ description = relationship_data["description"]
345
+ keywords = relationship_data["keywords"]
346
+ weight = relationship_data.get("weight", 1.0)
347
+ source_id = relationship_data["source_id"]
348
+
349
+ # Check if nodes exist in the knowledge graph
350
+ for need_insert_id in [src_id, tgt_id]:
351
+ if not (
352
+ await self.chunk_entity_relation_graph.has_node(need_insert_id)
353
+ ):
354
+ await self.chunk_entity_relation_graph.upsert_node(
355
+ need_insert_id,
356
+ node_data={
357
+ "source_id": source_id,
358
+ "description": "UNKNOWN",
359
+ "entity_type": "UNKNOWN",
360
+ },
361
+ )
362
+
363
+ # Insert edge into the knowledge graph
364
+ await self.chunk_entity_relation_graph.upsert_edge(
365
+ src_id,
366
+ tgt_id,
367
+ edge_data={
368
+ "weight": weight,
369
+ "description": description,
370
+ "keywords": keywords,
371
+ "source_id": source_id,
372
+ },
373
+ )
374
+ edge_data = {
375
+ "src_id": src_id,
376
+ "tgt_id": tgt_id,
377
+ "description": description,
378
+ "keywords": keywords,
379
+ }
380
+ all_relationships_data.append(edge_data)
381
+ update_storage = True
382
+
383
+ # Insert entities into vector storage if needed
384
+ if self.entities_vdb is not None:
385
+ data_for_vdb = {
386
+ compute_mdhash_id(dp["entity_name"], prefix="ent-"): {
387
+ "content": dp["entity_name"] + dp["description"],
388
+ "entity_name": dp["entity_name"],
389
+ }
390
+ for dp in all_entities_data
391
+ }
392
+ await self.entities_vdb.upsert(data_for_vdb)
393
+
394
+ # Insert relationships into vector storage if needed
395
+ if self.relationships_vdb is not None:
396
+ data_for_vdb = {
397
+ compute_mdhash_id(dp["src_id"] + dp["tgt_id"], prefix="rel-"): {
398
+ "src_id": dp["src_id"],
399
+ "tgt_id": dp["tgt_id"],
400
+ "content": dp["keywords"]
401
+ + dp["src_id"]
402
+ + dp["tgt_id"]
403
+ + dp["description"],
404
+ }
405
+ for dp in all_relationships_data
406
+ }
407
+ await self.relationships_vdb.upsert(data_for_vdb)
408
+ finally:
409
+ if update_storage:
410
+ await self._insert_done()
411
+
412
  def query(self, query: str, param: QueryParam = QueryParam()):
413
  loop = always_get_an_event_loop()
414
  return loop.run_until_complete(self.aquery(query, param))
lightrag/operate.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import json
3
  import re
 
4
  from typing import Union
5
  from collections import Counter, defaultdict
6
  import warnings
@@ -342,11 +343,15 @@ async def extract_entities(
342
  )
343
  return dict(maybe_nodes), dict(maybe_edges)
344
 
345
- # use_llm_func is wrapped in ascynio.Semaphore, limiting max_async callings
346
- results = await asyncio.gather(
347
- *[_process_single_content(c) for c in ordered_chunks]
348
- )
349
- print() # clear the progress bar
 
 
 
 
350
  maybe_nodes = defaultdict(list)
351
  maybe_edges = defaultdict(list)
352
  for m_nodes, m_edges in results:
@@ -354,18 +359,38 @@ async def extract_entities(
354
  maybe_nodes[k].extend(v)
355
  for k, v in m_edges.items():
356
  maybe_edges[tuple(sorted(k))].extend(v)
357
- all_entities_data = await asyncio.gather(
358
- *[
359
- _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
360
- for k, v in maybe_nodes.items()
361
- ]
362
- )
363
- all_relationships_data = await asyncio.gather(
364
- *[
365
- _merge_edges_then_upsert(k[0], k[1], v, knowledge_graph_inst, global_config)
366
- for k, v in maybe_edges.items()
367
- ]
368
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  if not len(all_entities_data):
370
  logger.warning("Didn't extract any entities, maybe your LLM is not working")
371
  return None
 
1
  import asyncio
2
  import json
3
  import re
4
+ from tqdm.asyncio import tqdm as tqdm_async
5
  from typing import Union
6
  from collections import Counter, defaultdict
7
  import warnings
 
343
  )
344
  return dict(maybe_nodes), dict(maybe_edges)
345
 
346
+ results = []
347
+ for result in tqdm_async(
348
+ asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
349
+ total=len(ordered_chunks),
350
+ desc="Extracting entities from chunks",
351
+ unit="chunk",
352
+ ):
353
+ results.append(await result)
354
+
355
  maybe_nodes = defaultdict(list)
356
  maybe_edges = defaultdict(list)
357
  for m_nodes, m_edges in results:
 
359
  maybe_nodes[k].extend(v)
360
  for k, v in m_edges.items():
361
  maybe_edges[tuple(sorted(k))].extend(v)
362
+ logger.info("Inserting entities into storage...")
363
+ all_entities_data = []
364
+ for result in tqdm_async(
365
+ asyncio.as_completed(
366
+ [
367
+ _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
368
+ for k, v in maybe_nodes.items()
369
+ ]
370
+ ),
371
+ total=len(maybe_nodes),
372
+ desc="Inserting entities",
373
+ unit="entity",
374
+ ):
375
+ all_entities_data.append(await result)
376
+
377
+ logger.info("Inserting relationships into storage...")
378
+ all_relationships_data = []
379
+ for result in tqdm_async(
380
+ asyncio.as_completed(
381
+ [
382
+ _merge_edges_then_upsert(
383
+ k[0], k[1], v, knowledge_graph_inst, global_config
384
+ )
385
+ for k, v in maybe_edges.items()
386
+ ]
387
+ ),
388
+ total=len(maybe_edges),
389
+ desc="Inserting relationships",
390
+ unit="relationship",
391
+ ):
392
+ all_relationships_data.append(await result)
393
+
394
  if not len(all_entities_data):
395
  logger.warning("Didn't extract any entities, maybe your LLM is not working")
396
  return None
lightrag/storage.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import html
3
  import os
 
4
  from dataclasses import dataclass
5
  from typing import Any, Union, cast
6
  import networkx as nx
@@ -95,9 +96,16 @@ class NanoVectorDBStorage(BaseVectorStorage):
95
  contents[i : i + self._max_batch_size]
96
  for i in range(0, len(contents), self._max_batch_size)
97
  ]
98
- embeddings_list = await asyncio.gather(
99
- *[self.embedding_func(batch) for batch in batches]
100
- )
 
 
 
 
 
 
 
101
  embeddings = np.concatenate(embeddings_list)
102
  for i, d in enumerate(list_data):
103
  d["__vector__"] = embeddings[i]
 
1
  import asyncio
2
  import html
3
  import os
4
+ from tqdm.asyncio import tqdm as tqdm_async
5
  from dataclasses import dataclass
6
  from typing import Any, Union, cast
7
  import networkx as nx
 
96
  contents[i : i + self._max_batch_size]
97
  for i in range(0, len(contents), self._max_batch_size)
98
  ]
99
+ embedding_tasks = [self.embedding_func(batch) for batch in batches]
100
+ embeddings_list = []
101
+ for f in tqdm_async(
102
+ asyncio.as_completed(embedding_tasks),
103
+ total=len(embedding_tasks),
104
+ desc="Generating embeddings",
105
+ unit="batch",
106
+ ):
107
+ embeddings = await f
108
+ embeddings_list.append(embeddings)
109
  embeddings = np.concatenate(embeddings_list)
110
  for i, d in enumerate(list_data):
111
  d["__vector__"] = embeddings[i]