zrguo
commited on
Commit
·
beb4bc2
1
Parent(s):
a2e05db
Update insert_custom_kg
Browse files- README.md +60 -53
- lightrag/lightrag.py +15 -4
- lightrag/operate.py +1 -0
- lightrag/utils_graph.py +7 -2
README.md
CHANGED
@@ -900,59 +900,66 @@ All operations are available in both synchronous and asynchronous versions. The
|
|
900 |
|
901 |
```python
|
902 |
custom_kg = {
|
903 |
-
|
904 |
-
|
905 |
-
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
-
|
911 |
-
|
912 |
-
|
913 |
-
|
914 |
-
|
915 |
-
|
916 |
-
|
917 |
-
|
918 |
-
|
919 |
-
|
920 |
-
|
921 |
-
|
922 |
-
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
933 |
-
|
934 |
-
|
935 |
-
|
936 |
-
|
937 |
-
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
-
|
942 |
-
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
-
|
952 |
-
|
953 |
-
|
954 |
-
|
955 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
956 |
|
957 |
rag.insert_custom_kg(custom_kg)
|
958 |
```
|
|
|
900 |
|
901 |
```python
|
902 |
custom_kg = {
|
903 |
+
"chunks": [
|
904 |
+
{
|
905 |
+
"content": "Alice and Bob are collaborating on quantum computing research.",
|
906 |
+
"source_id": "doc-1",
|
907 |
+
"file_path": "test_file",
|
908 |
+
}
|
909 |
+
],
|
910 |
+
"entities": [
|
911 |
+
{
|
912 |
+
"entity_name": "Alice",
|
913 |
+
"entity_type": "person",
|
914 |
+
"description": "Alice is a researcher specializing in quantum physics.",
|
915 |
+
"source_id": "doc-1",
|
916 |
+
"file_path": "test_file"
|
917 |
+
},
|
918 |
+
{
|
919 |
+
"entity_name": "Bob",
|
920 |
+
"entity_type": "person",
|
921 |
+
"description": "Bob is a mathematician.",
|
922 |
+
"source_id": "doc-1",
|
923 |
+
"file_path": "test_file"
|
924 |
+
},
|
925 |
+
{
|
926 |
+
"entity_name": "Quantum Computing",
|
927 |
+
"entity_type": "technology",
|
928 |
+
"description": "Quantum computing utilizes quantum mechanical phenomena for computation.",
|
929 |
+
"source_id": "doc-1",
|
930 |
+
"file_path": "test_file"
|
931 |
+
}
|
932 |
+
],
|
933 |
+
"relationships": [
|
934 |
+
{
|
935 |
+
"src_id": "Alice",
|
936 |
+
"tgt_id": "Bob",
|
937 |
+
"description": "Alice and Bob are research partners.",
|
938 |
+
"keywords": "collaboration research",
|
939 |
+
"weight": 1.0,
|
940 |
+
"source_id": "doc-1",
|
941 |
+
"file_path": "test_file"
|
942 |
+
},
|
943 |
+
{
|
944 |
+
"src_id": "Alice",
|
945 |
+
"tgt_id": "Quantum Computing",
|
946 |
+
"description": "Alice conducts research on quantum computing.",
|
947 |
+
"keywords": "research expertise",
|
948 |
+
"weight": 1.0,
|
949 |
+
"source_id": "doc-1",
|
950 |
+
"file_path": "test_file"
|
951 |
+
},
|
952 |
+
{
|
953 |
+
"src_id": "Bob",
|
954 |
+
"tgt_id": "Quantum Computing",
|
955 |
+
"description": "Bob researches quantum computing.",
|
956 |
+
"keywords": "research application",
|
957 |
+
"weight": 1.0,
|
958 |
+
"source_id": "doc-1",
|
959 |
+
"file_path": "test_file"
|
960 |
+
}
|
961 |
+
]
|
962 |
+
}
|
963 |
|
964 |
rag.insert_custom_kg(custom_kg)
|
965 |
```
|
lightrag/lightrag.py
CHANGED
@@ -4,6 +4,7 @@ import traceback
|
|
4 |
import asyncio
|
5 |
import configparser
|
6 |
import os
|
|
|
7 |
import warnings
|
8 |
from dataclasses import asdict, dataclass, field
|
9 |
from datetime import datetime, timezone
|
@@ -1235,7 +1236,6 @@ class LightRAG:
|
|
1235 |
self,
|
1236 |
custom_kg: dict[str, Any],
|
1237 |
full_doc_id: str = None,
|
1238 |
-
file_path: str = "custom_kg",
|
1239 |
) -> None:
|
1240 |
update_storage = False
|
1241 |
try:
|
@@ -1245,6 +1245,7 @@ class LightRAG:
|
|
1245 |
for chunk_data in custom_kg.get("chunks", []):
|
1246 |
chunk_content = clean_text(chunk_data["content"])
|
1247 |
source_id = chunk_data["source_id"]
|
|
|
1248 |
tokens = len(self.tokenizer.encode(chunk_content))
|
1249 |
chunk_order_index = (
|
1250 |
0
|
@@ -1261,7 +1262,7 @@ class LightRAG:
|
|
1261 |
"full_doc_id": full_doc_id
|
1262 |
if full_doc_id is not None
|
1263 |
else source_id,
|
1264 |
-
"file_path": file_path,
|
1265 |
"status": DocStatus.PROCESSED,
|
1266 |
}
|
1267 |
all_chunks_data[chunk_id] = chunk_entry
|
@@ -1282,6 +1283,7 @@ class LightRAG:
|
|
1282 |
description = entity_data.get("description", "No description provided")
|
1283 |
source_chunk_id = entity_data.get("source_id", "UNKNOWN")
|
1284 |
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
|
|
|
1285 |
|
1286 |
# Log if source_id is UNKNOWN
|
1287 |
if source_id == "UNKNOWN":
|
@@ -1296,6 +1298,7 @@ class LightRAG:
|
|
1296 |
"description": description,
|
1297 |
"source_id": source_id,
|
1298 |
"file_path": file_path,
|
|
|
1299 |
}
|
1300 |
# Insert node data into the knowledge graph
|
1301 |
await self.chunk_entity_relation_graph.upsert_node(
|
@@ -1315,6 +1318,7 @@ class LightRAG:
|
|
1315 |
weight = relationship_data.get("weight", 1.0)
|
1316 |
source_chunk_id = relationship_data.get("source_id", "UNKNOWN")
|
1317 |
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
|
|
|
1318 |
|
1319 |
# Log if source_id is UNKNOWN
|
1320 |
if source_id == "UNKNOWN":
|
@@ -1334,6 +1338,8 @@ class LightRAG:
|
|
1334 |
"source_id": source_id,
|
1335 |
"description": "UNKNOWN",
|
1336 |
"entity_type": "UNKNOWN",
|
|
|
|
|
1337 |
},
|
1338 |
)
|
1339 |
|
@@ -1346,8 +1352,11 @@ class LightRAG:
|
|
1346 |
"description": description,
|
1347 |
"keywords": keywords,
|
1348 |
"source_id": source_id,
|
|
|
|
|
1349 |
},
|
1350 |
)
|
|
|
1351 |
edge_data: dict[str, str] = {
|
1352 |
"src_id": src_id,
|
1353 |
"tgt_id": tgt_id,
|
@@ -1355,6 +1364,8 @@ class LightRAG:
|
|
1355 |
"keywords": keywords,
|
1356 |
"source_id": source_id,
|
1357 |
"weight": weight,
|
|
|
|
|
1358 |
}
|
1359 |
all_relationships_data.append(edge_data)
|
1360 |
update_storage = True
|
@@ -1367,7 +1378,7 @@ class LightRAG:
|
|
1367 |
"source_id": dp["source_id"],
|
1368 |
"description": dp["description"],
|
1369 |
"entity_type": dp["entity_type"],
|
1370 |
-
"file_path": file_path,
|
1371 |
}
|
1372 |
for dp in all_entities_data
|
1373 |
}
|
@@ -1383,7 +1394,7 @@ class LightRAG:
|
|
1383 |
"keywords": dp["keywords"],
|
1384 |
"description": dp["description"],
|
1385 |
"weight": dp["weight"],
|
1386 |
-
"file_path": file_path,
|
1387 |
}
|
1388 |
for dp in all_relationships_data
|
1389 |
}
|
|
|
4 |
import asyncio
|
5 |
import configparser
|
6 |
import os
|
7 |
+
import time
|
8 |
import warnings
|
9 |
from dataclasses import asdict, dataclass, field
|
10 |
from datetime import datetime, timezone
|
|
|
1236 |
self,
|
1237 |
custom_kg: dict[str, Any],
|
1238 |
full_doc_id: str = None,
|
|
|
1239 |
) -> None:
|
1240 |
update_storage = False
|
1241 |
try:
|
|
|
1245 |
for chunk_data in custom_kg.get("chunks", []):
|
1246 |
chunk_content = clean_text(chunk_data["content"])
|
1247 |
source_id = chunk_data["source_id"]
|
1248 |
+
file_path = chunk_data.get("file_path", "custom_kg")
|
1249 |
tokens = len(self.tokenizer.encode(chunk_content))
|
1250 |
chunk_order_index = (
|
1251 |
0
|
|
|
1262 |
"full_doc_id": full_doc_id
|
1263 |
if full_doc_id is not None
|
1264 |
else source_id,
|
1265 |
+
"file_path": file_path,
|
1266 |
"status": DocStatus.PROCESSED,
|
1267 |
}
|
1268 |
all_chunks_data[chunk_id] = chunk_entry
|
|
|
1283 |
description = entity_data.get("description", "No description provided")
|
1284 |
source_chunk_id = entity_data.get("source_id", "UNKNOWN")
|
1285 |
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
|
1286 |
+
file_path = entity_data.get("file_path", "custom_kg")
|
1287 |
|
1288 |
# Log if source_id is UNKNOWN
|
1289 |
if source_id == "UNKNOWN":
|
|
|
1298 |
"description": description,
|
1299 |
"source_id": source_id,
|
1300 |
"file_path": file_path,
|
1301 |
+
"created_at": int(time.time()),
|
1302 |
}
|
1303 |
# Insert node data into the knowledge graph
|
1304 |
await self.chunk_entity_relation_graph.upsert_node(
|
|
|
1318 |
weight = relationship_data.get("weight", 1.0)
|
1319 |
source_chunk_id = relationship_data.get("source_id", "UNKNOWN")
|
1320 |
source_id = chunk_to_source_map.get(source_chunk_id, "UNKNOWN")
|
1321 |
+
file_path = relationship_data.get("file_path", "custom_kg")
|
1322 |
|
1323 |
# Log if source_id is UNKNOWN
|
1324 |
if source_id == "UNKNOWN":
|
|
|
1338 |
"source_id": source_id,
|
1339 |
"description": "UNKNOWN",
|
1340 |
"entity_type": "UNKNOWN",
|
1341 |
+
"file_path": file_path,
|
1342 |
+
"created_at": int(time.time()),
|
1343 |
},
|
1344 |
)
|
1345 |
|
|
|
1352 |
"description": description,
|
1353 |
"keywords": keywords,
|
1354 |
"source_id": source_id,
|
1355 |
+
"file_path": file_path,
|
1356 |
+
"created_at": int(time.time()),
|
1357 |
},
|
1358 |
)
|
1359 |
+
|
1360 |
edge_data: dict[str, str] = {
|
1361 |
"src_id": src_id,
|
1362 |
"tgt_id": tgt_id,
|
|
|
1364 |
"keywords": keywords,
|
1365 |
"source_id": source_id,
|
1366 |
"weight": weight,
|
1367 |
+
"file_path": file_path,
|
1368 |
+
"created_at": int(time.time()),
|
1369 |
}
|
1370 |
all_relationships_data.append(edge_data)
|
1371 |
update_storage = True
|
|
|
1378 |
"source_id": dp["source_id"],
|
1379 |
"description": dp["description"],
|
1380 |
"entity_type": dp["entity_type"],
|
1381 |
+
"file_path": dp.get("file_path", "custom_kg"),
|
1382 |
}
|
1383 |
for dp in all_entities_data
|
1384 |
}
|
|
|
1394 |
"keywords": dp["keywords"],
|
1395 |
"description": dp["description"],
|
1396 |
"weight": dp["weight"],
|
1397 |
+
"file_path": dp.get("file_path", "custom_kg"),
|
1398 |
}
|
1399 |
for dp in all_relationships_data
|
1400 |
}
|
lightrag/operate.py
CHANGED
@@ -496,6 +496,7 @@ async def _merge_edges_then_upsert(
|
|
496 |
keywords=keywords,
|
497 |
source_id=source_id,
|
498 |
file_path=file_path,
|
|
|
499 |
)
|
500 |
|
501 |
return edge_data
|
|
|
496 |
keywords=keywords,
|
497 |
source_id=source_id,
|
498 |
file_path=file_path,
|
499 |
+
created_at=int(time.time()),
|
500 |
)
|
501 |
|
502 |
return edge_data
|
lightrag/utils_graph.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
from __future__ import annotations
|
2 |
|
|
|
3 |
import asyncio
|
4 |
from typing import Any, cast
|
5 |
|
@@ -479,7 +480,9 @@ async def acreate_entity(
|
|
479 |
"entity_id": entity_name,
|
480 |
"entity_type": entity_data.get("entity_type", "UNKNOWN"),
|
481 |
"description": entity_data.get("description", ""),
|
482 |
-
"source_id": entity_data.get("source_id", "
|
|
|
|
|
483 |
}
|
484 |
|
485 |
# Add entity to knowledge graph
|
@@ -575,8 +578,10 @@ async def acreate_relation(
|
|
575 |
edge_data = {
|
576 |
"description": relation_data.get("description", ""),
|
577 |
"keywords": relation_data.get("keywords", ""),
|
578 |
-
"source_id": relation_data.get("source_id", "
|
579 |
"weight": float(relation_data.get("weight", 1.0)),
|
|
|
|
|
580 |
}
|
581 |
|
582 |
# Add relation to knowledge graph
|
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
+
import time
|
4 |
import asyncio
|
5 |
from typing import Any, cast
|
6 |
|
|
|
480 |
"entity_id": entity_name,
|
481 |
"entity_type": entity_data.get("entity_type", "UNKNOWN"),
|
482 |
"description": entity_data.get("description", ""),
|
483 |
+
"source_id": entity_data.get("source_id", "manual_creation"),
|
484 |
+
"file_path": entity_data.get("file_path", "manual_creation"),
|
485 |
+
"created_at": int(time.time()),
|
486 |
}
|
487 |
|
488 |
# Add entity to knowledge graph
|
|
|
578 |
edge_data = {
|
579 |
"description": relation_data.get("description", ""),
|
580 |
"keywords": relation_data.get("keywords", ""),
|
581 |
+
"source_id": relation_data.get("source_id", "manual_creation"),
|
582 |
"weight": float(relation_data.get("weight", 1.0)),
|
583 |
+
"file_path": relation_data.get("file_path", "manual_creation"),
|
584 |
+
"created_at": int(time.time()),
|
585 |
}
|
586 |
|
587 |
# Add relation to knowledge graph
|