feat: 增强知识图谱关系的时序性支持
Browse files- 为关系和向量数据增加时间戳支持,记录知识获取的时间
- 优化混合查询策略,同时考虑语义相关性和时间顺序
- 增强提示词模板,指导LLM在处理冲突信息时考虑时间因素
- lightrag/operate.py +35 -21
- lightrag/storage.py +5 -5
lightrag/operate.py
CHANGED
@@ -129,9 +129,7 @@ async def _handle_single_relationship_extraction(
|
|
129 |
description=edge_description,
|
130 |
keywords=edge_keywords,
|
131 |
source_id=edge_source_id,
|
132 |
-
metadata={
|
133 |
-
"created_at": time.time()
|
134 |
-
}
|
135 |
)
|
136 |
|
137 |
|
@@ -451,7 +449,7 @@ async def extract_entities(
|
|
451 |
+ dp["description"],
|
452 |
"metadata": {
|
453 |
"created_at": dp.get("metadata", {}).get("created_at", time.time())
|
454 |
-
}
|
455 |
}
|
456 |
for dp in all_relationships_data
|
457 |
}
|
@@ -740,11 +738,20 @@ async def _get_node_data(
|
|
740 |
entities_context = list_of_list_to_csv(entites_section_list)
|
741 |
|
742 |
relations_section_list = [
|
743 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
744 |
]
|
745 |
for i, e in enumerate(use_relations):
|
746 |
-
created_at = e.get("created_at", "
|
747 |
-
#
|
748 |
if isinstance(created_at, (int, float)):
|
749 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
750 |
relations_section_list.append(
|
@@ -756,7 +763,7 @@ async def _get_node_data(
|
|
756 |
e["keywords"],
|
757 |
e["weight"],
|
758 |
e["rank"],
|
759 |
-
created_at
|
760 |
]
|
761 |
)
|
762 |
relations_context = list_of_list_to_csv(relations_section_list)
|
@@ -894,8 +901,6 @@ async def _get_edge_data(
|
|
894 |
if not len(results):
|
895 |
return "", "", ""
|
896 |
|
897 |
-
# 从 KV 存储中获取完整的关系信息
|
898 |
-
edge_ids = [r["id"] for r in results]
|
899 |
edge_datas = await asyncio.gather(
|
900 |
*[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
|
901 |
)
|
@@ -907,11 +912,11 @@ async def _get_edge_data(
|
|
907 |
)
|
908 |
edge_datas = [
|
909 |
{
|
910 |
-
"src_id": k["src_id"],
|
911 |
-
"tgt_id": k["tgt_id"],
|
912 |
-
"rank": d,
|
913 |
"created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
|
914 |
-
**v
|
915 |
}
|
916 |
for k, v, d in zip(results, edge_datas, edge_degree)
|
917 |
if v is not None
|
@@ -936,11 +941,20 @@ async def _get_edge_data(
|
|
936 |
)
|
937 |
|
938 |
relations_section_list = [
|
939 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
940 |
]
|
941 |
for i, e in enumerate(edge_datas):
|
942 |
-
created_at = e.get("created_at", "
|
943 |
-
#
|
944 |
if isinstance(created_at, (int, float)):
|
945 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
946 |
relations_section_list.append(
|
@@ -952,7 +966,7 @@ async def _get_edge_data(
|
|
952 |
e["keywords"],
|
953 |
e["weight"],
|
954 |
e["rank"],
|
955 |
-
created_at
|
956 |
]
|
957 |
)
|
958 |
relations_context = list_of_list_to_csv(relations_section_list)
|
@@ -1287,10 +1301,10 @@ async def mix_kg_vector_query(
|
|
1287 |
valid_chunks = []
|
1288 |
for chunk, result in zip(chunks, results):
|
1289 |
if chunk is not None and "content" in chunk:
|
1290 |
-
#
|
1291 |
chunk_with_time = {
|
1292 |
"content": chunk["content"],
|
1293 |
-
"created_at": result.get("created_at", None)
|
1294 |
}
|
1295 |
valid_chunks.append(chunk_with_time)
|
1296 |
|
@@ -1306,7 +1320,7 @@ async def mix_kg_vector_query(
|
|
1306 |
if not maybe_trun_chunks:
|
1307 |
return None
|
1308 |
|
1309 |
-
#
|
1310 |
formatted_chunks = []
|
1311 |
for c in maybe_trun_chunks:
|
1312 |
chunk_text = c["content"]
|
|
|
129 |
description=edge_description,
|
130 |
keywords=edge_keywords,
|
131 |
source_id=edge_source_id,
|
132 |
+
metadata={"created_at": time.time()},
|
|
|
|
|
133 |
)
|
134 |
|
135 |
|
|
|
449 |
+ dp["description"],
|
450 |
"metadata": {
|
451 |
"created_at": dp.get("metadata", {}).get("created_at", time.time())
|
452 |
+
},
|
453 |
}
|
454 |
for dp in all_relationships_data
|
455 |
}
|
|
|
738 |
entities_context = list_of_list_to_csv(entites_section_list)
|
739 |
|
740 |
relations_section_list = [
|
741 |
+
[
|
742 |
+
"id",
|
743 |
+
"source",
|
744 |
+
"target",
|
745 |
+
"description",
|
746 |
+
"keywords",
|
747 |
+
"weight",
|
748 |
+
"rank",
|
749 |
+
"created_at",
|
750 |
+
]
|
751 |
]
|
752 |
for i, e in enumerate(use_relations):
|
753 |
+
created_at = e.get("created_at", "UNKNOWN")
|
754 |
+
# Convert timestamp to readable format
|
755 |
if isinstance(created_at, (int, float)):
|
756 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
757 |
relations_section_list.append(
|
|
|
763 |
e["keywords"],
|
764 |
e["weight"],
|
765 |
e["rank"],
|
766 |
+
created_at,
|
767 |
]
|
768 |
)
|
769 |
relations_context = list_of_list_to_csv(relations_section_list)
|
|
|
901 |
if not len(results):
|
902 |
return "", "", ""
|
903 |
|
|
|
|
|
904 |
edge_datas = await asyncio.gather(
|
905 |
*[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
|
906 |
)
|
|
|
912 |
)
|
913 |
edge_datas = [
|
914 |
{
|
915 |
+
"src_id": k["src_id"],
|
916 |
+
"tgt_id": k["tgt_id"],
|
917 |
+
"rank": d,
|
918 |
"created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
|
919 |
+
**v,
|
920 |
}
|
921 |
for k, v, d in zip(results, edge_datas, edge_degree)
|
922 |
if v is not None
|
|
|
941 |
)
|
942 |
|
943 |
relations_section_list = [
|
944 |
+
[
|
945 |
+
"id",
|
946 |
+
"source",
|
947 |
+
"target",
|
948 |
+
"description",
|
949 |
+
"keywords",
|
950 |
+
"weight",
|
951 |
+
"rank",
|
952 |
+
"created_at",
|
953 |
+
]
|
954 |
]
|
955 |
for i, e in enumerate(edge_datas):
|
956 |
+
created_at = e.get("created_at", "Unknown")
|
957 |
+
# Convert timestamp to readable format
|
958 |
if isinstance(created_at, (int, float)):
|
959 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
960 |
relations_section_list.append(
|
|
|
966 |
e["keywords"],
|
967 |
e["weight"],
|
968 |
e["rank"],
|
969 |
+
created_at,
|
970 |
]
|
971 |
)
|
972 |
relations_context = list_of_list_to_csv(relations_section_list)
|
|
|
1301 |
valid_chunks = []
|
1302 |
for chunk, result in zip(chunks, results):
|
1303 |
if chunk is not None and "content" in chunk:
|
1304 |
+
# Merge chunk content and time metadata
|
1305 |
chunk_with_time = {
|
1306 |
"content": chunk["content"],
|
1307 |
+
"created_at": result.get("created_at", None),
|
1308 |
}
|
1309 |
valid_chunks.append(chunk_with_time)
|
1310 |
|
|
|
1320 |
if not maybe_trun_chunks:
|
1321 |
return None
|
1322 |
|
1323 |
+
# Include time information in content
|
1324 |
formatted_chunks = []
|
1325 |
for c in maybe_trun_chunks:
|
1326 |
chunk_text = c["content"]
|
lightrag/storage.py
CHANGED
@@ -88,7 +88,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|
88 |
if not len(data):
|
89 |
logger.warning("You insert an empty data to vector DB")
|
90 |
return []
|
91 |
-
|
92 |
current_time = time.time()
|
93 |
list_data = [
|
94 |
{
|
@@ -137,11 +137,11 @@ class NanoVectorDBStorage(BaseVectorStorage):
|
|
137 |
)
|
138 |
results = [
|
139 |
{
|
140 |
-
**dp,
|
141 |
-
"id": dp["__id__"],
|
142 |
"distance": dp["__metrics__"],
|
143 |
-
"created_at": dp.get("__created_at__")
|
144 |
-
}
|
145 |
for dp in results
|
146 |
]
|
147 |
return results
|
|
|
88 |
if not len(data):
|
89 |
logger.warning("You insert an empty data to vector DB")
|
90 |
return []
|
91 |
+
|
92 |
current_time = time.time()
|
93 |
list_data = [
|
94 |
{
|
|
|
137 |
)
|
138 |
results = [
|
139 |
{
|
140 |
+
**dp,
|
141 |
+
"id": dp["__id__"],
|
142 |
"distance": dp["__metrics__"],
|
143 |
+
"created_at": dp.get("__created_at__"),
|
144 |
+
}
|
145 |
for dp in results
|
146 |
]
|
147 |
return results
|