Magicyuan commited on
Commit
3057c9c
·
1 Parent(s): 99287ad

feat: 增强知识图谱关系的时序性支持

Browse files

- 为关系和向量数据增加时间戳支持,记录知识获取的时间
- 优化混合查询策略,同时考虑语义相关性和时间顺序
- 增强提示词模板,指导LLM在处理冲突信息时考虑时间因素

Files changed (2) hide show
  1. lightrag/operate.py +35 -21
  2. lightrag/storage.py +5 -5
lightrag/operate.py CHANGED
@@ -129,9 +129,7 @@ async def _handle_single_relationship_extraction(
129
  description=edge_description,
130
  keywords=edge_keywords,
131
  source_id=edge_source_id,
132
- metadata={
133
- "created_at": time.time()
134
- }
135
  )
136
 
137
 
@@ -451,7 +449,7 @@ async def extract_entities(
451
  + dp["description"],
452
  "metadata": {
453
  "created_at": dp.get("metadata", {}).get("created_at", time.time())
454
- }
455
  }
456
  for dp in all_relationships_data
457
  }
@@ -740,11 +738,20 @@ async def _get_node_data(
740
  entities_context = list_of_list_to_csv(entites_section_list)
741
 
742
  relations_section_list = [
743
- ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
 
 
 
 
 
 
 
 
 
744
  ]
745
  for i, e in enumerate(use_relations):
746
- created_at = e.get("created_at", "未知")
747
- # 转换时间戳为可读格式
748
  if isinstance(created_at, (int, float)):
749
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
750
  relations_section_list.append(
@@ -756,7 +763,7 @@ async def _get_node_data(
756
  e["keywords"],
757
  e["weight"],
758
  e["rank"],
759
- created_at
760
  ]
761
  )
762
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -894,8 +901,6 @@ async def _get_edge_data(
894
  if not len(results):
895
  return "", "", ""
896
 
897
- # 从 KV 存储中获取完整的关系信息
898
- edge_ids = [r["id"] for r in results]
899
  edge_datas = await asyncio.gather(
900
  *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
901
  )
@@ -907,11 +912,11 @@ async def _get_edge_data(
907
  )
908
  edge_datas = [
909
  {
910
- "src_id": k["src_id"],
911
- "tgt_id": k["tgt_id"],
912
- "rank": d,
913
  "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
914
- **v
915
  }
916
  for k, v, d in zip(results, edge_datas, edge_degree)
917
  if v is not None
@@ -936,11 +941,20 @@ async def _get_edge_data(
936
  )
937
 
938
  relations_section_list = [
939
- ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
 
 
 
 
 
 
 
 
 
940
  ]
941
  for i, e in enumerate(edge_datas):
942
- created_at = e.get("created_at", "未知")
943
- # 转换时间戳为可读格式
944
  if isinstance(created_at, (int, float)):
945
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
946
  relations_section_list.append(
@@ -952,7 +966,7 @@ async def _get_edge_data(
952
  e["keywords"],
953
  e["weight"],
954
  e["rank"],
955
- created_at
956
  ]
957
  )
958
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -1287,10 +1301,10 @@ async def mix_kg_vector_query(
1287
  valid_chunks = []
1288
  for chunk, result in zip(chunks, results):
1289
  if chunk is not None and "content" in chunk:
1290
- # 合并 chunk 内容和时间元数据
1291
  chunk_with_time = {
1292
  "content": chunk["content"],
1293
- "created_at": result.get("created_at", None)
1294
  }
1295
  valid_chunks.append(chunk_with_time)
1296
 
@@ -1306,7 +1320,7 @@ async def mix_kg_vector_query(
1306
  if not maybe_trun_chunks:
1307
  return None
1308
 
1309
- # 在内容中包含时间信息
1310
  formatted_chunks = []
1311
  for c in maybe_trun_chunks:
1312
  chunk_text = c["content"]
 
129
  description=edge_description,
130
  keywords=edge_keywords,
131
  source_id=edge_source_id,
132
+ metadata={"created_at": time.time()},
 
 
133
  )
134
 
135
 
 
449
  + dp["description"],
450
  "metadata": {
451
  "created_at": dp.get("metadata", {}).get("created_at", time.time())
452
+ },
453
  }
454
  for dp in all_relationships_data
455
  }
 
738
  entities_context = list_of_list_to_csv(entites_section_list)
739
 
740
  relations_section_list = [
741
+ [
742
+ "id",
743
+ "source",
744
+ "target",
745
+ "description",
746
+ "keywords",
747
+ "weight",
748
+ "rank",
749
+ "created_at",
750
+ ]
751
  ]
752
  for i, e in enumerate(use_relations):
753
+ created_at = e.get("created_at", "UNKNOWN")
754
+ # Convert timestamp to readable format
755
  if isinstance(created_at, (int, float)):
756
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
757
  relations_section_list.append(
 
763
  e["keywords"],
764
  e["weight"],
765
  e["rank"],
766
+ created_at,
767
  ]
768
  )
769
  relations_context = list_of_list_to_csv(relations_section_list)
 
901
  if not len(results):
902
  return "", "", ""
903
 
 
 
904
  edge_datas = await asyncio.gather(
905
  *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
906
  )
 
912
  )
913
  edge_datas = [
914
  {
915
+ "src_id": k["src_id"],
916
+ "tgt_id": k["tgt_id"],
917
+ "rank": d,
918
  "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
919
+ **v,
920
  }
921
  for k, v, d in zip(results, edge_datas, edge_degree)
922
  if v is not None
 
941
  )
942
 
943
  relations_section_list = [
944
+ [
945
+ "id",
946
+ "source",
947
+ "target",
948
+ "description",
949
+ "keywords",
950
+ "weight",
951
+ "rank",
952
+ "created_at",
953
+ ]
954
  ]
955
  for i, e in enumerate(edge_datas):
956
+ created_at = e.get("created_at", "Unknown")
957
+ # Convert timestamp to readable format
958
  if isinstance(created_at, (int, float)):
959
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
960
  relations_section_list.append(
 
966
  e["keywords"],
967
  e["weight"],
968
  e["rank"],
969
+ created_at,
970
  ]
971
  )
972
  relations_context = list_of_list_to_csv(relations_section_list)
 
1301
  valid_chunks = []
1302
  for chunk, result in zip(chunks, results):
1303
  if chunk is not None and "content" in chunk:
1304
+ # Merge chunk content and time metadata
1305
  chunk_with_time = {
1306
  "content": chunk["content"],
1307
+ "created_at": result.get("created_at", None),
1308
  }
1309
  valid_chunks.append(chunk_with_time)
1310
 
 
1320
  if not maybe_trun_chunks:
1321
  return None
1322
 
1323
+ # Include time information in content
1324
  formatted_chunks = []
1325
  for c in maybe_trun_chunks:
1326
  chunk_text = c["content"]
lightrag/storage.py CHANGED
@@ -88,7 +88,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
88
  if not len(data):
89
  logger.warning("You insert an empty data to vector DB")
90
  return []
91
-
92
  current_time = time.time()
93
  list_data = [
94
  {
@@ -137,11 +137,11 @@ class NanoVectorDBStorage(BaseVectorStorage):
137
  )
138
  results = [
139
  {
140
- **dp,
141
- "id": dp["__id__"],
142
  "distance": dp["__metrics__"],
143
- "created_at": dp.get("__created_at__")
144
- }
145
  for dp in results
146
  ]
147
  return results
 
88
  if not len(data):
89
  logger.warning("You insert an empty data to vector DB")
90
  return []
91
+
92
  current_time = time.time()
93
  list_data = [
94
  {
 
137
  )
138
  results = [
139
  {
140
+ **dp,
141
+ "id": dp["__id__"],
142
  "distance": dp["__metrics__"],
143
+ "created_at": dp.get("__created_at__"),
144
+ }
145
  for dp in results
146
  ]
147
  return results