Magicyuan commited on
Commit
99287ad
·
1 Parent(s): 7564948

feat: 增强知识图谱关系的时序性支持

Browse files

- 为关系和向量数据增加时间戳支持,记录知识获取的时间
- 优化混合查询策略,同时考虑语义相关性和时间顺序
- 增强提示词模板,指导LLM在处理冲突信息时考虑时间因素

Files changed (3) hide show
  1. lightrag/operate.py +46 -7
  2. lightrag/prompt.py +19 -2
  3. lightrag/storage.py +11 -1
lightrag/operate.py CHANGED
@@ -30,6 +30,7 @@ from .base import (
30
  QueryParam,
31
  )
32
  from .prompt import GRAPH_FIELD_SEP, PROMPTS
 
33
 
34
 
35
  def chunking_by_token_size(
@@ -128,6 +129,9 @@ async def _handle_single_relationship_extraction(
128
  description=edge_description,
129
  keywords=edge_keywords,
130
  source_id=edge_source_id,
 
 
 
131
  )
132
 
133
 
@@ -445,6 +449,9 @@ async def extract_entities(
445
  + dp["src_id"]
446
  + dp["tgt_id"]
447
  + dp["description"],
 
 
 
448
  }
449
  for dp in all_relationships_data
450
  }
@@ -733,9 +740,13 @@ async def _get_node_data(
733
  entities_context = list_of_list_to_csv(entites_section_list)
734
 
735
  relations_section_list = [
736
- ["id", "source", "target", "description", "keywords", "weight", "rank"]
737
  ]
738
  for i, e in enumerate(use_relations):
 
 
 
 
739
  relations_section_list.append(
740
  [
741
  i,
@@ -745,6 +756,7 @@ async def _get_node_data(
745
  e["keywords"],
746
  e["weight"],
747
  e["rank"],
 
748
  ]
749
  )
750
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -882,6 +894,8 @@ async def _get_edge_data(
882
  if not len(results):
883
  return "", "", ""
884
 
 
 
885
  edge_datas = await asyncio.gather(
886
  *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
887
  )
@@ -892,7 +906,13 @@ async def _get_edge_data(
892
  *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
893
  )
894
  edge_datas = [
895
- {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v}
 
 
 
 
 
 
896
  for k, v, d in zip(results, edge_datas, edge_degree)
897
  if v is not None
898
  ]
@@ -916,9 +936,13 @@ async def _get_edge_data(
916
  )
917
 
918
  relations_section_list = [
919
- ["id", "source", "target", "description", "keywords", "weight", "rank"]
920
  ]
921
  for i, e in enumerate(edge_datas):
 
 
 
 
922
  relations_section_list.append(
923
  [
924
  i,
@@ -928,6 +952,7 @@ async def _get_edge_data(
928
  e["keywords"],
929
  e["weight"],
930
  e["rank"],
 
931
  ]
932
  )
933
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -1259,9 +1284,15 @@ async def mix_kg_vector_query(
1259
  chunks_ids = [r["id"] for r in results]
1260
  chunks = await text_chunks_db.get_by_ids(chunks_ids)
1261
 
1262
- valid_chunks = [
1263
- chunk for chunk in chunks if chunk is not None and "content" in chunk
1264
- ]
 
 
 
 
 
 
1265
 
1266
  if not valid_chunks:
1267
  return None
@@ -1275,7 +1306,15 @@ async def mix_kg_vector_query(
1275
  if not maybe_trun_chunks:
1276
  return None
1277
 
1278
- return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
 
 
 
 
 
 
 
 
1279
  except Exception as e:
1280
  logger.error(f"Error in get_vector_context: {e}")
1281
  return None
 
30
  QueryParam,
31
  )
32
  from .prompt import GRAPH_FIELD_SEP, PROMPTS
33
+ import time
34
 
35
 
36
  def chunking_by_token_size(
 
129
  description=edge_description,
130
  keywords=edge_keywords,
131
  source_id=edge_source_id,
132
+ metadata={
133
+ "created_at": time.time()
134
+ }
135
  )
136
 
137
 
 
449
  + dp["src_id"]
450
  + dp["tgt_id"]
451
  + dp["description"],
452
+ "metadata": {
453
+ "created_at": dp.get("metadata", {}).get("created_at", time.time())
454
+ }
455
  }
456
  for dp in all_relationships_data
457
  }
 
740
  entities_context = list_of_list_to_csv(entites_section_list)
741
 
742
  relations_section_list = [
743
+ ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
744
  ]
745
  for i, e in enumerate(use_relations):
746
+ created_at = e.get("created_at", "未知")
747
+ # 转换时间戳为可读格式
748
+ if isinstance(created_at, (int, float)):
749
+ created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
750
  relations_section_list.append(
751
  [
752
  i,
 
756
  e["keywords"],
757
  e["weight"],
758
  e["rank"],
759
+ created_at
760
  ]
761
  )
762
  relations_context = list_of_list_to_csv(relations_section_list)
 
894
  if not len(results):
895
  return "", "", ""
896
 
897
+ # 从 KV 存储中获取完整的关系信息
898
+ edge_ids = [r["id"] for r in results]
899
  edge_datas = await asyncio.gather(
900
  *[knowledge_graph_inst.get_edge(r["src_id"], r["tgt_id"]) for r in results]
901
  )
 
906
  *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
907
  )
908
  edge_datas = [
909
+ {
910
+ "src_id": k["src_id"],
911
+ "tgt_id": k["tgt_id"],
912
+ "rank": d,
913
+ "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
914
+ **v
915
+ }
916
  for k, v, d in zip(results, edge_datas, edge_degree)
917
  if v is not None
918
  ]
 
936
  )
937
 
938
  relations_section_list = [
939
+ ["id", "source", "target", "description", "keywords", "weight", "rank", "created_at"]
940
  ]
941
  for i, e in enumerate(edge_datas):
942
+ created_at = e.get("created_at", "未知")
943
+ # 转换时间戳为可读格式
944
+ if isinstance(created_at, (int, float)):
945
+ created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
946
  relations_section_list.append(
947
  [
948
  i,
 
952
  e["keywords"],
953
  e["weight"],
954
  e["rank"],
955
+ created_at
956
  ]
957
  )
958
  relations_context = list_of_list_to_csv(relations_section_list)
 
1284
  chunks_ids = [r["id"] for r in results]
1285
  chunks = await text_chunks_db.get_by_ids(chunks_ids)
1286
 
1287
+ valid_chunks = []
1288
+ for chunk, result in zip(chunks, results):
1289
+ if chunk is not None and "content" in chunk:
1290
+ # 合并 chunk 内容和时间元数据
1291
+ chunk_with_time = {
1292
+ "content": chunk["content"],
1293
+ "created_at": result.get("created_at", None)
1294
+ }
1295
+ valid_chunks.append(chunk_with_time)
1296
 
1297
  if not valid_chunks:
1298
  return None
 
1306
  if not maybe_trun_chunks:
1307
  return None
1308
 
1309
+ # 在内容中包含时间信息
1310
+ formatted_chunks = []
1311
+ for c in maybe_trun_chunks:
1312
+ chunk_text = c["content"]
1313
+ if c["created_at"]:
1314
+ chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
1315
+ formatted_chunks.append(chunk_text)
1316
+
1317
+ return "\n--New Chunk--\n".join(formatted_chunks)
1318
  except Exception as e:
1319
  logger.error(f"Error in get_vector_context: {e}")
1320
  return None
lightrag/prompt.py CHANGED
@@ -164,6 +164,12 @@ Generate a response of the target length and format that responds to the user's
164
  If you don't know the answer, just say so. Do not make anything up.
165
  Do not include information where the supporting evidence for it is not provided.
166
 
 
 
 
 
 
 
167
  ---Target response length and format---
168
 
169
  {response_type}
@@ -172,8 +178,7 @@ Do not include information where the supporting evidence for it is not provided.
172
 
173
  {context_data}
174
 
175
- Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
176
- """
177
 
178
  PROMPTS["keywords_extraction"] = """---Role---
179
 
@@ -250,6 +255,12 @@ Generate a response of the target length and format that responds to the user's
250
  If you don't know the answer, just say so. Do not make anything up.
251
  Do not include information where the supporting evidence for it is not provided.
252
 
 
 
 
 
 
 
253
  ---Target response length and format---
254
 
255
  {response_type}
@@ -293,6 +304,12 @@ You are a professional assistant responsible for answering questions based on kn
293
 
294
  Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
295
 
 
 
 
 
 
 
296
  ---Data Sources---
297
 
298
  1. Knowledge Graph Data:
 
164
  If you don't know the answer, just say so. Do not make anything up.
165
  Do not include information where the supporting evidence for it is not provided.
166
 
167
+ When handling relationships with timestamps:
168
+ 1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
169
+ 2. When encountering conflicting relationships, consider both the semantic content and the timestamp
170
+ 3. Don't automatically prefer the most recently created relationships - use judgment based on the context
171
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
172
+
173
  ---Target response length and format---
174
 
175
  {response_type}
 
178
 
179
  {context_data}
180
 
181
+ Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""
 
182
 
183
  PROMPTS["keywords_extraction"] = """---Role---
184
 
 
255
  If you don't know the answer, just say so. Do not make anything up.
256
  Do not include information where the supporting evidence for it is not provided.
257
 
258
+ When handling content with timestamps:
259
+ 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
260
+ 2. When encountering conflicting information, consider both the content and the timestamp
261
+ 3. Don't automatically prefer the most recent content - use judgment based on the context
262
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
263
+
264
  ---Target response length and format---
265
 
266
  {response_type}
 
304
 
305
  Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
306
 
307
+ When handling information with timestamps:
308
+ 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
309
+ 2. When encountering conflicting information, consider both the content/relationship and the timestamp
310
+ 3. Don't automatically prefer the most recent information - use judgment based on the context
311
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
312
+
313
  ---Data Sources---
314
 
315
  1. Knowledge Graph Data:
lightrag/storage.py CHANGED
@@ -7,6 +7,7 @@ from typing import Any, Union, cast, Dict
7
  import networkx as nx
8
  import numpy as np
9
  from nano_vectordb import NanoVectorDB
 
10
 
11
  from .utils import (
12
  logger,
@@ -87,9 +88,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
87
  if not len(data):
88
  logger.warning("You insert an empty data to vector DB")
89
  return []
 
 
90
  list_data = [
91
  {
92
  "__id__": k,
 
93
  **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
94
  }
95
  for k, v in data.items()
@@ -132,7 +136,13 @@ class NanoVectorDBStorage(BaseVectorStorage):
132
  better_than_threshold=self.cosine_better_than_threshold,
133
  )
134
  results = [
135
- {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
 
 
 
 
 
 
136
  ]
137
  return results
138
 
 
7
  import networkx as nx
8
  import numpy as np
9
  from nano_vectordb import NanoVectorDB
10
+ import time
11
 
12
  from .utils import (
13
  logger,
 
88
  if not len(data):
89
  logger.warning("You insert an empty data to vector DB")
90
  return []
91
+
92
+ current_time = time.time()
93
  list_data = [
94
  {
95
  "__id__": k,
96
+ "__created_at__": current_time,
97
  **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
98
  }
99
  for k, v in data.items()
 
136
  better_than_threshold=self.cosine_better_than_threshold,
137
  )
138
  results = [
139
+ {
140
+ **dp,
141
+ "id": dp["__id__"],
142
+ "distance": dp["__metrics__"],
143
+ "created_at": dp.get("__created_at__")
144
+ }
145
+ for dp in results
146
  ]
147
  return results
148