zrguo commited on
Commit
3c5ab1e
·
unverified ·
2 Parent(s): 0b317b1 2d9f120

Merge pull request #525 from magicyuan876/main

Browse files
Files changed (3) hide show
  1. lightrag/operate.py +60 -7
  2. lightrag/prompt.py +19 -2
  3. lightrag/storage.py +11 -1
lightrag/operate.py CHANGED
@@ -30,6 +30,7 @@ from .base import (
30
  QueryParam,
31
  )
32
  from .prompt import GRAPH_FIELD_SEP, PROMPTS
 
33
 
34
 
35
  def chunking_by_token_size(
@@ -128,6 +129,7 @@ async def _handle_single_relationship_extraction(
128
  description=edge_description,
129
  keywords=edge_keywords,
130
  source_id=edge_source_id,
 
131
  )
132
 
133
 
@@ -445,6 +447,9 @@ async def extract_entities(
445
  + dp["src_id"]
446
  + dp["tgt_id"]
447
  + dp["description"],
 
 
 
448
  }
449
  for dp in all_relationships_data
450
  }
@@ -733,9 +738,22 @@ async def _get_node_data(
733
  entities_context = list_of_list_to_csv(entites_section_list)
734
 
735
  relations_section_list = [
736
- ["id", "source", "target", "description", "keywords", "weight", "rank"]
 
 
 
 
 
 
 
 
 
737
  ]
738
  for i, e in enumerate(use_relations):
 
 
 
 
739
  relations_section_list.append(
740
  [
741
  i,
@@ -745,6 +763,7 @@ async def _get_node_data(
745
  e["keywords"],
746
  e["weight"],
747
  e["rank"],
 
748
  ]
749
  )
750
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -892,7 +911,13 @@ async def _get_edge_data(
892
  *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
893
  )
894
  edge_datas = [
895
- {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v}
 
 
 
 
 
 
896
  for k, v, d in zip(results, edge_datas, edge_degree)
897
  if v is not None
898
  ]
@@ -916,9 +941,22 @@ async def _get_edge_data(
916
  )
917
 
918
  relations_section_list = [
919
- ["id", "source", "target", "description", "keywords", "weight", "rank"]
 
 
 
 
 
 
 
 
 
920
  ]
921
  for i, e in enumerate(edge_datas):
 
 
 
 
922
  relations_section_list.append(
923
  [
924
  i,
@@ -928,6 +966,7 @@ async def _get_edge_data(
928
  e["keywords"],
929
  e["weight"],
930
  e["rank"],
 
931
  ]
932
  )
933
  relations_context = list_of_list_to_csv(relations_section_list)
@@ -1259,9 +1298,15 @@ async def mix_kg_vector_query(
1259
  chunks_ids = [r["id"] for r in results]
1260
  chunks = await text_chunks_db.get_by_ids(chunks_ids)
1261
 
1262
- valid_chunks = [
1263
- chunk for chunk in chunks if chunk is not None and "content" in chunk
1264
- ]
 
 
 
 
 
 
1265
 
1266
  if not valid_chunks:
1267
  return None
@@ -1275,7 +1320,15 @@ async def mix_kg_vector_query(
1275
  if not maybe_trun_chunks:
1276
  return None
1277
 
1278
- return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
 
 
 
 
 
 
 
 
1279
  except Exception as e:
1280
  logger.error(f"Error in get_vector_context: {e}")
1281
  return None
 
30
  QueryParam,
31
  )
32
  from .prompt import GRAPH_FIELD_SEP, PROMPTS
33
+ import time
34
 
35
 
36
  def chunking_by_token_size(
 
129
  description=edge_description,
130
  keywords=edge_keywords,
131
  source_id=edge_source_id,
132
+ metadata={"created_at": time.time()},
133
  )
134
 
135
 
 
447
  + dp["src_id"]
448
  + dp["tgt_id"]
449
  + dp["description"],
450
+ "metadata": {
451
+ "created_at": dp.get("metadata", {}).get("created_at", time.time())
452
+ },
453
  }
454
  for dp in all_relationships_data
455
  }
 
738
  entities_context = list_of_list_to_csv(entites_section_list)
739
 
740
  relations_section_list = [
741
+ [
742
+ "id",
743
+ "source",
744
+ "target",
745
+ "description",
746
+ "keywords",
747
+ "weight",
748
+ "rank",
749
+ "created_at",
750
+ ]
751
  ]
752
  for i, e in enumerate(use_relations):
753
+ created_at = e.get("created_at", "UNKNOWN")
754
+ # Convert timestamp to readable format
755
+ if isinstance(created_at, (int, float)):
756
+ created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
757
  relations_section_list.append(
758
  [
759
  i,
 
763
  e["keywords"],
764
  e["weight"],
765
  e["rank"],
766
+ created_at,
767
  ]
768
  )
769
  relations_context = list_of_list_to_csv(relations_section_list)
 
911
  *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
912
  )
913
  edge_datas = [
914
+ {
915
+ "src_id": k["src_id"],
916
+ "tgt_id": k["tgt_id"],
917
+ "rank": d,
918
+ "created_at": k.get("__created_at__", None), # 从 KV 存储中获取时间元数据
919
+ **v,
920
+ }
921
  for k, v, d in zip(results, edge_datas, edge_degree)
922
  if v is not None
923
  ]
 
941
  )
942
 
943
  relations_section_list = [
944
+ [
945
+ "id",
946
+ "source",
947
+ "target",
948
+ "description",
949
+ "keywords",
950
+ "weight",
951
+ "rank",
952
+ "created_at",
953
+ ]
954
  ]
955
  for i, e in enumerate(edge_datas):
956
+ created_at = e.get("created_at", "Unknown")
957
+ # Convert timestamp to readable format
958
+ if isinstance(created_at, (int, float)):
959
+ created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
960
  relations_section_list.append(
961
  [
962
  i,
 
966
  e["keywords"],
967
  e["weight"],
968
  e["rank"],
969
+ created_at,
970
  ]
971
  )
972
  relations_context = list_of_list_to_csv(relations_section_list)
 
1298
  chunks_ids = [r["id"] for r in results]
1299
  chunks = await text_chunks_db.get_by_ids(chunks_ids)
1300
 
1301
+ valid_chunks = []
1302
+ for chunk, result in zip(chunks, results):
1303
+ if chunk is not None and "content" in chunk:
1304
+ # Merge chunk content and time metadata
1305
+ chunk_with_time = {
1306
+ "content": chunk["content"],
1307
+ "created_at": result.get("created_at", None),
1308
+ }
1309
+ valid_chunks.append(chunk_with_time)
1310
 
1311
  if not valid_chunks:
1312
  return None
 
1320
  if not maybe_trun_chunks:
1321
  return None
1322
 
1323
+ # Include time information in content
1324
+ formatted_chunks = []
1325
+ for c in maybe_trun_chunks:
1326
+ chunk_text = c["content"]
1327
+ if c["created_at"]:
1328
+ chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
1329
+ formatted_chunks.append(chunk_text)
1330
+
1331
+ return "\n--New Chunk--\n".join(formatted_chunks)
1332
  except Exception as e:
1333
  logger.error(f"Error in get_vector_context: {e}")
1334
  return None
lightrag/prompt.py CHANGED
@@ -164,6 +164,12 @@ Generate a response of the target length and format that responds to the user's
164
  If you don't know the answer, just say so. Do not make anything up.
165
  Do not include information where the supporting evidence for it is not provided.
166
 
 
 
 
 
 
 
167
  ---Target response length and format---
168
 
169
  {response_type}
@@ -172,8 +178,7 @@ Do not include information where the supporting evidence for it is not provided.
172
 
173
  {context_data}
174
 
175
- Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
176
- """
177
 
178
  PROMPTS["keywords_extraction"] = """---Role---
179
 
@@ -250,6 +255,12 @@ Generate a response of the target length and format that responds to the user's
250
  If you don't know the answer, just say so. Do not make anything up.
251
  Do not include information where the supporting evidence for it is not provided.
252
 
 
 
 
 
 
 
253
  ---Target response length and format---
254
 
255
  {response_type}
@@ -293,6 +304,12 @@ You are a professional assistant responsible for answering questions based on kn
293
 
294
  Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
295
 
 
 
 
 
 
 
296
  ---Data Sources---
297
 
298
  1. Knowledge Graph Data:
 
164
  If you don't know the answer, just say so. Do not make anything up.
165
  Do not include information where the supporting evidence for it is not provided.
166
 
167
+ When handling relationships with timestamps:
168
+ 1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
169
+ 2. When encountering conflicting relationships, consider both the semantic content and the timestamp
170
+ 3. Don't automatically prefer the most recently created relationships - use judgment based on the context
171
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
172
+
173
  ---Target response length and format---
174
 
175
  {response_type}
 
178
 
179
  {context_data}
180
 
181
+ Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""
 
182
 
183
  PROMPTS["keywords_extraction"] = """---Role---
184
 
 
255
  If you don't know the answer, just say so. Do not make anything up.
256
  Do not include information where the supporting evidence for it is not provided.
257
 
258
+ When handling content with timestamps:
259
+ 1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
260
+ 2. When encountering conflicting information, consider both the content and the timestamp
261
+ 3. Don't automatically prefer the most recent content - use judgment based on the context
262
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
263
+
264
  ---Target response length and format---
265
 
266
  {response_type}
 
304
 
305
  Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
306
 
307
+ When handling information with timestamps:
308
+ 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
309
+ 2. When encountering conflicting information, consider both the content/relationship and the timestamp
310
+ 3. Don't automatically prefer the most recent information - use judgment based on the context
311
+ 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
312
+
313
  ---Data Sources---
314
 
315
  1. Knowledge Graph Data:
lightrag/storage.py CHANGED
@@ -7,6 +7,7 @@ from typing import Any, Union, cast, Dict
7
  import networkx as nx
8
  import numpy as np
9
  from nano_vectordb import NanoVectorDB
 
10
 
11
  from .utils import (
12
  logger,
@@ -87,9 +88,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
87
  if not len(data):
88
  logger.warning("You insert an empty data to vector DB")
89
  return []
 
 
90
  list_data = [
91
  {
92
  "__id__": k,
 
93
  **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
94
  }
95
  for k, v in data.items()
@@ -132,7 +136,13 @@ class NanoVectorDBStorage(BaseVectorStorage):
132
  better_than_threshold=self.cosine_better_than_threshold,
133
  )
134
  results = [
135
- {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
 
 
 
 
 
 
136
  ]
137
  return results
138
 
 
7
  import networkx as nx
8
  import numpy as np
9
  from nano_vectordb import NanoVectorDB
10
+ import time
11
 
12
  from .utils import (
13
  logger,
 
88
  if not len(data):
89
  logger.warning("You insert an empty data to vector DB")
90
  return []
91
+
92
+ current_time = time.time()
93
  list_data = [
94
  {
95
  "__id__": k,
96
+ "__created_at__": current_time,
97
  **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
98
  }
99
  for k, v in data.items()
 
136
  better_than_threshold=self.cosine_better_than_threshold,
137
  )
138
  results = [
139
+ {
140
+ **dp,
141
+ "id": dp["__id__"],
142
+ "distance": dp["__metrics__"],
143
+ "created_at": dp.get("__created_at__"),
144
+ }
145
+ for dp in results
146
  ]
147
  return results
148