童石渊 commited on
Commit
6ef0fd5
·
1 Parent(s): 85f512e

chunk split retry

Browse files
Files changed (3) hide show
  1. lightrag/lightrag.py +18 -16
  2. lightrag/operate.py +128 -119
  3. test.ipynb +740 -0
lightrag/lightrag.py CHANGED
@@ -268,7 +268,7 @@ class LightRAG:
268
  self.llm_model_func,
269
  hashing_kv=self.llm_response_cache
270
  if self.llm_response_cache
271
- and hasattr(self.llm_response_cache, "global_config")
272
  else self.key_string_value_json_storage_cls(
273
  namespace="llm_response_cache",
274
  global_config=asdict(self),
@@ -316,7 +316,9 @@ class LightRAG:
316
 
317
  def insert(self, string_or_strings, split_by_character=None):
318
  loop = always_get_an_event_loop()
319
- return loop.run_until_complete(self.ainsert(string_or_strings, split_by_character))
 
 
320
 
321
  async def ainsert(self, string_or_strings, split_by_character):
322
  """Insert documents with checkpoint support
@@ -357,10 +359,10 @@ class LightRAG:
357
  # Process documents in batches
358
  batch_size = self.addon_params.get("insert_batch_size", 10)
359
  for i in range(0, len(new_docs), batch_size):
360
- batch_docs = dict(list(new_docs.items())[i: i + batch_size])
361
 
362
  for doc_id, doc in tqdm_async(
363
- batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}"
364
  ):
365
  try:
366
  # Update status to processing
@@ -548,7 +550,7 @@ class LightRAG:
548
  # Check if nodes exist in the knowledge graph
549
  for need_insert_id in [src_id, tgt_id]:
550
  if not (
551
- await self.chunk_entity_relation_graph.has_node(need_insert_id)
552
  ):
553
  await self.chunk_entity_relation_graph.upsert_node(
554
  need_insert_id,
@@ -597,9 +599,9 @@ class LightRAG:
597
  "src_id": dp["src_id"],
598
  "tgt_id": dp["tgt_id"],
599
  "content": dp["keywords"]
600
- + dp["src_id"]
601
- + dp["tgt_id"]
602
- + dp["description"],
603
  }
604
  for dp in all_relationships_data
605
  }
@@ -624,7 +626,7 @@ class LightRAG:
624
  asdict(self),
625
  hashing_kv=self.llm_response_cache
626
  if self.llm_response_cache
627
- and hasattr(self.llm_response_cache, "global_config")
628
  else self.key_string_value_json_storage_cls(
629
  namespace="llm_response_cache",
630
  global_config=asdict(self),
@@ -640,7 +642,7 @@ class LightRAG:
640
  asdict(self),
641
  hashing_kv=self.llm_response_cache
642
  if self.llm_response_cache
643
- and hasattr(self.llm_response_cache, "global_config")
644
  else self.key_string_value_json_storage_cls(
645
  namespace="llm_response_cache",
646
  global_config=asdict(self),
@@ -659,7 +661,7 @@ class LightRAG:
659
  asdict(self),
660
  hashing_kv=self.llm_response_cache
661
  if self.llm_response_cache
662
- and hasattr(self.llm_response_cache, "global_config")
663
  else self.key_string_value_json_storage_cls(
664
  namespace="llm_response_cache",
665
  global_config=asdict(self),
@@ -900,7 +902,7 @@ class LightRAG:
900
  dp
901
  for dp in self.entities_vdb.client_storage["data"]
902
  if chunk_id
903
- in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
904
  ]
905
  if entities_with_chunk:
906
  logger.error(
@@ -912,7 +914,7 @@ class LightRAG:
912
  dp
913
  for dp in self.relationships_vdb.client_storage["data"]
914
  if chunk_id
915
- in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
916
  ]
917
  if relations_with_chunk:
918
  logger.error(
@@ -929,7 +931,7 @@ class LightRAG:
929
  return asyncio.run(self.adelete_by_doc_id(doc_id))
930
 
931
  async def get_entity_info(
932
- self, entity_name: str, include_vector_data: bool = False
933
  ):
934
  """Get detailed information of an entity
935
 
@@ -980,7 +982,7 @@ class LightRAG:
980
  tracemalloc.stop()
981
 
982
  async def get_relation_info(
983
- self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
984
  ):
985
  """Get detailed information of a relationship
986
 
@@ -1022,7 +1024,7 @@ class LightRAG:
1022
  return result
1023
 
1024
  def get_relation_info_sync(
1025
- self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
1026
  ):
1027
  """Synchronous version of getting relationship information
1028
 
 
268
  self.llm_model_func,
269
  hashing_kv=self.llm_response_cache
270
  if self.llm_response_cache
271
+ and hasattr(self.llm_response_cache, "global_config")
272
  else self.key_string_value_json_storage_cls(
273
  namespace="llm_response_cache",
274
  global_config=asdict(self),
 
316
 
317
  def insert(self, string_or_strings, split_by_character=None):
318
  loop = always_get_an_event_loop()
319
+ return loop.run_until_complete(
320
+ self.ainsert(string_or_strings, split_by_character)
321
+ )
322
 
323
  async def ainsert(self, string_or_strings, split_by_character):
324
  """Insert documents with checkpoint support
 
359
  # Process documents in batches
360
  batch_size = self.addon_params.get("insert_batch_size", 10)
361
  for i in range(0, len(new_docs), batch_size):
362
+ batch_docs = dict(list(new_docs.items())[i : i + batch_size])
363
 
364
  for doc_id, doc in tqdm_async(
365
+ batch_docs.items(), desc=f"Processing batch {i // batch_size + 1}"
366
  ):
367
  try:
368
  # Update status to processing
 
550
  # Check if nodes exist in the knowledge graph
551
  for need_insert_id in [src_id, tgt_id]:
552
  if not (
553
+ await self.chunk_entity_relation_graph.has_node(need_insert_id)
554
  ):
555
  await self.chunk_entity_relation_graph.upsert_node(
556
  need_insert_id,
 
599
  "src_id": dp["src_id"],
600
  "tgt_id": dp["tgt_id"],
601
  "content": dp["keywords"]
602
+ + dp["src_id"]
603
+ + dp["tgt_id"]
604
+ + dp["description"],
605
  }
606
  for dp in all_relationships_data
607
  }
 
626
  asdict(self),
627
  hashing_kv=self.llm_response_cache
628
  if self.llm_response_cache
629
+ and hasattr(self.llm_response_cache, "global_config")
630
  else self.key_string_value_json_storage_cls(
631
  namespace="llm_response_cache",
632
  global_config=asdict(self),
 
642
  asdict(self),
643
  hashing_kv=self.llm_response_cache
644
  if self.llm_response_cache
645
+ and hasattr(self.llm_response_cache, "global_config")
646
  else self.key_string_value_json_storage_cls(
647
  namespace="llm_response_cache",
648
  global_config=asdict(self),
 
661
  asdict(self),
662
  hashing_kv=self.llm_response_cache
663
  if self.llm_response_cache
664
+ and hasattr(self.llm_response_cache, "global_config")
665
  else self.key_string_value_json_storage_cls(
666
  namespace="llm_response_cache",
667
  global_config=asdict(self),
 
902
  dp
903
  for dp in self.entities_vdb.client_storage["data"]
904
  if chunk_id
905
+ in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
906
  ]
907
  if entities_with_chunk:
908
  logger.error(
 
914
  dp
915
  for dp in self.relationships_vdb.client_storage["data"]
916
  if chunk_id
917
+ in (dp.get("source_id") or "").split(GRAPH_FIELD_SEP)
918
  ]
919
  if relations_with_chunk:
920
  logger.error(
 
931
  return asyncio.run(self.adelete_by_doc_id(doc_id))
932
 
933
  async def get_entity_info(
934
+ self, entity_name: str, include_vector_data: bool = False
935
  ):
936
  """Get detailed information of an entity
937
 
 
982
  tracemalloc.stop()
983
 
984
  async def get_relation_info(
985
+ self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
986
  ):
987
  """Get detailed information of a relationship
988
 
 
1024
  return result
1025
 
1026
  def get_relation_info_sync(
1027
+ self, src_entity: str, tgt_entity: str, include_vector_data: bool = False
1028
  ):
1029
  """Synchronous version of getting relationship information
1030
 
lightrag/operate.py CHANGED
@@ -34,7 +34,11 @@ import time
34
 
35
 
36
  def chunking_by_token_size(
37
- content: str, split_by_character=None, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o"
 
 
 
 
38
  ):
39
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
40
  results = []
@@ -44,11 +48,16 @@ def chunking_by_token_size(
44
  for chunk in raw_chunks:
45
  _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
46
  if len(_tokens) > max_token_size:
47
- for start in range(0, len(_tokens), max_token_size - overlap_token_size):
 
 
48
  chunk_content = decode_tokens_by_tiktoken(
49
- _tokens[start: start + max_token_size], model_name=tiktoken_model
 
 
 
 
50
  )
51
- new_chunks.append((min(max_token_size, len(_tokens) - start), chunk_content))
52
  else:
53
  new_chunks.append((len(_tokens), chunk))
54
  for index, (_len, chunk) in enumerate(new_chunks):
@@ -61,10 +70,10 @@ def chunking_by_token_size(
61
  )
62
  else:
63
  for index, start in enumerate(
64
- range(0, len(tokens), max_token_size - overlap_token_size)
65
  ):
66
  chunk_content = decode_tokens_by_tiktoken(
67
- tokens[start: start + max_token_size], model_name=tiktoken_model
68
  )
69
  results.append(
70
  {
@@ -77,9 +86,9 @@ def chunking_by_token_size(
77
 
78
 
79
  async def _handle_entity_relation_summary(
80
- entity_or_relation_name: str,
81
- description: str,
82
- global_config: dict,
83
  ) -> str:
84
  use_llm_func: callable = global_config["llm_model_func"]
85
  llm_max_tokens = global_config["llm_model_max_token_size"]
@@ -108,8 +117,8 @@ async def _handle_entity_relation_summary(
108
 
109
 
110
  async def _handle_single_entity_extraction(
111
- record_attributes: list[str],
112
- chunk_key: str,
113
  ):
114
  if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
115
  return None
@@ -129,8 +138,8 @@ async def _handle_single_entity_extraction(
129
 
130
 
131
  async def _handle_single_relationship_extraction(
132
- record_attributes: list[str],
133
- chunk_key: str,
134
  ):
135
  if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
136
  return None
@@ -156,10 +165,10 @@ async def _handle_single_relationship_extraction(
156
 
157
 
158
  async def _merge_nodes_then_upsert(
159
- entity_name: str,
160
- nodes_data: list[dict],
161
- knowledge_graph_inst: BaseGraphStorage,
162
- global_config: dict,
163
  ):
164
  already_entity_types = []
165
  already_source_ids = []
@@ -203,11 +212,11 @@ async def _merge_nodes_then_upsert(
203
 
204
 
205
  async def _merge_edges_then_upsert(
206
- src_id: str,
207
- tgt_id: str,
208
- edges_data: list[dict],
209
- knowledge_graph_inst: BaseGraphStorage,
210
- global_config: dict,
211
  ):
212
  already_weights = []
213
  already_source_ids = []
@@ -270,12 +279,12 @@ async def _merge_edges_then_upsert(
270
 
271
 
272
  async def extract_entities(
273
- chunks: dict[str, TextChunkSchema],
274
- knowledge_graph_inst: BaseGraphStorage,
275
- entity_vdb: BaseVectorStorage,
276
- relationships_vdb: BaseVectorStorage,
277
- global_config: dict,
278
- llm_response_cache: BaseKVStorage = None,
279
  ) -> Union[BaseGraphStorage, None]:
280
  use_llm_func: callable = global_config["llm_model_func"]
281
  entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
@@ -327,13 +336,13 @@ async def extract_entities(
327
  already_relations = 0
328
 
329
  async def _user_llm_func_with_cache(
330
- input_text: str, history_messages: list[dict[str, str]] = None
331
  ) -> str:
332
  if enable_llm_cache_for_entity_extract and llm_response_cache:
333
  need_to_restore = False
334
  if (
335
- global_config["embedding_cache_config"]
336
- and global_config["embedding_cache_config"]["enabled"]
337
  ):
338
  new_config = global_config.copy()
339
  new_config["embedding_cache_config"] = None
@@ -435,7 +444,7 @@ async def extract_entities(
435
  already_relations += len(maybe_edges)
436
  now_ticks = PROMPTS["process_tickers"][
437
  already_processed % len(PROMPTS["process_tickers"])
438
- ]
439
  print(
440
  f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
441
  end="",
@@ -445,10 +454,10 @@ async def extract_entities(
445
 
446
  results = []
447
  for result in tqdm_async(
448
- asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
449
- total=len(ordered_chunks),
450
- desc="Extracting entities from chunks",
451
- unit="chunk",
452
  ):
453
  results.append(await result)
454
 
@@ -462,32 +471,32 @@ async def extract_entities(
462
  logger.info("Inserting entities into storage...")
463
  all_entities_data = []
464
  for result in tqdm_async(
465
- asyncio.as_completed(
466
- [
467
- _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
468
- for k, v in maybe_nodes.items()
469
- ]
470
- ),
471
- total=len(maybe_nodes),
472
- desc="Inserting entities",
473
- unit="entity",
474
  ):
475
  all_entities_data.append(await result)
476
 
477
  logger.info("Inserting relationships into storage...")
478
  all_relationships_data = []
479
  for result in tqdm_async(
480
- asyncio.as_completed(
481
- [
482
- _merge_edges_then_upsert(
483
- k[0], k[1], v, knowledge_graph_inst, global_config
484
- )
485
- for k, v in maybe_edges.items()
486
- ]
487
- ),
488
- total=len(maybe_edges),
489
- desc="Inserting relationships",
490
- unit="relationship",
491
  ):
492
  all_relationships_data.append(await result)
493
 
@@ -518,9 +527,9 @@ async def extract_entities(
518
  "src_id": dp["src_id"],
519
  "tgt_id": dp["tgt_id"],
520
  "content": dp["keywords"]
521
- + dp["src_id"]
522
- + dp["tgt_id"]
523
- + dp["description"],
524
  "metadata": {
525
  "created_at": dp.get("metadata", {}).get("created_at", time.time())
526
  },
@@ -533,14 +542,14 @@ async def extract_entities(
533
 
534
 
535
  async def kg_query(
536
- query,
537
- knowledge_graph_inst: BaseGraphStorage,
538
- entities_vdb: BaseVectorStorage,
539
- relationships_vdb: BaseVectorStorage,
540
- text_chunks_db: BaseKVStorage[TextChunkSchema],
541
- query_param: QueryParam,
542
- global_config: dict,
543
- hashing_kv: BaseKVStorage = None,
544
  ) -> str:
545
  # Handle cache
546
  use_model_func = global_config["llm_model_func"]
@@ -660,12 +669,12 @@ async def kg_query(
660
 
661
 
662
  async def _build_query_context(
663
- query: list,
664
- knowledge_graph_inst: BaseGraphStorage,
665
- entities_vdb: BaseVectorStorage,
666
- relationships_vdb: BaseVectorStorage,
667
- text_chunks_db: BaseKVStorage[TextChunkSchema],
668
- query_param: QueryParam,
669
  ):
670
  # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
671
  # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
@@ -718,9 +727,9 @@ async def _build_query_context(
718
  query_param,
719
  )
720
  if (
721
- hl_entities_context == ""
722
- and hl_relations_context == ""
723
- and hl_text_units_context == ""
724
  ):
725
  logger.warn("No high level context found. Switching to local mode.")
726
  query_param.mode = "local"
@@ -759,11 +768,11 @@ async def _build_query_context(
759
 
760
 
761
  async def _get_node_data(
762
- query,
763
- knowledge_graph_inst: BaseGraphStorage,
764
- entities_vdb: BaseVectorStorage,
765
- text_chunks_db: BaseKVStorage[TextChunkSchema],
766
- query_param: QueryParam,
767
  ):
768
  # get similar entities
769
  results = await entities_vdb.query(query, top_k=query_param.top_k)
@@ -850,10 +859,10 @@ async def _get_node_data(
850
 
851
 
852
  async def _find_most_related_text_unit_from_entities(
853
- node_datas: list[dict],
854
- query_param: QueryParam,
855
- text_chunks_db: BaseKVStorage[TextChunkSchema],
856
- knowledge_graph_inst: BaseGraphStorage,
857
  ):
858
  text_units = [
859
  split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
@@ -893,8 +902,8 @@ async def _find_most_related_text_unit_from_entities(
893
  if this_edges:
894
  for e in this_edges:
895
  if (
896
- e[1] in all_one_hop_text_units_lookup
897
- and c_id in all_one_hop_text_units_lookup[e[1]]
898
  ):
899
  all_text_units_lookup[c_id]["relation_counts"] += 1
900
 
@@ -924,9 +933,9 @@ async def _find_most_related_text_unit_from_entities(
924
 
925
 
926
  async def _find_most_related_edges_from_entities(
927
- node_datas: list[dict],
928
- query_param: QueryParam,
929
- knowledge_graph_inst: BaseGraphStorage,
930
  ):
931
  all_related_edges = await asyncio.gather(
932
  *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas]
@@ -964,11 +973,11 @@ async def _find_most_related_edges_from_entities(
964
 
965
 
966
  async def _get_edge_data(
967
- keywords,
968
- knowledge_graph_inst: BaseGraphStorage,
969
- relationships_vdb: BaseVectorStorage,
970
- text_chunks_db: BaseKVStorage[TextChunkSchema],
971
- query_param: QueryParam,
972
  ):
973
  results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
974
 
@@ -1066,9 +1075,9 @@ async def _get_edge_data(
1066
 
1067
 
1068
  async def _find_most_related_entities_from_relationships(
1069
- edge_datas: list[dict],
1070
- query_param: QueryParam,
1071
- knowledge_graph_inst: BaseGraphStorage,
1072
  ):
1073
  entity_names = []
1074
  seen = set()
@@ -1103,10 +1112,10 @@ async def _find_most_related_entities_from_relationships(
1103
 
1104
 
1105
  async def _find_related_text_unit_from_relationships(
1106
- edge_datas: list[dict],
1107
- query_param: QueryParam,
1108
- text_chunks_db: BaseKVStorage[TextChunkSchema],
1109
- knowledge_graph_inst: BaseGraphStorage,
1110
  ):
1111
  text_units = [
1112
  split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
@@ -1172,12 +1181,12 @@ def combine_contexts(entities, relationships, sources):
1172
 
1173
 
1174
  async def naive_query(
1175
- query,
1176
- chunks_vdb: BaseVectorStorage,
1177
- text_chunks_db: BaseKVStorage[TextChunkSchema],
1178
- query_param: QueryParam,
1179
- global_config: dict,
1180
- hashing_kv: BaseKVStorage = None,
1181
  ):
1182
  # Handle cache
1183
  use_model_func = global_config["llm_model_func"]
@@ -1235,7 +1244,7 @@ async def naive_query(
1235
 
1236
  if len(response) > len(sys_prompt):
1237
  response = (
1238
- response[len(sys_prompt):]
1239
  .replace(sys_prompt, "")
1240
  .replace("user", "")
1241
  .replace("model", "")
@@ -1263,15 +1272,15 @@ async def naive_query(
1263
 
1264
 
1265
  async def mix_kg_vector_query(
1266
- query,
1267
- knowledge_graph_inst: BaseGraphStorage,
1268
- entities_vdb: BaseVectorStorage,
1269
- relationships_vdb: BaseVectorStorage,
1270
- chunks_vdb: BaseVectorStorage,
1271
- text_chunks_db: BaseKVStorage[TextChunkSchema],
1272
- query_param: QueryParam,
1273
- global_config: dict,
1274
- hashing_kv: BaseKVStorage = None,
1275
  ) -> str:
1276
  """
1277
  Hybrid retrieval implementation combining knowledge graph and vector search.
@@ -1296,7 +1305,7 @@ async def mix_kg_vector_query(
1296
  # Reuse keyword extraction logic from kg_query
1297
  example_number = global_config["addon_params"].get("example_number", None)
1298
  if example_number and example_number < len(
1299
- PROMPTS["keywords_extraction_examples"]
1300
  ):
1301
  examples = "\n".join(
1302
  PROMPTS["keywords_extraction_examples"][: int(example_number)]
 
34
 
35
 
36
  def chunking_by_token_size(
37
+ content: str,
38
+ split_by_character=None,
39
+ overlap_token_size=128,
40
+ max_token_size=1024,
41
+ tiktoken_model="gpt-4o",
42
  ):
43
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
44
  results = []
 
48
  for chunk in raw_chunks:
49
  _tokens = encode_string_by_tiktoken(chunk, model_name=tiktoken_model)
50
  if len(_tokens) > max_token_size:
51
+ for start in range(
52
+ 0, len(_tokens), max_token_size - overlap_token_size
53
+ ):
54
  chunk_content = decode_tokens_by_tiktoken(
55
+ _tokens[start : start + max_token_size],
56
+ model_name=tiktoken_model,
57
+ )
58
+ new_chunks.append(
59
+ (min(max_token_size, len(_tokens) - start), chunk_content)
60
  )
 
61
  else:
62
  new_chunks.append((len(_tokens), chunk))
63
  for index, (_len, chunk) in enumerate(new_chunks):
 
70
  )
71
  else:
72
  for index, start in enumerate(
73
+ range(0, len(tokens), max_token_size - overlap_token_size)
74
  ):
75
  chunk_content = decode_tokens_by_tiktoken(
76
+ tokens[start : start + max_token_size], model_name=tiktoken_model
77
  )
78
  results.append(
79
  {
 
86
 
87
 
88
  async def _handle_entity_relation_summary(
89
+ entity_or_relation_name: str,
90
+ description: str,
91
+ global_config: dict,
92
  ) -> str:
93
  use_llm_func: callable = global_config["llm_model_func"]
94
  llm_max_tokens = global_config["llm_model_max_token_size"]
 
117
 
118
 
119
  async def _handle_single_entity_extraction(
120
+ record_attributes: list[str],
121
+ chunk_key: str,
122
  ):
123
  if len(record_attributes) < 4 or record_attributes[0] != '"entity"':
124
  return None
 
138
 
139
 
140
  async def _handle_single_relationship_extraction(
141
+ record_attributes: list[str],
142
+ chunk_key: str,
143
  ):
144
  if len(record_attributes) < 5 or record_attributes[0] != '"relationship"':
145
  return None
 
165
 
166
 
167
  async def _merge_nodes_then_upsert(
168
+ entity_name: str,
169
+ nodes_data: list[dict],
170
+ knowledge_graph_inst: BaseGraphStorage,
171
+ global_config: dict,
172
  ):
173
  already_entity_types = []
174
  already_source_ids = []
 
212
 
213
 
214
  async def _merge_edges_then_upsert(
215
+ src_id: str,
216
+ tgt_id: str,
217
+ edges_data: list[dict],
218
+ knowledge_graph_inst: BaseGraphStorage,
219
+ global_config: dict,
220
  ):
221
  already_weights = []
222
  already_source_ids = []
 
279
 
280
 
281
  async def extract_entities(
282
+ chunks: dict[str, TextChunkSchema],
283
+ knowledge_graph_inst: BaseGraphStorage,
284
+ entity_vdb: BaseVectorStorage,
285
+ relationships_vdb: BaseVectorStorage,
286
+ global_config: dict,
287
+ llm_response_cache: BaseKVStorage = None,
288
  ) -> Union[BaseGraphStorage, None]:
289
  use_llm_func: callable = global_config["llm_model_func"]
290
  entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
 
336
  already_relations = 0
337
 
338
  async def _user_llm_func_with_cache(
339
+ input_text: str, history_messages: list[dict[str, str]] = None
340
  ) -> str:
341
  if enable_llm_cache_for_entity_extract and llm_response_cache:
342
  need_to_restore = False
343
  if (
344
+ global_config["embedding_cache_config"]
345
+ and global_config["embedding_cache_config"]["enabled"]
346
  ):
347
  new_config = global_config.copy()
348
  new_config["embedding_cache_config"] = None
 
444
  already_relations += len(maybe_edges)
445
  now_ticks = PROMPTS["process_tickers"][
446
  already_processed % len(PROMPTS["process_tickers"])
447
+ ]
448
  print(
449
  f"{now_ticks} Processed {already_processed} chunks, {already_entities} entities(duplicated), {already_relations} relations(duplicated)\r",
450
  end="",
 
454
 
455
  results = []
456
  for result in tqdm_async(
457
+ asyncio.as_completed([_process_single_content(c) for c in ordered_chunks]),
458
+ total=len(ordered_chunks),
459
+ desc="Extracting entities from chunks",
460
+ unit="chunk",
461
  ):
462
  results.append(await result)
463
 
 
471
  logger.info("Inserting entities into storage...")
472
  all_entities_data = []
473
  for result in tqdm_async(
474
+ asyncio.as_completed(
475
+ [
476
+ _merge_nodes_then_upsert(k, v, knowledge_graph_inst, global_config)
477
+ for k, v in maybe_nodes.items()
478
+ ]
479
+ ),
480
+ total=len(maybe_nodes),
481
+ desc="Inserting entities",
482
+ unit="entity",
483
  ):
484
  all_entities_data.append(await result)
485
 
486
  logger.info("Inserting relationships into storage...")
487
  all_relationships_data = []
488
  for result in tqdm_async(
489
+ asyncio.as_completed(
490
+ [
491
+ _merge_edges_then_upsert(
492
+ k[0], k[1], v, knowledge_graph_inst, global_config
493
+ )
494
+ for k, v in maybe_edges.items()
495
+ ]
496
+ ),
497
+ total=len(maybe_edges),
498
+ desc="Inserting relationships",
499
+ unit="relationship",
500
  ):
501
  all_relationships_data.append(await result)
502
 
 
527
  "src_id": dp["src_id"],
528
  "tgt_id": dp["tgt_id"],
529
  "content": dp["keywords"]
530
+ + dp["src_id"]
531
+ + dp["tgt_id"]
532
+ + dp["description"],
533
  "metadata": {
534
  "created_at": dp.get("metadata", {}).get("created_at", time.time())
535
  },
 
542
 
543
 
544
  async def kg_query(
545
+ query,
546
+ knowledge_graph_inst: BaseGraphStorage,
547
+ entities_vdb: BaseVectorStorage,
548
+ relationships_vdb: BaseVectorStorage,
549
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
550
+ query_param: QueryParam,
551
+ global_config: dict,
552
+ hashing_kv: BaseKVStorage = None,
553
  ) -> str:
554
  # Handle cache
555
  use_model_func = global_config["llm_model_func"]
 
669
 
670
 
671
  async def _build_query_context(
672
+ query: list,
673
+ knowledge_graph_inst: BaseGraphStorage,
674
+ entities_vdb: BaseVectorStorage,
675
+ relationships_vdb: BaseVectorStorage,
676
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
677
+ query_param: QueryParam,
678
  ):
679
  # ll_entities_context, ll_relations_context, ll_text_units_context = "", "", ""
680
  # hl_entities_context, hl_relations_context, hl_text_units_context = "", "", ""
 
727
  query_param,
728
  )
729
  if (
730
+ hl_entities_context == ""
731
+ and hl_relations_context == ""
732
+ and hl_text_units_context == ""
733
  ):
734
  logger.warn("No high level context found. Switching to local mode.")
735
  query_param.mode = "local"
 
768
 
769
 
770
  async def _get_node_data(
771
+ query,
772
+ knowledge_graph_inst: BaseGraphStorage,
773
+ entities_vdb: BaseVectorStorage,
774
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
775
+ query_param: QueryParam,
776
  ):
777
  # get similar entities
778
  results = await entities_vdb.query(query, top_k=query_param.top_k)
 
859
 
860
 
861
  async def _find_most_related_text_unit_from_entities(
862
+ node_datas: list[dict],
863
+ query_param: QueryParam,
864
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
865
+ knowledge_graph_inst: BaseGraphStorage,
866
  ):
867
  text_units = [
868
  split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
 
902
  if this_edges:
903
  for e in this_edges:
904
  if (
905
+ e[1] in all_one_hop_text_units_lookup
906
+ and c_id in all_one_hop_text_units_lookup[e[1]]
907
  ):
908
  all_text_units_lookup[c_id]["relation_counts"] += 1
909
 
 
933
 
934
 
935
  async def _find_most_related_edges_from_entities(
936
+ node_datas: list[dict],
937
+ query_param: QueryParam,
938
+ knowledge_graph_inst: BaseGraphStorage,
939
  ):
940
  all_related_edges = await asyncio.gather(
941
  *[knowledge_graph_inst.get_node_edges(dp["entity_name"]) for dp in node_datas]
 
973
 
974
 
975
  async def _get_edge_data(
976
+ keywords,
977
+ knowledge_graph_inst: BaseGraphStorage,
978
+ relationships_vdb: BaseVectorStorage,
979
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
980
+ query_param: QueryParam,
981
  ):
982
  results = await relationships_vdb.query(keywords, top_k=query_param.top_k)
983
 
 
1075
 
1076
 
1077
  async def _find_most_related_entities_from_relationships(
1078
+ edge_datas: list[dict],
1079
+ query_param: QueryParam,
1080
+ knowledge_graph_inst: BaseGraphStorage,
1081
  ):
1082
  entity_names = []
1083
  seen = set()
 
1112
 
1113
 
1114
  async def _find_related_text_unit_from_relationships(
1115
+ edge_datas: list[dict],
1116
+ query_param: QueryParam,
1117
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
1118
+ knowledge_graph_inst: BaseGraphStorage,
1119
  ):
1120
  text_units = [
1121
  split_string_by_multi_markers(dp["source_id"], [GRAPH_FIELD_SEP])
 
1181
 
1182
 
1183
  async def naive_query(
1184
+ query,
1185
+ chunks_vdb: BaseVectorStorage,
1186
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
1187
+ query_param: QueryParam,
1188
+ global_config: dict,
1189
+ hashing_kv: BaseKVStorage = None,
1190
  ):
1191
  # Handle cache
1192
  use_model_func = global_config["llm_model_func"]
 
1244
 
1245
  if len(response) > len(sys_prompt):
1246
  response = (
1247
+ response[len(sys_prompt) :]
1248
  .replace(sys_prompt, "")
1249
  .replace("user", "")
1250
  .replace("model", "")
 
1272
 
1273
 
1274
  async def mix_kg_vector_query(
1275
+ query,
1276
+ knowledge_graph_inst: BaseGraphStorage,
1277
+ entities_vdb: BaseVectorStorage,
1278
+ relationships_vdb: BaseVectorStorage,
1279
+ chunks_vdb: BaseVectorStorage,
1280
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
1281
+ query_param: QueryParam,
1282
+ global_config: dict,
1283
+ hashing_kv: BaseKVStorage = None,
1284
  ) -> str:
1285
  """
1286
  Hybrid retrieval implementation combining knowledge graph and vector search.
 
1305
  # Reuse keyword extraction logic from kg_query
1306
  example_number = global_config["addon_params"].get("example_number", None)
1307
  if example_number and example_number < len(
1308
+ PROMPTS["keywords_extraction_examples"]
1309
  ):
1310
  examples = "\n".join(
1311
  PROMPTS["keywords_extraction_examples"][: int(example_number)]
test.ipynb ADDED
@@ -0,0 +1,740 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "4b5690db12e34685",
7
+ "metadata": {
8
+ "ExecuteTime": {
9
+ "end_time": "2025-01-07T05:38:34.174205Z",
10
+ "start_time": "2025-01-07T05:38:29.978194Z"
11
+ }
12
+ },
13
+ "outputs": [],
14
+ "source": [
15
+ "import os\n",
16
+ "import logging\n",
17
+ "import numpy as np\n",
18
+ "from lightrag import LightRAG, QueryParam\n",
19
+ "from lightrag.llm import openai_complete_if_cache, openai_embedding\n",
20
+ "from lightrag.utils import EmbeddingFunc\n",
21
+ "import nest_asyncio"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 2,
27
+ "id": "8c8ee7c061bf9159",
28
+ "metadata": {
29
+ "ExecuteTime": {
30
+ "end_time": "2025-01-07T05:38:37.440083Z",
31
+ "start_time": "2025-01-07T05:38:37.437666Z"
32
+ }
33
+ },
34
+ "outputs": [],
35
+ "source": [
36
+ "nest_asyncio.apply()\n",
37
+ "WORKING_DIR = \"../llm_rag/paper_db/R000088_test2\"\n",
38
+ "logging.basicConfig(format=\"%(levelname)s:%(message)s\", level=logging.INFO)\n",
39
+ "if not os.path.exists(WORKING_DIR):\n",
40
+ " os.mkdir(WORKING_DIR)\n",
41
+ "os.environ[\"doubao_api\"] = \"6b890250-0cf6-4eb1-aa82-9c9d711398a7\""
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 3,
47
+ "id": "a5009d16e0851dca",
48
+ "metadata": {
49
+ "ExecuteTime": {
50
+ "end_time": "2025-01-07T05:38:42.594315Z",
51
+ "start_time": "2025-01-07T05:38:42.590800Z"
52
+ }
53
+ },
54
+ "outputs": [],
55
+ "source": [
56
+ "async def llm_model_func(\n",
57
+ " prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs\n",
58
+ ") -> str:\n",
59
+ " return await openai_complete_if_cache(\n",
60
+ " \"ep-20241218114828-2tlww\",\n",
61
+ " prompt,\n",
62
+ " system_prompt=system_prompt,\n",
63
+ " history_messages=history_messages,\n",
64
+ " api_key=os.getenv(\"doubao_api\"),\n",
65
+ " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
66
+ " **kwargs,\n",
67
+ " )\n",
68
+ "\n",
69
+ "\n",
70
+ "async def embedding_func(texts: list[str]) -> np.ndarray:\n",
71
+ " return await openai_embedding(\n",
72
+ " texts,\n",
73
+ " model=\"ep-20241231173413-pgjmk\",\n",
74
+ " api_key=os.getenv(\"doubao_api\"),\n",
75
+ " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
76
+ " )"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 4,
82
+ "id": "397fcad24ce4d0ed",
83
+ "metadata": {
84
+ "ExecuteTime": {
85
+ "end_time": "2025-01-07T05:38:44.016901Z",
86
+ "start_time": "2025-01-07T05:38:44.006291Z"
87
+ }
88
+ },
89
+ "outputs": [
90
+ {
91
+ "name": "stderr",
92
+ "output_type": "stream",
93
+ "text": [
94
+ "INFO:lightrag:Logger initialized for working directory: ../llm_rag/paper_db/R000088_test2\n",
95
+ "INFO:lightrag:Load KV llm_response_cache with 0 data\n",
96
+ "INFO:lightrag:Load KV full_docs with 0 data\n",
97
+ "INFO:lightrag:Load KV text_chunks with 0 data\n",
98
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_entities.json'} 0 data\n",
99
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_relationships.json'} 0 data\n",
100
+ "INFO:nano-vectordb:Init {'embedding_dim': 4096, 'metric': 'cosine', 'storage_file': '../llm_rag/paper_db/R000088_test2/vdb_chunks.json'} 0 data\n",
101
+ "INFO:lightrag:Loaded document status storage with 0 records\n"
102
+ ]
103
+ }
104
+ ],
105
+ "source": [
106
+ "rag = LightRAG(\n",
107
+ " working_dir=WORKING_DIR,\n",
108
+ " llm_model_func=llm_model_func,\n",
109
+ " embedding_func=EmbeddingFunc(\n",
110
+ " embedding_dim=4096, max_token_size=8192, func=embedding_func\n",
111
+ " ),\n",
112
+ ")"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 5,
118
+ "id": "1dc3603677f7484d",
119
+ "metadata": {
120
+ "ExecuteTime": {
121
+ "end_time": "2025-01-07T05:38:47.509111Z",
122
+ "start_time": "2025-01-07T05:38:47.501997Z"
123
+ }
124
+ },
125
+ "outputs": [],
126
+ "source": [
127
+ "with open(\n",
128
+ " \"../llm_rag/example/R000088/auto/R000088_full_txt.md\", \"r\", encoding=\"utf-8\"\n",
129
+ ") as f:\n",
130
+ " content = f.read()\n",
131
+ "\n",
132
+ "\n",
133
+ "async def embedding_func(texts: list[str]) -> np.ndarray:\n",
134
+ " return await openai_embedding(\n",
135
+ " texts,\n",
136
+ " model=\"ep-20241231173413-pgjmk\",\n",
137
+ " api_key=os.getenv(\"doubao_api\"),\n",
138
+ " base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
139
+ " )\n",
140
+ "\n",
141
+ "\n",
142
+ "async def get_embedding_dim():\n",
143
+ " test_text = [\"This is a test sentence.\"]\n",
144
+ " embedding = await embedding_func(test_text)\n",
145
+ " embedding_dim = embedding.shape[1]\n",
146
+ " return embedding_dim"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 6,
152
+ "id": "6844202606acfbe5",
153
+ "metadata": {
154
+ "ExecuteTime": {
155
+ "end_time": "2025-01-07T05:38:50.666764Z",
156
+ "start_time": "2025-01-07T05:38:50.247712Z"
157
+ }
158
+ },
159
+ "outputs": [
160
+ {
161
+ "name": "stderr",
162
+ "output_type": "stream",
163
+ "text": [
164
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n"
165
+ ]
166
+ }
167
+ ],
168
+ "source": [
169
+ "embedding_dimension = await get_embedding_dim()"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 7,
175
+ "id": "d6273839d9681403",
176
+ "metadata": {
177
+ "ExecuteTime": {
178
+ "end_time": "2025-01-07T05:42:33.085507Z",
179
+ "start_time": "2025-01-07T05:38:56.789348Z"
180
+ }
181
+ },
182
+ "outputs": [
183
+ {
184
+ "name": "stderr",
185
+ "output_type": "stream",
186
+ "text": [
187
+ "INFO:lightrag:Processing 1 new unique documents\n",
188
+ "Processing batch 1: 0%| | 0/1 [00:00<?, ?it/s]INFO:lightrag:Inserting 22 vectors to chunks\n",
189
+ "\n",
190
+ "Generating embeddings: 0%| | 0/1 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
191
+ "\n",
192
+ "Generating embeddings: 100%|██████████| 1/1 [00:03<00:00, 3.85s/batch]\u001b[A\n",
193
+ "\n",
194
+ "Extracting entities from chunks: 0%| | 0/22 [00:00<?, ?chunk/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
195
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
196
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
197
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
198
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
199
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
200
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
201
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
202
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
203
+ ]
204
+ },
205
+ {
206
+ "name": "stdout",
207
+ "output_type": "stream",
208
+ "text": [
209
+ "⠙ Processed 1 chunks, 7 entities(duplicated), 6 relations(duplicated)\r"
210
+ ]
211
+ },
212
+ {
213
+ "name": "stderr",
214
+ "output_type": "stream",
215
+ "text": [
216
+ "\n",
217
+ "Extracting entities from chunks: 5%|▍ | 1/22 [00:23<08:21, 23.90s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
218
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
219
+ ]
220
+ },
221
+ {
222
+ "name": "stdout",
223
+ "output_type": "stream",
224
+ "text": [
225
+ "⠹ Processed 2 chunks, 12 entities(duplicated), 15 relations(duplicated)\r"
226
+ ]
227
+ },
228
+ {
229
+ "name": "stderr",
230
+ "output_type": "stream",
231
+ "text": [
232
+ "\n",
233
+ "Extracting entities from chunks: 9%|▉ | 2/22 [00:26<03:50, 11.51s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
234
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
235
+ ]
236
+ },
237
+ {
238
+ "name": "stdout",
239
+ "output_type": "stream",
240
+ "text": [
241
+ "⠸ Processed 3 chunks, 20 entities(duplicated), 22 relations(duplicated)\r"
242
+ ]
243
+ },
244
+ {
245
+ "name": "stderr",
246
+ "output_type": "stream",
247
+ "text": [
248
+ "\n",
249
+ "Extracting entities from chunks: 14%|█▎ | 3/22 [00:34<03:08, 9.93s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
250
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
251
+ ]
252
+ },
253
+ {
254
+ "name": "stdout",
255
+ "output_type": "stream",
256
+ "text": [
257
+ "⠼ Processed 4 chunks, 30 entities(duplicated), 30 relations(duplicated)\r"
258
+ ]
259
+ },
260
+ {
261
+ "name": "stderr",
262
+ "output_type": "stream",
263
+ "text": [
264
+ "\n",
265
+ "Extracting entities from chunks: 18%|█▊ | 4/22 [00:37<02:09, 7.21s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
266
+ ]
267
+ },
268
+ {
269
+ "name": "stdout",
270
+ "output_type": "stream",
271
+ "text": [
272
+ "⠴ Processed 5 chunks, 39 entities(duplicated), 39 relations(duplicated)\r"
273
+ ]
274
+ },
275
+ {
276
+ "name": "stderr",
277
+ "output_type": "stream",
278
+ "text": [
279
+ "\n",
280
+ "Extracting entities from chunks: 23%|██▎ | 5/22 [00:38<01:19, 4.70s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
281
+ ]
282
+ },
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "⠦ Processed 6 chunks, 39 entities(duplicated), 39 relations(duplicated)\r"
288
+ ]
289
+ },
290
+ {
291
+ "name": "stderr",
292
+ "output_type": "stream",
293
+ "text": [
294
+ "\n",
295
+ "Extracting entities from chunks: 27%|██▋ | 6/22 [00:38<00:53, 3.32s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
296
+ ]
297
+ },
298
+ {
299
+ "name": "stdout",
300
+ "output_type": "stream",
301
+ "text": [
302
+ "⠧ Processed 7 chunks, 47 entities(duplicated), 50 relations(duplicated)\r"
303
+ ]
304
+ },
305
+ {
306
+ "name": "stderr",
307
+ "output_type": "stream",
308
+ "text": [
309
+ "\n",
310
+ "Extracting entities from chunks: 32%|███▏ | 7/22 [00:39<00:39, 2.65s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
311
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
312
+ ]
313
+ },
314
+ {
315
+ "name": "stdout",
316
+ "output_type": "stream",
317
+ "text": [
318
+ "⠇ Processed 8 chunks, 56 entities(duplicated), 58 relations(duplicated)\r"
319
+ ]
320
+ },
321
+ {
322
+ "name": "stderr",
323
+ "output_type": "stream",
324
+ "text": [
325
+ "\n",
326
+ "Extracting entities from chunks: 36%|███▋ | 8/22 [00:40<00:29, 2.13s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
327
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
328
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
329
+ ]
330
+ },
331
+ {
332
+ "name": "stdout",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "⠏ Processed 9 chunks, 63 entities(duplicated), 69 relations(duplicated)\r"
336
+ ]
337
+ },
338
+ {
339
+ "name": "stderr",
340
+ "output_type": "stream",
341
+ "text": [
342
+ "\n",
343
+ "Extracting entities from chunks: 41%|████ | 9/22 [00:47<00:43, 3.38s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
344
+ ]
345
+ },
346
+ {
347
+ "name": "stdout",
348
+ "output_type": "stream",
349
+ "text": [
350
+ "⠋ Processed 10 chunks, 81 entities(duplicated), 81 relations(duplicated)\r"
351
+ ]
352
+ },
353
+ {
354
+ "name": "stderr",
355
+ "output_type": "stream",
356
+ "text": [
357
+ "\n",
358
+ "Extracting entities from chunks: 45%|████▌ | 10/22 [00:48<00:32, 2.73s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
359
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
360
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
361
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
362
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
363
+ ]
364
+ },
365
+ {
366
+ "name": "stdout",
367
+ "output_type": "stream",
368
+ "text": [
369
+ "⠙ Processed 11 chunks, 92 entities(duplicated), 89 relations(duplicated)\r"
370
+ ]
371
+ },
372
+ {
373
+ "name": "stderr",
374
+ "output_type": "stream",
375
+ "text": [
376
+ "\n",
377
+ "Extracting entities from chunks: 50%|█████ | 11/22 [01:01<01:05, 5.99s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
378
+ ]
379
+ },
380
+ {
381
+ "name": "stdout",
382
+ "output_type": "stream",
383
+ "text": [
384
+ "⠹ Processed 12 chunks, 107 entities(duplicated), 107 relations(duplicated)\r"
385
+ ]
386
+ },
387
+ {
388
+ "name": "stderr",
389
+ "output_type": "stream",
390
+ "text": [
391
+ "\n",
392
+ "Extracting entities from chunks: 55%|█████▍ | 12/22 [01:10<01:09, 6.94s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
393
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
394
+ ]
395
+ },
396
+ {
397
+ "name": "stdout",
398
+ "output_type": "stream",
399
+ "text": [
400
+ "⠸ Processed 13 chunks, 127 entities(duplicated), 126 relations(duplicated)\r"
401
+ ]
402
+ },
403
+ {
404
+ "name": "stderr",
405
+ "output_type": "stream",
406
+ "text": [
407
+ "\n",
408
+ "Extracting entities from chunks: 59%|█████▉ | 13/22 [01:16<00:59, 6.59s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
409
+ ]
410
+ },
411
+ {
412
+ "name": "stdout",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "⠼ Processed 14 chunks, 151 entities(duplicated), 137 relations(duplicated)\r"
416
+ ]
417
+ },
418
+ {
419
+ "name": "stderr",
420
+ "output_type": "stream",
421
+ "text": [
422
+ "\n",
423
+ "Extracting entities from chunks: 64%|██████▎ | 14/22 [01:16<00:37, 4.68s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
424
+ ]
425
+ },
426
+ {
427
+ "name": "stdout",
428
+ "output_type": "stream",
429
+ "text": [
430
+ "⠴ Processed 15 chunks, 161 entities(duplicated), 144 relations(duplicated)\r"
431
+ ]
432
+ },
433
+ {
434
+ "name": "stderr",
435
+ "output_type": "stream",
436
+ "text": [
437
+ "\n",
438
+ "Extracting entities from chunks: 68%|██████▊ | 15/22 [01:17<00:23, 3.31s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
439
+ ]
440
+ },
441
+ {
442
+ "name": "stdout",
443
+ "output_type": "stream",
444
+ "text": [
445
+ "⠦ Processed 16 chunks, 176 entities(duplicated), 154 relations(duplicated)\r"
446
+ ]
447
+ },
448
+ {
449
+ "name": "stderr",
450
+ "output_type": "stream",
451
+ "text": [
452
+ "\n",
453
+ "Extracting entities from chunks: 73%|███████▎ | 16/22 [01:19<00:18, 3.04s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
454
+ ]
455
+ },
456
+ {
457
+ "name": "stdout",
458
+ "output_type": "stream",
459
+ "text": [
460
+ "⠧ Processed 17 chunks, 189 entities(duplicated), 162 relations(duplicated)\r"
461
+ ]
462
+ },
463
+ {
464
+ "name": "stderr",
465
+ "output_type": "stream",
466
+ "text": [
467
+ "\n",
468
+ "Extracting entities from chunks: 77%|███████▋ | 17/22 [01:21<00:13, 2.80s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
469
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
470
+ ]
471
+ },
472
+ {
473
+ "name": "stdout",
474
+ "output_type": "stream",
475
+ "text": [
476
+ "⠇ Processed 18 chunks, 207 entities(duplicated), 186 relations(duplicated)\r"
477
+ ]
478
+ },
479
+ {
480
+ "name": "stderr",
481
+ "output_type": "stream",
482
+ "text": [
483
+ "\n",
484
+ "Extracting entities from chunks: 82%|████████▏ | 18/22 [01:38<00:28, 7.06s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
485
+ ]
486
+ },
487
+ {
488
+ "name": "stdout",
489
+ "output_type": "stream",
490
+ "text": [
491
+ "⠏ Processed 19 chunks, 222 entities(duplicated), 200 relations(duplicated)\r"
492
+ ]
493
+ },
494
+ {
495
+ "name": "stderr",
496
+ "output_type": "stream",
497
+ "text": [
498
+ "\n",
499
+ "Extracting entities from chunks: 86%|████████▋ | 19/22 [01:44<00:19, 6.61s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
500
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
501
+ ]
502
+ },
503
+ {
504
+ "name": "stdout",
505
+ "output_type": "stream",
506
+ "text": [
507
+ "⠋ Processed 20 chunks, 310 entities(duplicated), 219 relations(duplicated)\r"
508
+ ]
509
+ },
510
+ {
511
+ "name": "stderr",
512
+ "output_type": "stream",
513
+ "text": [
514
+ "\n",
515
+ "Extracting entities from chunks: 91%|█████████ | 20/22 [02:12<00:26, 13.19s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
516
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
517
+ ]
518
+ },
519
+ {
520
+ "name": "stdout",
521
+ "output_type": "stream",
522
+ "text": [
523
+ "⠙ Processed 21 chunks, 345 entities(duplicated), 263 relations(duplicated)\r"
524
+ ]
525
+ },
526
+ {
527
+ "name": "stderr",
528
+ "output_type": "stream",
529
+ "text": [
530
+ "\n",
531
+ "Extracting entities from chunks: 95%|█████████▌| 21/22 [02:32<00:15, 15.15s/chunk]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
532
+ ]
533
+ },
534
+ {
535
+ "name": "stdout",
536
+ "output_type": "stream",
537
+ "text": [
538
+ "⠹ Processed 22 chunks, 417 entities(duplicated), 285 relations(duplicated)\r"
539
+ ]
540
+ },
541
+ {
542
+ "name": "stderr",
543
+ "output_type": "stream",
544
+ "text": [
545
+ "\n",
546
+ "Extracting entities from chunks: 100%|██████████| 22/22 [03:21<00:00, 9.18s/chunk]\u001b[A\n",
547
+ "INFO:lightrag:Inserting entities into storage...\n",
548
+ "\n",
549
+ "Inserting entities: 100%|██████████| 327/327 [00:00<00:00, 13446.31entity/s]\n",
550
+ "INFO:lightrag:Inserting relationships into storage...\n",
551
+ "\n",
552
+ "Inserting relationships: 100%|██████████| 272/272 [00:00<00:00, 16740.29relationship/s]\n",
553
+ "INFO:lightrag:Inserting 327 vectors to entities\n",
554
+ "\n",
555
+ "Generating embeddings: 0%| | 0/11 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
556
+ "\n",
557
+ "Generating embeddings: 9%|▉ | 1/11 [00:00<00:09, 1.02batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
558
+ "\n",
559
+ "Generating embeddings: 18%|█▊ | 2/11 [00:02<00:09, 1.07s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
560
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
561
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
562
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
563
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
564
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
565
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
566
+ "\n",
567
+ "Generating embeddings: 27%|██▋ | 3/11 [00:02<00:06, 1.33batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
568
+ "\n",
569
+ "Generating embeddings: 36%|███▋ | 4/11 [00:02<00:04, 1.67batch/s]\u001b[A\n",
570
+ "Generating embeddings: 45%|████▌ | 5/11 [00:03<00:03, 1.93batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
571
+ "\n",
572
+ "Generating embeddings: 55%|█████▍ | 6/11 [00:03<00:02, 2.15batch/s]\u001b[A\n",
573
+ "Generating embeddings: 64%|██████▎ | 7/11 [00:03<00:01, 2.33batch/s]\u001b[A\n",
574
+ "Generating embeddings: 73%|███████▎ | 8/11 [00:04<00:01, 2.46batch/s]\u001b[A\n",
575
+ "Generating embeddings: 82%|████████▏ | 9/11 [00:04<00:00, 2.55batch/s]\u001b[A\n",
576
+ "Generating embeddings: 91%|█████████ | 10/11 [00:05<00:00, 2.64batch/s]\u001b[A\n",
577
+ "Generating embeddings: 100%|██████████| 11/11 [00:05<00:00, 2.04batch/s]\u001b[A\n",
578
+ "INFO:lightrag:Inserting 272 vectors to relationships\n",
579
+ "\n",
580
+ "Generating embeddings: 0%| | 0/9 [00:00<?, ?batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
581
+ "\n",
582
+ "Generating embeddings: 11%|█ | 1/9 [00:01<00:11, 1.39s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
583
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
584
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
585
+ "\n",
586
+ "Generating embeddings: 22%|██▏ | 2/9 [00:02<00:07, 1.01s/batch]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
587
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
588
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
589
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
590
+ "\n",
591
+ "Generating embeddings: 33%|███▎ | 3/9 [00:02<00:04, 1.40batch/s]\u001b[AINFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
592
+ "\n",
593
+ "Generating embeddings: 44%|████▍ | 4/9 [00:02<00:02, 1.74batch/s]\u001b[A\n",
594
+ "Generating embeddings: 56%|█████▌ | 5/9 [00:03<00:01, 2.01batch/s]\u001b[A\n",
595
+ "Generating embeddings: 67%|██████▋ | 6/9 [00:03<00:01, 2.23batch/s]\u001b[A\n",
596
+ "Generating embeddings: 78%|███████▊ | 7/9 [00:03<00:00, 2.39batch/s]\u001b[A\n",
597
+ "Generating embeddings: 89%|████████▉ | 8/9 [00:04<00:00, 2.52batch/s]\u001b[A\n",
598
+ "Generating embeddings: 100%|██████████| 9/9 [00:04<00:00, 1.93batch/s]\u001b[A\n",
599
+ "INFO:lightrag:Writing graph with 331 nodes, 272 edges\n",
600
+ "Processing batch 1: 100%|██████████| 1/1 [03:36<00:00, 216.27s/it]\n"
601
+ ]
602
+ }
603
+ ],
604
+ "source": [
605
+ "# rag.insert(content)\n",
606
+ "rag.insert(content, split_by_character=\"\\n#\")"
607
+ ]
608
+ },
609
+ {
610
+ "cell_type": "code",
611
+ "execution_count": 8,
612
+ "id": "c4f9ae517151a01d",
613
+ "metadata": {
614
+ "ExecuteTime": {
615
+ "end_time": "2025-01-07T05:42:50.044809Z",
616
+ "start_time": "2025-01-07T05:42:50.041256Z"
617
+ }
618
+ },
619
+ "outputs": [],
620
+ "source": [
621
+ "prompt1 = \"\"\"\n",
622
+ "你是一名经验丰富的论文分析科学家,你的任务是对一篇英文学术研究论文进行关键信息提取并深入分析。\n",
623
+ "\n",
624
+ "请按照以下步骤进行分析:\n",
625
+ "1. 对于论文的分析对象相关问题:\n",
626
+ " - 仔细查找论文中的研究队列相关信息,确定分析对象来自哪些研究队列。\n",
627
+ " - 查看如果来自多个队列,文中是单独分析还是联合分析。\n",
628
+ " - 找出这些队列的名称。\n",
629
+ " - 确定这些队列开展的国家有哪些(注意:“澳门”记为“中国澳门”,“香港”记为“中国香港”,“台湾”记为“中国台湾”,其余采用国家回答)。\n",
630
+ " - 明确队列研究对象的性别分布(“男性”、“女性”或“全体”)。\n",
631
+ " - 查找队列收集结束时,研究对象年龄分布(平均值/中位值、标准差或范围),若信息缺失则根据年龄推理规则进行推理:当论文只提供了队列开展时对象的年龄,应根据队列结束时间推算最终年龄范围。例如:1989建立队列时年龄为25 - 42岁,随访至2011年结束,则推算年龄范围为47 - 64岁。\n",
632
+ " - 确定队列研究时间线,即哪一年开始收集信息/建立队列,哪一年结束,若信息缺失则根据队列时间线推理规则进行推理:如论文只提供了建立队列时间为1995,进行了10年的随访,则推算队列结束时间为2005年。\n",
633
+ " - 找出队列结束时实际参与研究人数是多少。\n",
634
+ "首先在<分析>标签中,针对每个问题详细分析你的思考过程。然后在<回答>标签中给出所有问题的最终答案。\"\"\""
635
+ ]
636
+ },
637
+ {
638
+ "cell_type": "code",
639
+ "execution_count": 9,
640
+ "id": "7a6491385b050095",
641
+ "metadata": {
642
+ "ExecuteTime": {
643
+ "end_time": "2025-01-07T05:43:24.751628Z",
644
+ "start_time": "2025-01-07T05:42:50.865679Z"
645
+ }
646
+ },
647
+ "outputs": [
648
+ {
649
+ "name": "stderr",
650
+ "output_type": "stream",
651
+ "text": [
652
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n",
653
+ "INFO:lightrag:kw_prompt result:\n"
654
+ ]
655
+ },
656
+ {
657
+ "name": "stdout",
658
+ "output_type": "stream",
659
+ "text": [
660
+ "{\n",
661
+ " \"high_level_keywords\": [\"英文学术研究论文分析\", \"关键信息提取\", \"深入分析\"],\n",
662
+ " \"low_level_keywords\": [\"研究队列\", \"队列名称\", \"队列开展国家\", \"性别分布\", \"年龄分布\", \"队列研究时间线\", \"实际参与研究人数\"]\n",
663
+ "}\n"
664
+ ]
665
+ },
666
+ {
667
+ "name": "stderr",
668
+ "output_type": "stream",
669
+ "text": [
670
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
671
+ "INFO:lightrag:Local query uses 60 entites, 38 relations, 6 text units\n",
672
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/embeddings \"HTTP/1.1 200 OK\"\n",
673
+ "INFO:lightrag:Global query uses 72 entites, 60 relations, 4 text units\n",
674
+ "INFO:httpx:HTTP Request: POST https://ark.cn-beijing.volces.com/api/v3/chat/completions \"HTTP/1.1 200 OK\"\n"
675
+ ]
676
+ },
677
+ {
678
+ "name": "stdout",
679
+ "output_type": "stream",
680
+ "text": [
681
+ "<分析>\n",
682
+ "- **分析对象来自哪些研究队列及是单独分析还是联合分析**:\n",
683
+ " 通过查找论文内容,发现文中提到“This is a combined analysis of data from 2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16)”,明确是对两个队列的数据进行联合分析,队列名称分别为“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”。\n",
684
+ "- **队列开展的国家**:\n",
685
+ " 文中多次提及研究在挪威进行,如“combined analyses and extended follow-up of 2 vitamin B intervention trials among patients with ischemic heart disease in Norway”,所以确定研究开展的国家是挪威。\n",
686
+ "- **队列研究对象的性别分布**:\n",
687
+ " 从“Mean (SD) age was 62.3 (11.0) years and 23.5% of participants were women”可知,研究对象包含男性和女性,即全体。\n",
688
+ "- **队列收集结束时研究对象年龄分布**:\n",
689
+ " 已知“Mean (SD) age was 62.3 (11.0) years”是基线时年龄信息,“Median (interquartile range) duration of extended follow-up through December 31, 2007, was 78 (61 - 90) months”,由于随访的中位时间是78个月(约6.5年),所以可推算队列收集结束时研究对象年龄均值约为62.3 + 6.5 = 68.8岁(标准差仍为11.0年)。\n",
690
+ "- **队列研究时间线**:\n",
691
+ " 根据“2 randomized, double-blind, placebo-controlled clinical trials (Norwegian Vitamin [NORVIT] trial15 and Western Norway B Vitamin Intervention Trial [WENBIT]16) conducted between 1998 and 2005, and an observational posttrial follow-up through December 31, 2007”可知,队列开始收集信息时间为1998年,结束时间为2007年12月31日。\n",
692
+ "- **队列结束时实际参与研究人数**:\n",
693
+ " 由“A total of 6837 individuals were included in the combined analyses, of whom 6261 (91.6%) participated in posttrial follow-up”可知,队列结束时实际参与研究人数为6261人。\n",
694
+ "</分析>\n",
695
+ "\n",
696
+ "<回答>\n",
697
+ "- 分析对象来自“Norwegian Vitamin (NORVIT) trial”和“Western Norway B Vitamin Intervention Trial (WENBIT)”两个研究队列,文中是对这两个队列的数据进行联合分析。\n",
698
+ "- 队列开展的国家是挪威。\n",
699
+ "- 队列研究对象的性别分布为全体。\n",
700
+ "- 队列收集结束时,研究对象年龄分布均值约为68.8岁,标准差为11.0年。\n",
701
+ "- 队列研究时间线为1998年开始收集信息/建立队列,2007年12月31日结束。\n",
702
+ "- 队列结束时实际参与研究人数是6261人。\n"
703
+ ]
704
+ }
705
+ ],
706
+ "source": [
707
+ "print(rag.query(prompt1, param=QueryParam(mode=\"hybrid\")))"
708
+ ]
709
+ },
710
+ {
711
+ "cell_type": "code",
712
+ "execution_count": null,
713
+ "id": "fef9d06983da47af",
714
+ "metadata": {},
715
+ "outputs": [],
716
+ "source": []
717
+ }
718
+ ],
719
+ "metadata": {
720
+ "kernelspec": {
721
+ "display_name": "Python 3",
722
+ "language": "python",
723
+ "name": "python3"
724
+ },
725
+ "language_info": {
726
+ "codemirror_mode": {
727
+ "name": "ipython",
728
+ "version": 2
729
+ },
730
+ "file_extension": ".py",
731
+ "mimetype": "text/x-python",
732
+ "name": "python",
733
+ "nbconvert_exporter": "python",
734
+ "pygments_lexer": "ipython2",
735
+ "version": "2.7.6"
736
+ }
737
+ },
738
+ "nbformat": 4,
739
+ "nbformat_minor": 5
740
+ }