Daniel.y commited on
Commit
3364c72
·
unverified ·
2 Parent(s): ea69de9 3511697

Merge pull request #1545 from danielaskdd/change-naive-context-to-json

Browse files
lightrag/api/__init__.py CHANGED
@@ -1 +1 @@
1
- __api_version__ = "0168"
 
1
+ __api_version__ = "0169"
lightrag/api/routers/query_routes.py CHANGED
@@ -67,16 +67,6 @@ class QueryRequest(BaseModel):
67
  description="Maximum number of tokens allocated for entity descriptions in local retrieval.",
68
  )
69
 
70
- hl_keywords: Optional[List[str]] = Field(
71
- default=None,
72
- description="List of high-level keywords to prioritize in retrieval.",
73
- )
74
-
75
- ll_keywords: Optional[List[str]] = Field(
76
- default=None,
77
- description="List of low-level keywords to refine retrieval focus.",
78
- )
79
-
80
  conversation_history: Optional[List[Dict[str, Any]]] = Field(
81
  default=None,
82
  description="Stores past conversation history to maintain context. Format: [{'role': 'user/assistant', 'content': 'message'}].",
@@ -93,20 +83,6 @@ class QueryRequest(BaseModel):
93
  def query_strip_after(cls, query: str) -> str:
94
  return query.strip()
95
 
96
- @field_validator("hl_keywords", mode="after")
97
- @classmethod
98
- def hl_keywords_strip_after(cls, hl_keywords: List[str] | None) -> List[str] | None:
99
- if hl_keywords is None:
100
- return None
101
- return [keyword.strip() for keyword in hl_keywords]
102
-
103
- @field_validator("ll_keywords", mode="after")
104
- @classmethod
105
- def ll_keywords_strip_after(cls, ll_keywords: List[str] | None) -> List[str] | None:
106
- if ll_keywords is None:
107
- return None
108
- return [keyword.strip() for keyword in ll_keywords]
109
-
110
  @field_validator("conversation_history", mode="after")
111
  @classmethod
112
  def conversation_history_role_check(
 
67
  description="Maximum number of tokens allocated for entity descriptions in local retrieval.",
68
  )
69
 
 
 
 
 
 
 
 
 
 
 
70
  conversation_history: Optional[List[Dict[str, Any]]] = Field(
71
  default=None,
72
  description="Stores past conversation history to maintain context. Format: [{'role': 'user/assistant', 'content': 'message'}].",
 
83
  def query_strip_after(cls, query: str) -> str:
84
  return query.strip()
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  @field_validator("conversation_history", mode="after")
87
  @classmethod
88
  def conversation_history_role_check(
lightrag/lightrag.py CHANGED
@@ -53,7 +53,6 @@ from .operate import (
53
  extract_entities,
54
  merge_nodes_and_edges,
55
  kg_query,
56
- mix_kg_vector_query,
57
  naive_query,
58
  query_with_keywords,
59
  )
@@ -1437,8 +1436,10 @@ class LightRAG:
1437
  """
1438
  # If a custom model is provided in param, temporarily update global config
1439
  global_config = asdict(self)
 
 
1440
 
1441
- if param.mode in ["local", "global", "hybrid"]:
1442
  response = await kg_query(
1443
  query.strip(),
1444
  self.chunk_entity_relation_graph,
@@ -1447,30 +1448,17 @@ class LightRAG:
1447
  self.text_chunks,
1448
  param,
1449
  global_config,
1450
- hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
1451
  system_prompt=system_prompt,
 
1452
  )
1453
  elif param.mode == "naive":
1454
  response = await naive_query(
1455
  query.strip(),
1456
  self.chunks_vdb,
1457
- self.text_chunks,
1458
- param,
1459
- global_config,
1460
- hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
1461
- system_prompt=system_prompt,
1462
- )
1463
- elif param.mode == "mix":
1464
- response = await mix_kg_vector_query(
1465
- query.strip(),
1466
- self.chunk_entity_relation_graph,
1467
- self.entities_vdb,
1468
- self.relationships_vdb,
1469
- self.chunks_vdb,
1470
- self.text_chunks,
1471
  param,
1472
  global_config,
1473
- hashing_kv=self.llm_response_cache, # Directly use llm_response_cache
1474
  system_prompt=system_prompt,
1475
  )
1476
  elif param.mode == "bypass":
 
53
  extract_entities,
54
  merge_nodes_and_edges,
55
  kg_query,
 
56
  naive_query,
57
  query_with_keywords,
58
  )
 
1436
  """
1437
  # If a custom model is provided in param, temporarily update global config
1438
  global_config = asdict(self)
1439
+ # Save original query for vector search
1440
+ param.original_query = query
1441
 
1442
+ if param.mode in ["local", "global", "hybrid", "mix"]:
1443
  response = await kg_query(
1444
  query.strip(),
1445
  self.chunk_entity_relation_graph,
 
1448
  self.text_chunks,
1449
  param,
1450
  global_config,
1451
+ hashing_kv=self.llm_response_cache,
1452
  system_prompt=system_prompt,
1453
+ chunks_vdb=self.chunks_vdb,
1454
  )
1455
  elif param.mode == "naive":
1456
  response = await naive_query(
1457
  query.strip(),
1458
  self.chunks_vdb,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1459
  param,
1460
  global_config,
1461
+ hashing_kv=self.llm_response_cache,
1462
  system_prompt=system_prompt,
1463
  )
1464
  elif param.mode == "bypass":
lightrag/operate.py CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
2
  from functools import partial
3
 
4
  import asyncio
5
- import traceback
6
  import json
7
  import re
8
  import os
@@ -26,7 +25,6 @@ from .utils import (
26
  CacheData,
27
  get_conversation_turns,
28
  use_llm_func_with_cache,
29
- list_of_list_to_json,
30
  )
31
  from .base import (
32
  BaseGraphStorage,
@@ -859,6 +857,7 @@ async def kg_query(
859
  global_config: dict[str, str],
860
  hashing_kv: BaseKVStorage | None = None,
861
  system_prompt: str | None = None,
 
862
  ) -> str | AsyncIterator[str]:
863
  if query_param.model_func:
864
  use_model_func = query_param.model_func
@@ -911,6 +910,7 @@ async def kg_query(
911
  relationships_vdb,
912
  text_chunks_db,
913
  query_param,
 
914
  )
915
 
916
  if query_param.only_need_context:
@@ -1110,169 +1110,85 @@ async def extract_keywords_only(
1110
  return hl_keywords, ll_keywords
1111
 
1112
 
1113
- async def mix_kg_vector_query(
1114
  query: str,
1115
- knowledge_graph_inst: BaseGraphStorage,
1116
- entities_vdb: BaseVectorStorage,
1117
- relationships_vdb: BaseVectorStorage,
1118
  chunks_vdb: BaseVectorStorage,
1119
- text_chunks_db: BaseKVStorage,
1120
  query_param: QueryParam,
1121
- global_config: dict[str, str],
1122
- hashing_kv: BaseKVStorage | None = None,
1123
- system_prompt: str | None = None,
1124
- ) -> str | AsyncIterator[str]:
1125
- """
1126
- Hybrid retrieval implementation combining knowledge graph and vector search.
1127
-
1128
- This function performs a hybrid search by:
1129
- 1. Extracting semantic information from knowledge graph
1130
- 2. Retrieving relevant text chunks through vector similarity
1131
- 3. Combining both results for comprehensive answer generation
1132
  """
1133
- # get tokenizer
1134
- tokenizer: Tokenizer = global_config["tokenizer"]
1135
 
1136
- if query_param.model_func:
1137
- use_model_func = query_param.model_func
1138
- else:
1139
- use_model_func = global_config["llm_model_func"]
1140
- # Apply higher priority (5) to query relation LLM function
1141
- use_model_func = partial(use_model_func, _priority=5)
1142
 
1143
- # 1. Cache handling
1144
- args_hash = compute_args_hash("mix", query, cache_type="query")
1145
- cached_response, quantized, min_val, max_val = await handle_cache(
1146
- hashing_kv, args_hash, query, "mix", cache_type="query"
1147
- )
1148
- if cached_response is not None:
1149
- return cached_response
1150
 
1151
- # Process conversation history
1152
- history_context = ""
1153
- if query_param.conversation_history:
1154
- history_context = get_conversation_turns(
1155
- query_param.conversation_history, query_param.history_turns
 
 
1156
  )
 
 
1157
 
1158
- # 2. Execute knowledge graph and vector searches in parallel
1159
- async def get_kg_context():
1160
- try:
1161
- hl_keywords, ll_keywords = await get_keywords_from_query(
1162
- query, query_param, global_config, hashing_kv
1163
- )
1164
-
1165
- if not hl_keywords and not ll_keywords:
1166
- logger.warning("Both high-level and low-level keywords are empty")
1167
- return None
1168
-
1169
- # Convert keyword lists to strings
1170
- ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else ""
1171
- hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else ""
1172
-
1173
- # Set query mode based on available keywords
1174
- if not ll_keywords_str and not hl_keywords_str:
1175
- return None
1176
- elif not ll_keywords_str:
1177
- query_param.mode = "global"
1178
- elif not hl_keywords_str:
1179
- query_param.mode = "local"
1180
- else:
1181
- query_param.mode = "hybrid"
1182
-
1183
- # Build knowledge graph context
1184
- context = await _build_query_context(
1185
- ll_keywords_str,
1186
- hl_keywords_str,
1187
- knowledge_graph_inst,
1188
- entities_vdb,
1189
- relationships_vdb,
1190
- text_chunks_db,
1191
- query_param,
1192
- )
1193
-
1194
- return context
1195
-
1196
- except Exception as e:
1197
- logger.error(f"Error in get_kg_context: {str(e)}")
1198
- traceback.print_exc()
1199
- return None
1200
-
1201
- # 3. Execute both retrievals in parallel
1202
- kg_context, vector_context = await asyncio.gather(
1203
- get_kg_context(), _get_vector_context(query, chunks_vdb, query_param, tokenizer)
1204
- )
1205
-
1206
- # 4. Merge contexts
1207
- if kg_context is None and vector_context is None:
1208
- return PROMPTS["fail_response"]
1209
 
1210
- if query_param.only_need_context:
1211
- context_str = f"""
1212
- \r\n\r\n=====Knowledge Graph Context=====\r\n\r\n
1213
- {kg_context if kg_context else "No relevant knowledge graph information found"}
1214
-
1215
- \r\n\r\n=====Vector Context=====\r\n\r\n
1216
- {vector_context if vector_context else "No relevant text information found"}
1217
- """.strip()
1218
- return context_str
1219
-
1220
- # 5. Construct hybrid prompt
1221
- sys_prompt = (
1222
- system_prompt if system_prompt else PROMPTS["mix_rag_response"]
1223
- ).format(
1224
- kg_context=kg_context
1225
- if kg_context
1226
- else "No relevant knowledge graph information found",
1227
- vector_context=vector_context
1228
- if vector_context
1229
- else "No relevant text information found",
1230
- response_type=query_param.response_type,
1231
- history=history_context,
1232
- )
1233
 
1234
- if query_param.only_need_prompt:
1235
- return sys_prompt
 
 
 
 
1236
 
1237
- len_of_prompts = len(tokenizer.encode(query + sys_prompt))
1238
- logger.debug(f"[mix_kg_vector_query]Prompt Tokens: {len_of_prompts}")
 
 
 
 
1239
 
1240
- # 6. Generate response
1241
- response = await use_model_func(
1242
- query,
1243
- system_prompt=sys_prompt,
1244
- stream=query_param.stream,
1245
- )
1246
 
1247
- # Clean up response content
1248
- if isinstance(response, str) and len(response) > len(sys_prompt):
1249
- response = (
1250
- response.replace(sys_prompt, "")
1251
- .replace("user", "")
1252
- .replace("model", "")
1253
- .replace(query, "")
1254
- .replace("<system>", "")
1255
- .replace("</system>", "")
1256
- .strip()
1257
- )
1258
 
1259
- if hashing_kv.global_config.get("enable_llm_cache"):
1260
- # 7. Save cache - Only cache after collecting complete response
1261
- await save_to_cache(
1262
- hashing_kv,
1263
- CacheData(
1264
- args_hash=args_hash,
1265
- content=response,
1266
- prompt=query,
1267
- quantized=quantized,
1268
- min_val=min_val,
1269
- max_val=max_val,
1270
- mode="mix",
1271
- cache_type="query",
1272
- ),
1273
  )
1274
 
1275
- return response
 
 
 
1276
 
1277
 
1278
  async def _build_query_context(
@@ -1283,8 +1199,11 @@ async def _build_query_context(
1283
  relationships_vdb: BaseVectorStorage,
1284
  text_chunks_db: BaseKVStorage,
1285
  query_param: QueryParam,
 
1286
  ):
1287
- logger.info(f"Process {os.getpid()} buidling query context...")
 
 
1288
  if query_param.mode == "local":
1289
  entities_context, relations_context, text_units_context = await _get_node_data(
1290
  ll_keywords,
@@ -1301,7 +1220,7 @@ async def _build_query_context(
1301
  text_chunks_db,
1302
  query_param,
1303
  )
1304
- else: # hybrid mode
1305
  ll_data = await _get_node_data(
1306
  ll_keywords,
1307
  knowledge_graph_inst,
@@ -1329,10 +1248,43 @@ async def _build_query_context(
1329
  hl_text_units_context,
1330
  ) = hl_data
1331
 
1332
- entities_context, relations_context, text_units_context = combine_contexts(
1333
- [hl_entities_context, ll_entities_context],
1334
- [hl_relations_context, ll_relations_context],
1335
- [hl_text_units_context, ll_text_units_context],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1336
  )
1337
  # not necessary to use LLM to generate a response
1338
  if not entities_context and not relations_context:
@@ -1440,17 +1392,7 @@ async def _get_node_data(
1440
  )
1441
 
1442
  # build prompt
1443
- entites_section_list = [
1444
- [
1445
- "id",
1446
- "entity",
1447
- "type",
1448
- "description",
1449
- "rank",
1450
- "created_at",
1451
- "file_path",
1452
- ]
1453
- ]
1454
  for i, n in enumerate(node_datas):
1455
  created_at = n.get("created_at", "UNKNOWN")
1456
  if isinstance(created_at, (int, float)):
@@ -1459,32 +1401,19 @@ async def _get_node_data(
1459
  # Get file path from node data
1460
  file_path = n.get("file_path", "unknown_source")
1461
 
1462
- entites_section_list.append(
1463
- [
1464
- i,
1465
- n["entity_name"],
1466
- n.get("entity_type", "UNKNOWN"),
1467
- n.get("description", "UNKNOWN"),
1468
- n["rank"],
1469
- created_at,
1470
- file_path,
1471
- ]
1472
  )
1473
- entities_context = list_of_list_to_json(entites_section_list)
1474
-
1475
- relations_section_list = [
1476
- [
1477
- "id",
1478
- "source",
1479
- "target",
1480
- "description",
1481
- "keywords",
1482
- "weight",
1483
- "rank",
1484
- "created_at",
1485
- "file_path",
1486
- ]
1487
- ]
1488
  for i, e in enumerate(use_relations):
1489
  created_at = e.get("created_at", "UNKNOWN")
1490
  # Convert timestamp to readable format
@@ -1494,27 +1423,29 @@ async def _get_node_data(
1494
  # Get file path from edge data
1495
  file_path = e.get("file_path", "unknown_source")
1496
 
1497
- relations_section_list.append(
1498
- [
1499
- i,
1500
- e["src_tgt"][0],
1501
- e["src_tgt"][1],
1502
- e["description"],
1503
- e["keywords"],
1504
- e["weight"],
1505
- e["rank"],
1506
- created_at,
1507
- file_path,
1508
- ]
1509
  )
1510
- relations_context = list_of_list_to_json(relations_section_list)
1511
 
1512
- text_units_section_list = [["id", "content", "file_path"]]
1513
  for i, t in enumerate(use_text_units):
1514
- text_units_section_list.append(
1515
- [i, t["content"], t.get("file_path", "unknown_source")]
 
 
 
 
1516
  )
1517
- text_units_context = list_of_list_to_json(text_units_section_list)
1518
  return entities_context, relations_context, text_units_context
1519
 
1520
 
@@ -1757,19 +1688,7 @@ async def _get_edge_data(
1757
  f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} chunks"
1758
  )
1759
 
1760
- relations_section_list = [
1761
- [
1762
- "id",
1763
- "source",
1764
- "target",
1765
- "description",
1766
- "keywords",
1767
- "weight",
1768
- "rank",
1769
- "created_at",
1770
- "file_path",
1771
- ]
1772
- ]
1773
  for i, e in enumerate(edge_datas):
1774
  created_at = e.get("created_at", "UNKNOWN")
1775
  # Convert timestamp to readable format
@@ -1779,24 +1698,21 @@ async def _get_edge_data(
1779
  # Get file path from edge data
1780
  file_path = e.get("file_path", "unknown_source")
1781
 
1782
- relations_section_list.append(
1783
- [
1784
- i,
1785
- e["src_id"],
1786
- e["tgt_id"],
1787
- e["description"],
1788
- e["keywords"],
1789
- e["weight"],
1790
- e["rank"],
1791
- created_at,
1792
- file_path,
1793
- ]
1794
  )
1795
- relations_context = list_of_list_to_json(relations_section_list)
1796
 
1797
- entites_section_list = [
1798
- ["id", "entity", "type", "description", "rank", "created_at", "file_path"]
1799
- ]
1800
  for i, n in enumerate(use_entities):
1801
  created_at = n.get("created_at", "UNKNOWN")
1802
  # Convert timestamp to readable format
@@ -1806,23 +1722,27 @@ async def _get_edge_data(
1806
  # Get file path from node data
1807
  file_path = n.get("file_path", "unknown_source")
1808
 
1809
- entites_section_list.append(
1810
- [
1811
- i,
1812
- n["entity_name"],
1813
- n.get("entity_type", "UNKNOWN"),
1814
- n.get("description", "UNKNOWN"),
1815
- n["rank"],
1816
- created_at,
1817
- file_path,
1818
- ]
1819
  )
1820
- entities_context = list_of_list_to_json(entites_section_list)
1821
 
1822
- text_units_section_list = [["id", "content", "file_path"]]
1823
  for i, t in enumerate(use_text_units):
1824
- text_units_section_list.append([i, t["content"], t.get("file_path", "unknown")])
1825
- text_units_context = list_of_list_to_json(text_units_section_list)
 
 
 
 
 
1826
  return entities_context, relations_context, text_units_context
1827
 
1828
 
@@ -1938,29 +1858,9 @@ async def _find_related_text_unit_from_relationships(
1938
  return all_text_units
1939
 
1940
 
1941
- def combine_contexts(entities, relationships, sources):
1942
- # Function to extract entities, relationships, and sources from context strings
1943
- hl_entities, ll_entities = entities[0], entities[1]
1944
- hl_relationships, ll_relationships = relationships[0], relationships[1]
1945
- hl_sources, ll_sources = sources[0], sources[1]
1946
- # Combine and deduplicate the entities
1947
- combined_entities = process_combine_contexts(hl_entities, ll_entities)
1948
-
1949
- # Combine and deduplicate the relationships
1950
- combined_relationships = process_combine_contexts(
1951
- hl_relationships, ll_relationships
1952
- )
1953
-
1954
- # Combine and deduplicate the sources
1955
- combined_sources = process_combine_contexts(hl_sources, ll_sources)
1956
-
1957
- return combined_entities, combined_relationships, combined_sources
1958
-
1959
-
1960
  async def naive_query(
1961
  query: str,
1962
  chunks_vdb: BaseVectorStorage,
1963
- text_chunks_db: BaseKVStorage,
1964
  query_param: QueryParam,
1965
  global_config: dict[str, str],
1966
  hashing_kv: BaseKVStorage | None = None,
@@ -1982,14 +1882,24 @@ async def naive_query(
1982
  return cached_response
1983
 
1984
  tokenizer: Tokenizer = global_config["tokenizer"]
1985
- section = await _get_vector_context(query, chunks_vdb, query_param, tokenizer)
1986
 
1987
- if section is None:
 
 
 
 
1988
  return PROMPTS["fail_response"]
1989
 
 
1990
  if query_param.only_need_context:
1991
- return section
 
 
 
 
 
1992
 
 
1993
  # Process conversation history
1994
  history_context = ""
1995
  if query_param.conversation_history:
@@ -1999,7 +1909,7 @@ async def naive_query(
1999
 
2000
  sys_prompt_temp = system_prompt if system_prompt else PROMPTS["naive_rag_response"]
2001
  sys_prompt = sys_prompt_temp.format(
2002
- content_data=section,
2003
  response_type=query_param.response_type,
2004
  history=history_context,
2005
  )
@@ -2056,6 +1966,9 @@ async def kg_query_with_keywords(
2056
  query_param: QueryParam,
2057
  global_config: dict[str, str],
2058
  hashing_kv: BaseKVStorage | None = None,
 
 
 
2059
  ) -> str | AsyncIterator[str]:
2060
  """
2061
  Refactored kg_query that does NOT extract keywords by itself.
@@ -2069,9 +1982,6 @@ async def kg_query_with_keywords(
2069
  # Apply higher priority (5) to query relation LLM function
2070
  use_model_func = partial(use_model_func, _priority=5)
2071
 
2072
- # ---------------------------
2073
- # 1) Handle potential cache for query results
2074
- # ---------------------------
2075
  args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
2076
  cached_response, quantized, min_val, max_val = await handle_cache(
2077
  hashing_kv, args_hash, query, query_param.mode, cache_type="query"
@@ -2079,14 +1989,6 @@ async def kg_query_with_keywords(
2079
  if cached_response is not None:
2080
  return cached_response
2081
 
2082
- # ---------------------------
2083
- # 2) RETRIEVE KEYWORDS FROM query_param
2084
- # ---------------------------
2085
-
2086
- # If these fields don't exist, default to empty lists/strings.
2087
- hl_keywords = getattr(query_param, "hl_keywords", []) or []
2088
- ll_keywords = getattr(query_param, "ll_keywords", []) or []
2089
-
2090
  # If neither has any keywords, you could handle that logic here.
2091
  if not hl_keywords and not ll_keywords:
2092
  logger.warning(
@@ -2100,25 +2002,9 @@ async def kg_query_with_keywords(
2100
  logger.warning("high_level_keywords is empty, switching to local mode.")
2101
  query_param.mode = "local"
2102
 
2103
- # Flatten low-level and high-level keywords if needed
2104
- ll_keywords_flat = (
2105
- [item for sublist in ll_keywords for item in sublist]
2106
- if any(isinstance(i, list) for i in ll_keywords)
2107
- else ll_keywords
2108
- )
2109
- hl_keywords_flat = (
2110
- [item for sublist in hl_keywords for item in sublist]
2111
- if any(isinstance(i, list) for i in hl_keywords)
2112
- else hl_keywords
2113
- )
2114
-
2115
- # Join the flattened lists
2116
- ll_keywords_str = ", ".join(ll_keywords_flat) if ll_keywords_flat else ""
2117
- hl_keywords_str = ", ".join(hl_keywords_flat) if hl_keywords_flat else ""
2118
 
2119
- # ---------------------------
2120
- # 3) BUILD CONTEXT
2121
- # ---------------------------
2122
  context = await _build_query_context(
2123
  ll_keywords_str,
2124
  hl_keywords_str,
@@ -2127,18 +2013,14 @@ async def kg_query_with_keywords(
2127
  relationships_vdb,
2128
  text_chunks_db,
2129
  query_param,
 
2130
  )
2131
  if not context:
2132
  return PROMPTS["fail_response"]
2133
 
2134
- # If only context is needed, return it
2135
  if query_param.only_need_context:
2136
  return context
2137
 
2138
- # ---------------------------
2139
- # 4) BUILD THE SYSTEM PROMPT + CALL LLM
2140
- # ---------------------------
2141
-
2142
  # Process conversation history
2143
  history_context = ""
2144
  if query_param.conversation_history:
@@ -2180,7 +2062,6 @@ async def kg_query_with_keywords(
2180
  )
2181
 
2182
  if hashing_kv.global_config.get("enable_llm_cache"):
2183
- # 7. Save cache - 只有在收集完整响应后才缓存
2184
  await save_to_cache(
2185
  hashing_kv,
2186
  CacheData(
@@ -2198,85 +2079,6 @@ async def kg_query_with_keywords(
2198
  return response
2199
 
2200
 
2201
- async def _get_vector_context(
2202
- query: str,
2203
- chunks_vdb: BaseVectorStorage,
2204
- query_param: QueryParam,
2205
- tokenizer: Tokenizer,
2206
- ) -> str | None:
2207
- """
2208
- Retrieve vector context from the vector database.
2209
-
2210
- This function performs vector search to find relevant text chunks for a query,
2211
- formats them with file path and creation time information, and truncates
2212
- the results to fit within token limits.
2213
-
2214
- Args:
2215
- query: The query string to search for
2216
- chunks_vdb: Vector database containing document chunks
2217
- query_param: Query parameters including top_k and ids
2218
- tokenizer: Tokenizer for counting tokens
2219
-
2220
- Returns:
2221
- Formatted string containing relevant text chunks, or None if no results found
2222
- """
2223
- try:
2224
- # Reduce top_k for vector search in hybrid mode since we have structured information from KG
2225
- mix_topk = (
2226
- min(10, query_param.top_k)
2227
- if hasattr(query_param, "mode") and query_param.mode == "mix"
2228
- else query_param.top_k
2229
- )
2230
- results = await chunks_vdb.query(query, top_k=mix_topk, ids=query_param.ids)
2231
- if not results:
2232
- return None
2233
-
2234
- valid_chunks = []
2235
- for result in results:
2236
- if "content" in result:
2237
- # Directly use content from chunks_vdb.query result
2238
- chunk_with_time = {
2239
- "content": result["content"],
2240
- "created_at": result.get("created_at", None),
2241
- "file_path": result.get("file_path", None),
2242
- }
2243
- valid_chunks.append(chunk_with_time)
2244
-
2245
- if not valid_chunks:
2246
- return None
2247
-
2248
- maybe_trun_chunks = truncate_list_by_token_size(
2249
- valid_chunks,
2250
- key=lambda x: x["content"],
2251
- max_token_size=query_param.max_token_for_text_unit,
2252
- tokenizer=tokenizer,
2253
- )
2254
-
2255
- logger.debug(
2256
- f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
2257
- )
2258
- logger.info(f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {mix_topk}")
2259
-
2260
- if not maybe_trun_chunks:
2261
- return None
2262
-
2263
- # Include time information in content
2264
- formatted_chunks = []
2265
- for c in maybe_trun_chunks:
2266
- chunk_text = "File path: " + c["file_path"] + "\r\n\r\n" + c["content"]
2267
- if c["created_at"]:
2268
- chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\r\n\r\n{chunk_text}"
2269
- formatted_chunks.append(chunk_text)
2270
-
2271
- logger.debug(
2272
- f"Truncate chunks from {len(valid_chunks)} to {len(formatted_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
2273
- )
2274
- return "\r\n\r\n--New Chunk--\r\n\r\n".join(formatted_chunks)
2275
- except Exception as e:
2276
- logger.error(f"Error in _get_vector_context: {e}")
2277
- return None
2278
-
2279
-
2280
  async def query_with_keywords(
2281
  query: str,
2282
  prompt: str,
@@ -2320,12 +2122,15 @@ async def query_with_keywords(
2320
  )
2321
 
2322
  # Create a new string with the prompt and the keywords
2323
- ll_keywords_str = ", ".join(ll_keywords)
2324
- hl_keywords_str = ", ".join(hl_keywords)
2325
- formatted_question = f"{prompt}\n\n### Keywords:\nHigh-level: {hl_keywords_str}\nLow-level: {ll_keywords_str}\n\n### Query:\n{query}"
 
 
 
2326
 
2327
  # Use appropriate query method based on mode
2328
- if param.mode in ["local", "global", "hybrid"]:
2329
  return await kg_query_with_keywords(
2330
  formatted_question,
2331
  knowledge_graph_inst,
@@ -2335,6 +2140,9 @@ async def query_with_keywords(
2335
  param,
2336
  global_config,
2337
  hashing_kv=hashing_kv,
 
 
 
2338
  )
2339
  elif param.mode == "naive":
2340
  return await naive_query(
@@ -2345,17 +2153,5 @@ async def query_with_keywords(
2345
  global_config,
2346
  hashing_kv=hashing_kv,
2347
  )
2348
- elif param.mode == "mix":
2349
- return await mix_kg_vector_query(
2350
- formatted_question,
2351
- knowledge_graph_inst,
2352
- entities_vdb,
2353
- relationships_vdb,
2354
- chunks_vdb,
2355
- text_chunks_db,
2356
- param,
2357
- global_config,
2358
- hashing_kv=hashing_kv,
2359
- )
2360
  else:
2361
  raise ValueError(f"Unknown mode {param.mode}")
 
2
  from functools import partial
3
 
4
  import asyncio
 
5
  import json
6
  import re
7
  import os
 
25
  CacheData,
26
  get_conversation_turns,
27
  use_llm_func_with_cache,
 
28
  )
29
  from .base import (
30
  BaseGraphStorage,
 
857
  global_config: dict[str, str],
858
  hashing_kv: BaseKVStorage | None = None,
859
  system_prompt: str | None = None,
860
+ chunks_vdb: BaseVectorStorage = None,
861
  ) -> str | AsyncIterator[str]:
862
  if query_param.model_func:
863
  use_model_func = query_param.model_func
 
910
  relationships_vdb,
911
  text_chunks_db,
912
  query_param,
913
+ chunks_vdb,
914
  )
915
 
916
  if query_param.only_need_context:
 
1110
  return hl_keywords, ll_keywords
1111
 
1112
 
1113
+ async def _get_vector_context(
1114
  query: str,
 
 
 
1115
  chunks_vdb: BaseVectorStorage,
 
1116
  query_param: QueryParam,
1117
+ tokenizer: Tokenizer,
1118
+ ) -> tuple[list, list, list] | None:
 
 
 
 
 
 
 
 
 
1119
  """
1120
+ Retrieve vector context from the vector database.
 
1121
 
1122
+ This function performs vector search to find relevant text chunks for a query,
1123
+ formats them with file path and creation time information.
 
 
 
 
1124
 
1125
+ Args:
1126
+ query: The query string to search for
1127
+ chunks_vdb: Vector database containing document chunks
1128
+ query_param: Query parameters including top_k and ids
1129
+ tokenizer: Tokenizer for counting tokens
 
 
1130
 
1131
+ Returns:
1132
+ Tuple (empty_entities, empty_relations, text_units) for combine_contexts,
1133
+ compatible with _get_edge_data and _get_node_data format
1134
+ """
1135
+ try:
1136
+ results = await chunks_vdb.query(
1137
+ query, top_k=query_param.top_k, ids=query_param.ids
1138
  )
1139
+ if not results:
1140
+ return [], [], []
1141
 
1142
+ valid_chunks = []
1143
+ for result in results:
1144
+ if "content" in result:
1145
+ # Directly use content from chunks_vdb.query result
1146
+ chunk_with_time = {
1147
+ "content": result["content"],
1148
+ "created_at": result.get("created_at", None),
1149
+ "file_path": result.get("file_path", "unknown_source"),
1150
+ }
1151
+ valid_chunks.append(chunk_with_time)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1152
 
1153
+ if not valid_chunks:
1154
+ return [], [], []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1155
 
1156
+ maybe_trun_chunks = truncate_list_by_token_size(
1157
+ valid_chunks,
1158
+ key=lambda x: x["content"],
1159
+ max_token_size=query_param.max_token_for_text_unit,
1160
+ tokenizer=tokenizer,
1161
+ )
1162
 
1163
+ logger.debug(
1164
+ f"Truncate chunks from {len(valid_chunks)} to {len(maybe_trun_chunks)} (max tokens:{query_param.max_token_for_text_unit})"
1165
+ )
1166
+ logger.info(
1167
+ f"Vector query: {len(maybe_trun_chunks)} chunks, top_k: {query_param.top_k}"
1168
+ )
1169
 
1170
+ if not maybe_trun_chunks:
1171
+ return [], [], []
 
 
 
 
1172
 
1173
+ # Create empty entities and relations contexts
1174
+ entities_context = []
1175
+ relations_context = []
 
 
 
 
 
 
 
 
1176
 
1177
+ # Create text_units_context directly as a list of dictionaries
1178
+ text_units_context = []
1179
+ for i, chunk in enumerate(maybe_trun_chunks):
1180
+ text_units_context.append(
1181
+ {
1182
+ "id": i + 1,
1183
+ "content": chunk["content"],
1184
+ "file_path": chunk["file_path"],
1185
+ }
 
 
 
 
 
1186
  )
1187
 
1188
+ return entities_context, relations_context, text_units_context
1189
+ except Exception as e:
1190
+ logger.error(f"Error in _get_vector_context: {e}")
1191
+ return [], [], []
1192
 
1193
 
1194
  async def _build_query_context(
 
1199
  relationships_vdb: BaseVectorStorage,
1200
  text_chunks_db: BaseKVStorage,
1201
  query_param: QueryParam,
1202
+ chunks_vdb: BaseVectorStorage = None, # Add chunks_vdb parameter for mix mode
1203
  ):
1204
+ logger.info(f"Process {os.getpid()} building query context...")
1205
+
1206
+ # Handle local and global modes as before
1207
  if query_param.mode == "local":
1208
  entities_context, relations_context, text_units_context = await _get_node_data(
1209
  ll_keywords,
 
1220
  text_chunks_db,
1221
  query_param,
1222
  )
1223
+ else: # hybrid or mix mode
1224
  ll_data = await _get_node_data(
1225
  ll_keywords,
1226
  knowledge_graph_inst,
 
1248
  hl_text_units_context,
1249
  ) = hl_data
1250
 
1251
+ # Initialize vector data with empty lists
1252
+ vector_entities_context, vector_relations_context, vector_text_units_context = (
1253
+ [],
1254
+ [],
1255
+ [],
1256
+ )
1257
+
1258
+ # Only get vector data if in mix mode
1259
+ if query_param.mode == "mix" and hasattr(query_param, "original_query"):
1260
+ # Get tokenizer from text_chunks_db
1261
+ tokenizer = text_chunks_db.global_config.get("tokenizer")
1262
+
1263
+ # Get vector context in triple format
1264
+ vector_data = await _get_vector_context(
1265
+ query_param.original_query, # We need to pass the original query
1266
+ chunks_vdb,
1267
+ query_param,
1268
+ tokenizer,
1269
+ )
1270
+
1271
+ # If vector_data is not None, unpack it
1272
+ if vector_data is not None:
1273
+ (
1274
+ vector_entities_context,
1275
+ vector_relations_context,
1276
+ vector_text_units_context,
1277
+ ) = vector_data
1278
+
1279
+ # Combine and deduplicate the entities, relationships, and sources
1280
+ entities_context = process_combine_contexts(
1281
+ hl_entities_context, ll_entities_context, vector_entities_context
1282
+ )
1283
+ relations_context = process_combine_contexts(
1284
+ hl_relations_context, ll_relations_context, vector_relations_context
1285
+ )
1286
+ text_units_context = process_combine_contexts(
1287
+ hl_text_units_context, ll_text_units_context, vector_text_units_context
1288
  )
1289
  # not necessary to use LLM to generate a response
1290
  if not entities_context and not relations_context:
 
1392
  )
1393
 
1394
  # build prompt
1395
+ entities_context = []
 
 
 
 
 
 
 
 
 
 
1396
  for i, n in enumerate(node_datas):
1397
  created_at = n.get("created_at", "UNKNOWN")
1398
  if isinstance(created_at, (int, float)):
 
1401
  # Get file path from node data
1402
  file_path = n.get("file_path", "unknown_source")
1403
 
1404
+ entities_context.append(
1405
+ {
1406
+ "id": i + 1,
1407
+ "entity": n["entity_name"],
1408
+ "type": n.get("entity_type", "UNKNOWN"),
1409
+ "description": n.get("description", "UNKNOWN"),
1410
+ "rank": n["rank"],
1411
+ "created_at": created_at,
1412
+ "file_path": file_path,
1413
+ }
1414
  )
1415
+
1416
+ relations_context = []
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
  for i, e in enumerate(use_relations):
1418
  created_at = e.get("created_at", "UNKNOWN")
1419
  # Convert timestamp to readable format
 
1423
  # Get file path from edge data
1424
  file_path = e.get("file_path", "unknown_source")
1425
 
1426
+ relations_context.append(
1427
+ {
1428
+ "id": i + 1,
1429
+ "entity1": e["src_tgt"][0],
1430
+ "entity2": e["src_tgt"][1],
1431
+ "description": e["description"],
1432
+ "keywords": e["keywords"],
1433
+ "weight": e["weight"],
1434
+ "rank": e["rank"],
1435
+ "created_at": created_at,
1436
+ "file_path": file_path,
1437
+ }
1438
  )
 
1439
 
1440
+ text_units_context = []
1441
  for i, t in enumerate(use_text_units):
1442
+ text_units_context.append(
1443
+ {
1444
+ "id": i + 1,
1445
+ "content": t["content"],
1446
+ "file_path": t.get("file_path", "unknown_source"),
1447
+ }
1448
  )
 
1449
  return entities_context, relations_context, text_units_context
1450
 
1451
 
 
1688
  f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} chunks"
1689
  )
1690
 
1691
+ relations_context = []
 
 
 
 
 
 
 
 
 
 
 
 
1692
  for i, e in enumerate(edge_datas):
1693
  created_at = e.get("created_at", "UNKNOWN")
1694
  # Convert timestamp to readable format
 
1698
  # Get file path from edge data
1699
  file_path = e.get("file_path", "unknown_source")
1700
 
1701
+ relations_context.append(
1702
+ {
1703
+ "id": i + 1,
1704
+ "entity1": e["src_id"],
1705
+ "entity2": e["tgt_id"],
1706
+ "description": e["description"],
1707
+ "keywords": e["keywords"],
1708
+ "weight": e["weight"],
1709
+ "rank": e["rank"],
1710
+ "created_at": created_at,
1711
+ "file_path": file_path,
1712
+ }
1713
  )
 
1714
 
1715
+ entities_context = []
 
 
1716
  for i, n in enumerate(use_entities):
1717
  created_at = n.get("created_at", "UNKNOWN")
1718
  # Convert timestamp to readable format
 
1722
  # Get file path from node data
1723
  file_path = n.get("file_path", "unknown_source")
1724
 
1725
+ entities_context.append(
1726
+ {
1727
+ "id": i + 1,
1728
+ "entity": n["entity_name"],
1729
+ "type": n.get("entity_type", "UNKNOWN"),
1730
+ "description": n.get("description", "UNKNOWN"),
1731
+ "rank": n["rank"],
1732
+ "created_at": created_at,
1733
+ "file_path": file_path,
1734
+ }
1735
  )
 
1736
 
1737
+ text_units_context = []
1738
  for i, t in enumerate(use_text_units):
1739
+ text_units_context.append(
1740
+ {
1741
+ "id": i + 1,
1742
+ "content": t["content"],
1743
+ "file_path": t.get("file_path", "unknown"),
1744
+ }
1745
+ )
1746
  return entities_context, relations_context, text_units_context
1747
 
1748
 
 
1858
  return all_text_units
1859
 
1860
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1861
  async def naive_query(
1862
  query: str,
1863
  chunks_vdb: BaseVectorStorage,
 
1864
  query_param: QueryParam,
1865
  global_config: dict[str, str],
1866
  hashing_kv: BaseKVStorage | None = None,
 
1882
  return cached_response
1883
 
1884
  tokenizer: Tokenizer = global_config["tokenizer"]
 
1885
 
1886
+ _, _, text_units_context = await _get_vector_context(
1887
+ query, chunks_vdb, query_param, tokenizer
1888
+ )
1889
+
1890
+ if text_units_context is None or len(text_units_context) == 0:
1891
  return PROMPTS["fail_response"]
1892
 
1893
+ text_units_str = json.dumps(text_units_context, ensure_ascii=False)
1894
  if query_param.only_need_context:
1895
+ return f"""
1896
+ ---Document Chunks---
1897
+
1898
+ ```json
1899
+ {text_units_str}
1900
+ ```
1901
 
1902
+ """
1903
  # Process conversation history
1904
  history_context = ""
1905
  if query_param.conversation_history:
 
1909
 
1910
  sys_prompt_temp = system_prompt if system_prompt else PROMPTS["naive_rag_response"]
1911
  sys_prompt = sys_prompt_temp.format(
1912
+ content_data=text_units_str,
1913
  response_type=query_param.response_type,
1914
  history=history_context,
1915
  )
 
1966
  query_param: QueryParam,
1967
  global_config: dict[str, str],
1968
  hashing_kv: BaseKVStorage | None = None,
1969
+ ll_keywords: list[str] = [],
1970
+ hl_keywords: list[str] = [],
1971
+ chunks_vdb: BaseVectorStorage | None = None,
1972
  ) -> str | AsyncIterator[str]:
1973
  """
1974
  Refactored kg_query that does NOT extract keywords by itself.
 
1982
  # Apply higher priority (5) to query relation LLM function
1983
  use_model_func = partial(use_model_func, _priority=5)
1984
 
 
 
 
1985
  args_hash = compute_args_hash(query_param.mode, query, cache_type="query")
1986
  cached_response, quantized, min_val, max_val = await handle_cache(
1987
  hashing_kv, args_hash, query, query_param.mode, cache_type="query"
 
1989
  if cached_response is not None:
1990
  return cached_response
1991
 
 
 
 
 
 
 
 
 
1992
  # If neither has any keywords, you could handle that logic here.
1993
  if not hl_keywords and not ll_keywords:
1994
  logger.warning(
 
2002
  logger.warning("high_level_keywords is empty, switching to local mode.")
2003
  query_param.mode = "local"
2004
 
2005
+ ll_keywords_str = ", ".join(ll_keywords) if ll_keywords else ""
2006
+ hl_keywords_str = ", ".join(hl_keywords) if hl_keywords else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
2007
 
 
 
 
2008
  context = await _build_query_context(
2009
  ll_keywords_str,
2010
  hl_keywords_str,
 
2013
  relationships_vdb,
2014
  text_chunks_db,
2015
  query_param,
2016
+ chunks_vdb=chunks_vdb,
2017
  )
2018
  if not context:
2019
  return PROMPTS["fail_response"]
2020
 
 
2021
  if query_param.only_need_context:
2022
  return context
2023
 
 
 
 
 
2024
  # Process conversation history
2025
  history_context = ""
2026
  if query_param.conversation_history:
 
2062
  )
2063
 
2064
  if hashing_kv.global_config.get("enable_llm_cache"):
 
2065
  await save_to_cache(
2066
  hashing_kv,
2067
  CacheData(
 
2079
  return response
2080
 
2081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2082
  async def query_with_keywords(
2083
  query: str,
2084
  prompt: str,
 
2122
  )
2123
 
2124
  # Create a new string with the prompt and the keywords
2125
+ keywords_str = ", ".join(ll_keywords + hl_keywords)
2126
+ formatted_question = (
2127
+ f"{prompt}\n\n### Keywords\n\n{keywords_str}\n\n### Query\n\n{query}"
2128
+ )
2129
+
2130
+ param.original_query = query
2131
 
2132
  # Use appropriate query method based on mode
2133
+ if param.mode in ["local", "global", "hybrid", "mix"]:
2134
  return await kg_query_with_keywords(
2135
  formatted_question,
2136
  knowledge_graph_inst,
 
2140
  param,
2141
  global_config,
2142
  hashing_kv=hashing_kv,
2143
+ hl_keywords=hl_keywords,
2144
+ ll_keywords=ll_keywords,
2145
+ chunks_vdb=chunks_vdb,
2146
  )
2147
  elif param.mode == "naive":
2148
  return await naive_query(
 
2153
  global_config,
2154
  hashing_kv=hashing_kv,
2155
  )
 
 
 
 
 
 
 
 
 
 
 
 
2156
  else:
2157
  raise ValueError(f"Unknown mode {param.mode}")
lightrag/prompt.py CHANGED
@@ -311,7 +311,7 @@ When handling content with timestamps:
311
  ---Conversation History---
312
  {history}
313
 
314
- ---Document Chunks---
315
  {content_data}
316
 
317
  ---Response Rules---
@@ -320,7 +320,7 @@ When handling content with timestamps:
320
  - Use markdown formatting with appropriate section headings
321
  - Please respond in the same language as the user's question.
322
  - Ensure the response maintains continuity with the conversation history.
323
- - List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), and include the file path if available, in the following format: [KG/DC] file_path
324
  - If you don't know the answer, just say so.
325
  - Do not include information not provided by the Document Chunks."""
326
 
@@ -347,41 +347,3 @@ Similarity score criteria:
347
  0.5: Partially related and answer needs modification to be used
348
  Return only a number between 0-1, without any additional content.
349
  """
350
-
351
- PROMPTS["mix_rag_response"] = """---Role---
352
-
353
- You are a helpful assistant responding to user query about Data Sources provided below.
354
-
355
-
356
- ---Goal---
357
-
358
- Generate a concise response based on Data Sources and follow Response Rules, considering both the conversation history and the current query. Data sources contain two parts: Knowledge Graph(KG) and Document Chunks(DC). Summarize all information in the provided Data Sources, and incorporating general knowledge relevant to the Data Sources. Do not include information not provided by Data Sources.
359
-
360
- When handling information with timestamps:
361
- 1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
362
- 2. When encountering conflicting information, consider both the content/relationship and the timestamp
363
- 3. Don't automatically prefer the most recent information - use judgment based on the context
364
- 4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
365
-
366
- ---Conversation History---
367
- {history}
368
-
369
- ---Data Sources---
370
-
371
- 1. From Knowledge Graph(KG):
372
- {kg_context}
373
-
374
- 2. From Document Chunks(DC):
375
- {vector_context}
376
-
377
- ---Response Rules---
378
-
379
- - Target format and length: {response_type}
380
- - Use markdown formatting with appropriate section headings
381
- - Please respond in the same language as the user's question.
382
- - Ensure the response maintains continuity with the conversation history.
383
- - Organize answer in sections focusing on one main point or aspect of the answer
384
- - Use clear and descriptive section titles that reflect the content
385
- - List up to 5 most important reference sources at the end under "References" section. Clearly indicating whether each source is from Knowledge Graph (KG) or Vector Data (DC), and include the file path if available, in the following format: [KG/DC] file_path
386
- - If you don't know the answer, just say so. Do not make anything up.
387
- - Do not include information not provided by the Data Sources."""
 
311
  ---Conversation History---
312
  {history}
313
 
314
+ ---Document Chunks(DC)---
315
  {content_data}
316
 
317
  ---Response Rules---
 
320
  - Use markdown formatting with appropriate section headings
321
  - Please respond in the same language as the user's question.
322
  - Ensure the response maintains continuity with the conversation history.
323
+ - List up to 5 most important reference sources at the end under "References" section. Clearly indicating each source from Document Chunks(DC), and include the file path if available, in the following format: [DC] file_path
324
  - If you don't know the answer, just say so.
325
  - Do not include information not provided by the Document Chunks."""
326
 
 
347
  0.5: Partially related and answer needs modification to be used
348
  Return only a number between 0-1, without any additional content.
349
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lightrag/utils.py CHANGED
@@ -719,26 +719,6 @@ def truncate_list_by_token_size(
719
  return list_data
720
 
721
 
722
- def list_of_list_to_json(data: list[list[str]]) -> list[dict[str, str]]:
723
- if not data or len(data) <= 1:
724
- return []
725
-
726
- header = data[0]
727
- result = []
728
-
729
- for row in data[1:]:
730
- if len(row) >= 2:
731
- item = {}
732
- for i, field_name in enumerate(header):
733
- if i < len(row):
734
- item[field_name] = str(row[i])
735
- else:
736
- item[field_name] = ""
737
- result.append(item)
738
-
739
- return result
740
-
741
-
742
  def save_data_to_file(data, file_name):
743
  with open(file_name, "w", encoding="utf-8") as f:
744
  json.dump(data, f, ensure_ascii=False, indent=4)
@@ -804,21 +784,33 @@ def xml_to_json(xml_file):
804
  return None
805
 
806
 
807
- def process_combine_contexts(
808
- hl_context: list[dict[str, str]], ll_context: list[dict[str, str]]
809
- ):
 
 
 
 
 
 
 
810
  seen_content = {}
811
  combined_data = []
812
 
813
- for item in hl_context + ll_context:
814
- content_dict = {k: v for k, v in item.items() if k != "id"}
815
- content_key = tuple(sorted(content_dict.items()))
816
- if content_key not in seen_content:
817
- seen_content[content_key] = item
818
- combined_data.append(item)
819
-
 
 
 
 
 
820
  for i, item in enumerate(combined_data):
821
- item["id"] = str(i)
822
 
823
  return combined_data
824
 
 
719
  return list_data
720
 
721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
722
  def save_data_to_file(data, file_name):
723
  with open(file_name, "w", encoding="utf-8") as f:
724
  json.dump(data, f, ensure_ascii=False, indent=4)
 
784
  return None
785
 
786
 
787
+ def process_combine_contexts(*context_lists):
788
+ """
789
+ Combine multiple context lists and remove duplicate content
790
+
791
+ Args:
792
+ *context_lists: Any number of context lists
793
+
794
+ Returns:
795
+ Combined context list with duplicates removed
796
+ """
797
  seen_content = {}
798
  combined_data = []
799
 
800
+ # Iterate through all input context lists
801
+ for context_list in context_lists:
802
+ if not context_list: # Skip empty lists
803
+ continue
804
+ for item in context_list:
805
+ content_dict = {k: v for k, v in item.items() if k != "id"}
806
+ content_key = tuple(sorted(content_dict.items()))
807
+ if content_key not in seen_content:
808
+ seen_content[content_key] = item
809
+ combined_data.append(item)
810
+
811
+ # Reassign IDs
812
  for i, item in enumerate(combined_data):
813
+ item["id"] = str(i + 1)
814
 
815
  return combined_data
816
 
lightrag_webui/src/api/lightrag.ts CHANGED
@@ -94,10 +94,6 @@ export type QueryRequest = {
94
  max_token_for_global_context?: number
95
  /** Maximum number of tokens allocated for entity descriptions in local retrieval. */
96
  max_token_for_local_context?: number
97
- /** List of high-level keywords to prioritize in retrieval. */
98
- hl_keywords?: string[]
99
- /** List of low-level keywords to refine retrieval focus. */
100
- ll_keywords?: string[]
101
  /**
102
  * Stores past conversation history to maintain context.
103
  * Format: [{"role": "user/assistant", "content": "message"}].
 
94
  max_token_for_global_context?: number
95
  /** Maximum number of tokens allocated for entity descriptions in local retrieval. */
96
  max_token_for_local_context?: number
 
 
 
 
97
  /**
98
  * Stores past conversation history to maintain context.
99
  * Format: [{"role": "user/assistant", "content": "message"}].
lightrag_webui/src/components/retrieval/QuerySettings.tsx CHANGED
@@ -1,7 +1,6 @@
1
  import { useCallback } from 'react'
2
  import { QueryMode, QueryRequest } from '@/api/lightrag'
3
  // Removed unused import for Text component
4
- import Input from '@/components/ui/Input'
5
  import Checkbox from '@/components/ui/Checkbox'
6
  import NumberInput from '@/components/ui/NumberInput'
7
  import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/Card'
@@ -242,71 +241,6 @@ export default function QuerySettings() {
242
  </div>
243
  </>
244
 
245
- {/* Keywords */}
246
- <>
247
- <>
248
- <TooltipProvider>
249
- <Tooltip>
250
- <TooltipTrigger asChild>
251
- <label htmlFor="hl_keywords" className="ml-1 cursor-help">
252
- {t('retrievePanel.querySettings.hlKeywords')}
253
- </label>
254
- </TooltipTrigger>
255
- <TooltipContent side="left">
256
- <p>{t('retrievePanel.querySettings.hlKeywordsTooltip')}</p>
257
- </TooltipContent>
258
- </Tooltip>
259
- </TooltipProvider>
260
- <div>
261
- {/* Removed sr-only label */}
262
- <Input
263
- id="hl_keywords"
264
- type="text"
265
- value={querySettings.hl_keywords?.join(', ')}
266
- onChange={(e) => {
267
- const keywords = e.target.value
268
- .split(',')
269
- .map((k) => k.trim())
270
- .filter((k) => k !== '')
271
- handleChange('hl_keywords', keywords)
272
- }}
273
- placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')}
274
- />
275
- </div>
276
- </>
277
-
278
- <>
279
- <TooltipProvider>
280
- <Tooltip>
281
- <TooltipTrigger asChild>
282
- <label htmlFor="ll_keywords" className="ml-1 cursor-help">
283
- {t('retrievePanel.querySettings.llKeywords')}
284
- </label>
285
- </TooltipTrigger>
286
- <TooltipContent side="left">
287
- <p>{t('retrievePanel.querySettings.llKeywordsTooltip')}</p>
288
- </TooltipContent>
289
- </Tooltip>
290
- </TooltipProvider>
291
- <div>
292
- {/* Removed sr-only label */}
293
- <Input
294
- id="ll_keywords"
295
- type="text"
296
- value={querySettings.ll_keywords?.join(', ')}
297
- onChange={(e) => {
298
- const keywords = e.target.value
299
- .split(',')
300
- .map((k) => k.trim())
301
- .filter((k) => k !== '')
302
- handleChange('ll_keywords', keywords)
303
- }}
304
- placeholder={t('retrievePanel.querySettings.hlkeywordsPlaceHolder')}
305
- />
306
- </div>
307
- </>
308
- </>
309
-
310
  {/* Toggle Options */}
311
  <>
312
  <div className="flex items-center gap-2">
 
1
  import { useCallback } from 'react'
2
  import { QueryMode, QueryRequest } from '@/api/lightrag'
3
  // Removed unused import for Text component
 
4
  import Checkbox from '@/components/ui/Checkbox'
5
  import NumberInput from '@/components/ui/NumberInput'
6
  import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/Card'
 
241
  </div>
242
  </>
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  {/* Toggle Options */}
245
  <>
246
  <div className="flex items-center gap-2">