Magicyuan commited on
Commit
18ef39c
·
1 Parent(s): 6c876fd

修复 args_hash在使用常规缓存时候才计算导致embedding缓存时没有计算的bug

Browse files
Files changed (2) hide show
  1. README.md +5 -1
  2. lightrag/llm.py +42 -6
README.md CHANGED
@@ -596,7 +596,11 @@ if __name__ == "__main__":
596
  | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
597
  | **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
598
  | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
599
- | **embedding\_cache\_config** | `dict` | Configuration for embedding cache. Includes `enabled` (bool) to toggle cache and `similarity_threshold` (float) for cache retrieval | `{"enabled": False, "similarity_threshold": 0.95}` |
 
 
 
 
600
 
601
  ## API Server Implementation
602
 
 
596
  | **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
597
  | **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
598
  | **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
599
+ | **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains two parameters:
600
+ - `enabled`: Boolean value to enable/disable caching functionality. When enabled, questions and answers will be cached.
601
+ - `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
602
+
603
+ Default: `{"enabled": False, "similarity_threshold": 0.95}` | `{"enabled": False, "similarity_threshold": 0.95}` |
604
 
605
  ## API Server Implementation
606
 
lightrag/llm.py CHANGED
@@ -66,7 +66,11 @@ async def openai_complete_if_cache(
66
  messages.append({"role": "system", "content": system_prompt})
67
  messages.extend(history_messages)
68
  messages.append({"role": "user", "content": prompt})
 
69
  if hashing_kv is not None:
 
 
 
70
  # Get embedding cache configuration
71
  embedding_cache_config = hashing_kv.global_config.get(
72
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -86,7 +90,6 @@ async def openai_complete_if_cache(
86
  return best_cached_response
87
  else:
88
  # Use regular cache
89
- args_hash = compute_args_hash(model, messages)
90
  if_cache_return = await hashing_kv.get_by_id(args_hash)
91
  if if_cache_return is not None:
92
  return if_cache_return["return"]
@@ -159,7 +162,12 @@ async def azure_openai_complete_if_cache(
159
  messages.extend(history_messages)
160
  if prompt is not None:
161
  messages.append({"role": "user", "content": prompt})
 
 
162
  if hashing_kv is not None:
 
 
 
163
  # Get embedding cache configuration
164
  embedding_cache_config = hashing_kv.global_config.get(
165
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -178,7 +186,7 @@ async def azure_openai_complete_if_cache(
178
  if best_cached_response is not None:
179
  return best_cached_response
180
  else:
181
- args_hash = compute_args_hash(model, messages)
182
  if_cache_return = await hashing_kv.get_by_id(args_hash)
183
  if if_cache_return is not None:
184
  return if_cache_return["return"]
@@ -271,6 +279,9 @@ async def bedrock_complete_if_cache(
271
 
272
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
273
  if hashing_kv is not None:
 
 
 
274
  # Get embedding cache configuration
275
  embedding_cache_config = hashing_kv.global_config.get(
276
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -290,7 +301,6 @@ async def bedrock_complete_if_cache(
290
  return best_cached_response
291
  else:
292
  # Use regular cache
293
- args_hash = compute_args_hash(model, messages)
294
  if_cache_return = await hashing_kv.get_by_id(args_hash)
295
  if if_cache_return is not None:
296
  return if_cache_return["return"]
@@ -343,6 +353,11 @@ def initialize_hf_model(model_name):
343
  return hf_model, hf_tokenizer
344
 
345
 
 
 
 
 
 
346
  async def hf_model_if_cache(
347
  model,
348
  prompt,
@@ -359,7 +374,11 @@ async def hf_model_if_cache(
359
  messages.extend(history_messages)
360
  messages.append({"role": "user", "content": prompt})
361
 
 
362
  if hashing_kv is not None:
 
 
 
363
  # Get embedding cache configuration
364
  embedding_cache_config = hashing_kv.global_config.get(
365
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -379,7 +398,6 @@ async def hf_model_if_cache(
379
  return best_cached_response
380
  else:
381
  # Use regular cache
382
- args_hash = compute_args_hash(model, messages)
383
  if_cache_return = await hashing_kv.get_by_id(args_hash)
384
  if if_cache_return is not None:
385
  return if_cache_return["return"]
@@ -448,6 +466,11 @@ async def hf_model_if_cache(
448
  return response_text
449
 
450
 
 
 
 
 
 
451
  async def ollama_model_if_cache(
452
  model,
453
  prompt,
@@ -468,7 +491,12 @@ async def ollama_model_if_cache(
468
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
469
  messages.extend(history_messages)
470
  messages.append({"role": "user", "content": prompt})
 
 
471
  if hashing_kv is not None:
 
 
 
472
  # Get embedding cache configuration
473
  embedding_cache_config = hashing_kv.global_config.get(
474
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -488,7 +516,6 @@ async def ollama_model_if_cache(
488
  return best_cached_response
489
  else:
490
  # Use regular cache
491
- args_hash = compute_args_hash(model, messages)
492
  if_cache_return = await hashing_kv.get_by_id(args_hash)
493
  if if_cache_return is not None:
494
  return if_cache_return["return"]
@@ -542,6 +569,11 @@ def initialize_lmdeploy_pipeline(
542
  return lmdeploy_pipe
543
 
544
 
 
 
 
 
 
545
  async def lmdeploy_model_if_cache(
546
  model,
547
  prompt,
@@ -620,7 +652,12 @@ async def lmdeploy_model_if_cache(
620
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
621
  messages.extend(history_messages)
622
  messages.append({"role": "user", "content": prompt})
 
 
623
  if hashing_kv is not None:
 
 
 
624
  # Get embedding cache configuration
625
  embedding_cache_config = hashing_kv.global_config.get(
626
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
@@ -640,7 +677,6 @@ async def lmdeploy_model_if_cache(
640
  return best_cached_response
641
  else:
642
  # Use regular cache
643
- args_hash = compute_args_hash(model, messages)
644
  if_cache_return = await hashing_kv.get_by_id(args_hash)
645
  if if_cache_return is not None:
646
  return if_cache_return["return"]
 
66
  messages.append({"role": "system", "content": system_prompt})
67
  messages.extend(history_messages)
68
  messages.append({"role": "user", "content": prompt})
69
+
70
  if hashing_kv is not None:
71
+ # Calculate args_hash only when using cache
72
+ args_hash = compute_args_hash(model, messages)
73
+
74
  # Get embedding cache configuration
75
  embedding_cache_config = hashing_kv.global_config.get(
76
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
90
  return best_cached_response
91
  else:
92
  # Use regular cache
 
93
  if_cache_return = await hashing_kv.get_by_id(args_hash)
94
  if if_cache_return is not None:
95
  return if_cache_return["return"]
 
162
  messages.extend(history_messages)
163
  if prompt is not None:
164
  messages.append({"role": "user", "content": prompt})
165
+
166
+ hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
167
  if hashing_kv is not None:
168
+ # Calculate args_hash only when using cache
169
+ args_hash = compute_args_hash(model, messages)
170
+
171
  # Get embedding cache configuration
172
  embedding_cache_config = hashing_kv.global_config.get(
173
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
186
  if best_cached_response is not None:
187
  return best_cached_response
188
  else:
189
+ # Use regular cache
190
  if_cache_return = await hashing_kv.get_by_id(args_hash)
191
  if if_cache_return is not None:
192
  return if_cache_return["return"]
 
279
 
280
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
281
  if hashing_kv is not None:
282
+ # Calculate args_hash only when using cache
283
+ args_hash = compute_args_hash(model, messages)
284
+
285
  # Get embedding cache configuration
286
  embedding_cache_config = hashing_kv.global_config.get(
287
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
301
  return best_cached_response
302
  else:
303
  # Use regular cache
 
304
  if_cache_return = await hashing_kv.get_by_id(args_hash)
305
  if if_cache_return is not None:
306
  return if_cache_return["return"]
 
353
  return hf_model, hf_tokenizer
354
 
355
 
356
+ @retry(
357
+ stop=stop_after_attempt(3),
358
+ wait=wait_exponential(multiplier=1, min=4, max=10),
359
+ retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
360
+ )
361
  async def hf_model_if_cache(
362
  model,
363
  prompt,
 
374
  messages.extend(history_messages)
375
  messages.append({"role": "user", "content": prompt})
376
 
377
+ hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
378
  if hashing_kv is not None:
379
+ # Calculate args_hash only when using cache
380
+ args_hash = compute_args_hash(model, messages)
381
+
382
  # Get embedding cache configuration
383
  embedding_cache_config = hashing_kv.global_config.get(
384
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
398
  return best_cached_response
399
  else:
400
  # Use regular cache
 
401
  if_cache_return = await hashing_kv.get_by_id(args_hash)
402
  if if_cache_return is not None:
403
  return if_cache_return["return"]
 
466
  return response_text
467
 
468
 
469
+ @retry(
470
+ stop=stop_after_attempt(3),
471
+ wait=wait_exponential(multiplier=1, min=4, max=10),
472
+ retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
473
+ )
474
  async def ollama_model_if_cache(
475
  model,
476
  prompt,
 
491
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
492
  messages.extend(history_messages)
493
  messages.append({"role": "user", "content": prompt})
494
+
495
+ hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
496
  if hashing_kv is not None:
497
+ # Calculate args_hash only when using cache
498
+ args_hash = compute_args_hash(model, messages)
499
+
500
  # Get embedding cache configuration
501
  embedding_cache_config = hashing_kv.global_config.get(
502
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
516
  return best_cached_response
517
  else:
518
  # Use regular cache
 
519
  if_cache_return = await hashing_kv.get_by_id(args_hash)
520
  if if_cache_return is not None:
521
  return if_cache_return["return"]
 
569
  return lmdeploy_pipe
570
 
571
 
572
+ @retry(
573
+ stop=stop_after_attempt(3),
574
+ wait=wait_exponential(multiplier=1, min=4, max=10),
575
+ retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
576
+ )
577
  async def lmdeploy_model_if_cache(
578
  model,
579
  prompt,
 
652
  hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
653
  messages.extend(history_messages)
654
  messages.append({"role": "user", "content": prompt})
655
+
656
+ hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
657
  if hashing_kv is not None:
658
+ # Calculate args_hash only when using cache
659
+ args_hash = compute_args_hash(model, messages)
660
+
661
  # Get embedding cache configuration
662
  embedding_cache_config = hashing_kv.global_config.get(
663
  "embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
 
677
  return best_cached_response
678
  else:
679
  # Use regular cache
 
680
  if_cache_return = await hashing_kv.get_by_id(args_hash)
681
  if if_cache_return is not None:
682
  return if_cache_return["return"]