修复 args_hash在使用常规缓存时候才计算导致embedding缓存时没有计算的bug
Browse files- README.md +5 -1
- lightrag/llm.py +42 -6
README.md
CHANGED
@@ -596,7 +596,11 @@ if __name__ == "__main__":
|
|
596 |
| **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
|
597 |
| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
|
598 |
| **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
|
599 |
-
| **embedding\_cache\_config** | `dict` | Configuration for
|
|
|
|
|
|
|
|
|
600 |
|
601 |
## API Server Implementation
|
602 |
|
|
|
596 |
| **enable\_llm\_cache** | `bool` | If `TRUE`, stores LLM results in cache; repeated prompts return cached responses | `TRUE` |
|
597 |
| **addon\_params** | `dict` | Additional parameters, e.g., `{"example_number": 1, "language": "Simplified Chinese"}`: sets example limit and output language | `example_number: all examples, language: English` |
|
598 |
| **convert\_response\_to\_json\_func** | `callable` | Not used | `convert_response_to_json` |
|
599 |
+
| **embedding\_cache\_config** | `dict` | Configuration for question-answer caching. Contains two parameters:
|
600 |
+
- `enabled`: Boolean value to enable/disable caching functionality. When enabled, questions and answers will be cached.
|
601 |
+
- `similarity_threshold`: Float value (0-1), similarity threshold. When a new question's similarity with a cached question exceeds this threshold, the cached answer will be returned directly without calling the LLM.
|
602 |
+
|
603 |
+
Default: `{"enabled": False, "similarity_threshold": 0.95}` | `{"enabled": False, "similarity_threshold": 0.95}` |
|
604 |
|
605 |
## API Server Implementation
|
606 |
|
lightrag/llm.py
CHANGED
@@ -66,7 +66,11 @@ async def openai_complete_if_cache(
|
|
66 |
messages.append({"role": "system", "content": system_prompt})
|
67 |
messages.extend(history_messages)
|
68 |
messages.append({"role": "user", "content": prompt})
|
|
|
69 |
if hashing_kv is not None:
|
|
|
|
|
|
|
70 |
# Get embedding cache configuration
|
71 |
embedding_cache_config = hashing_kv.global_config.get(
|
72 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -86,7 +90,6 @@ async def openai_complete_if_cache(
|
|
86 |
return best_cached_response
|
87 |
else:
|
88 |
# Use regular cache
|
89 |
-
args_hash = compute_args_hash(model, messages)
|
90 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
91 |
if if_cache_return is not None:
|
92 |
return if_cache_return["return"]
|
@@ -159,7 +162,12 @@ async def azure_openai_complete_if_cache(
|
|
159 |
messages.extend(history_messages)
|
160 |
if prompt is not None:
|
161 |
messages.append({"role": "user", "content": prompt})
|
|
|
|
|
162 |
if hashing_kv is not None:
|
|
|
|
|
|
|
163 |
# Get embedding cache configuration
|
164 |
embedding_cache_config = hashing_kv.global_config.get(
|
165 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -178,7 +186,7 @@ async def azure_openai_complete_if_cache(
|
|
178 |
if best_cached_response is not None:
|
179 |
return best_cached_response
|
180 |
else:
|
181 |
-
|
182 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
183 |
if if_cache_return is not None:
|
184 |
return if_cache_return["return"]
|
@@ -271,6 +279,9 @@ async def bedrock_complete_if_cache(
|
|
271 |
|
272 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
273 |
if hashing_kv is not None:
|
|
|
|
|
|
|
274 |
# Get embedding cache configuration
|
275 |
embedding_cache_config = hashing_kv.global_config.get(
|
276 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -290,7 +301,6 @@ async def bedrock_complete_if_cache(
|
|
290 |
return best_cached_response
|
291 |
else:
|
292 |
# Use regular cache
|
293 |
-
args_hash = compute_args_hash(model, messages)
|
294 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
295 |
if if_cache_return is not None:
|
296 |
return if_cache_return["return"]
|
@@ -343,6 +353,11 @@ def initialize_hf_model(model_name):
|
|
343 |
return hf_model, hf_tokenizer
|
344 |
|
345 |
|
|
|
|
|
|
|
|
|
|
|
346 |
async def hf_model_if_cache(
|
347 |
model,
|
348 |
prompt,
|
@@ -359,7 +374,11 @@ async def hf_model_if_cache(
|
|
359 |
messages.extend(history_messages)
|
360 |
messages.append({"role": "user", "content": prompt})
|
361 |
|
|
|
362 |
if hashing_kv is not None:
|
|
|
|
|
|
|
363 |
# Get embedding cache configuration
|
364 |
embedding_cache_config = hashing_kv.global_config.get(
|
365 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -379,7 +398,6 @@ async def hf_model_if_cache(
|
|
379 |
return best_cached_response
|
380 |
else:
|
381 |
# Use regular cache
|
382 |
-
args_hash = compute_args_hash(model, messages)
|
383 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
384 |
if if_cache_return is not None:
|
385 |
return if_cache_return["return"]
|
@@ -448,6 +466,11 @@ async def hf_model_if_cache(
|
|
448 |
return response_text
|
449 |
|
450 |
|
|
|
|
|
|
|
|
|
|
|
451 |
async def ollama_model_if_cache(
|
452 |
model,
|
453 |
prompt,
|
@@ -468,7 +491,12 @@ async def ollama_model_if_cache(
|
|
468 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
469 |
messages.extend(history_messages)
|
470 |
messages.append({"role": "user", "content": prompt})
|
|
|
|
|
471 |
if hashing_kv is not None:
|
|
|
|
|
|
|
472 |
# Get embedding cache configuration
|
473 |
embedding_cache_config = hashing_kv.global_config.get(
|
474 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -488,7 +516,6 @@ async def ollama_model_if_cache(
|
|
488 |
return best_cached_response
|
489 |
else:
|
490 |
# Use regular cache
|
491 |
-
args_hash = compute_args_hash(model, messages)
|
492 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
493 |
if if_cache_return is not None:
|
494 |
return if_cache_return["return"]
|
@@ -542,6 +569,11 @@ def initialize_lmdeploy_pipeline(
|
|
542 |
return lmdeploy_pipe
|
543 |
|
544 |
|
|
|
|
|
|
|
|
|
|
|
545 |
async def lmdeploy_model_if_cache(
|
546 |
model,
|
547 |
prompt,
|
@@ -620,7 +652,12 @@ async def lmdeploy_model_if_cache(
|
|
620 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
621 |
messages.extend(history_messages)
|
622 |
messages.append({"role": "user", "content": prompt})
|
|
|
|
|
623 |
if hashing_kv is not None:
|
|
|
|
|
|
|
624 |
# Get embedding cache configuration
|
625 |
embedding_cache_config = hashing_kv.global_config.get(
|
626 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
@@ -640,7 +677,6 @@ async def lmdeploy_model_if_cache(
|
|
640 |
return best_cached_response
|
641 |
else:
|
642 |
# Use regular cache
|
643 |
-
args_hash = compute_args_hash(model, messages)
|
644 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
645 |
if if_cache_return is not None:
|
646 |
return if_cache_return["return"]
|
|
|
66 |
messages.append({"role": "system", "content": system_prompt})
|
67 |
messages.extend(history_messages)
|
68 |
messages.append({"role": "user", "content": prompt})
|
69 |
+
|
70 |
if hashing_kv is not None:
|
71 |
+
# Calculate args_hash only when using cache
|
72 |
+
args_hash = compute_args_hash(model, messages)
|
73 |
+
|
74 |
# Get embedding cache configuration
|
75 |
embedding_cache_config = hashing_kv.global_config.get(
|
76 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
90 |
return best_cached_response
|
91 |
else:
|
92 |
# Use regular cache
|
|
|
93 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
94 |
if if_cache_return is not None:
|
95 |
return if_cache_return["return"]
|
|
|
162 |
messages.extend(history_messages)
|
163 |
if prompt is not None:
|
164 |
messages.append({"role": "user", "content": prompt})
|
165 |
+
|
166 |
+
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
167 |
if hashing_kv is not None:
|
168 |
+
# Calculate args_hash only when using cache
|
169 |
+
args_hash = compute_args_hash(model, messages)
|
170 |
+
|
171 |
# Get embedding cache configuration
|
172 |
embedding_cache_config = hashing_kv.global_config.get(
|
173 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
186 |
if best_cached_response is not None:
|
187 |
return best_cached_response
|
188 |
else:
|
189 |
+
# Use regular cache
|
190 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
191 |
if if_cache_return is not None:
|
192 |
return if_cache_return["return"]
|
|
|
279 |
|
280 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
281 |
if hashing_kv is not None:
|
282 |
+
# Calculate args_hash only when using cache
|
283 |
+
args_hash = compute_args_hash(model, messages)
|
284 |
+
|
285 |
# Get embedding cache configuration
|
286 |
embedding_cache_config = hashing_kv.global_config.get(
|
287 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
301 |
return best_cached_response
|
302 |
else:
|
303 |
# Use regular cache
|
|
|
304 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
305 |
if if_cache_return is not None:
|
306 |
return if_cache_return["return"]
|
|
|
353 |
return hf_model, hf_tokenizer
|
354 |
|
355 |
|
356 |
+
@retry(
|
357 |
+
stop=stop_after_attempt(3),
|
358 |
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
359 |
+
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
360 |
+
)
|
361 |
async def hf_model_if_cache(
|
362 |
model,
|
363 |
prompt,
|
|
|
374 |
messages.extend(history_messages)
|
375 |
messages.append({"role": "user", "content": prompt})
|
376 |
|
377 |
+
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
378 |
if hashing_kv is not None:
|
379 |
+
# Calculate args_hash only when using cache
|
380 |
+
args_hash = compute_args_hash(model, messages)
|
381 |
+
|
382 |
# Get embedding cache configuration
|
383 |
embedding_cache_config = hashing_kv.global_config.get(
|
384 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
398 |
return best_cached_response
|
399 |
else:
|
400 |
# Use regular cache
|
|
|
401 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
402 |
if if_cache_return is not None:
|
403 |
return if_cache_return["return"]
|
|
|
466 |
return response_text
|
467 |
|
468 |
|
469 |
+
@retry(
|
470 |
+
stop=stop_after_attempt(3),
|
471 |
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
472 |
+
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
473 |
+
)
|
474 |
async def ollama_model_if_cache(
|
475 |
model,
|
476 |
prompt,
|
|
|
491 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
492 |
messages.extend(history_messages)
|
493 |
messages.append({"role": "user", "content": prompt})
|
494 |
+
|
495 |
+
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
496 |
if hashing_kv is not None:
|
497 |
+
# Calculate args_hash only when using cache
|
498 |
+
args_hash = compute_args_hash(model, messages)
|
499 |
+
|
500 |
# Get embedding cache configuration
|
501 |
embedding_cache_config = hashing_kv.global_config.get(
|
502 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
516 |
return best_cached_response
|
517 |
else:
|
518 |
# Use regular cache
|
|
|
519 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
520 |
if if_cache_return is not None:
|
521 |
return if_cache_return["return"]
|
|
|
569 |
return lmdeploy_pipe
|
570 |
|
571 |
|
572 |
+
@retry(
|
573 |
+
stop=stop_after_attempt(3),
|
574 |
+
wait=wait_exponential(multiplier=1, min=4, max=10),
|
575 |
+
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
576 |
+
)
|
577 |
async def lmdeploy_model_if_cache(
|
578 |
model,
|
579 |
prompt,
|
|
|
652 |
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
653 |
messages.extend(history_messages)
|
654 |
messages.append({"role": "user", "content": prompt})
|
655 |
+
|
656 |
+
hashing_kv: BaseKVStorage = kwargs.pop("hashing_kv", None)
|
657 |
if hashing_kv is not None:
|
658 |
+
# Calculate args_hash only when using cache
|
659 |
+
args_hash = compute_args_hash(model, messages)
|
660 |
+
|
661 |
# Get embedding cache configuration
|
662 |
embedding_cache_config = hashing_kv.global_config.get(
|
663 |
"embedding_cache_config", {"enabled": False, "similarity_threshold": 0.95}
|
|
|
677 |
return best_cached_response
|
678 |
else:
|
679 |
# Use regular cache
|
|
|
680 |
if_cache_return = await hashing_kv.get_by_id(args_hash)
|
681 |
if if_cache_return is not None:
|
682 |
return if_cache_return["return"]
|