cleanup
Browse files- lightrag/lightrag.py +78 -49
lightrag/lightrag.py
CHANGED
@@ -231,23 +231,16 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
|
|
231 |
class LightRAG:
|
232 |
"""LightRAG: Simple and Fast Retrieval-Augmented Generation."""
|
233 |
|
|
|
|
|
|
|
234 |
working_dir: str = field(
|
235 |
default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
|
236 |
)
|
237 |
"""Directory where cache and temporary files are stored."""
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
"enabled": False,
|
242 |
-
"similarity_threshold": 0.95,
|
243 |
-
"use_llm_check": False,
|
244 |
-
}
|
245 |
-
)
|
246 |
-
"""Configuration for embedding cache.
|
247 |
-
- enabled: If True, enables caching to avoid redundant computations.
|
248 |
-
- similarity_threshold: Minimum similarity score to use cached embeddings.
|
249 |
-
- use_llm_check: If True, validates cached embeddings using an LLM.
|
250 |
-
"""
|
251 |
|
252 |
kv_storage: str = field(default="JsonKVStorage")
|
253 |
"""Storage backend for key-value data."""
|
@@ -262,13 +255,27 @@ class LightRAG:
|
|
262 |
"""Storage type for tracking document processing statuses."""
|
263 |
|
264 |
# Logging
|
|
|
|
|
265 |
log_level: int = field(default=logger.level)
|
266 |
"""Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
|
267 |
|
268 |
log_dir: str = field(default=os.getcwd())
|
269 |
"""Directory where logs are stored. Defaults to the current working directory."""
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
# Text chunking
|
|
|
|
|
272 |
chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
|
273 |
"""Maximum number of tokens per text chunk when splitting documents."""
|
274 |
|
@@ -280,16 +287,41 @@ class LightRAG:
|
|
280 |
tiktoken_model_name: str = field(default="gpt-4o-mini")
|
281 |
"""Model name used for tokenization when chunking text."""
|
282 |
|
283 |
-
# Entity extraction
|
284 |
-
entity_extract_max_gleaning: int = field(default=1)
|
285 |
-
"""Maximum number of entity extraction attempts for ambiguous content."""
|
286 |
-
|
287 |
-
entity_summary_to_max_tokens: int = field(
|
288 |
-
default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
|
289 |
-
)
|
290 |
"""Maximum number of tokens used for summarizing extracted entities."""
|
291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
# Node embedding
|
|
|
|
|
293 |
node_embedding_algorithm: str = field(default="node2vec")
|
294 |
"""Algorithm used for node embedding in knowledge graphs."""
|
295 |
|
@@ -312,6 +344,9 @@ class LightRAG:
|
|
312 |
- random_seed: Seed value for reproducibility.
|
313 |
"""
|
314 |
|
|
|
|
|
|
|
315 |
embedding_func: EmbeddingFunc | None = field(default=None)
|
316 |
"""Function for computing text embeddings. Must be set before use."""
|
317 |
|
@@ -321,7 +356,22 @@ class LightRAG:
|
|
321 |
embedding_func_max_async: int = field(default=16)
|
322 |
"""Maximum number of concurrent embedding function calls."""
|
323 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
# LLM Configuration
|
|
|
|
|
325 |
llm_model_func: Callable[..., object] | None = field(default=None)
|
326 |
"""Function for interacting with the large language model (LLM). Must be set before use."""
|
327 |
|
@@ -338,6 +388,8 @@ class LightRAG:
|
|
338 |
"""Additional keyword arguments passed to the LLM model function."""
|
339 |
|
340 |
# Storage
|
|
|
|
|
341 |
vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
|
342 |
"""Additional parameters for vector database storage."""
|
343 |
|
@@ -351,15 +403,22 @@ class LightRAG:
|
|
351 |
"""If True, enables caching for entity extraction steps to reduce LLM costs."""
|
352 |
|
353 |
# Extensions
|
|
|
|
|
354 |
max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
|
355 |
"""Maximum number of parallel insert operations."""
|
356 |
|
357 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
358 |
|
359 |
# Storages Management
|
|
|
|
|
360 |
auto_manage_storages_states: bool = field(default=True)
|
361 |
"""If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
|
362 |
|
|
|
|
|
|
|
363 |
convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
|
364 |
default_factory=lambda: convert_response_to_json
|
365 |
)
|
@@ -369,36 +428,6 @@ class LightRAG:
|
|
369 |
The default function is :func:`.utils.convert_response_to_json`.
|
370 |
"""
|
371 |
|
372 |
-
chunking_func: Callable[
|
373 |
-
[
|
374 |
-
str,
|
375 |
-
str | None,
|
376 |
-
bool,
|
377 |
-
int,
|
378 |
-
int,
|
379 |
-
str,
|
380 |
-
],
|
381 |
-
list[dict[str, Any]],
|
382 |
-
] = field(default_factory=lambda: chunking_by_token_size)
|
383 |
-
"""
|
384 |
-
Custom chunking function for splitting text into chunks before processing.
|
385 |
-
|
386 |
-
The function should take the following parameters:
|
387 |
-
|
388 |
-
- `content`: The text to be split into chunks.
|
389 |
-
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
|
390 |
-
- `split_by_character_only`: If True, the text is split only on the specified character.
|
391 |
-
- `chunk_token_size`: The maximum number of tokens per chunk.
|
392 |
-
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
393 |
-
- `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
|
394 |
-
|
395 |
-
The function should return a list of dictionaries, where each dictionary contains the following keys:
|
396 |
-
- `tokens`: The number of tokens in the chunk.
|
397 |
-
- `content`: The text content of the chunk.
|
398 |
-
|
399 |
-
Defaults to `chunking_by_token_size` if not specified.
|
400 |
-
"""
|
401 |
-
|
402 |
def verify_storage_implementation(
|
403 |
self, storage_type: str, storage_name: str
|
404 |
) -> None:
|
|
|
231 |
class LightRAG:
|
232 |
"""LightRAG: Simple and Fast Retrieval-Augmented Generation."""
|
233 |
|
234 |
+
# Directory
|
235 |
+
# ---
|
236 |
+
|
237 |
working_dir: str = field(
|
238 |
default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
|
239 |
)
|
240 |
"""Directory where cache and temporary files are stored."""
|
241 |
|
242 |
+
# Storage
|
243 |
+
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
244 |
|
245 |
kv_storage: str = field(default="JsonKVStorage")
|
246 |
"""Storage backend for key-value data."""
|
|
|
255 |
"""Storage type for tracking document processing statuses."""
|
256 |
|
257 |
# Logging
|
258 |
+
# ---
|
259 |
+
|
260 |
log_level: int = field(default=logger.level)
|
261 |
"""Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
|
262 |
|
263 |
log_dir: str = field(default=os.getcwd())
|
264 |
"""Directory where logs are stored. Defaults to the current working directory."""
|
265 |
|
266 |
+
# Entity extraction
|
267 |
+
# ---
|
268 |
+
|
269 |
+
entity_extract_max_gleaning: int = field(default=1)
|
270 |
+
"""Maximum number of entity extraction attempts for ambiguous content."""
|
271 |
+
|
272 |
+
entity_summary_to_max_tokens: int = field(
|
273 |
+
default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
|
274 |
+
)
|
275 |
+
|
276 |
# Text chunking
|
277 |
+
# ---
|
278 |
+
|
279 |
chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
|
280 |
"""Maximum number of tokens per text chunk when splitting documents."""
|
281 |
|
|
|
287 |
tiktoken_model_name: str = field(default="gpt-4o-mini")
|
288 |
"""Model name used for tokenization when chunking text."""
|
289 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
"""Maximum number of tokens used for summarizing extracted entities."""
|
291 |
|
292 |
+
chunking_func: Callable[
|
293 |
+
[
|
294 |
+
str,
|
295 |
+
str | None,
|
296 |
+
bool,
|
297 |
+
int,
|
298 |
+
int,
|
299 |
+
str,
|
300 |
+
],
|
301 |
+
list[dict[str, Any]],
|
302 |
+
] = field(default_factory=lambda: chunking_by_token_size)
|
303 |
+
"""
|
304 |
+
Custom chunking function for splitting text into chunks before processing.
|
305 |
+
|
306 |
+
The function should take the following parameters:
|
307 |
+
|
308 |
+
- `content`: The text to be split into chunks.
|
309 |
+
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
|
310 |
+
- `split_by_character_only`: If True, the text is split only on the specified character.
|
311 |
+
- `chunk_token_size`: The maximum number of tokens per chunk.
|
312 |
+
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
313 |
+
- `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
|
314 |
+
|
315 |
+
The function should return a list of dictionaries, where each dictionary contains the following keys:
|
316 |
+
- `tokens`: The number of tokens in the chunk.
|
317 |
+
- `content`: The text content of the chunk.
|
318 |
+
|
319 |
+
Defaults to `chunking_by_token_size` if not specified.
|
320 |
+
"""
|
321 |
+
|
322 |
# Node embedding
|
323 |
+
# ---
|
324 |
+
|
325 |
node_embedding_algorithm: str = field(default="node2vec")
|
326 |
"""Algorithm used for node embedding in knowledge graphs."""
|
327 |
|
|
|
344 |
- random_seed: Seed value for reproducibility.
|
345 |
"""
|
346 |
|
347 |
+
# Embedding
|
348 |
+
# ---
|
349 |
+
|
350 |
embedding_func: EmbeddingFunc | None = field(default=None)
|
351 |
"""Function for computing text embeddings. Must be set before use."""
|
352 |
|
|
|
356 |
embedding_func_max_async: int = field(default=16)
|
357 |
"""Maximum number of concurrent embedding function calls."""
|
358 |
|
359 |
+
embedding_cache_config: dict[str, Any] = field(
|
360 |
+
default={
|
361 |
+
"enabled": False,
|
362 |
+
"similarity_threshold": 0.95,
|
363 |
+
"use_llm_check": False,
|
364 |
+
}
|
365 |
+
)
|
366 |
+
"""Configuration for embedding cache.
|
367 |
+
- enabled: If True, enables caching to avoid redundant computations.
|
368 |
+
- similarity_threshold: Minimum similarity score to use cached embeddings.
|
369 |
+
- use_llm_check: If True, validates cached embeddings using an LLM.
|
370 |
+
"""
|
371 |
+
|
372 |
# LLM Configuration
|
373 |
+
# ---
|
374 |
+
|
375 |
llm_model_func: Callable[..., object] | None = field(default=None)
|
376 |
"""Function for interacting with the large language model (LLM). Must be set before use."""
|
377 |
|
|
|
388 |
"""Additional keyword arguments passed to the LLM model function."""
|
389 |
|
390 |
# Storage
|
391 |
+
# ---
|
392 |
+
|
393 |
vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
|
394 |
"""Additional parameters for vector database storage."""
|
395 |
|
|
|
403 |
"""If True, enables caching for entity extraction steps to reduce LLM costs."""
|
404 |
|
405 |
# Extensions
|
406 |
+
# ---
|
407 |
+
|
408 |
max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
|
409 |
"""Maximum number of parallel insert operations."""
|
410 |
|
411 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
412 |
|
413 |
# Storages Management
|
414 |
+
# ---
|
415 |
+
|
416 |
auto_manage_storages_states: bool = field(default=True)
|
417 |
"""If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
|
418 |
|
419 |
+
# Storages Management
|
420 |
+
# ---
|
421 |
+
|
422 |
convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
|
423 |
default_factory=lambda: convert_response_to_json
|
424 |
)
|
|
|
428 |
The default function is :func:`.utils.convert_response_to_json`.
|
429 |
"""
|
430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
431 |
def verify_storage_implementation(
|
432 |
self, storage_type: str, storage_name: str
|
433 |
) -> None:
|