YanSte commited on
Commit
ce79589
·
1 Parent(s): 63750e8
Files changed (1) hide show
  1. lightrag/lightrag.py +78 -49
lightrag/lightrag.py CHANGED
@@ -231,23 +231,16 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
231
  class LightRAG:
232
  """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
233
 
 
 
 
234
  working_dir: str = field(
235
  default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
236
  )
237
  """Directory where cache and temporary files are stored."""
238
 
239
- embedding_cache_config: dict[str, Any] = field(
240
- default={
241
- "enabled": False,
242
- "similarity_threshold": 0.95,
243
- "use_llm_check": False,
244
- }
245
- )
246
- """Configuration for embedding cache.
247
- - enabled: If True, enables caching to avoid redundant computations.
248
- - similarity_threshold: Minimum similarity score to use cached embeddings.
249
- - use_llm_check: If True, validates cached embeddings using an LLM.
250
- """
251
 
252
  kv_storage: str = field(default="JsonKVStorage")
253
  """Storage backend for key-value data."""
@@ -262,13 +255,27 @@ class LightRAG:
262
  """Storage type for tracking document processing statuses."""
263
 
264
  # Logging
 
 
265
  log_level: int = field(default=logger.level)
266
  """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
267
 
268
  log_dir: str = field(default=os.getcwd())
269
  """Directory where logs are stored. Defaults to the current working directory."""
270
 
 
 
 
 
 
 
 
 
 
 
271
  # Text chunking
 
 
272
  chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
273
  """Maximum number of tokens per text chunk when splitting documents."""
274
 
@@ -280,16 +287,41 @@ class LightRAG:
280
  tiktoken_model_name: str = field(default="gpt-4o-mini")
281
  """Model name used for tokenization when chunking text."""
282
 
283
- # Entity extraction
284
- entity_extract_max_gleaning: int = field(default=1)
285
- """Maximum number of entity extraction attempts for ambiguous content."""
286
-
287
- entity_summary_to_max_tokens: int = field(
288
- default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
289
- )
290
  """Maximum number of tokens used for summarizing extracted entities."""
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  # Node embedding
 
 
293
  node_embedding_algorithm: str = field(default="node2vec")
294
  """Algorithm used for node embedding in knowledge graphs."""
295
 
@@ -312,6 +344,9 @@ class LightRAG:
312
  - random_seed: Seed value for reproducibility.
313
  """
314
 
 
 
 
315
  embedding_func: EmbeddingFunc | None = field(default=None)
316
  """Function for computing text embeddings. Must be set before use."""
317
 
@@ -321,7 +356,22 @@ class LightRAG:
321
  embedding_func_max_async: int = field(default=16)
322
  """Maximum number of concurrent embedding function calls."""
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  # LLM Configuration
 
 
325
  llm_model_func: Callable[..., object] | None = field(default=None)
326
  """Function for interacting with the large language model (LLM). Must be set before use."""
327
 
@@ -338,6 +388,8 @@ class LightRAG:
338
  """Additional keyword arguments passed to the LLM model function."""
339
 
340
  # Storage
 
 
341
  vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
342
  """Additional parameters for vector database storage."""
343
 
@@ -351,15 +403,22 @@ class LightRAG:
351
  """If True, enables caching for entity extraction steps to reduce LLM costs."""
352
 
353
  # Extensions
 
 
354
  max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
355
  """Maximum number of parallel insert operations."""
356
 
357
  addon_params: dict[str, Any] = field(default_factory=dict)
358
 
359
  # Storages Management
 
 
360
  auto_manage_storages_states: bool = field(default=True)
361
  """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
362
 
 
 
 
363
  convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
364
  default_factory=lambda: convert_response_to_json
365
  )
@@ -369,36 +428,6 @@ class LightRAG:
369
  The default function is :func:`.utils.convert_response_to_json`.
370
  """
371
 
372
- chunking_func: Callable[
373
- [
374
- str,
375
- str | None,
376
- bool,
377
- int,
378
- int,
379
- str,
380
- ],
381
- list[dict[str, Any]],
382
- ] = field(default_factory=lambda: chunking_by_token_size)
383
- """
384
- Custom chunking function for splitting text into chunks before processing.
385
-
386
- The function should take the following parameters:
387
-
388
- - `content`: The text to be split into chunks.
389
- - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
390
- - `split_by_character_only`: If True, the text is split only on the specified character.
391
- - `chunk_token_size`: The maximum number of tokens per chunk.
392
- - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
393
- - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
394
-
395
- The function should return a list of dictionaries, where each dictionary contains the following keys:
396
- - `tokens`: The number of tokens in the chunk.
397
- - `content`: The text content of the chunk.
398
-
399
- Defaults to `chunking_by_token_size` if not specified.
400
- """
401
-
402
  def verify_storage_implementation(
403
  self, storage_type: str, storage_name: str
404
  ) -> None:
 
231
  class LightRAG:
232
  """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
233
 
234
+ # Directory
235
+ # ---
236
+
237
  working_dir: str = field(
238
  default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
239
  )
240
  """Directory where cache and temporary files are stored."""
241
 
242
+ # Storage
243
+ # ---
 
 
 
 
 
 
 
 
 
 
244
 
245
  kv_storage: str = field(default="JsonKVStorage")
246
  """Storage backend for key-value data."""
 
255
  """Storage type for tracking document processing statuses."""
256
 
257
  # Logging
258
+ # ---
259
+
260
  log_level: int = field(default=logger.level)
261
  """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
262
 
263
  log_dir: str = field(default=os.getcwd())
264
  """Directory where logs are stored. Defaults to the current working directory."""
265
 
266
+ # Entity extraction
267
+ # ---
268
+
269
+ entity_extract_max_gleaning: int = field(default=1)
270
+ """Maximum number of entity extraction attempts for ambiguous content."""
271
+
272
+ entity_summary_to_max_tokens: int = field(
273
+ default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
274
+ )
275
+
276
  # Text chunking
277
+ # ---
278
+
279
  chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
280
  """Maximum number of tokens per text chunk when splitting documents."""
281
 
 
287
  tiktoken_model_name: str = field(default="gpt-4o-mini")
288
  """Model name used for tokenization when chunking text."""
289
 
 
 
 
 
 
 
 
290
  """Maximum number of tokens used for summarizing extracted entities."""
291
 
292
+ chunking_func: Callable[
293
+ [
294
+ str,
295
+ str | None,
296
+ bool,
297
+ int,
298
+ int,
299
+ str,
300
+ ],
301
+ list[dict[str, Any]],
302
+ ] = field(default_factory=lambda: chunking_by_token_size)
303
+ """
304
+ Custom chunking function for splitting text into chunks before processing.
305
+
306
+ The function should take the following parameters:
307
+
308
+ - `content`: The text to be split into chunks.
309
+ - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
310
+ - `split_by_character_only`: If True, the text is split only on the specified character.
311
+ - `chunk_token_size`: The maximum number of tokens per chunk.
312
+ - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
313
+ - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
314
+
315
+ The function should return a list of dictionaries, where each dictionary contains the following keys:
316
+ - `tokens`: The number of tokens in the chunk.
317
+ - `content`: The text content of the chunk.
318
+
319
+ Defaults to `chunking_by_token_size` if not specified.
320
+ """
321
+
322
  # Node embedding
323
+ # ---
324
+
325
  node_embedding_algorithm: str = field(default="node2vec")
326
  """Algorithm used for node embedding in knowledge graphs."""
327
 
 
344
  - random_seed: Seed value for reproducibility.
345
  """
346
 
347
+ # Embedding
348
+ # ---
349
+
350
  embedding_func: EmbeddingFunc | None = field(default=None)
351
  """Function for computing text embeddings. Must be set before use."""
352
 
 
356
  embedding_func_max_async: int = field(default=16)
357
  """Maximum number of concurrent embedding function calls."""
358
 
359
+ embedding_cache_config: dict[str, Any] = field(
360
+ default={
361
+ "enabled": False,
362
+ "similarity_threshold": 0.95,
363
+ "use_llm_check": False,
364
+ }
365
+ )
366
+ """Configuration for embedding cache.
367
+ - enabled: If True, enables caching to avoid redundant computations.
368
+ - similarity_threshold: Minimum similarity score to use cached embeddings.
369
+ - use_llm_check: If True, validates cached embeddings using an LLM.
370
+ """
371
+
372
  # LLM Configuration
373
+ # ---
374
+
375
  llm_model_func: Callable[..., object] | None = field(default=None)
376
  """Function for interacting with the large language model (LLM). Must be set before use."""
377
 
 
388
  """Additional keyword arguments passed to the LLM model function."""
389
 
390
  # Storage
391
+ # ---
392
+
393
  vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
394
  """Additional parameters for vector database storage."""
395
 
 
403
  """If True, enables caching for entity extraction steps to reduce LLM costs."""
404
 
405
  # Extensions
406
+ # ---
407
+
408
  max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
409
  """Maximum number of parallel insert operations."""
410
 
411
  addon_params: dict[str, Any] = field(default_factory=dict)
412
 
413
  # Storages Management
414
+ # ---
415
+
416
  auto_manage_storages_states: bool = field(default=True)
417
  """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
418
 
419
+ # Storages Management
420
+ # ---
421
+
422
  convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
423
  default_factory=lambda: convert_response_to_json
424
  )
 
428
  The default function is :func:`.utils.convert_response_to_json`.
429
  """
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  def verify_storage_implementation(
432
  self, storage_type: str, storage_name: str
433
  ) -> None: