Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

YanSte commited on Feb 20

Commit

ce79589

1 Parent(s): 63750e8

cleanup

Browse files

Files changed (1) hide show

lightrag/lightrag.py +78 -49

lightrag/lightrag.py CHANGED Viewed

@@ -231,23 +231,16 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
 class LightRAG:
     """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
     working_dir: str = field(
         default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
     )
     """Directory where cache and temporary files are stored."""
-    embedding_cache_config: dict[str, Any] = field(
-        default={
-            "enabled": False,
-            "similarity_threshold": 0.95,
-            "use_llm_check": False,
-        }
-    )
-    """Configuration for embedding cache.
-    - enabled: If True, enables caching to avoid redundant computations.
-    - similarity_threshold: Minimum similarity score to use cached embeddings.
-    - use_llm_check: If True, validates cached embeddings using an LLM.
-    """
     kv_storage: str = field(default="JsonKVStorage")
     """Storage backend for key-value data."""
@@ -262,13 +255,27 @@ class LightRAG:
     """Storage type for tracking document processing statuses."""
     # Logging
     log_level: int = field(default=logger.level)
     """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
     log_dir: str = field(default=os.getcwd())
     """Directory where logs are stored. Defaults to the current working directory."""
     # Text chunking
     chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
     """Maximum number of tokens per text chunk when splitting documents."""
@@ -280,16 +287,41 @@ class LightRAG:
     tiktoken_model_name: str = field(default="gpt-4o-mini")
     """Model name used for tokenization when chunking text."""
-    # Entity extraction
-    entity_extract_max_gleaning: int = field(default=1)
-    """Maximum number of entity extraction attempts for ambiguous content."""
-    entity_summary_to_max_tokens: int = field(
-        default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
-    )
     """Maximum number of tokens used for summarizing extracted entities."""
     # Node embedding
     node_embedding_algorithm: str = field(default="node2vec")
     """Algorithm used for node embedding in knowledge graphs."""
@@ -312,6 +344,9 @@ class LightRAG:
     - random_seed: Seed value for reproducibility.
     """
     embedding_func: EmbeddingFunc | None = field(default=None)
     """Function for computing text embeddings. Must be set before use."""
@@ -321,7 +356,22 @@ class LightRAG:
     embedding_func_max_async: int = field(default=16)
     """Maximum number of concurrent embedding function calls."""
     # LLM Configuration
     llm_model_func: Callable[..., object] | None = field(default=None)
     """Function for interacting with the large language model (LLM). Must be set before use."""
@@ -338,6 +388,8 @@ class LightRAG:
     """Additional keyword arguments passed to the LLM model function."""
     # Storage
     vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
     """Additional parameters for vector database storage."""
@@ -351,15 +403,22 @@ class LightRAG:
     """If True, enables caching for entity extraction steps to reduce LLM costs."""
     # Extensions
     max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
     """Maximum number of parallel insert operations."""
     addon_params: dict[str, Any] = field(default_factory=dict)
     # Storages Management
     auto_manage_storages_states: bool = field(default=True)
     """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
     convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
         default_factory=lambda: convert_response_to_json
     )
@@ -369,36 +428,6 @@ class LightRAG:
     The default function is :func:`.utils.convert_response_to_json`.
     """
-    chunking_func: Callable[
-        [
-            str,
-            str | None,
-            bool,
-            int,
-            int,
-            str,
-        ],
-        list[dict[str, Any]],
-    ] = field(default_factory=lambda: chunking_by_token_size)
-    """
-    Custom chunking function for splitting text into chunks before processing.
-    The function should take the following parameters:
-        - `content`: The text to be split into chunks.
-        - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
-        - `split_by_character_only`: If True, the text is split only on the specified character.
-        - `chunk_token_size`: The maximum number of tokens per chunk.
-        - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
-        - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
-    The function should return a list of dictionaries, where each dictionary contains the following keys:
-        - `tokens`: The number of tokens in the chunk.
-        - `content`: The text content of the chunk.
-    Defaults to `chunking_by_token_size` if not specified.
-    """
     def verify_storage_implementation(
         self, storage_type: str, storage_name: str
     ) -> None:

 class LightRAG:
     """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
+    # Directory
+    # ---
     working_dir: str = field(
         default=f"./lightrag_cache_{datetime.now().strftime('%Y-%m-%d-%H:%M:%S')}"
     )
     """Directory where cache and temporary files are stored."""
+    # Storage
+    # ---
     kv_storage: str = field(default="JsonKVStorage")
     """Storage backend for key-value data."""
     """Storage type for tracking document processing statuses."""
     # Logging
+    # ---
     log_level: int = field(default=logger.level)
     """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
     log_dir: str = field(default=os.getcwd())
     """Directory where logs are stored. Defaults to the current working directory."""
+    # Entity extraction
+    # ---
+    entity_extract_max_gleaning: int = field(default=1)
+    """Maximum number of entity extraction attempts for ambiguous content."""
+    entity_summary_to_max_tokens: int = field(
+        default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
+    )
     # Text chunking
+    # ---
     chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
     """Maximum number of tokens per text chunk when splitting documents."""
     tiktoken_model_name: str = field(default="gpt-4o-mini")
     """Model name used for tokenization when chunking text."""
     """Maximum number of tokens used for summarizing extracted entities."""
+    chunking_func: Callable[
+        [
+            str,
+            str | None,
+            bool,
+            int,
+            int,
+            str,
+        ],
+        list[dict[str, Any]],
+    ] = field(default_factory=lambda: chunking_by_token_size)
+    """
+    Custom chunking function for splitting text into chunks before processing.
+    The function should take the following parameters:
+        - `content`: The text to be split into chunks.
+        - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
+        - `split_by_character_only`: If True, the text is split only on the specified character.
+        - `chunk_token_size`: The maximum number of tokens per chunk.
+        - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
+        - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
+    The function should return a list of dictionaries, where each dictionary contains the following keys:
+        - `tokens`: The number of tokens in the chunk.
+        - `content`: The text content of the chunk.
+    Defaults to `chunking_by_token_size` if not specified.
+    """
     # Node embedding
+    # ---
     node_embedding_algorithm: str = field(default="node2vec")
     """Algorithm used for node embedding in knowledge graphs."""
     - random_seed: Seed value for reproducibility.
     """
+    # Embedding
+    # ---
     embedding_func: EmbeddingFunc | None = field(default=None)
     """Function for computing text embeddings. Must be set before use."""
     embedding_func_max_async: int = field(default=16)
     """Maximum number of concurrent embedding function calls."""
+    embedding_cache_config: dict[str, Any] = field(
+        default={
+            "enabled": False,
+            "similarity_threshold": 0.95,
+            "use_llm_check": False,
+        }
+    )
+    """Configuration for embedding cache.
+    - enabled: If True, enables caching to avoid redundant computations.
+    - similarity_threshold: Minimum similarity score to use cached embeddings.
+    - use_llm_check: If True, validates cached embeddings using an LLM.
+    """
     # LLM Configuration
+    # ---
     llm_model_func: Callable[..., object] | None = field(default=None)
     """Function for interacting with the large language model (LLM). Must be set before use."""
     """Additional keyword arguments passed to the LLM model function."""
     # Storage
+    # ---
     vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
     """Additional parameters for vector database storage."""
     """If True, enables caching for entity extraction steps to reduce LLM costs."""
     # Extensions
+    # ---
     max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
     """Maximum number of parallel insert operations."""
     addon_params: dict[str, Any] = field(default_factory=dict)
     # Storages Management
+    # ---
     auto_manage_storages_states: bool = field(default=True)
     """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
+    # Storages Management
+    # ---
     convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
         default_factory=lambda: convert_response_to_json
     )
     The default function is :func:`.utils.convert_response_to_json`.
     """
     def verify_storage_implementation(
         self, storage_type: str, storage_name: str
     ) -> None: