improved docs
Browse files- lightrag/lightrag.py +75 -18
lightrag/lightrag.py
CHANGED
@@ -109,38 +109,65 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
|
|
109 |
|
110 |
@dataclass
|
111 |
class LightRAG:
|
|
|
|
|
112 |
working_dir: str = field(
|
113 |
default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
|
114 |
)
|
115 |
-
|
116 |
-
|
|
|
117 |
default_factory=lambda: {
|
118 |
"enabled": False,
|
119 |
"similarity_threshold": 0.95,
|
120 |
"use_llm_check": False,
|
121 |
}
|
122 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
kv_storage: str = field(default="JsonKVStorage")
|
|
|
|
|
124 |
vector_storage: str = field(default="NanoVectorDBStorage")
|
|
|
|
|
125 |
graph_storage: str = field(default="NetworkXStorage")
|
|
|
126 |
|
127 |
-
#
|
128 |
current_log_level = logger.level
|
129 |
-
log_level:
|
|
|
|
|
130 |
log_dir: str = field(default=os.getcwd())
|
|
|
131 |
|
132 |
-
#
|
133 |
chunk_token_size: int = 1200
|
|
|
|
|
134 |
chunk_overlap_token_size: int = 100
|
|
|
|
|
135 |
tiktoken_model_name: str = "gpt-4o-mini"
|
|
|
136 |
|
137 |
-
#
|
138 |
entity_extract_max_gleaning: int = 1
|
|
|
|
|
139 |
entity_summary_to_max_tokens: int = 500
|
|
|
140 |
|
141 |
-
#
|
142 |
node_embedding_algorithm: str = "node2vec"
|
143 |
-
|
|
|
|
|
144 |
default_factory=lambda: {
|
145 |
"dimensions": 1536,
|
146 |
"num_walks": 10,
|
@@ -150,26 +177,56 @@ class LightRAG:
|
|
150 |
"random_seed": 3,
|
151 |
}
|
152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
-
# embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding)
|
155 |
-
embedding_func: EmbeddingFunc = None # This must be set (we do want to separate llm from the corte, so no more default initialization)
|
156 |
embedding_batch_num: int = 32
|
|
|
|
|
157 |
embedding_func_max_async: int = 16
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
# LLM
|
160 |
-
llm_model_func: callable = None # This must be set (we do want to separate llm from the corte, so no more default initialization)
|
161 |
-
llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" # 'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it'
|
162 |
llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768"))
|
|
|
|
|
163 |
llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
-
# storage
|
167 |
-
vector_db_storage_cls_kwargs: dict = field(default_factory=dict)
|
168 |
namespace_prefix: str = field(default="")
|
|
|
169 |
|
170 |
enable_llm_cache: bool = True
|
171 |
-
|
|
|
172 |
enable_llm_cache_for_entity_extract: bool = True
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# extension
|
175 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
@@ -177,8 +234,8 @@ class LightRAG:
|
|
177 |
convert_response_to_json
|
178 |
)
|
179 |
|
180 |
-
# Add new field for document status storage type
|
181 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
|
|
182 |
|
183 |
# Custom Chunking Function
|
184 |
chunking_func: Callable[
|
|
|
109 |
|
110 |
@dataclass
|
111 |
class LightRAG:
|
112 |
+
"""LightRAG: Simple and Fast Retrieval-Augmented Generation."""
|
113 |
+
|
114 |
working_dir: str = field(
|
115 |
default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
|
116 |
)
|
117 |
+
"""Directory where cache and temporary files are stored."""
|
118 |
+
|
119 |
+
embedding_cache_config: dict[str, Any] = field(
|
120 |
default_factory=lambda: {
|
121 |
"enabled": False,
|
122 |
"similarity_threshold": 0.95,
|
123 |
"use_llm_check": False,
|
124 |
}
|
125 |
)
|
126 |
+
"""Configuration for embedding cache.
|
127 |
+
- enabled: If True, enables caching to avoid redundant computations.
|
128 |
+
- similarity_threshold: Minimum similarity score to use cached embeddings.
|
129 |
+
- use_llm_check: If True, validates cached embeddings using an LLM.
|
130 |
+
"""
|
131 |
+
|
132 |
kv_storage: str = field(default="JsonKVStorage")
|
133 |
+
"""Storage backend for key-value data."""
|
134 |
+
|
135 |
vector_storage: str = field(default="NanoVectorDBStorage")
|
136 |
+
"""Storage backend for vector embeddings."""
|
137 |
+
|
138 |
graph_storage: str = field(default="NetworkXStorage")
|
139 |
+
"""Storage backend for knowledge graphs."""
|
140 |
|
141 |
+
# Logging
|
142 |
current_log_level = logger.level
|
143 |
+
log_level: int = field(default=current_log_level)
|
144 |
+
"""Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
|
145 |
+
|
146 |
log_dir: str = field(default=os.getcwd())
|
147 |
+
"""Directory where logs are stored. Defaults to the current working directory."""
|
148 |
|
149 |
+
# Text chunking
|
150 |
chunk_token_size: int = 1200
|
151 |
+
"""Maximum number of tokens per text chunk when splitting documents."""
|
152 |
+
|
153 |
chunk_overlap_token_size: int = 100
|
154 |
+
"""Number of overlapping tokens between consecutive text chunks to preserve context."""
|
155 |
+
|
156 |
tiktoken_model_name: str = "gpt-4o-mini"
|
157 |
+
"""Model name used for tokenization when chunking text."""
|
158 |
|
159 |
+
# Entity extraction
|
160 |
entity_extract_max_gleaning: int = 1
|
161 |
+
"""Maximum number of entity extraction attempts for ambiguous content."""
|
162 |
+
|
163 |
entity_summary_to_max_tokens: int = 500
|
164 |
+
"""Maximum number of tokens used for summarizing extracted entities."""
|
165 |
|
166 |
+
# Node embedding
|
167 |
node_embedding_algorithm: str = "node2vec"
|
168 |
+
"""Algorithm used for node embedding in knowledge graphs."""
|
169 |
+
|
170 |
+
node2vec_params: dict[str, int] = field(
|
171 |
default_factory=lambda: {
|
172 |
"dimensions": 1536,
|
173 |
"num_walks": 10,
|
|
|
177 |
"random_seed": 3,
|
178 |
}
|
179 |
)
|
180 |
+
"""Configuration for the node2vec embedding algorithm:
|
181 |
+
- dimensions: Number of dimensions for embeddings.
|
182 |
+
- num_walks: Number of random walks per node.
|
183 |
+
- walk_length: Number of steps per random walk.
|
184 |
+
- window_size: Context window size for training.
|
185 |
+
- iterations: Number of iterations for training.
|
186 |
+
- random_seed: Seed value for reproducibility.
|
187 |
+
"""
|
188 |
+
|
189 |
+
embedding_func: EmbeddingFunc = None
|
190 |
+
"""Function for computing text embeddings. Must be set before use."""
|
191 |
|
|
|
|
|
192 |
embedding_batch_num: int = 32
|
193 |
+
"""Batch size for embedding computations."""
|
194 |
+
|
195 |
embedding_func_max_async: int = 16
|
196 |
+
"""Maximum number of concurrent embedding function calls."""
|
197 |
+
|
198 |
+
# LLM Configuration
|
199 |
+
llm_model_func: callable = None
|
200 |
+
"""Function for interacting with the large language model (LLM). Must be set before use."""
|
201 |
+
|
202 |
+
llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
|
203 |
+
"""Name of the LLM model used for generating responses."""
|
204 |
|
|
|
|
|
|
|
205 |
llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768"))
|
206 |
+
"""Maximum number of tokens allowed per LLM response."""
|
207 |
+
|
208 |
llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
|
209 |
+
"""Maximum number of concurrent LLM calls."""
|
210 |
+
|
211 |
+
llm_model_kwargs: dict[str, Any] = field(default_factory=dict)
|
212 |
+
"""Additional keyword arguments passed to the LLM model function."""
|
213 |
+
|
214 |
+
# Storage
|
215 |
+
vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
|
216 |
+
"""Additional parameters for vector database storage."""
|
217 |
|
|
|
|
|
218 |
namespace_prefix: str = field(default="")
|
219 |
+
"""Prefix for namespacing stored data across different environments."""
|
220 |
|
221 |
enable_llm_cache: bool = True
|
222 |
+
"""Enables caching for LLM responses to avoid redundant computations."""
|
223 |
+
|
224 |
enable_llm_cache_for_entity_extract: bool = True
|
225 |
+
"""If True, enables caching for entity extraction steps to reduce LLM costs."""
|
226 |
+
|
227 |
+
# Extensions
|
228 |
+
addon_params: dict[str, Any] = field(default_factory=dict)
|
229 |
+
"""Dictionary for additional parameters and extensions."""
|
230 |
|
231 |
# extension
|
232 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
|
|
234 |
convert_response_to_json
|
235 |
)
|
236 |
|
|
|
237 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
238 |
+
"""Storage type for tracking document processing statuses."""
|
239 |
|
240 |
# Custom Chunking Function
|
241 |
chunking_func: Callable[
|