YanSte commited on
Commit
f3ca081
·
1 Parent(s): 128cb78

improved docs

Browse files
Files changed (1) hide show
  1. lightrag/lightrag.py +75 -18
lightrag/lightrag.py CHANGED
@@ -109,38 +109,65 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
109
 
110
  @dataclass
111
  class LightRAG:
 
 
112
  working_dir: str = field(
113
  default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
114
  )
115
- # Default not to use embedding cache
116
- embedding_cache_config: dict = field(
 
117
  default_factory=lambda: {
118
  "enabled": False,
119
  "similarity_threshold": 0.95,
120
  "use_llm_check": False,
121
  }
122
  )
 
 
 
 
 
 
123
  kv_storage: str = field(default="JsonKVStorage")
 
 
124
  vector_storage: str = field(default="NanoVectorDBStorage")
 
 
125
  graph_storage: str = field(default="NetworkXStorage")
 
126
 
127
- # logging
128
  current_log_level = logger.level
129
- log_level: str = field(default=current_log_level)
 
 
130
  log_dir: str = field(default=os.getcwd())
 
131
 
132
- # text chunking
133
  chunk_token_size: int = 1200
 
 
134
  chunk_overlap_token_size: int = 100
 
 
135
  tiktoken_model_name: str = "gpt-4o-mini"
 
136
 
137
- # entity extraction
138
  entity_extract_max_gleaning: int = 1
 
 
139
  entity_summary_to_max_tokens: int = 500
 
140
 
141
- # node embedding
142
  node_embedding_algorithm: str = "node2vec"
143
- node2vec_params: dict = field(
 
 
144
  default_factory=lambda: {
145
  "dimensions": 1536,
146
  "num_walks": 10,
@@ -150,26 +177,56 @@ class LightRAG:
150
  "random_seed": 3,
151
  }
152
  )
 
 
 
 
 
 
 
 
 
 
 
153
 
154
- # embedding_func: EmbeddingFunc = field(default_factory=lambda:hf_embedding)
155
- embedding_func: EmbeddingFunc = None # This must be set (we do want to separate llm from the corte, so no more default initialization)
156
  embedding_batch_num: int = 32
 
 
157
  embedding_func_max_async: int = 16
 
 
 
 
 
 
 
 
158
 
159
- # LLM
160
- llm_model_func: callable = None # This must be set (we do want to separate llm from the corte, so no more default initialization)
161
- llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct" # 'meta-llama/Llama-3.2-1B'#'google/gemma-2-2b-it'
162
  llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768"))
 
 
163
  llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
164
- llm_model_kwargs: dict = field(default_factory=dict)
 
 
 
 
 
 
 
165
 
166
- # storage
167
- vector_db_storage_cls_kwargs: dict = field(default_factory=dict)
168
  namespace_prefix: str = field(default="")
 
169
 
170
  enable_llm_cache: bool = True
171
- # Sometimes there are some reason the LLM failed at Extracting Entities, and we want to continue without LLM cost, we can use this flag
 
172
  enable_llm_cache_for_entity_extract: bool = True
 
 
 
 
 
173
 
174
  # extension
175
  addon_params: dict[str, Any] = field(default_factory=dict)
@@ -177,8 +234,8 @@ class LightRAG:
177
  convert_response_to_json
178
  )
179
 
180
- # Add new field for document status storage type
181
  doc_status_storage: str = field(default="JsonDocStatusStorage")
 
182
 
183
  # Custom Chunking Function
184
  chunking_func: Callable[
 
109
 
110
  @dataclass
111
  class LightRAG:
112
+ """LightRAG: Simple and Fast Retrieval-Augmented Generation."""
113
+
114
  working_dir: str = field(
115
  default_factory=lambda: f'./lightrag_cache_{datetime.now().strftime("%Y-%m-%d-%H:%M:%S")}'
116
  )
117
+ """Directory where cache and temporary files are stored."""
118
+
119
+ embedding_cache_config: dict[str, Any] = field(
120
  default_factory=lambda: {
121
  "enabled": False,
122
  "similarity_threshold": 0.95,
123
  "use_llm_check": False,
124
  }
125
  )
126
+ """Configuration for embedding cache.
127
+ - enabled: If True, enables caching to avoid redundant computations.
128
+ - similarity_threshold: Minimum similarity score to use cached embeddings.
129
+ - use_llm_check: If True, validates cached embeddings using an LLM.
130
+ """
131
+
132
  kv_storage: str = field(default="JsonKVStorage")
133
+ """Storage backend for key-value data."""
134
+
135
  vector_storage: str = field(default="NanoVectorDBStorage")
136
+ """Storage backend for vector embeddings."""
137
+
138
  graph_storage: str = field(default="NetworkXStorage")
139
+ """Storage backend for knowledge graphs."""
140
 
141
+ # Logging
142
  current_log_level = logger.level
143
+ log_level: int = field(default=current_log_level)
144
+ """Logging level for the system (e.g., 'DEBUG', 'INFO', 'WARNING')."""
145
+
146
  log_dir: str = field(default=os.getcwd())
147
+ """Directory where logs are stored. Defaults to the current working directory."""
148
 
149
+ # Text chunking
150
  chunk_token_size: int = 1200
151
+ """Maximum number of tokens per text chunk when splitting documents."""
152
+
153
  chunk_overlap_token_size: int = 100
154
+ """Number of overlapping tokens between consecutive text chunks to preserve context."""
155
+
156
  tiktoken_model_name: str = "gpt-4o-mini"
157
+ """Model name used for tokenization when chunking text."""
158
 
159
+ # Entity extraction
160
  entity_extract_max_gleaning: int = 1
161
+ """Maximum number of entity extraction attempts for ambiguous content."""
162
+
163
  entity_summary_to_max_tokens: int = 500
164
+ """Maximum number of tokens used for summarizing extracted entities."""
165
 
166
+ # Node embedding
167
  node_embedding_algorithm: str = "node2vec"
168
+ """Algorithm used for node embedding in knowledge graphs."""
169
+
170
+ node2vec_params: dict[str, int] = field(
171
  default_factory=lambda: {
172
  "dimensions": 1536,
173
  "num_walks": 10,
 
177
  "random_seed": 3,
178
  }
179
  )
180
+ """Configuration for the node2vec embedding algorithm:
181
+ - dimensions: Number of dimensions for embeddings.
182
+ - num_walks: Number of random walks per node.
183
+ - walk_length: Number of steps per random walk.
184
+ - window_size: Context window size for training.
185
+ - iterations: Number of iterations for training.
186
+ - random_seed: Seed value for reproducibility.
187
+ """
188
+
189
+ embedding_func: EmbeddingFunc = None
190
+ """Function for computing text embeddings. Must be set before use."""
191
 
 
 
192
  embedding_batch_num: int = 32
193
+ """Batch size for embedding computations."""
194
+
195
  embedding_func_max_async: int = 16
196
+ """Maximum number of concurrent embedding function calls."""
197
+
198
+ # LLM Configuration
199
+ llm_model_func: callable = None
200
+ """Function for interacting with the large language model (LLM). Must be set before use."""
201
+
202
+ llm_model_name: str = "meta-llama/Llama-3.2-1B-Instruct"
203
+ """Name of the LLM model used for generating responses."""
204
 
 
 
 
205
  llm_model_max_token_size: int = int(os.getenv("MAX_TOKENS", "32768"))
206
+ """Maximum number of tokens allowed per LLM response."""
207
+
208
  llm_model_max_async: int = int(os.getenv("MAX_ASYNC", "16"))
209
+ """Maximum number of concurrent LLM calls."""
210
+
211
+ llm_model_kwargs: dict[str, Any] = field(default_factory=dict)
212
+ """Additional keyword arguments passed to the LLM model function."""
213
+
214
+ # Storage
215
+ vector_db_storage_cls_kwargs: dict[str, Any] = field(default_factory=dict)
216
+ """Additional parameters for vector database storage."""
217
 
 
 
218
  namespace_prefix: str = field(default="")
219
+ """Prefix for namespacing stored data across different environments."""
220
 
221
  enable_llm_cache: bool = True
222
+ """Enables caching for LLM responses to avoid redundant computations."""
223
+
224
  enable_llm_cache_for_entity_extract: bool = True
225
+ """If True, enables caching for entity extraction steps to reduce LLM costs."""
226
+
227
+ # Extensions
228
+ addon_params: dict[str, Any] = field(default_factory=dict)
229
+ """Dictionary for additional parameters and extensions."""
230
 
231
  # extension
232
  addon_params: dict[str, Any] = field(default_factory=dict)
 
234
  convert_response_to_json
235
  )
236
 
 
237
  doc_status_storage: str = field(default="JsonDocStatusStorage")
238
+ """Storage type for tracking document processing statuses."""
239
 
240
  # Custom Chunking Function
241
  chunking_func: Callable[