YanSte commited on
Commit
63750e8
·
1 Parent(s): 38386d1

added docs and fields

Browse files
lightrag/kg/postgres_impl.py CHANGED
@@ -54,9 +54,7 @@ class PostgreSQLDB:
54
  self.pool: Pool | None = None
55
 
56
  if self.user is None or self.password is None or self.database is None:
57
- raise ValueError(
58
- "Missing database user, password, or database"
59
- )
60
 
61
  async def initdb(self):
62
  try:
 
54
  self.pool: Pool | None = None
55
 
56
  if self.user is None or self.password is None or self.database is None:
57
+ raise ValueError("Missing database user, password, or database")
 
 
58
 
59
  async def initdb(self):
60
  try:
lightrag/lightrag.py CHANGED
@@ -225,6 +225,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
225
  asyncio.set_event_loop(new_loop)
226
  return new_loop
227
 
 
228
  @final
229
  @dataclass
230
  class LightRAG:
@@ -271,7 +272,9 @@ class LightRAG:
271
  chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
272
  """Maximum number of tokens per text chunk when splitting documents."""
273
 
274
- chunk_overlap_token_size: int = field(default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100)))
 
 
275
  """Number of overlapping tokens between consecutive text chunks to preserve context."""
276
 
277
  tiktoken_model_name: str = field(default="gpt-4o-mini")
@@ -281,11 +284,13 @@ class LightRAG:
281
  entity_extract_max_gleaning: int = field(default=1)
282
  """Maximum number of entity extraction attempts for ambiguous content."""
283
 
284
- entity_summary_to_max_tokens: int = field(default=int(os.getenv("MAX_TOKEN_SUMMARY", 500)))
 
 
285
  """Maximum number of tokens used for summarizing extracted entities."""
286
 
287
  # Node embedding
288
- node_embedding_algorithm: str = field(default="node2vec")
289
  """Algorithm used for node embedding in knowledge graphs."""
290
 
291
  node2vec_params: dict[str, int] = field(
@@ -348,19 +353,22 @@ class LightRAG:
348
  # Extensions
349
  max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
350
  """Maximum number of parallel insert operations."""
351
-
352
  addon_params: dict[str, Any] = field(default_factory=dict)
353
 
354
  # Storages Management
355
  auto_manage_storages_states: bool = field(default=True)
356
  """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
357
 
358
- """Dictionary for additional parameters and extensions."""
359
- convert_response_to_json_func: Callable[[str], dict[str, Any]] = (
360
- convert_response_to_json
361
  )
 
 
 
 
 
362
 
363
- # Custom Chunking Function
364
  chunking_func: Callable[
365
  [
366
  str,
@@ -371,7 +379,25 @@ class LightRAG:
371
  str,
372
  ],
373
  list[dict[str, Any]],
374
- ] = chunking_by_token_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
 
376
  def verify_storage_implementation(
377
  self, storage_type: str, storage_name: str
 
225
  asyncio.set_event_loop(new_loop)
226
  return new_loop
227
 
228
+
229
  @final
230
  @dataclass
231
  class LightRAG:
 
272
  chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
273
  """Maximum number of tokens per text chunk when splitting documents."""
274
 
275
+ chunk_overlap_token_size: int = field(
276
+ default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))
277
+ )
278
  """Number of overlapping tokens between consecutive text chunks to preserve context."""
279
 
280
  tiktoken_model_name: str = field(default="gpt-4o-mini")
 
284
  entity_extract_max_gleaning: int = field(default=1)
285
  """Maximum number of entity extraction attempts for ambiguous content."""
286
 
287
+ entity_summary_to_max_tokens: int = field(
288
+ default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
289
+ )
290
  """Maximum number of tokens used for summarizing extracted entities."""
291
 
292
  # Node embedding
293
+ node_embedding_algorithm: str = field(default="node2vec")
294
  """Algorithm used for node embedding in knowledge graphs."""
295
 
296
  node2vec_params: dict[str, int] = field(
 
353
  # Extensions
354
  max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
355
  """Maximum number of parallel insert operations."""
356
+
357
  addon_params: dict[str, Any] = field(default_factory=dict)
358
 
359
  # Storages Management
360
  auto_manage_storages_states: bool = field(default=True)
361
  """If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
362
 
363
+ convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
364
+ default_factory=lambda: convert_response_to_json
 
365
  )
366
+ """
367
+ Custom function for converting LLM responses to JSON format.
368
+
369
+ The default function is :func:`.utils.convert_response_to_json`.
370
+ """
371
 
 
372
  chunking_func: Callable[
373
  [
374
  str,
 
379
  str,
380
  ],
381
  list[dict[str, Any]],
382
+ ] = field(default_factory=lambda: chunking_by_token_size)
383
+ """
384
+ Custom chunking function for splitting text into chunks before processing.
385
+
386
+ The function should take the following parameters:
387
+
388
+ - `content`: The text to be split into chunks.
389
+ - `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
390
+ - `split_by_character_only`: If True, the text is split only on the specified character.
391
+ - `chunk_token_size`: The maximum number of tokens per chunk.
392
+ - `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
393
+ - `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
394
+
395
+ The function should return a list of dictionaries, where each dictionary contains the following keys:
396
+ - `tokens`: The number of tokens in the chunk.
397
+ - `content`: The text content of the chunk.
398
+
399
+ Defaults to `chunking_by_token_size` if not specified.
400
+ """
401
 
402
  def verify_storage_implementation(
403
  self, storage_type: str, storage_name: str