added docs and fields
Browse files- lightrag/kg/postgres_impl.py +1 -3
- lightrag/lightrag.py +35 -9
lightrag/kg/postgres_impl.py
CHANGED
@@ -54,9 +54,7 @@ class PostgreSQLDB:
|
|
54 |
self.pool: Pool | None = None
|
55 |
|
56 |
if self.user is None or self.password is None or self.database is None:
|
57 |
-
raise ValueError(
|
58 |
-
"Missing database user, password, or database"
|
59 |
-
)
|
60 |
|
61 |
async def initdb(self):
|
62 |
try:
|
|
|
54 |
self.pool: Pool | None = None
|
55 |
|
56 |
if self.user is None or self.password is None or self.database is None:
|
57 |
+
raise ValueError("Missing database user, password, or database")
|
|
|
|
|
58 |
|
59 |
async def initdb(self):
|
60 |
try:
|
lightrag/lightrag.py
CHANGED
@@ -225,6 +225,7 @@ def always_get_an_event_loop() -> asyncio.AbstractEventLoop:
|
|
225 |
asyncio.set_event_loop(new_loop)
|
226 |
return new_loop
|
227 |
|
|
|
228 |
@final
|
229 |
@dataclass
|
230 |
class LightRAG:
|
@@ -271,7 +272,9 @@ class LightRAG:
|
|
271 |
chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
|
272 |
"""Maximum number of tokens per text chunk when splitting documents."""
|
273 |
|
274 |
-
chunk_overlap_token_size: int = field(
|
|
|
|
|
275 |
"""Number of overlapping tokens between consecutive text chunks to preserve context."""
|
276 |
|
277 |
tiktoken_model_name: str = field(default="gpt-4o-mini")
|
@@ -281,11 +284,13 @@ class LightRAG:
|
|
281 |
entity_extract_max_gleaning: int = field(default=1)
|
282 |
"""Maximum number of entity extraction attempts for ambiguous content."""
|
283 |
|
284 |
-
entity_summary_to_max_tokens: int = field(
|
|
|
|
|
285 |
"""Maximum number of tokens used for summarizing extracted entities."""
|
286 |
|
287 |
# Node embedding
|
288 |
-
node_embedding_algorithm: str = field(default="node2vec")
|
289 |
"""Algorithm used for node embedding in knowledge graphs."""
|
290 |
|
291 |
node2vec_params: dict[str, int] = field(
|
@@ -348,19 +353,22 @@ class LightRAG:
|
|
348 |
# Extensions
|
349 |
max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
|
350 |
"""Maximum number of parallel insert operations."""
|
351 |
-
|
352 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
353 |
|
354 |
# Storages Management
|
355 |
auto_manage_storages_states: bool = field(default=True)
|
356 |
"""If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
|
357 |
|
358 |
-
|
359 |
-
|
360 |
-
convert_response_to_json
|
361 |
)
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
-
# Custom Chunking Function
|
364 |
chunking_func: Callable[
|
365 |
[
|
366 |
str,
|
@@ -371,7 +379,25 @@ class LightRAG:
|
|
371 |
str,
|
372 |
],
|
373 |
list[dict[str, Any]],
|
374 |
-
] = chunking_by_token_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
def verify_storage_implementation(
|
377 |
self, storage_type: str, storage_name: str
|
|
|
225 |
asyncio.set_event_loop(new_loop)
|
226 |
return new_loop
|
227 |
|
228 |
+
|
229 |
@final
|
230 |
@dataclass
|
231 |
class LightRAG:
|
|
|
272 |
chunk_token_size: int = field(default=int(os.getenv("CHUNK_SIZE", 1200)))
|
273 |
"""Maximum number of tokens per text chunk when splitting documents."""
|
274 |
|
275 |
+
chunk_overlap_token_size: int = field(
|
276 |
+
default=int(os.getenv("CHUNK_OVERLAP_SIZE", 100))
|
277 |
+
)
|
278 |
"""Number of overlapping tokens between consecutive text chunks to preserve context."""
|
279 |
|
280 |
tiktoken_model_name: str = field(default="gpt-4o-mini")
|
|
|
284 |
entity_extract_max_gleaning: int = field(default=1)
|
285 |
"""Maximum number of entity extraction attempts for ambiguous content."""
|
286 |
|
287 |
+
entity_summary_to_max_tokens: int = field(
|
288 |
+
default=int(os.getenv("MAX_TOKEN_SUMMARY", 500))
|
289 |
+
)
|
290 |
"""Maximum number of tokens used for summarizing extracted entities."""
|
291 |
|
292 |
# Node embedding
|
293 |
+
node_embedding_algorithm: str = field(default="node2vec")
|
294 |
"""Algorithm used for node embedding in knowledge graphs."""
|
295 |
|
296 |
node2vec_params: dict[str, int] = field(
|
|
|
353 |
# Extensions
|
354 |
max_parallel_insert: int = field(default=int(os.getenv("MAX_PARALLEL_INSERT", 20)))
|
355 |
"""Maximum number of parallel insert operations."""
|
356 |
+
|
357 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
358 |
|
359 |
# Storages Management
|
360 |
auto_manage_storages_states: bool = field(default=True)
|
361 |
"""If True, lightrag will automatically calls initialize_storages and finalize_storages at the appropriate times."""
|
362 |
|
363 |
+
convert_response_to_json_func: Callable[[str], dict[str, Any]] = field(
|
364 |
+
default_factory=lambda: convert_response_to_json
|
|
|
365 |
)
|
366 |
+
"""
|
367 |
+
Custom function for converting LLM responses to JSON format.
|
368 |
+
|
369 |
+
The default function is :func:`.utils.convert_response_to_json`.
|
370 |
+
"""
|
371 |
|
|
|
372 |
chunking_func: Callable[
|
373 |
[
|
374 |
str,
|
|
|
379 |
str,
|
380 |
],
|
381 |
list[dict[str, Any]],
|
382 |
+
] = field(default_factory=lambda: chunking_by_token_size)
|
383 |
+
"""
|
384 |
+
Custom chunking function for splitting text into chunks before processing.
|
385 |
+
|
386 |
+
The function should take the following parameters:
|
387 |
+
|
388 |
+
- `content`: The text to be split into chunks.
|
389 |
+
- `split_by_character`: The character to split the text on. If None, the text is split into chunks of `chunk_token_size` tokens.
|
390 |
+
- `split_by_character_only`: If True, the text is split only on the specified character.
|
391 |
+
- `chunk_token_size`: The maximum number of tokens per chunk.
|
392 |
+
- `chunk_overlap_token_size`: The number of overlapping tokens between consecutive chunks.
|
393 |
+
- `tiktoken_model_name`: The name of the tiktoken model to use for tokenization.
|
394 |
+
|
395 |
+
The function should return a list of dictionaries, where each dictionary contains the following keys:
|
396 |
+
- `tokens`: The number of tokens in the chunk.
|
397 |
+
- `content`: The text content of the chunk.
|
398 |
+
|
399 |
+
Defaults to `chunking_by_token_size` if not specified.
|
400 |
+
"""
|
401 |
|
402 |
def verify_storage_implementation(
|
403 |
self, storage_type: str, storage_name: str
|