Merge pull request #560 from AdiKalra/main
Browse files- lightrag/lightrag.py +6 -1
- lightrag/operate.py +1 -0
lightrag/lightrag.py
CHANGED
@@ -187,6 +187,10 @@ class LightRAG:
|
|
187 |
# Add new field for document status storage type
|
188 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
189 |
|
|
|
|
|
|
|
|
|
190 |
def __post_init__(self):
|
191 |
log_file = os.path.join("lightrag.log")
|
192 |
set_logger(log_file)
|
@@ -388,13 +392,14 @@ class LightRAG:
|
|
388 |
**dp,
|
389 |
"full_doc_id": doc_id,
|
390 |
}
|
391 |
-
for dp in
|
392 |
doc["content"],
|
393 |
split_by_character=split_by_character,
|
394 |
split_by_character_only=split_by_character_only,
|
395 |
overlap_token_size=self.chunk_overlap_token_size,
|
396 |
max_token_size=self.chunk_token_size,
|
397 |
tiktoken_model=self.tiktoken_model_name,
|
|
|
398 |
)
|
399 |
}
|
400 |
|
|
|
187 |
# Add new field for document status storage type
|
188 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
189 |
|
190 |
+
# Custom Chunking Function
|
191 |
+
chunking_func: callable = chunking_by_token_size
|
192 |
+
chunking_func_kwargs: dict = field(default_factory=dict)
|
193 |
+
|
194 |
def __post_init__(self):
|
195 |
log_file = os.path.join("lightrag.log")
|
196 |
set_logger(log_file)
|
|
|
392 |
**dp,
|
393 |
"full_doc_id": doc_id,
|
394 |
}
|
395 |
+
for dp in self.chunking_func(
|
396 |
doc["content"],
|
397 |
split_by_character=split_by_character,
|
398 |
split_by_character_only=split_by_character_only,
|
399 |
overlap_token_size=self.chunk_overlap_token_size,
|
400 |
max_token_size=self.chunk_token_size,
|
401 |
tiktoken_model=self.tiktoken_model_name,
|
402 |
+
**self.chunking_func_kwargs,
|
403 |
)
|
404 |
}
|
405 |
|
lightrag/operate.py
CHANGED
@@ -39,6 +39,7 @@ def chunking_by_token_size(
|
|
39 |
overlap_token_size=128,
|
40 |
max_token_size=1024,
|
41 |
tiktoken_model="gpt-4o",
|
|
|
42 |
):
|
43 |
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
44 |
results = []
|
|
|
39 |
overlap_token_size=128,
|
40 |
max_token_size=1024,
|
41 |
tiktoken_model="gpt-4o",
|
42 |
+
**kwargs,
|
43 |
):
|
44 |
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
45 |
results = []
|