zrguo commited on
Commit
27408d4
·
unverified ·
2 Parent(s): b922b45 cded49e

Merge pull request #560 from AdiKalra/main

Browse files
Files changed (2) hide show
  1. lightrag/lightrag.py +6 -1
  2. lightrag/operate.py +1 -0
lightrag/lightrag.py CHANGED
@@ -187,6 +187,10 @@ class LightRAG:
187
  # Add new field for document status storage type
188
  doc_status_storage: str = field(default="JsonDocStatusStorage")
189
 
 
 
 
 
190
  def __post_init__(self):
191
  log_file = os.path.join("lightrag.log")
192
  set_logger(log_file)
@@ -388,13 +392,14 @@ class LightRAG:
388
  **dp,
389
  "full_doc_id": doc_id,
390
  }
391
- for dp in chunking_by_token_size(
392
  doc["content"],
393
  split_by_character=split_by_character,
394
  split_by_character_only=split_by_character_only,
395
  overlap_token_size=self.chunk_overlap_token_size,
396
  max_token_size=self.chunk_token_size,
397
  tiktoken_model=self.tiktoken_model_name,
 
398
  )
399
  }
400
 
 
187
  # Add new field for document status storage type
188
  doc_status_storage: str = field(default="JsonDocStatusStorage")
189
 
190
+ # Custom Chunking Function
191
+ chunking_func: callable = chunking_by_token_size
192
+ chunking_func_kwargs: dict = field(default_factory=dict)
193
+
194
  def __post_init__(self):
195
  log_file = os.path.join("lightrag.log")
196
  set_logger(log_file)
 
392
  **dp,
393
  "full_doc_id": doc_id,
394
  }
395
+ for dp in self.chunking_func(
396
  doc["content"],
397
  split_by_character=split_by_character,
398
  split_by_character_only=split_by_character_only,
399
  overlap_token_size=self.chunk_overlap_token_size,
400
  max_token_size=self.chunk_token_size,
401
  tiktoken_model=self.tiktoken_model_name,
402
+ **self.chunking_func_kwargs,
403
  )
404
  }
405
 
lightrag/operate.py CHANGED
@@ -39,6 +39,7 @@ def chunking_by_token_size(
39
  overlap_token_size=128,
40
  max_token_size=1024,
41
  tiktoken_model="gpt-4o",
 
42
  ):
43
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
44
  results = []
 
39
  overlap_token_size=128,
40
  max_token_size=1024,
41
  tiktoken_model="gpt-4o",
42
+ **kwargs,
43
  ):
44
  tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
45
  results = []