cleaned type
Browse files- lightrag/lightrag.py +15 -6
- lightrag/operate.py +5 -6
- lightrag/utils.py +1 -1
lightrag/lightrag.py
CHANGED
@@ -4,7 +4,7 @@ from tqdm.asyncio import tqdm as tqdm_async
|
|
4 |
from dataclasses import asdict, dataclass, field
|
5 |
from datetime import datetime
|
6 |
from functools import partial
|
7 |
-
from typing import Any, Type, Union, cast
|
8 |
import traceback
|
9 |
from .operate import (
|
10 |
chunking_by_token_size,
|
@@ -177,13 +177,24 @@ class LightRAG:
|
|
177 |
|
178 |
# extension
|
179 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
180 |
-
convert_response_to_json_func:
|
181 |
|
182 |
# Add new field for document status storage type
|
183 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
184 |
|
185 |
# Custom Chunking Function
|
186 |
-
chunking_func:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
chunking_func_kwargs: dict = field(default_factory=dict)
|
188 |
|
189 |
def __post_init__(self):
|
@@ -538,9 +549,7 @@ class LightRAG:
|
|
538 |
return
|
539 |
|
540 |
full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
|
541 |
-
new_docs = {}
|
542 |
-
if full_docs_ids:
|
543 |
-
new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
|
544 |
|
545 |
if not new_docs:
|
546 |
logger.info("All documents have been processed or are duplicates")
|
|
|
4 |
from dataclasses import asdict, dataclass, field
|
5 |
from datetime import datetime
|
6 |
from functools import partial
|
7 |
+
from typing import Any, Callable, Optional, Type, Union, cast
|
8 |
import traceback
|
9 |
from .operate import (
|
10 |
chunking_by_token_size,
|
|
|
177 |
|
178 |
# extension
|
179 |
addon_params: dict[str, Any] = field(default_factory=dict)
|
180 |
+
convert_response_to_json_func: Callable[[str], dict[str, Any]] = convert_response_to_json
|
181 |
|
182 |
# Add new field for document status storage type
|
183 |
doc_status_storage: str = field(default="JsonDocStatusStorage")
|
184 |
|
185 |
# Custom Chunking Function
|
186 |
+
chunking_func: Callable[
|
187 |
+
[
|
188 |
+
str,
|
189 |
+
Optional[str],
|
190 |
+
bool,
|
191 |
+
int,
|
192 |
+
int,
|
193 |
+
str,
|
194 |
+
],
|
195 |
+
list[dict[str, Any]],
|
196 |
+
] = chunking_by_token_size
|
197 |
+
|
198 |
chunking_func_kwargs: dict = field(default_factory=dict)
|
199 |
|
200 |
def __post_init__(self):
|
|
|
549 |
return
|
550 |
|
551 |
full_docs_ids = await self.full_docs.get_by_ids(to_process_doc_keys)
|
552 |
+
new_docs = {doc["id"]: doc for doc in full_docs_ids or []}
|
|
|
|
|
553 |
|
554 |
if not new_docs:
|
555 |
logger.info("All documents have been processed or are duplicates")
|
lightrag/operate.py
CHANGED
@@ -36,12 +36,11 @@ import time
|
|
36 |
|
37 |
def chunking_by_token_size(
|
38 |
content: str,
|
39 |
-
split_by_character=None,
|
40 |
-
split_by_character_only=False,
|
41 |
-
overlap_token_size=128,
|
42 |
-
max_token_size=1024,
|
43 |
-
tiktoken_model="gpt-4o"
|
44 |
-
**kwargs,
|
45 |
) -> list[dict[str, Any]]:
|
46 |
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
47 |
results: list[dict[str, Any]] = []
|
|
|
36 |
|
37 |
def chunking_by_token_size(
|
38 |
content: str,
|
39 |
+
split_by_character: Union[str, None]=None,
|
40 |
+
split_by_character_only: bool =False,
|
41 |
+
overlap_token_size: int =128,
|
42 |
+
max_token_size: int =1024,
|
43 |
+
tiktoken_model: str="gpt-4o"
|
|
|
44 |
) -> list[dict[str, Any]]:
|
45 |
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
|
46 |
results: list[dict[str, Any]] = []
|
lightrag/utils.py
CHANGED
@@ -98,7 +98,7 @@ def locate_json_string_body_from_string(content: str) -> Union[str, None]:
|
|
98 |
return None
|
99 |
|
100 |
|
101 |
-
def convert_response_to_json(response: str) -> dict:
|
102 |
json_str = locate_json_string_body_from_string(response)
|
103 |
assert json_str is not None, f"Unable to parse JSON from response: {response}"
|
104 |
try:
|
|
|
98 |
return None
|
99 |
|
100 |
|
101 |
+
def convert_response_to_json(response: str) -> dict[str, Any]:
|
102 |
json_str = locate_json_string_body_from_string(response)
|
103 |
assert json_str is not None, f"Unable to parse JSON from response: {response}"
|
104 |
try:
|