yangdx commited on
Commit
109988a
·
1 Parent(s): 648da0e

Add Unicode collation for Chinese file sorting of document scanning

Browse files
lightrag/api/requirements.txt CHANGED
@@ -16,6 +16,7 @@ python-dotenv
16
  python-jose[cryptography]
17
  python-multipart
18
  pytz
 
19
  tenacity
20
  tiktoken
21
  uvicorn
 
16
  python-jose[cryptography]
17
  python-multipart
18
  pytz
19
+ pyuca
20
  tenacity
21
  tiktoken
22
  uvicorn
lightrag/api/routers/document_routes.py CHANGED
@@ -3,6 +3,7 @@ This module contains all document-related routes for the LightRAG API.
3
  """
4
 
5
  import asyncio
 
6
  from lightrag.utils import logger
7
  import aiofiles
8
  import shutil
@@ -614,8 +615,12 @@ async def pipeline_index_files(rag: LightRAG, file_paths: List[Path]):
614
  try:
615
  enqueued = False
616
 
 
 
 
 
617
  # Process files sequentially
618
- for file_path in file_paths:
619
  if await pipeline_enqueue_file(rag, file_path):
620
  enqueued = True
621
 
 
3
  """
4
 
5
  import asyncio
6
+ from pyuca import Collator
7
  from lightrag.utils import logger
8
  import aiofiles
9
  import shutil
 
615
  try:
616
  enqueued = False
617
 
618
+ # Create Collator for Unicode sorting
619
+ collator = Collator()
620
+ sorted_file_paths = sorted(file_paths, key=lambda p: collator.sort_key(str(p)))
621
+
622
  # Process files sequentially
623
+ for file_path in sorted_file_paths:
624
  if await pipeline_enqueue_file(rag, file_path):
625
  enqueued = True
626
 
requirements.txt CHANGED
@@ -11,6 +11,9 @@ pipmaster
11
  pydantic
12
  python-dotenv
13
 
 
 
 
14
  setuptools
15
  tenacity
16
 
 
11
  pydantic
12
  python-dotenv
13
 
14
+ # Unicode Collation Algorithm for proper Chinese sorting
15
+ pyuca
16
+
17
  setuptools
18
  tenacity
19