Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

zrguo commited on Feb 21

Commit

3fede48

unverified ·

2 Parent(s): b90ac1b 6faaceb

Merge pull request #892 from PiochU19/main

Browse files

add support of providing ids for documents insert

Files changed (2) hide show

README.md +14 -0
lightrag/lightrag.py +50 -21

README.md CHANGED Viewed

@@ -545,6 +545,20 @@ The `insert_batch_size` parameter in `addon_params` controls how many documents
 </details>
 <details>
   <summary><b>Incremental Insert</b></summary>

 </details>
+<details>
+  <summary> <b> Insert with ID </b></summary>
+If you want to provide your own IDs for your documents, number of documents and number of IDs must be the same.
+```python
+# Insert single text, and provide ID for it
+rag.insert("TEXT1", ids=["ID_FOR_TEXT1"])
+# Insert multiple texts, and provide IDs for them
+rag.insert(["TEXT1", "TEXT2",...], ids=["ID_FOR_TEXT1", "ID_FOR_TEXT2"])
+```
+</details>
 <details>
   <summary><b>Incremental Insert</b></summary>

lightrag/lightrag.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from __future__ import annotations
 import asyncio
-import os
 import configparser
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
@@ -41,11 +41,11 @@ from .utils import (
     always_get_an_event_loop,
     compute_mdhash_id,
     convert_response_to_json,
     lazy_external_import,
     limit_async_func_call,
     logger,
     set_logger,
-    encode_string_by_tiktoken,
 )
 from .types import KnowledgeGraph
@@ -479,6 +479,7 @@ class LightRAG:
         input: str | list[str],
         split_by_character: str | None = None,
         split_by_character_only: bool = False,
     ) -> None:
         """Sync Insert documents with checkpoint support
@@ -487,10 +488,11 @@ class LightRAG:
             split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
             split_by_character_only: if split_by_character_only is True, split the string by character only, when
             split_by_character is None, this parameter is ignored.
         """
         loop = always_get_an_event_loop()
         loop.run_until_complete(
-            self.ainsert(input, split_by_character, split_by_character_only)
         )
     async def ainsert(
@@ -498,6 +500,7 @@ class LightRAG:
         input: str | list[str],
         split_by_character: str | None = None,
         split_by_character_only: bool = False,
     ) -> None:
         """Async Insert documents with checkpoint support
@@ -506,8 +509,9 @@ class LightRAG:
             split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
             split_by_character_only: if split_by_character_only is True, split the string by character only, when
             split_by_character is None, this parameter is ignored.
         """
-        await self.apipeline_enqueue_documents(input)
         await self.apipeline_process_enqueue_documents(
             split_by_character, split_by_character_only
         )
@@ -564,24 +568,51 @@ class LightRAG:
             if update_storage:
                 await self._insert_done()
-    async def apipeline_enqueue_documents(self, input: str | list[str]) -> None:
         """
         Pipeline for Processing Documents
-        1. Remove duplicate contents from the list
-        2. Generate document IDs and initial status
-        3. Filter out already processed documents
-        4. Enqueue document in status
         """
         if isinstance(input, str):
             input = [input]
-        # 1. Remove duplicate contents from the list
-        unique_contents = list(set(doc.strip() for doc in input))
-        # 2. Generate document IDs and initial status
         new_docs: dict[str, Any] = {
-            compute_mdhash_id(content, prefix="doc-"): {
                 "content": content,
                 "content_summary": self._get_content_summary(content),
                 "content_length": len(content),
@@ -589,10 +620,10 @@ class LightRAG:
                 "created_at": datetime.now().isoformat(),
                 "updated_at": datetime.now().isoformat(),
             }
-            for content in unique_contents
         }
-        # 3. Filter out already processed documents
         # Get docs ids
         all_new_doc_ids = set(new_docs.keys())
         # Exclude IDs of documents that are already in progress
@@ -604,7 +635,7 @@ class LightRAG:
             logger.info("No new unique documents were found.")
             return
-        # 4. Store status document
         await self.doc_status.upsert(new_docs)
         logger.info(f"Stored {len(new_docs)} new unique documents")
@@ -661,8 +692,6 @@ class LightRAG:
                 # 4. iterate over batch
                 for doc_id_processing_status in docs_batch:
                     doc_id, status_doc = doc_id_processing_status
-                    # Update status in processing
-                    doc_status_id = compute_mdhash_id(status_doc.content, prefix="doc-")
                     # Generate chunks from document
                     chunks: dict[str, Any] = {
                         compute_mdhash_id(dp["content"], prefix="chunk-"): {
@@ -682,7 +711,7 @@ class LightRAG:
                     tasks = [
                         self.doc_status.upsert(
                             {
-                                doc_status_id: {
                                     "status": DocStatus.PROCESSING,
                                     "updated_at": datetime.now().isoformat(),
                                     "content": status_doc.content,
@@ -703,7 +732,7 @@ class LightRAG:
                         await asyncio.gather(*tasks)
                         await self.doc_status.upsert(
                             {
-                                doc_status_id: {
                                     "status": DocStatus.PROCESSED,
                                     "chunks_count": len(chunks),
                                     "content": status_doc.content,
@@ -718,7 +747,7 @@ class LightRAG:
                         logger.error(f"Failed to process document {doc_id}: {str(e)}")
                         await self.doc_status.upsert(
                             {
-                                doc_status_id: {
                                     "status": DocStatus.FAILED,
                                     "error": str(e),
                                     "content": status_doc.content,

 from __future__ import annotations
 import asyncio
 import configparser
+import os
 from dataclasses import asdict, dataclass, field
 from datetime import datetime
 from functools import partial
     always_get_an_event_loop,
     compute_mdhash_id,
     convert_response_to_json,
+    encode_string_by_tiktoken,
     lazy_external_import,
     limit_async_func_call,
     logger,
     set_logger,
 )
 from .types import KnowledgeGraph
         input: str | list[str],
         split_by_character: str | None = None,
         split_by_character_only: bool = False,
+        ids: list[str] | None = None,
     ) -> None:
         """Sync Insert documents with checkpoint support
             split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
             split_by_character_only: if split_by_character_only is True, split the string by character only, when
             split_by_character is None, this parameter is ignored.
+            ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
         """
         loop = always_get_an_event_loop()
         loop.run_until_complete(
+            self.ainsert(input, split_by_character, split_by_character_only, ids)
         )
     async def ainsert(
         input: str | list[str],
         split_by_character: str | None = None,
         split_by_character_only: bool = False,
+        ids: list[str] | None = None,
     ) -> None:
         """Async Insert documents with checkpoint support
             split_by_character: if split_by_character is not None, split the string by character, if chunk longer than
             split_by_character_only: if split_by_character_only is True, split the string by character only, when
             split_by_character is None, this parameter is ignored.
+            ids: list of unique document IDs, if not provided, MD5 hash IDs will be generated
         """
+        await self.apipeline_enqueue_documents(input, ids)
         await self.apipeline_process_enqueue_documents(
             split_by_character, split_by_character_only
         )
             if update_storage:
                 await self._insert_done()
+    async def apipeline_enqueue_documents(
+        self, input: str | list[str], ids: list[str] | None
+    ) -> None:
         """
         Pipeline for Processing Documents
+        1. Validate ids if provided or generate MD5 hash IDs
+        2. Remove duplicate contents
+        3. Generate document initial status
+        4. Filter out already processed documents
+        5. Enqueue document in status
         """
         if isinstance(input, str):
             input = [input]
+        # 1. Validate ids if provided or generate MD5 hash IDs
+        if ids is not None:
+            # Check if the number of IDs matches the number of documents
+            if len(ids) != len(input):
+                raise ValueError("Number of IDs must match the number of documents")
+            # Check if IDs are unique
+            if len(ids) != len(set(ids)):
+                raise ValueError("IDs must be unique")
+            # Generate contents dict of IDs provided by user and documents
+            contents = {id_: doc.strip() for id_, doc in zip(ids, input)}
+        else:
+            # Generate contents dict of MD5 hash IDs and documents
+            contents = {
+                compute_mdhash_id(doc.strip(), prefix="doc-"): doc.strip()
+                for doc in input
+            }
+        # 2. Remove duplicate contents
+        unique_contents = {
+            id_: content
+            for content, id_ in {
+                content: id_ for id_, content in contents.items()
+            }.items()
+        }
+        # 3. Generate document initial status
         new_docs: dict[str, Any] = {
+            id_: {
                 "content": content,
                 "content_summary": self._get_content_summary(content),
                 "content_length": len(content),
                 "created_at": datetime.now().isoformat(),
                 "updated_at": datetime.now().isoformat(),
             }
+            for id_, content in unique_contents.items()
         }
+        # 4. Filter out already processed documents
         # Get docs ids
         all_new_doc_ids = set(new_docs.keys())
         # Exclude IDs of documents that are already in progress
             logger.info("No new unique documents were found.")
             return
+        # 5. Store status document
         await self.doc_status.upsert(new_docs)
         logger.info(f"Stored {len(new_docs)} new unique documents")
                 # 4. iterate over batch
                 for doc_id_processing_status in docs_batch:
                     doc_id, status_doc = doc_id_processing_status
                     # Generate chunks from document
                     chunks: dict[str, Any] = {
                         compute_mdhash_id(dp["content"], prefix="chunk-"): {
                     tasks = [
                         self.doc_status.upsert(
                             {
+                                doc_id: {
                                     "status": DocStatus.PROCESSING,
                                     "updated_at": datetime.now().isoformat(),
                                     "content": status_doc.content,
                         await asyncio.gather(*tasks)
                         await self.doc_status.upsert(
                             {
+                                doc_id: {
                                     "status": DocStatus.PROCESSED,
                                     "chunks_count": len(chunks),
                                     "content": status_doc.content,
                         logger.error(f"Failed to process document {doc_id}: {str(e)}")
                         await self.doc_status.upsert(
                             {
+                                doc_id: {
                                     "status": DocStatus.FAILED,
                                     "error": str(e),
                                     "content": status_doc.content,