cleaned docs
Browse files- examples/lightrag_oracle_demo.py +2 -3
- lightrag/lightrag.py +29 -24
examples/lightrag_oracle_demo.py
CHANGED
|
@@ -121,9 +121,8 @@ async def main():
|
|
| 121 |
texts = [x for x in all_text.split("\n") if x]
|
| 122 |
|
| 123 |
# New mode use pipeline
|
| 124 |
-
await rag.
|
| 125 |
-
await rag.
|
| 126 |
-
await rag.apipeline_process_extract_graph()
|
| 127 |
|
| 128 |
# Old method use ainsert
|
| 129 |
# await rag.ainsert(texts)
|
|
|
|
| 121 |
texts = [x for x in all_text.split("\n") if x]
|
| 122 |
|
| 123 |
# New mode use pipeline
|
| 124 |
+
await rag.apipeline_enqueue_documents(texts)
|
| 125 |
+
await rag.apipeline_process_enqueue_documents()
|
|
|
|
| 126 |
|
| 127 |
# Old method use ainsert
|
| 128 |
# await rag.ainsert(texts)
|
lightrag/lightrag.py
CHANGED
|
@@ -395,7 +395,9 @@ class LightRAG:
|
|
| 395 |
split_by_character is None, this parameter is ignored.
|
| 396 |
"""
|
| 397 |
await self.apipeline_process_documents(string_or_strings)
|
| 398 |
-
await self.apipeline_process_enqueue_documents(
|
|
|
|
|
|
|
| 399 |
|
| 400 |
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
| 401 |
loop = always_get_an_event_loop()
|
|
@@ -511,23 +513,23 @@ class LightRAG:
|
|
| 511 |
async def _get_pending_documents(self) -> list[str]:
|
| 512 |
"""Fetch all pending and failed documents."""
|
| 513 |
to_process_doc_keys: list[str] = []
|
| 514 |
-
|
| 515 |
# Fetch failed documents
|
| 516 |
failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED)
|
| 517 |
if failed_docs:
|
| 518 |
to_process_doc_keys.extend([doc["id"] for doc in failed_docs])
|
| 519 |
-
|
| 520 |
# Fetch pending documents
|
| 521 |
pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING)
|
| 522 |
if pending_docs:
|
| 523 |
to_process_doc_keys.extend([doc["id"] for doc in pending_docs])
|
| 524 |
-
|
| 525 |
if not to_process_doc_keys:
|
| 526 |
logger.info("All documents have been processed or are duplicates")
|
| 527 |
return []
|
| 528 |
-
|
| 529 |
return to_process_doc_keys
|
| 530 |
-
|
| 531 |
async def apipeline_process_enqueue_documents(
|
| 532 |
self,
|
| 533 |
split_by_character: str | None = None,
|
|
@@ -535,36 +537,39 @@ class LightRAG:
|
|
| 535 |
) -> None:
|
| 536 |
"""
|
| 537 |
Process pending documents by splitting them into chunks, processing
|
| 538 |
-
each chunk for entity and relation extraction, and updating the
|
| 539 |
document status.
|
| 540 |
-
|
| 541 |
1. Get all pending and failed documents
|
| 542 |
2. Split document content into chunks
|
| 543 |
3. Process each chunk for entity and relation extraction
|
| 544 |
4. Update the document status
|
| 545 |
-
"""
|
| 546 |
# 1. get all pending and failed documents
|
| 547 |
pending_doc_ids = await self._get_pending_documents()
|
| 548 |
-
|
| 549 |
if not pending_doc_ids:
|
| 550 |
logger.info("All documents have been processed or are duplicates")
|
| 551 |
return
|
| 552 |
-
|
| 553 |
# Get allready processed documents (text chunks and full docs)
|
| 554 |
-
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(
|
|
|
|
|
|
|
| 555 |
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
|
| 556 |
|
| 557 |
# 2. split docs into chunks, insert chunks, update doc status
|
| 558 |
batch_size = self.addon_params.get("insert_batch_size", 10)
|
| 559 |
batch_docs_list = [
|
| 560 |
-
pending_doc_ids[i : i + batch_size]
|
|
|
|
| 561 |
]
|
| 562 |
-
|
| 563 |
# 3. iterate over batches
|
| 564 |
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
|
| 565 |
for batch_idx, doc_ids in tqdm_async(
|
| 566 |
enumerate(batch_docs_list),
|
| 567 |
-
desc=
|
| 568 |
):
|
| 569 |
# 4. iterate over batch
|
| 570 |
for doc_id in tqdm_async(
|
|
@@ -580,7 +585,7 @@ class LightRAG:
|
|
| 580 |
"updated_at": datetime.now().isoformat(),
|
| 581 |
"content_summary": status_doc["content_summary"],
|
| 582 |
"content_length": status_doc["content_length"],
|
| 583 |
-
"created_at": status_doc["created_at"],
|
| 584 |
}
|
| 585 |
}
|
| 586 |
)
|
|
@@ -599,22 +604,24 @@ class LightRAG:
|
|
| 599 |
self.tiktoken_model_name,
|
| 600 |
)
|
| 601 |
}
|
| 602 |
-
|
| 603 |
-
# Ensure chunk insertion and graph processing happen sequentially, not in parallel
|
| 604 |
await self._process_entity_relation_graph(chunks)
|
| 605 |
await self.chunks_vdb.upsert(chunks)
|
| 606 |
|
| 607 |
# Check if document already processed the doc
|
| 608 |
if doc_id not in full_docs_processed_doc_ids:
|
| 609 |
tasks[doc_id].append(
|
| 610 |
-
self.full_docs.upsert(
|
|
|
|
|
|
|
| 611 |
)
|
| 612 |
-
|
| 613 |
# Check if chunks already processed the doc
|
| 614 |
if doc_id not in text_chunks_processed_doc_ids:
|
| 615 |
tasks[doc_id].append(self.text_chunks.upsert(chunks))
|
| 616 |
|
| 617 |
-
# Process document (text chunks and full docs) in parallel
|
| 618 |
for doc_id, task in tasks.items():
|
| 619 |
try:
|
| 620 |
await asyncio.gather(*task)
|
|
@@ -630,9 +637,7 @@ class LightRAG:
|
|
| 630 |
await self._insert_done()
|
| 631 |
|
| 632 |
except Exception as e:
|
| 633 |
-
logger.error(
|
| 634 |
-
f"Failed to process document {doc_id}: {str(e)}"
|
| 635 |
-
)
|
| 636 |
await self.doc_status.upsert(
|
| 637 |
{
|
| 638 |
doc_id: {
|
|
|
|
| 395 |
split_by_character is None, this parameter is ignored.
|
| 396 |
"""
|
| 397 |
await self.apipeline_process_documents(string_or_strings)
|
| 398 |
+
await self.apipeline_process_enqueue_documents(
|
| 399 |
+
split_by_character, split_by_character_only
|
| 400 |
+
)
|
| 401 |
|
| 402 |
def insert_custom_chunks(self, full_text: str, text_chunks: list[str]):
|
| 403 |
loop = always_get_an_event_loop()
|
|
|
|
| 513 |
async def _get_pending_documents(self) -> list[str]:
|
| 514 |
"""Fetch all pending and failed documents."""
|
| 515 |
to_process_doc_keys: list[str] = []
|
| 516 |
+
|
| 517 |
# Fetch failed documents
|
| 518 |
failed_docs = await self.doc_status.get_by_status(status=DocStatus.FAILED)
|
| 519 |
if failed_docs:
|
| 520 |
to_process_doc_keys.extend([doc["id"] for doc in failed_docs])
|
| 521 |
+
|
| 522 |
# Fetch pending documents
|
| 523 |
pending_docs = await self.doc_status.get_by_status(status=DocStatus.PENDING)
|
| 524 |
if pending_docs:
|
| 525 |
to_process_doc_keys.extend([doc["id"] for doc in pending_docs])
|
| 526 |
+
|
| 527 |
if not to_process_doc_keys:
|
| 528 |
logger.info("All documents have been processed or are duplicates")
|
| 529 |
return []
|
| 530 |
+
|
| 531 |
return to_process_doc_keys
|
| 532 |
+
|
| 533 |
async def apipeline_process_enqueue_documents(
|
| 534 |
self,
|
| 535 |
split_by_character: str | None = None,
|
|
|
|
| 537 |
) -> None:
|
| 538 |
"""
|
| 539 |
Process pending documents by splitting them into chunks, processing
|
| 540 |
+
each chunk for entity and relation extraction, and updating the
|
| 541 |
document status.
|
| 542 |
+
|
| 543 |
1. Get all pending and failed documents
|
| 544 |
2. Split document content into chunks
|
| 545 |
3. Process each chunk for entity and relation extraction
|
| 546 |
4. Update the document status
|
| 547 |
+
"""
|
| 548 |
# 1. get all pending and failed documents
|
| 549 |
pending_doc_ids = await self._get_pending_documents()
|
| 550 |
+
|
| 551 |
if not pending_doc_ids:
|
| 552 |
logger.info("All documents have been processed or are duplicates")
|
| 553 |
return
|
| 554 |
+
|
| 555 |
# Get allready processed documents (text chunks and full docs)
|
| 556 |
+
text_chunks_processed_doc_ids = await self.text_chunks.filter_keys(
|
| 557 |
+
pending_doc_ids
|
| 558 |
+
)
|
| 559 |
full_docs_processed_doc_ids = await self.full_docs.filter_keys(pending_doc_ids)
|
| 560 |
|
| 561 |
# 2. split docs into chunks, insert chunks, update doc status
|
| 562 |
batch_size = self.addon_params.get("insert_batch_size", 10)
|
| 563 |
batch_docs_list = [
|
| 564 |
+
pending_doc_ids[i : i + batch_size]
|
| 565 |
+
for i in range(0, len(pending_doc_ids), batch_size)
|
| 566 |
]
|
| 567 |
+
|
| 568 |
# 3. iterate over batches
|
| 569 |
tasks: dict[str, list[Coroutine[Any, Any, None]]] = {}
|
| 570 |
for batch_idx, doc_ids in tqdm_async(
|
| 571 |
enumerate(batch_docs_list),
|
| 572 |
+
desc="Process Batches",
|
| 573 |
):
|
| 574 |
# 4. iterate over batch
|
| 575 |
for doc_id in tqdm_async(
|
|
|
|
| 585 |
"updated_at": datetime.now().isoformat(),
|
| 586 |
"content_summary": status_doc["content_summary"],
|
| 587 |
"content_length": status_doc["content_length"],
|
| 588 |
+
"created_at": status_doc["created_at"],
|
| 589 |
}
|
| 590 |
}
|
| 591 |
)
|
|
|
|
| 604 |
self.tiktoken_model_name,
|
| 605 |
)
|
| 606 |
}
|
| 607 |
+
|
| 608 |
+
# Ensure chunk insertion and graph processing happen sequentially, not in parallel
|
| 609 |
await self._process_entity_relation_graph(chunks)
|
| 610 |
await self.chunks_vdb.upsert(chunks)
|
| 611 |
|
| 612 |
# Check if document already processed the doc
|
| 613 |
if doc_id not in full_docs_processed_doc_ids:
|
| 614 |
tasks[doc_id].append(
|
| 615 |
+
self.full_docs.upsert(
|
| 616 |
+
{doc_id: {"content": status_doc["content"]}}
|
| 617 |
+
)
|
| 618 |
)
|
| 619 |
+
|
| 620 |
# Check if chunks already processed the doc
|
| 621 |
if doc_id not in text_chunks_processed_doc_ids:
|
| 622 |
tasks[doc_id].append(self.text_chunks.upsert(chunks))
|
| 623 |
|
| 624 |
+
# Process document (text chunks and full docs) in parallel
|
| 625 |
for doc_id, task in tasks.items():
|
| 626 |
try:
|
| 627 |
await asyncio.gather(*task)
|
|
|
|
| 637 |
await self._insert_done()
|
| 638 |
|
| 639 |
except Exception as e:
|
| 640 |
+
logger.error(f"Failed to process document {doc_id}: {str(e)}")
|
|
|
|
|
|
|
| 641 |
await self.doc_status.upsert(
|
| 642 |
{
|
| 643 |
doc_id: {
|