Merge branch 'main' of github.com:lcjqyml/LightRAG
Browse files- env.example +2 -0
- lightrag/__init__.py +1 -1
- lightrag/kg/postgres_impl.py +28 -6
- lightrag/lightrag.py +33 -41
- lightrag/llm/hf.py +16 -1
- lightrag/operate.py +10 -36
- lightrag/utils.py +33 -25
env.example
CHANGED
@@ -73,6 +73,8 @@ LLM_BINDING_HOST=http://localhost:11434
|
|
73 |
### Embedding Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal)
|
74 |
EMBEDDING_MODEL=bge-m3:latest
|
75 |
EMBEDDING_DIM=1024
|
|
|
|
|
76 |
# EMBEDDING_BINDING_API_KEY=your_api_key
|
77 |
### ollama example
|
78 |
EMBEDDING_BINDING=ollama
|
|
|
73 |
### Embedding Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal)
|
74 |
EMBEDDING_MODEL=bge-m3:latest
|
75 |
EMBEDDING_DIM=1024
|
76 |
+
EMBEDDING_BATCH_NUM=32
|
77 |
+
EMBEDDING_FUNC_MAX_ASYNC=16
|
78 |
# EMBEDDING_BINDING_API_KEY=your_api_key
|
79 |
### ollama example
|
80 |
EMBEDDING_BINDING=ollama
|
lightrag/__init__.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
2 |
|
3 |
-
__version__ = "1.2.
|
4 |
__author__ = "Zirui Guo"
|
5 |
__url__ = "https://github.com/HKUDS/LightRAG"
|
|
|
1 |
from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
|
2 |
|
3 |
+
__version__ = "1.2.7"
|
4 |
__author__ = "Zirui Guo"
|
5 |
__url__ = "https://github.com/HKUDS/LightRAG"
|
lightrag/kg/postgres_impl.py
CHANGED
@@ -747,8 +747,30 @@ class PGDocStatusStorage(DocStatusStorage):
|
|
747 |
)
|
748 |
|
749 |
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
750 |
-
"""Get doc_chunks data by
|
751 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
752 |
|
753 |
async def get_status_counts(self) -> dict[str, int]:
|
754 |
"""Get counts of documents in each status"""
|
@@ -1570,7 +1592,7 @@ TABLES = {
|
|
1570 |
content_vector VECTOR,
|
1571 |
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
1572 |
update_time TIMESTAMP,
|
1573 |
-
|
1574 |
file_path TEXT NULL,
|
1575 |
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
|
1576 |
)"""
|
@@ -1585,7 +1607,7 @@ TABLES = {
|
|
1585 |
content_vector VECTOR,
|
1586 |
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
1587 |
update_time TIMESTAMP,
|
1588 |
-
|
1589 |
file_path TEXT NULL,
|
1590 |
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
|
1591 |
)"""
|
@@ -1673,7 +1695,7 @@ SQL_TEMPLATES = {
|
|
1673 |
""",
|
1674 |
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
1675 |
content_vector, chunk_ids, file_path)
|
1676 |
-
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7
|
1677 |
ON CONFLICT (workspace,id) DO UPDATE
|
1678 |
SET entity_name=EXCLUDED.entity_name,
|
1679 |
content=EXCLUDED.content,
|
@@ -1684,7 +1706,7 @@ SQL_TEMPLATES = {
|
|
1684 |
""",
|
1685 |
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
|
1686 |
target_id, content, content_vector, chunk_ids, file_path)
|
1687 |
-
VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8
|
1688 |
ON CONFLICT (workspace,id) DO UPDATE
|
1689 |
SET source_id=EXCLUDED.source_id,
|
1690 |
target_id=EXCLUDED.target_id,
|
|
|
747 |
)
|
748 |
|
749 |
async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
|
750 |
+
"""Get doc_chunks data by multiple IDs."""
|
751 |
+
if not ids:
|
752 |
+
return []
|
753 |
+
|
754 |
+
sql = "SELECT * FROM LIGHTRAG_DOC_STATUS WHERE workspace=$1 AND id = ANY($2)"
|
755 |
+
params = {"workspace": self.db.workspace, "ids": ids}
|
756 |
+
|
757 |
+
results = await self.db.query(sql, params, True)
|
758 |
+
|
759 |
+
if not results:
|
760 |
+
return []
|
761 |
+
return [
|
762 |
+
{
|
763 |
+
"content": row["content"],
|
764 |
+
"content_length": row["content_length"],
|
765 |
+
"content_summary": row["content_summary"],
|
766 |
+
"status": row["status"],
|
767 |
+
"chunks_count": row["chunks_count"],
|
768 |
+
"created_at": row["created_at"],
|
769 |
+
"updated_at": row["updated_at"],
|
770 |
+
"file_path": row["file_path"],
|
771 |
+
}
|
772 |
+
for row in results
|
773 |
+
]
|
774 |
|
775 |
async def get_status_counts(self) -> dict[str, int]:
|
776 |
"""Get counts of documents in each status"""
|
|
|
1592 |
content_vector VECTOR,
|
1593 |
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
1594 |
update_time TIMESTAMP,
|
1595 |
+
chunk_ids VARCHAR(255)[] NULL,
|
1596 |
file_path TEXT NULL,
|
1597 |
CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
|
1598 |
)"""
|
|
|
1607 |
content_vector VECTOR,
|
1608 |
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
1609 |
update_time TIMESTAMP,
|
1610 |
+
chunk_ids VARCHAR(255)[] NULL,
|
1611 |
file_path TEXT NULL,
|
1612 |
CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
|
1613 |
)"""
|
|
|
1695 |
""",
|
1696 |
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
1697 |
content_vector, chunk_ids, file_path)
|
1698 |
+
VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7)
|
1699 |
ON CONFLICT (workspace,id) DO UPDATE
|
1700 |
SET entity_name=EXCLUDED.entity_name,
|
1701 |
content=EXCLUDED.content,
|
|
|
1706 |
""",
|
1707 |
"upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
|
1708 |
target_id, content, content_vector, chunk_ids, file_path)
|
1709 |
+
VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8)
|
1710 |
ON CONFLICT (workspace,id) DO UPDATE
|
1711 |
SET source_id=EXCLUDED.source_id,
|
1712 |
target_id=EXCLUDED.target_id,
|
lightrag/lightrag.py
CHANGED
@@ -183,10 +183,10 @@ class LightRAG:
|
|
183 |
embedding_func: EmbeddingFunc | None = field(default=None)
|
184 |
"""Function for computing text embeddings. Must be set before use."""
|
185 |
|
186 |
-
embedding_batch_num: int = field(default=32)
|
187 |
"""Batch size for embedding computations."""
|
188 |
|
189 |
-
embedding_func_max_async: int = field(default=16)
|
190 |
"""Maximum number of concurrent embedding function calls."""
|
191 |
|
192 |
embedding_cache_config: dict[str, Any] = field(
|
@@ -1947,6 +1947,8 @@ class LightRAG:
|
|
1947 |
|
1948 |
# 2. Update entity information in the graph
|
1949 |
new_node_data = {**node_data, **updated_data}
|
|
|
|
|
1950 |
if "entity_name" in new_node_data:
|
1951 |
del new_node_data[
|
1952 |
"entity_name"
|
@@ -1963,7 +1965,7 @@ class LightRAG:
|
|
1963 |
|
1964 |
# Store relationships that need to be updated
|
1965 |
relations_to_update = []
|
1966 |
-
|
1967 |
# Get all edges related to the original entity
|
1968 |
edges = await self.chunk_entity_relation_graph.get_node_edges(
|
1969 |
entity_name
|
@@ -1975,6 +1977,12 @@ class LightRAG:
|
|
1975 |
source, target
|
1976 |
)
|
1977 |
if edge_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
1978 |
if source == entity_name:
|
1979 |
await self.chunk_entity_relation_graph.upsert_edge(
|
1980 |
new_entity_name, target, edge_data
|
@@ -2000,6 +2008,12 @@ class LightRAG:
|
|
2000 |
f"Deleted old entity '{entity_name}' and its vector embedding from database"
|
2001 |
)
|
2002 |
|
|
|
|
|
|
|
|
|
|
|
|
|
2003 |
# Update relationship vector representations
|
2004 |
for src, tgt, edge_data in relations_to_update:
|
2005 |
description = edge_data.get("description", "")
|
@@ -2498,39 +2512,21 @@ class LightRAG:
|
|
2498 |
# 4. Get all relationships of the source entities
|
2499 |
all_relations = []
|
2500 |
for entity_name in source_entities:
|
2501 |
-
# Get all relationships
|
2502 |
-
|
2503 |
entity_name
|
2504 |
)
|
2505 |
-
if
|
2506 |
-
for src, tgt in
|
2507 |
# Ensure src is the current entity
|
2508 |
if src == entity_name:
|
2509 |
edge_data = await self.chunk_entity_relation_graph.get_edge(
|
2510 |
src, tgt
|
2511 |
)
|
2512 |
-
all_relations.append((
|
2513 |
-
|
2514 |
-
# Get all relationships where this entity is the target
|
2515 |
-
incoming_edges = []
|
2516 |
-
all_labels = await self.chunk_entity_relation_graph.get_all_labels()
|
2517 |
-
for label in all_labels:
|
2518 |
-
if label == entity_name:
|
2519 |
-
continue
|
2520 |
-
node_edges = await self.chunk_entity_relation_graph.get_node_edges(
|
2521 |
-
label
|
2522 |
-
)
|
2523 |
-
for src, tgt in node_edges or []:
|
2524 |
-
if tgt == entity_name:
|
2525 |
-
incoming_edges.append((src, tgt))
|
2526 |
-
|
2527 |
-
for src, tgt in incoming_edges:
|
2528 |
-
edge_data = await self.chunk_entity_relation_graph.get_edge(
|
2529 |
-
src, tgt
|
2530 |
-
)
|
2531 |
-
all_relations.append(("incoming", src, tgt, edge_data))
|
2532 |
|
2533 |
# 5. Create or update the target entity
|
|
|
2534 |
if not target_exists:
|
2535 |
await self.chunk_entity_relation_graph.upsert_node(
|
2536 |
target_entity, merged_entity_data
|
@@ -2544,8 +2540,11 @@ class LightRAG:
|
|
2544 |
|
2545 |
# 6. Recreate all relationships, pointing to the target entity
|
2546 |
relation_updates = {} # Track relationships that need to be merged
|
|
|
2547 |
|
2548 |
-
for
|
|
|
|
|
2549 |
new_src = target_entity if src in source_entities else src
|
2550 |
new_tgt = target_entity if tgt in source_entities else tgt
|
2551 |
|
@@ -2590,6 +2589,12 @@ class LightRAG:
|
|
2590 |
f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}"
|
2591 |
)
|
2592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
2593 |
# 7. Update entity vector representation
|
2594 |
description = merged_entity_data.get("description", "")
|
2595 |
source_id = merged_entity_data.get("source_id", "")
|
@@ -2652,19 +2657,6 @@ class LightRAG:
|
|
2652 |
entity_id = compute_mdhash_id(entity_name, prefix="ent-")
|
2653 |
await self.entities_vdb.delete([entity_id])
|
2654 |
|
2655 |
-
# Also ensure any relationships specific to this entity are deleted from vector DB
|
2656 |
-
# This is a safety check, as these should have been transformed to the target entity already
|
2657 |
-
entity_relation_prefix = compute_mdhash_id(entity_name, prefix="rel-")
|
2658 |
-
relations_with_entity = await self.relationships_vdb.search_by_prefix(
|
2659 |
-
entity_relation_prefix
|
2660 |
-
)
|
2661 |
-
if relations_with_entity:
|
2662 |
-
relation_ids = [r["id"] for r in relations_with_entity]
|
2663 |
-
await self.relationships_vdb.delete(relation_ids)
|
2664 |
-
logger.info(
|
2665 |
-
f"Deleted {len(relation_ids)} relation records for entity '{entity_name}' from vector database"
|
2666 |
-
)
|
2667 |
-
|
2668 |
logger.info(
|
2669 |
f"Deleted source entity '{entity_name}' and its vector embedding from database"
|
2670 |
)
|
|
|
183 |
embedding_func: EmbeddingFunc | None = field(default=None)
|
184 |
"""Function for computing text embeddings. Must be set before use."""
|
185 |
|
186 |
+
embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 32)))
|
187 |
"""Batch size for embedding computations."""
|
188 |
|
189 |
+
embedding_func_max_async: int = field(default=int(os.getenv("EMBEDDING_FUNC_MAX_ASYNC", 16)))
|
190 |
"""Maximum number of concurrent embedding function calls."""
|
191 |
|
192 |
embedding_cache_config: dict[str, Any] = field(
|
|
|
1947 |
|
1948 |
# 2. Update entity information in the graph
|
1949 |
new_node_data = {**node_data, **updated_data}
|
1950 |
+
new_node_data["entity_id"] = new_entity_name
|
1951 |
+
|
1952 |
if "entity_name" in new_node_data:
|
1953 |
del new_node_data[
|
1954 |
"entity_name"
|
|
|
1965 |
|
1966 |
# Store relationships that need to be updated
|
1967 |
relations_to_update = []
|
1968 |
+
relations_to_delete = []
|
1969 |
# Get all edges related to the original entity
|
1970 |
edges = await self.chunk_entity_relation_graph.get_node_edges(
|
1971 |
entity_name
|
|
|
1977 |
source, target
|
1978 |
)
|
1979 |
if edge_data:
|
1980 |
+
relations_to_delete.append(
|
1981 |
+
compute_mdhash_id(source + target, prefix="rel-")
|
1982 |
+
)
|
1983 |
+
relations_to_delete.append(
|
1984 |
+
compute_mdhash_id(target + source, prefix="rel-")
|
1985 |
+
)
|
1986 |
if source == entity_name:
|
1987 |
await self.chunk_entity_relation_graph.upsert_edge(
|
1988 |
new_entity_name, target, edge_data
|
|
|
2008 |
f"Deleted old entity '{entity_name}' and its vector embedding from database"
|
2009 |
)
|
2010 |
|
2011 |
+
# Delete old relation records from vector database
|
2012 |
+
await self.relationships_vdb.delete(relations_to_delete)
|
2013 |
+
logger.info(
|
2014 |
+
f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database"
|
2015 |
+
)
|
2016 |
+
|
2017 |
# Update relationship vector representations
|
2018 |
for src, tgt, edge_data in relations_to_update:
|
2019 |
description = edge_data.get("description", "")
|
|
|
2512 |
# 4. Get all relationships of the source entities
|
2513 |
all_relations = []
|
2514 |
for entity_name in source_entities:
|
2515 |
+
# Get all relationships of the source entities
|
2516 |
+
edges = await self.chunk_entity_relation_graph.get_node_edges(
|
2517 |
entity_name
|
2518 |
)
|
2519 |
+
if edges:
|
2520 |
+
for src, tgt in edges:
|
2521 |
# Ensure src is the current entity
|
2522 |
if src == entity_name:
|
2523 |
edge_data = await self.chunk_entity_relation_graph.get_edge(
|
2524 |
src, tgt
|
2525 |
)
|
2526 |
+
all_relations.append((src, tgt, edge_data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2527 |
|
2528 |
# 5. Create or update the target entity
|
2529 |
+
merged_entity_data["entity_id"] = target_entity
|
2530 |
if not target_exists:
|
2531 |
await self.chunk_entity_relation_graph.upsert_node(
|
2532 |
target_entity, merged_entity_data
|
|
|
2540 |
|
2541 |
# 6. Recreate all relationships, pointing to the target entity
|
2542 |
relation_updates = {} # Track relationships that need to be merged
|
2543 |
+
relations_to_delete = []
|
2544 |
|
2545 |
+
for src, tgt, edge_data in all_relations:
|
2546 |
+
relations_to_delete.append(compute_mdhash_id(src + tgt, prefix="rel-"))
|
2547 |
+
relations_to_delete.append(compute_mdhash_id(tgt + src, prefix="rel-"))
|
2548 |
new_src = target_entity if src in source_entities else src
|
2549 |
new_tgt = target_entity if tgt in source_entities else tgt
|
2550 |
|
|
|
2589 |
f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}"
|
2590 |
)
|
2591 |
|
2592 |
+
# Delete relationships records from vector database
|
2593 |
+
await self.relationships_vdb.delete(relations_to_delete)
|
2594 |
+
logger.info(
|
2595 |
+
f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database"
|
2596 |
+
)
|
2597 |
+
|
2598 |
# 7. Update entity vector representation
|
2599 |
description = merged_entity_data.get("description", "")
|
2600 |
source_id = merged_entity_data.get("source_id", "")
|
|
|
2657 |
entity_id = compute_mdhash_id(entity_name, prefix="ent-")
|
2658 |
await self.entities_vdb.delete([entity_id])
|
2659 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2660 |
logger.info(
|
2661 |
f"Deleted source entity '{entity_name}' and its vector embedding from database"
|
2662 |
)
|
lightrag/llm/hf.py
CHANGED
@@ -138,16 +138,31 @@ async def hf_model_complete(
|
|
138 |
|
139 |
|
140 |
async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
encoded_texts = tokenizer(
|
143 |
texts, return_tensors="pt", padding=True, truncation=True
|
144 |
).to(device)
|
|
|
|
|
145 |
with torch.no_grad():
|
146 |
outputs = embed_model(
|
147 |
input_ids=encoded_texts["input_ids"],
|
148 |
attention_mask=encoded_texts["attention_mask"],
|
149 |
)
|
150 |
embeddings = outputs.last_hidden_state.mean(dim=1)
|
|
|
|
|
151 |
if embeddings.dtype == torch.bfloat16:
|
152 |
return embeddings.detach().to(torch.float32).cpu().numpy()
|
153 |
else:
|
|
|
138 |
|
139 |
|
140 |
async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
|
141 |
+
# Detect the appropriate device
|
142 |
+
if torch.cuda.is_available():
|
143 |
+
device = next(embed_model.parameters()).device # Use CUDA if available
|
144 |
+
elif torch.backends.mps.is_available():
|
145 |
+
device = torch.device("mps") # Use MPS for Apple Silicon
|
146 |
+
else:
|
147 |
+
device = torch.device("cpu") # Fallback to CPU
|
148 |
+
|
149 |
+
# Move the model to the detected device
|
150 |
+
embed_model = embed_model.to(device)
|
151 |
+
|
152 |
+
# Tokenize the input texts and move them to the same device
|
153 |
encoded_texts = tokenizer(
|
154 |
texts, return_tensors="pt", padding=True, truncation=True
|
155 |
).to(device)
|
156 |
+
|
157 |
+
# Perform inference
|
158 |
with torch.no_grad():
|
159 |
outputs = embed_model(
|
160 |
input_ids=encoded_texts["input_ids"],
|
161 |
attention_mask=encoded_texts["attention_mask"],
|
162 |
)
|
163 |
embeddings = outputs.last_hidden_state.mean(dim=1)
|
164 |
+
|
165 |
+
# Convert embeddings to NumPy
|
166 |
if embeddings.dtype == torch.bfloat16:
|
167 |
return embeddings.detach().to(torch.float32).cpu().numpy()
|
168 |
else:
|
lightrag/operate.py
CHANGED
@@ -172,7 +172,7 @@ async def _handle_single_entity_extraction(
|
|
172 |
entity_type=entity_type,
|
173 |
description=entity_description,
|
174 |
source_id=chunk_key,
|
175 |
-
|
176 |
)
|
177 |
|
178 |
|
@@ -201,7 +201,7 @@ async def _handle_single_relationship_extraction(
|
|
201 |
description=edge_description,
|
202 |
keywords=edge_keywords,
|
203 |
source_id=edge_source_id,
|
204 |
-
|
205 |
)
|
206 |
|
207 |
|
@@ -224,9 +224,7 @@ async def _merge_nodes_then_upsert(
|
|
224 |
split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
|
225 |
)
|
226 |
already_file_paths.extend(
|
227 |
-
split_string_by_multi_markers(
|
228 |
-
already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP]
|
229 |
-
)
|
230 |
)
|
231 |
already_description.append(already_node["description"])
|
232 |
|
@@ -244,7 +242,7 @@ async def _merge_nodes_then_upsert(
|
|
244 |
set([dp["source_id"] for dp in nodes_data] + already_source_ids)
|
245 |
)
|
246 |
file_path = GRAPH_FIELD_SEP.join(
|
247 |
-
set([dp["
|
248 |
)
|
249 |
|
250 |
logger.debug(f"file_path: {file_path}")
|
@@ -298,7 +296,7 @@ async def _merge_edges_then_upsert(
|
|
298 |
if already_edge.get("file_path") is not None:
|
299 |
already_file_paths.extend(
|
300 |
split_string_by_multi_markers(
|
301 |
-
already_edge["
|
302 |
)
|
303 |
)
|
304 |
|
@@ -340,11 +338,7 @@ async def _merge_edges_then_upsert(
|
|
340 |
)
|
341 |
file_path = GRAPH_FIELD_SEP.join(
|
342 |
set(
|
343 |
-
[
|
344 |
-
dp["metadata"]["file_path"]
|
345 |
-
for dp in edges_data
|
346 |
-
if dp.get("metadata", {}).get("file_path")
|
347 |
-
]
|
348 |
+ already_file_paths
|
349 |
)
|
350 |
)
|
@@ -679,10 +673,6 @@ async def extract_entities(
|
|
679 |
"content": f"{dp['entity_name']}\n{dp['description']}",
|
680 |
"source_id": dp["source_id"],
|
681 |
"file_path": dp.get("file_path", "unknown_source"),
|
682 |
-
"metadata": {
|
683 |
-
"created_at": dp.get("created_at", time.time()),
|
684 |
-
"file_path": dp.get("file_path", "unknown_source"),
|
685 |
-
},
|
686 |
}
|
687 |
for dp in all_entities_data
|
688 |
}
|
@@ -697,10 +687,6 @@ async def extract_entities(
|
|
697 |
"content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
|
698 |
"source_id": dp["source_id"],
|
699 |
"file_path": dp.get("file_path", "unknown_source"),
|
700 |
-
"metadata": {
|
701 |
-
"created_at": dp.get("created_at", time.time()),
|
702 |
-
"file_path": dp.get("file_path", "unknown_source"),
|
703 |
-
},
|
704 |
}
|
705 |
for dp in all_relationships_data
|
706 |
}
|
@@ -1285,11 +1271,8 @@ async def _get_node_data(
|
|
1285 |
if isinstance(created_at, (int, float)):
|
1286 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1287 |
|
1288 |
-
# Get file path from
|
1289 |
file_path = n.get("file_path", "unknown_source")
|
1290 |
-
if not file_path or file_path == "unknown_source":
|
1291 |
-
# Try to get from metadata
|
1292 |
-
file_path = n.get("metadata", {}).get("file_path", "unknown_source")
|
1293 |
|
1294 |
entites_section_list.append(
|
1295 |
[
|
@@ -1323,11 +1306,8 @@ async def _get_node_data(
|
|
1323 |
if isinstance(created_at, (int, float)):
|
1324 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1325 |
|
1326 |
-
# Get file path from
|
1327 |
file_path = e.get("file_path", "unknown_source")
|
1328 |
-
if not file_path or file_path == "unknown_source":
|
1329 |
-
# Try to get from metadata
|
1330 |
-
file_path = e.get("metadata", {}).get("file_path", "unknown_source")
|
1331 |
|
1332 |
relations_section_list.append(
|
1333 |
[
|
@@ -1564,11 +1544,8 @@ async def _get_edge_data(
|
|
1564 |
if isinstance(created_at, (int, float)):
|
1565 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1566 |
|
1567 |
-
# Get file path from
|
1568 |
file_path = e.get("file_path", "unknown_source")
|
1569 |
-
if not file_path or file_path == "unknown_source":
|
1570 |
-
# Try to get from metadata
|
1571 |
-
file_path = e.get("metadata", {}).get("file_path", "unknown_source")
|
1572 |
|
1573 |
relations_section_list.append(
|
1574 |
[
|
@@ -1594,11 +1571,8 @@ async def _get_edge_data(
|
|
1594 |
if isinstance(created_at, (int, float)):
|
1595 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1596 |
|
1597 |
-
# Get file path from
|
1598 |
file_path = n.get("file_path", "unknown_source")
|
1599 |
-
if not file_path or file_path == "unknown_source":
|
1600 |
-
# Try to get from metadata
|
1601 |
-
file_path = n.get("metadata", {}).get("file_path", "unknown_source")
|
1602 |
|
1603 |
entites_section_list.append(
|
1604 |
[
|
|
|
172 |
entity_type=entity_type,
|
173 |
description=entity_description,
|
174 |
source_id=chunk_key,
|
175 |
+
file_path=file_path,
|
176 |
)
|
177 |
|
178 |
|
|
|
201 |
description=edge_description,
|
202 |
keywords=edge_keywords,
|
203 |
source_id=edge_source_id,
|
204 |
+
file_path=file_path,
|
205 |
)
|
206 |
|
207 |
|
|
|
224 |
split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
|
225 |
)
|
226 |
already_file_paths.extend(
|
227 |
+
split_string_by_multi_markers(already_node["file_path"], [GRAPH_FIELD_SEP])
|
|
|
|
|
228 |
)
|
229 |
already_description.append(already_node["description"])
|
230 |
|
|
|
242 |
set([dp["source_id"] for dp in nodes_data] + already_source_ids)
|
243 |
)
|
244 |
file_path = GRAPH_FIELD_SEP.join(
|
245 |
+
set([dp["file_path"] for dp in nodes_data] + already_file_paths)
|
246 |
)
|
247 |
|
248 |
logger.debug(f"file_path: {file_path}")
|
|
|
296 |
if already_edge.get("file_path") is not None:
|
297 |
already_file_paths.extend(
|
298 |
split_string_by_multi_markers(
|
299 |
+
already_edge["file_path"], [GRAPH_FIELD_SEP]
|
300 |
)
|
301 |
)
|
302 |
|
|
|
338 |
)
|
339 |
file_path = GRAPH_FIELD_SEP.join(
|
340 |
set(
|
341 |
+
[dp["file_path"] for dp in edges_data if dp.get("file_path")]
|
|
|
|
|
|
|
|
|
342 |
+ already_file_paths
|
343 |
)
|
344 |
)
|
|
|
673 |
"content": f"{dp['entity_name']}\n{dp['description']}",
|
674 |
"source_id": dp["source_id"],
|
675 |
"file_path": dp.get("file_path", "unknown_source"),
|
|
|
|
|
|
|
|
|
676 |
}
|
677 |
for dp in all_entities_data
|
678 |
}
|
|
|
687 |
"content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
|
688 |
"source_id": dp["source_id"],
|
689 |
"file_path": dp.get("file_path", "unknown_source"),
|
|
|
|
|
|
|
|
|
690 |
}
|
691 |
for dp in all_relationships_data
|
692 |
}
|
|
|
1271 |
if isinstance(created_at, (int, float)):
|
1272 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1273 |
|
1274 |
+
# Get file path from node data
|
1275 |
file_path = n.get("file_path", "unknown_source")
|
|
|
|
|
|
|
1276 |
|
1277 |
entites_section_list.append(
|
1278 |
[
|
|
|
1306 |
if isinstance(created_at, (int, float)):
|
1307 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1308 |
|
1309 |
+
# Get file path from edge data
|
1310 |
file_path = e.get("file_path", "unknown_source")
|
|
|
|
|
|
|
1311 |
|
1312 |
relations_section_list.append(
|
1313 |
[
|
|
|
1544 |
if isinstance(created_at, (int, float)):
|
1545 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1546 |
|
1547 |
+
# Get file path from edge data
|
1548 |
file_path = e.get("file_path", "unknown_source")
|
|
|
|
|
|
|
1549 |
|
1550 |
relations_section_list.append(
|
1551 |
[
|
|
|
1571 |
if isinstance(created_at, (int, float)):
|
1572 |
created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
|
1573 |
|
1574 |
+
# Get file path from node data
|
1575 |
file_path = n.get("file_path", "unknown_source")
|
|
|
|
|
|
|
1576 |
|
1577 |
entites_section_list.append(
|
1578 |
[
|
lightrag/utils.py
CHANGED
@@ -109,15 +109,17 @@ def setup_logger(
|
|
109 |
logger_name: str,
|
110 |
level: str = "INFO",
|
111 |
add_filter: bool = False,
|
112 |
-
log_file_path: str = None,
|
|
|
113 |
):
|
114 |
-
"""Set up a logger with console and file handlers
|
115 |
|
116 |
Args:
|
117 |
logger_name: Name of the logger to set up
|
118 |
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
119 |
add_filter: Whether to add LightragPathFilter to the logger
|
120 |
-
log_file_path: Path to the log file. If None,
|
|
|
121 |
"""
|
122 |
# Configure formatters
|
123 |
detailed_formatter = logging.Formatter(
|
@@ -125,18 +127,6 @@ def setup_logger(
|
|
125 |
)
|
126 |
simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
|
127 |
|
128 |
-
# Get log file path
|
129 |
-
if log_file_path is None:
|
130 |
-
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
131 |
-
log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
|
132 |
-
|
133 |
-
# Ensure log directory exists
|
134 |
-
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
135 |
-
|
136 |
-
# Get log file max size and backup count from environment variables
|
137 |
-
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
138 |
-
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
139 |
-
|
140 |
logger_instance = logging.getLogger(logger_name)
|
141 |
logger_instance.setLevel(level)
|
142 |
logger_instance.handlers = [] # Clear existing handlers
|
@@ -148,16 +138,34 @@ def setup_logger(
|
|
148 |
console_handler.setLevel(level)
|
149 |
logger_instance.addHandler(console_handler)
|
150 |
|
151 |
-
# Add file handler
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
# Add path filter if requested
|
163 |
if add_filter:
|
|
|
109 |
logger_name: str,
|
110 |
level: str = "INFO",
|
111 |
add_filter: bool = False,
|
112 |
+
log_file_path: str | None = None,
|
113 |
+
enable_file_logging: bool = True,
|
114 |
):
|
115 |
+
"""Set up a logger with console and optionally file handlers
|
116 |
|
117 |
Args:
|
118 |
logger_name: Name of the logger to set up
|
119 |
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
120 |
add_filter: Whether to add LightragPathFilter to the logger
|
121 |
+
log_file_path: Path to the log file. If None and file logging is enabled, defaults to lightrag.log in LOG_DIR or cwd
|
122 |
+
enable_file_logging: Whether to enable logging to a file (defaults to True)
|
123 |
"""
|
124 |
# Configure formatters
|
125 |
detailed_formatter = logging.Formatter(
|
|
|
127 |
)
|
128 |
simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
logger_instance = logging.getLogger(logger_name)
|
131 |
logger_instance.setLevel(level)
|
132 |
logger_instance.handlers = [] # Clear existing handlers
|
|
|
138 |
console_handler.setLevel(level)
|
139 |
logger_instance.addHandler(console_handler)
|
140 |
|
141 |
+
# Add file handler by default unless explicitly disabled
|
142 |
+
if enable_file_logging:
|
143 |
+
# Get log file path
|
144 |
+
if log_file_path is None:
|
145 |
+
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
146 |
+
log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
|
147 |
+
|
148 |
+
# Ensure log directory exists
|
149 |
+
os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
|
150 |
+
|
151 |
+
# Get log file max size and backup count from environment variables
|
152 |
+
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
153 |
+
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
154 |
+
|
155 |
+
try:
|
156 |
+
# Add file handler
|
157 |
+
file_handler = logging.handlers.RotatingFileHandler(
|
158 |
+
filename=log_file_path,
|
159 |
+
maxBytes=log_max_bytes,
|
160 |
+
backupCount=log_backup_count,
|
161 |
+
encoding="utf-8",
|
162 |
+
)
|
163 |
+
file_handler.setFormatter(detailed_formatter)
|
164 |
+
file_handler.setLevel(level)
|
165 |
+
logger_instance.addHandler(file_handler)
|
166 |
+
except PermissionError as e:
|
167 |
+
logger.warning(f"Could not create log file at {log_file_path}: {str(e)}")
|
168 |
+
logger.warning("Continuing with console logging only")
|
169 |
|
170 |
# Add path filter if requested
|
171 |
if add_filter:
|