Milin commited on
Commit
d95261a
·
2 Parent(s): 1ca22c5 54deb16

Merge branch 'main' of github.com:lcjqyml/LightRAG

Browse files
env.example CHANGED
@@ -73,6 +73,8 @@ LLM_BINDING_HOST=http://localhost:11434
73
  ### Embedding Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal)
74
  EMBEDDING_MODEL=bge-m3:latest
75
  EMBEDDING_DIM=1024
 
 
76
  # EMBEDDING_BINDING_API_KEY=your_api_key
77
  ### ollama example
78
  EMBEDDING_BINDING=ollama
 
73
  ### Embedding Configuration (Use valid host. For local services installed with docker, you can use host.docker.internal)
74
  EMBEDDING_MODEL=bge-m3:latest
75
  EMBEDDING_DIM=1024
76
+ EMBEDDING_BATCH_NUM=32
77
+ EMBEDDING_FUNC_MAX_ASYNC=16
78
  # EMBEDDING_BINDING_API_KEY=your_api_key
79
  ### ollama example
80
  EMBEDDING_BINDING=ollama
lightrag/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
  from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
2
 
3
- __version__ = "1.2.6"
4
  __author__ = "Zirui Guo"
5
  __url__ = "https://github.com/HKUDS/LightRAG"
 
1
  from .lightrag import LightRAG as LightRAG, QueryParam as QueryParam
2
 
3
+ __version__ = "1.2.7"
4
  __author__ = "Zirui Guo"
5
  __url__ = "https://github.com/HKUDS/LightRAG"
lightrag/kg/postgres_impl.py CHANGED
@@ -747,8 +747,30 @@ class PGDocStatusStorage(DocStatusStorage):
747
  )
748
 
749
  async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
750
- """Get doc_chunks data by id"""
751
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
752
 
753
  async def get_status_counts(self) -> dict[str, int]:
754
  """Get counts of documents in each status"""
@@ -1570,7 +1592,7 @@ TABLES = {
1570
  content_vector VECTOR,
1571
  create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
1572
  update_time TIMESTAMP,
1573
- chunk_id TEXT NULL,
1574
  file_path TEXT NULL,
1575
  CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
1576
  )"""
@@ -1585,7 +1607,7 @@ TABLES = {
1585
  content_vector VECTOR,
1586
  create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
1587
  update_time TIMESTAMP,
1588
- chunk_id TEXT NULL,
1589
  file_path TEXT NULL,
1590
  CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
1591
  )"""
@@ -1673,7 +1695,7 @@ SQL_TEMPLATES = {
1673
  """,
1674
  "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
1675
  content_vector, chunk_ids, file_path)
1676
- VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7::varchar[])
1677
  ON CONFLICT (workspace,id) DO UPDATE
1678
  SET entity_name=EXCLUDED.entity_name,
1679
  content=EXCLUDED.content,
@@ -1684,7 +1706,7 @@ SQL_TEMPLATES = {
1684
  """,
1685
  "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
1686
  target_id, content, content_vector, chunk_ids, file_path)
1687
- VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8::varchar[])
1688
  ON CONFLICT (workspace,id) DO UPDATE
1689
  SET source_id=EXCLUDED.source_id,
1690
  target_id=EXCLUDED.target_id,
 
747
  )
748
 
749
  async def get_by_ids(self, ids: list[str]) -> list[dict[str, Any]]:
750
+ """Get doc_chunks data by multiple IDs."""
751
+ if not ids:
752
+ return []
753
+
754
+ sql = "SELECT * FROM LIGHTRAG_DOC_STATUS WHERE workspace=$1 AND id = ANY($2)"
755
+ params = {"workspace": self.db.workspace, "ids": ids}
756
+
757
+ results = await self.db.query(sql, params, True)
758
+
759
+ if not results:
760
+ return []
761
+ return [
762
+ {
763
+ "content": row["content"],
764
+ "content_length": row["content_length"],
765
+ "content_summary": row["content_summary"],
766
+ "status": row["status"],
767
+ "chunks_count": row["chunks_count"],
768
+ "created_at": row["created_at"],
769
+ "updated_at": row["updated_at"],
770
+ "file_path": row["file_path"],
771
+ }
772
+ for row in results
773
+ ]
774
 
775
  async def get_status_counts(self) -> dict[str, int]:
776
  """Get counts of documents in each status"""
 
1592
  content_vector VECTOR,
1593
  create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
1594
  update_time TIMESTAMP,
1595
+ chunk_ids VARCHAR(255)[] NULL,
1596
  file_path TEXT NULL,
1597
  CONSTRAINT LIGHTRAG_VDB_ENTITY_PK PRIMARY KEY (workspace, id)
1598
  )"""
 
1607
  content_vector VECTOR,
1608
  create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
1609
  update_time TIMESTAMP,
1610
+ chunk_ids VARCHAR(255)[] NULL,
1611
  file_path TEXT NULL,
1612
  CONSTRAINT LIGHTRAG_VDB_RELATION_PK PRIMARY KEY (workspace, id)
1613
  )"""
 
1695
  """,
1696
  "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
1697
  content_vector, chunk_ids, file_path)
1698
+ VALUES ($1, $2, $3, $4, $5, $6::varchar[], $7)
1699
  ON CONFLICT (workspace,id) DO UPDATE
1700
  SET entity_name=EXCLUDED.entity_name,
1701
  content=EXCLUDED.content,
 
1706
  """,
1707
  "upsert_relationship": """INSERT INTO LIGHTRAG_VDB_RELATION (workspace, id, source_id,
1708
  target_id, content, content_vector, chunk_ids, file_path)
1709
+ VALUES ($1, $2, $3, $4, $5, $6, $7::varchar[], $8)
1710
  ON CONFLICT (workspace,id) DO UPDATE
1711
  SET source_id=EXCLUDED.source_id,
1712
  target_id=EXCLUDED.target_id,
lightrag/lightrag.py CHANGED
@@ -183,10 +183,10 @@ class LightRAG:
183
  embedding_func: EmbeddingFunc | None = field(default=None)
184
  """Function for computing text embeddings. Must be set before use."""
185
 
186
- embedding_batch_num: int = field(default=32)
187
  """Batch size for embedding computations."""
188
 
189
- embedding_func_max_async: int = field(default=16)
190
  """Maximum number of concurrent embedding function calls."""
191
 
192
  embedding_cache_config: dict[str, Any] = field(
@@ -1947,6 +1947,8 @@ class LightRAG:
1947
 
1948
  # 2. Update entity information in the graph
1949
  new_node_data = {**node_data, **updated_data}
 
 
1950
  if "entity_name" in new_node_data:
1951
  del new_node_data[
1952
  "entity_name"
@@ -1963,7 +1965,7 @@ class LightRAG:
1963
 
1964
  # Store relationships that need to be updated
1965
  relations_to_update = []
1966
-
1967
  # Get all edges related to the original entity
1968
  edges = await self.chunk_entity_relation_graph.get_node_edges(
1969
  entity_name
@@ -1975,6 +1977,12 @@ class LightRAG:
1975
  source, target
1976
  )
1977
  if edge_data:
 
 
 
 
 
 
1978
  if source == entity_name:
1979
  await self.chunk_entity_relation_graph.upsert_edge(
1980
  new_entity_name, target, edge_data
@@ -2000,6 +2008,12 @@ class LightRAG:
2000
  f"Deleted old entity '{entity_name}' and its vector embedding from database"
2001
  )
2002
 
 
 
 
 
 
 
2003
  # Update relationship vector representations
2004
  for src, tgt, edge_data in relations_to_update:
2005
  description = edge_data.get("description", "")
@@ -2498,39 +2512,21 @@ class LightRAG:
2498
  # 4. Get all relationships of the source entities
2499
  all_relations = []
2500
  for entity_name in source_entities:
2501
- # Get all relationships where this entity is the source
2502
- outgoing_edges = await self.chunk_entity_relation_graph.get_node_edges(
2503
  entity_name
2504
  )
2505
- if outgoing_edges:
2506
- for src, tgt in outgoing_edges:
2507
  # Ensure src is the current entity
2508
  if src == entity_name:
2509
  edge_data = await self.chunk_entity_relation_graph.get_edge(
2510
  src, tgt
2511
  )
2512
- all_relations.append(("outgoing", src, tgt, edge_data))
2513
-
2514
- # Get all relationships where this entity is the target
2515
- incoming_edges = []
2516
- all_labels = await self.chunk_entity_relation_graph.get_all_labels()
2517
- for label in all_labels:
2518
- if label == entity_name:
2519
- continue
2520
- node_edges = await self.chunk_entity_relation_graph.get_node_edges(
2521
- label
2522
- )
2523
- for src, tgt in node_edges or []:
2524
- if tgt == entity_name:
2525
- incoming_edges.append((src, tgt))
2526
-
2527
- for src, tgt in incoming_edges:
2528
- edge_data = await self.chunk_entity_relation_graph.get_edge(
2529
- src, tgt
2530
- )
2531
- all_relations.append(("incoming", src, tgt, edge_data))
2532
 
2533
  # 5. Create or update the target entity
 
2534
  if not target_exists:
2535
  await self.chunk_entity_relation_graph.upsert_node(
2536
  target_entity, merged_entity_data
@@ -2544,8 +2540,11 @@ class LightRAG:
2544
 
2545
  # 6. Recreate all relationships, pointing to the target entity
2546
  relation_updates = {} # Track relationships that need to be merged
 
2547
 
2548
- for rel_type, src, tgt, edge_data in all_relations:
 
 
2549
  new_src = target_entity if src in source_entities else src
2550
  new_tgt = target_entity if tgt in source_entities else tgt
2551
 
@@ -2590,6 +2589,12 @@ class LightRAG:
2590
  f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}"
2591
  )
2592
 
 
 
 
 
 
 
2593
  # 7. Update entity vector representation
2594
  description = merged_entity_data.get("description", "")
2595
  source_id = merged_entity_data.get("source_id", "")
@@ -2652,19 +2657,6 @@ class LightRAG:
2652
  entity_id = compute_mdhash_id(entity_name, prefix="ent-")
2653
  await self.entities_vdb.delete([entity_id])
2654
 
2655
- # Also ensure any relationships specific to this entity are deleted from vector DB
2656
- # This is a safety check, as these should have been transformed to the target entity already
2657
- entity_relation_prefix = compute_mdhash_id(entity_name, prefix="rel-")
2658
- relations_with_entity = await self.relationships_vdb.search_by_prefix(
2659
- entity_relation_prefix
2660
- )
2661
- if relations_with_entity:
2662
- relation_ids = [r["id"] for r in relations_with_entity]
2663
- await self.relationships_vdb.delete(relation_ids)
2664
- logger.info(
2665
- f"Deleted {len(relation_ids)} relation records for entity '{entity_name}' from vector database"
2666
- )
2667
-
2668
  logger.info(
2669
  f"Deleted source entity '{entity_name}' and its vector embedding from database"
2670
  )
 
183
  embedding_func: EmbeddingFunc | None = field(default=None)
184
  """Function for computing text embeddings. Must be set before use."""
185
 
186
+ embedding_batch_num: int = field(default=int(os.getenv("EMBEDDING_BATCH_NUM", 32)))
187
  """Batch size for embedding computations."""
188
 
189
+ embedding_func_max_async: int = field(default=int(os.getenv("EMBEDDING_FUNC_MAX_ASYNC", 16)))
190
  """Maximum number of concurrent embedding function calls."""
191
 
192
  embedding_cache_config: dict[str, Any] = field(
 
1947
 
1948
  # 2. Update entity information in the graph
1949
  new_node_data = {**node_data, **updated_data}
1950
+ new_node_data["entity_id"] = new_entity_name
1951
+
1952
  if "entity_name" in new_node_data:
1953
  del new_node_data[
1954
  "entity_name"
 
1965
 
1966
  # Store relationships that need to be updated
1967
  relations_to_update = []
1968
+ relations_to_delete = []
1969
  # Get all edges related to the original entity
1970
  edges = await self.chunk_entity_relation_graph.get_node_edges(
1971
  entity_name
 
1977
  source, target
1978
  )
1979
  if edge_data:
1980
+ relations_to_delete.append(
1981
+ compute_mdhash_id(source + target, prefix="rel-")
1982
+ )
1983
+ relations_to_delete.append(
1984
+ compute_mdhash_id(target + source, prefix="rel-")
1985
+ )
1986
  if source == entity_name:
1987
  await self.chunk_entity_relation_graph.upsert_edge(
1988
  new_entity_name, target, edge_data
 
2008
  f"Deleted old entity '{entity_name}' and its vector embedding from database"
2009
  )
2010
 
2011
+ # Delete old relation records from vector database
2012
+ await self.relationships_vdb.delete(relations_to_delete)
2013
+ logger.info(
2014
+ f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database"
2015
+ )
2016
+
2017
  # Update relationship vector representations
2018
  for src, tgt, edge_data in relations_to_update:
2019
  description = edge_data.get("description", "")
 
2512
  # 4. Get all relationships of the source entities
2513
  all_relations = []
2514
  for entity_name in source_entities:
2515
+ # Get all relationships of the source entities
2516
+ edges = await self.chunk_entity_relation_graph.get_node_edges(
2517
  entity_name
2518
  )
2519
+ if edges:
2520
+ for src, tgt in edges:
2521
  # Ensure src is the current entity
2522
  if src == entity_name:
2523
  edge_data = await self.chunk_entity_relation_graph.get_edge(
2524
  src, tgt
2525
  )
2526
+ all_relations.append((src, tgt, edge_data))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2527
 
2528
  # 5. Create or update the target entity
2529
+ merged_entity_data["entity_id"] = target_entity
2530
  if not target_exists:
2531
  await self.chunk_entity_relation_graph.upsert_node(
2532
  target_entity, merged_entity_data
 
2540
 
2541
  # 6. Recreate all relationships, pointing to the target entity
2542
  relation_updates = {} # Track relationships that need to be merged
2543
+ relations_to_delete = []
2544
 
2545
+ for src, tgt, edge_data in all_relations:
2546
+ relations_to_delete.append(compute_mdhash_id(src + tgt, prefix="rel-"))
2547
+ relations_to_delete.append(compute_mdhash_id(tgt + src, prefix="rel-"))
2548
  new_src = target_entity if src in source_entities else src
2549
  new_tgt = target_entity if tgt in source_entities else tgt
2550
 
 
2589
  f"Created or updated relationship: {rel_data['src']} -> {rel_data['tgt']}"
2590
  )
2591
 
2592
+ # Delete relationships records from vector database
2593
+ await self.relationships_vdb.delete(relations_to_delete)
2594
+ logger.info(
2595
+ f"Deleted {len(relations_to_delete)} relation records for entity '{entity_name}' from vector database"
2596
+ )
2597
+
2598
  # 7. Update entity vector representation
2599
  description = merged_entity_data.get("description", "")
2600
  source_id = merged_entity_data.get("source_id", "")
 
2657
  entity_id = compute_mdhash_id(entity_name, prefix="ent-")
2658
  await self.entities_vdb.delete([entity_id])
2659
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2660
  logger.info(
2661
  f"Deleted source entity '{entity_name}' and its vector embedding from database"
2662
  )
lightrag/llm/hf.py CHANGED
@@ -138,16 +138,31 @@ async def hf_model_complete(
138
 
139
 
140
  async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
141
- device = next(embed_model.parameters()).device
 
 
 
 
 
 
 
 
 
 
 
142
  encoded_texts = tokenizer(
143
  texts, return_tensors="pt", padding=True, truncation=True
144
  ).to(device)
 
 
145
  with torch.no_grad():
146
  outputs = embed_model(
147
  input_ids=encoded_texts["input_ids"],
148
  attention_mask=encoded_texts["attention_mask"],
149
  )
150
  embeddings = outputs.last_hidden_state.mean(dim=1)
 
 
151
  if embeddings.dtype == torch.bfloat16:
152
  return embeddings.detach().to(torch.float32).cpu().numpy()
153
  else:
 
138
 
139
 
140
  async def hf_embed(texts: list[str], tokenizer, embed_model) -> np.ndarray:
141
+ # Detect the appropriate device
142
+ if torch.cuda.is_available():
143
+ device = next(embed_model.parameters()).device # Use CUDA if available
144
+ elif torch.backends.mps.is_available():
145
+ device = torch.device("mps") # Use MPS for Apple Silicon
146
+ else:
147
+ device = torch.device("cpu") # Fallback to CPU
148
+
149
+ # Move the model to the detected device
150
+ embed_model = embed_model.to(device)
151
+
152
+ # Tokenize the input texts and move them to the same device
153
  encoded_texts = tokenizer(
154
  texts, return_tensors="pt", padding=True, truncation=True
155
  ).to(device)
156
+
157
+ # Perform inference
158
  with torch.no_grad():
159
  outputs = embed_model(
160
  input_ids=encoded_texts["input_ids"],
161
  attention_mask=encoded_texts["attention_mask"],
162
  )
163
  embeddings = outputs.last_hidden_state.mean(dim=1)
164
+
165
+ # Convert embeddings to NumPy
166
  if embeddings.dtype == torch.bfloat16:
167
  return embeddings.detach().to(torch.float32).cpu().numpy()
168
  else:
lightrag/operate.py CHANGED
@@ -172,7 +172,7 @@ async def _handle_single_entity_extraction(
172
  entity_type=entity_type,
173
  description=entity_description,
174
  source_id=chunk_key,
175
- metadata={"created_at": time.time(), "file_path": file_path},
176
  )
177
 
178
 
@@ -201,7 +201,7 @@ async def _handle_single_relationship_extraction(
201
  description=edge_description,
202
  keywords=edge_keywords,
203
  source_id=edge_source_id,
204
- metadata={"created_at": time.time(), "file_path": file_path},
205
  )
206
 
207
 
@@ -224,9 +224,7 @@ async def _merge_nodes_then_upsert(
224
  split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
225
  )
226
  already_file_paths.extend(
227
- split_string_by_multi_markers(
228
- already_node["metadata"]["file_path"], [GRAPH_FIELD_SEP]
229
- )
230
  )
231
  already_description.append(already_node["description"])
232
 
@@ -244,7 +242,7 @@ async def _merge_nodes_then_upsert(
244
  set([dp["source_id"] for dp in nodes_data] + already_source_ids)
245
  )
246
  file_path = GRAPH_FIELD_SEP.join(
247
- set([dp["metadata"]["file_path"] for dp in nodes_data] + already_file_paths)
248
  )
249
 
250
  logger.debug(f"file_path: {file_path}")
@@ -298,7 +296,7 @@ async def _merge_edges_then_upsert(
298
  if already_edge.get("file_path") is not None:
299
  already_file_paths.extend(
300
  split_string_by_multi_markers(
301
- already_edge["metadata"]["file_path"], [GRAPH_FIELD_SEP]
302
  )
303
  )
304
 
@@ -340,11 +338,7 @@ async def _merge_edges_then_upsert(
340
  )
341
  file_path = GRAPH_FIELD_SEP.join(
342
  set(
343
- [
344
- dp["metadata"]["file_path"]
345
- for dp in edges_data
346
- if dp.get("metadata", {}).get("file_path")
347
- ]
348
  + already_file_paths
349
  )
350
  )
@@ -679,10 +673,6 @@ async def extract_entities(
679
  "content": f"{dp['entity_name']}\n{dp['description']}",
680
  "source_id": dp["source_id"],
681
  "file_path": dp.get("file_path", "unknown_source"),
682
- "metadata": {
683
- "created_at": dp.get("created_at", time.time()),
684
- "file_path": dp.get("file_path", "unknown_source"),
685
- },
686
  }
687
  for dp in all_entities_data
688
  }
@@ -697,10 +687,6 @@ async def extract_entities(
697
  "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
698
  "source_id": dp["source_id"],
699
  "file_path": dp.get("file_path", "unknown_source"),
700
- "metadata": {
701
- "created_at": dp.get("created_at", time.time()),
702
- "file_path": dp.get("file_path", "unknown_source"),
703
- },
704
  }
705
  for dp in all_relationships_data
706
  }
@@ -1285,11 +1271,8 @@ async def _get_node_data(
1285
  if isinstance(created_at, (int, float)):
1286
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1287
 
1288
- # Get file path from metadata or directly from node data
1289
  file_path = n.get("file_path", "unknown_source")
1290
- if not file_path or file_path == "unknown_source":
1291
- # Try to get from metadata
1292
- file_path = n.get("metadata", {}).get("file_path", "unknown_source")
1293
 
1294
  entites_section_list.append(
1295
  [
@@ -1323,11 +1306,8 @@ async def _get_node_data(
1323
  if isinstance(created_at, (int, float)):
1324
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1325
 
1326
- # Get file path from metadata or directly from edge data
1327
  file_path = e.get("file_path", "unknown_source")
1328
- if not file_path or file_path == "unknown_source":
1329
- # Try to get from metadata
1330
- file_path = e.get("metadata", {}).get("file_path", "unknown_source")
1331
 
1332
  relations_section_list.append(
1333
  [
@@ -1564,11 +1544,8 @@ async def _get_edge_data(
1564
  if isinstance(created_at, (int, float)):
1565
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1566
 
1567
- # Get file path from metadata or directly from edge data
1568
  file_path = e.get("file_path", "unknown_source")
1569
- if not file_path or file_path == "unknown_source":
1570
- # Try to get from metadata
1571
- file_path = e.get("metadata", {}).get("file_path", "unknown_source")
1572
 
1573
  relations_section_list.append(
1574
  [
@@ -1594,11 +1571,8 @@ async def _get_edge_data(
1594
  if isinstance(created_at, (int, float)):
1595
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1596
 
1597
- # Get file path from metadata or directly from node data
1598
  file_path = n.get("file_path", "unknown_source")
1599
- if not file_path or file_path == "unknown_source":
1600
- # Try to get from metadata
1601
- file_path = n.get("metadata", {}).get("file_path", "unknown_source")
1602
 
1603
  entites_section_list.append(
1604
  [
 
172
  entity_type=entity_type,
173
  description=entity_description,
174
  source_id=chunk_key,
175
+ file_path=file_path,
176
  )
177
 
178
 
 
201
  description=edge_description,
202
  keywords=edge_keywords,
203
  source_id=edge_source_id,
204
+ file_path=file_path,
205
  )
206
 
207
 
 
224
  split_string_by_multi_markers(already_node["source_id"], [GRAPH_FIELD_SEP])
225
  )
226
  already_file_paths.extend(
227
+ split_string_by_multi_markers(already_node["file_path"], [GRAPH_FIELD_SEP])
 
 
228
  )
229
  already_description.append(already_node["description"])
230
 
 
242
  set([dp["source_id"] for dp in nodes_data] + already_source_ids)
243
  )
244
  file_path = GRAPH_FIELD_SEP.join(
245
+ set([dp["file_path"] for dp in nodes_data] + already_file_paths)
246
  )
247
 
248
  logger.debug(f"file_path: {file_path}")
 
296
  if already_edge.get("file_path") is not None:
297
  already_file_paths.extend(
298
  split_string_by_multi_markers(
299
+ already_edge["file_path"], [GRAPH_FIELD_SEP]
300
  )
301
  )
302
 
 
338
  )
339
  file_path = GRAPH_FIELD_SEP.join(
340
  set(
341
+ [dp["file_path"] for dp in edges_data if dp.get("file_path")]
 
 
 
 
342
  + already_file_paths
343
  )
344
  )
 
673
  "content": f"{dp['entity_name']}\n{dp['description']}",
674
  "source_id": dp["source_id"],
675
  "file_path": dp.get("file_path", "unknown_source"),
 
 
 
 
676
  }
677
  for dp in all_entities_data
678
  }
 
687
  "content": f"{dp['src_id']}\t{dp['tgt_id']}\n{dp['keywords']}\n{dp['description']}",
688
  "source_id": dp["source_id"],
689
  "file_path": dp.get("file_path", "unknown_source"),
 
 
 
 
690
  }
691
  for dp in all_relationships_data
692
  }
 
1271
  if isinstance(created_at, (int, float)):
1272
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1273
 
1274
+ # Get file path from node data
1275
  file_path = n.get("file_path", "unknown_source")
 
 
 
1276
 
1277
  entites_section_list.append(
1278
  [
 
1306
  if isinstance(created_at, (int, float)):
1307
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1308
 
1309
+ # Get file path from edge data
1310
  file_path = e.get("file_path", "unknown_source")
 
 
 
1311
 
1312
  relations_section_list.append(
1313
  [
 
1544
  if isinstance(created_at, (int, float)):
1545
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1546
 
1547
+ # Get file path from edge data
1548
  file_path = e.get("file_path", "unknown_source")
 
 
 
1549
 
1550
  relations_section_list.append(
1551
  [
 
1571
  if isinstance(created_at, (int, float)):
1572
  created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
1573
 
1574
+ # Get file path from node data
1575
  file_path = n.get("file_path", "unknown_source")
 
 
 
1576
 
1577
  entites_section_list.append(
1578
  [
lightrag/utils.py CHANGED
@@ -109,15 +109,17 @@ def setup_logger(
109
  logger_name: str,
110
  level: str = "INFO",
111
  add_filter: bool = False,
112
- log_file_path: str = None,
 
113
  ):
114
- """Set up a logger with console and file handlers
115
 
116
  Args:
117
  logger_name: Name of the logger to set up
118
  level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
119
  add_filter: Whether to add LightragPathFilter to the logger
120
- log_file_path: Path to the log file. If None, will use current directory/lightrag.log
 
121
  """
122
  # Configure formatters
123
  detailed_formatter = logging.Formatter(
@@ -125,18 +127,6 @@ def setup_logger(
125
  )
126
  simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
127
 
128
- # Get log file path
129
- if log_file_path is None:
130
- log_dir = os.getenv("LOG_DIR", os.getcwd())
131
- log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
132
-
133
- # Ensure log directory exists
134
- os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
135
-
136
- # Get log file max size and backup count from environment variables
137
- log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
138
- log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
139
-
140
  logger_instance = logging.getLogger(logger_name)
141
  logger_instance.setLevel(level)
142
  logger_instance.handlers = [] # Clear existing handlers
@@ -148,16 +138,34 @@ def setup_logger(
148
  console_handler.setLevel(level)
149
  logger_instance.addHandler(console_handler)
150
 
151
- # Add file handler
152
- file_handler = logging.handlers.RotatingFileHandler(
153
- filename=log_file_path,
154
- maxBytes=log_max_bytes,
155
- backupCount=log_backup_count,
156
- encoding="utf-8",
157
- )
158
- file_handler.setFormatter(detailed_formatter)
159
- file_handler.setLevel(level)
160
- logger_instance.addHandler(file_handler)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  # Add path filter if requested
163
  if add_filter:
 
109
  logger_name: str,
110
  level: str = "INFO",
111
  add_filter: bool = False,
112
+ log_file_path: str | None = None,
113
+ enable_file_logging: bool = True,
114
  ):
115
+ """Set up a logger with console and optionally file handlers
116
 
117
  Args:
118
  logger_name: Name of the logger to set up
119
  level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
120
  add_filter: Whether to add LightragPathFilter to the logger
121
+ log_file_path: Path to the log file. If None and file logging is enabled, defaults to lightrag.log in LOG_DIR or cwd
122
+ enable_file_logging: Whether to enable logging to a file (defaults to True)
123
  """
124
  # Configure formatters
125
  detailed_formatter = logging.Formatter(
 
127
  )
128
  simple_formatter = logging.Formatter("%(levelname)s: %(message)s")
129
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  logger_instance = logging.getLogger(logger_name)
131
  logger_instance.setLevel(level)
132
  logger_instance.handlers = [] # Clear existing handlers
 
138
  console_handler.setLevel(level)
139
  logger_instance.addHandler(console_handler)
140
 
141
+ # Add file handler by default unless explicitly disabled
142
+ if enable_file_logging:
143
+ # Get log file path
144
+ if log_file_path is None:
145
+ log_dir = os.getenv("LOG_DIR", os.getcwd())
146
+ log_file_path = os.path.abspath(os.path.join(log_dir, "lightrag.log"))
147
+
148
+ # Ensure log directory exists
149
+ os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
150
+
151
+ # Get log file max size and backup count from environment variables
152
+ log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
153
+ log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
154
+
155
+ try:
156
+ # Add file handler
157
+ file_handler = logging.handlers.RotatingFileHandler(
158
+ filename=log_file_path,
159
+ maxBytes=log_max_bytes,
160
+ backupCount=log_backup_count,
161
+ encoding="utf-8",
162
+ )
163
+ file_handler.setFormatter(detailed_formatter)
164
+ file_handler.setLevel(level)
165
+ logger_instance.addHandler(file_handler)
166
+ except PermissionError as e:
167
+ logger.warning(f"Could not create log file at {log_file_path}: {str(e)}")
168
+ logger.warning("Continuing with console logging only")
169
 
170
  # Add path filter if requested
171
  if add_filter: