samuel-z-chen commited on
Commit
5e1537b
·
1 Parent(s): c7cae5d

- Fix the bug from main stream that using doc['status']

Browse files

- Improve the performance of Apache AGE.
- Revise the README.md for Apache AGE indexing.

Files changed (3) hide show
  1. README.md +31 -2
  2. lightrag/kg/postgres_impl.py +20 -3
  3. lightrag/lightrag.py +1 -1
README.md CHANGED
@@ -455,9 +455,38 @@ For production level scenarios you will most likely want to leverage an enterpri
455
  * If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag
456
  * How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py)
457
  * Create index for AGE example: (Change below `dickens` to your graph name if necessary)
458
- ```
 
459
  SET search_path = ag_catalog, "$user", public;
460
- CREATE INDEX idx_entity ON dickens."Entity" USING gin (agtype_access_operator(properties, '"node_id"'));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
461
  ```
462
  * Known issue of the Apache AGE: The released versions got below issue:
463
  > You might find that the properties of the nodes/edges are empty.
 
455
  * If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag
456
  * How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py)
457
  * Create index for AGE example: (Change below `dickens` to your graph name if necessary)
458
+ ```sql
459
+ load 'age';
460
  SET search_path = ag_catalog, "$user", public;
461
+ CREATE INDEX CONCURRENTLY entity_p_idx ON dickens."Entity" (id);
462
+ CREATE INDEX CONCURRENTLY vertex_p_idx ON dickens."_ag_label_vertex" (id);
463
+ CREATE INDEX CONCURRENTLY directed_p_idx ON dickens."DIRECTED" (id);
464
+ CREATE INDEX CONCURRENTLY directed_eid_idx ON dickens."DIRECTED" (end_id);
465
+ CREATE INDEX CONCURRENTLY directed_sid_idx ON dickens."DIRECTED" (start_id);
466
+ CREATE INDEX CONCURRENTLY directed_seid_idx ON dickens."DIRECTED" (start_id,end_id);
467
+ CREATE INDEX CONCURRENTLY edge_p_idx ON dickens."_ag_label_edge" (id);
468
+ CREATE INDEX CONCURRENTLY edge_sid_idx ON dickens."_ag_label_edge" (start_id);
469
+ CREATE INDEX CONCURRENTLY edge_eid_idx ON dickens."_ag_label_edge" (end_id);
470
+ CREATE INDEX CONCURRENTLY edge_seid_idx ON dickens."_ag_label_edge" (start_id,end_id);
471
+ create INDEX CONCURRENTLY vertex_idx_node_id ON dickens."_ag_label_vertex" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype));
472
+ create INDEX CONCURRENTLY entity_idx_node_id ON dickens."Entity" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype));
473
+ CREATE INDEX CONCURRENTLY entity_node_id_gin_idx ON dickens."Entity" using gin(properties);
474
+ ALTER TABLE dickens."DIRECTED" CLUSTER ON directed_sid_idx;
475
+
476
+ -- drop if necessary
477
+ drop INDEX entity_p_idx;
478
+ drop INDEX vertex_p_idx;
479
+ drop INDEX directed_p_idx;
480
+ drop INDEX directed_eid_idx;
481
+ drop INDEX directed_sid_idx;
482
+ drop INDEX directed_seid_idx;
483
+ drop INDEX edge_p_idx;
484
+ drop INDEX edge_sid_idx;
485
+ drop INDEX edge_eid_idx;
486
+ drop INDEX edge_seid_idx;
487
+ drop INDEX vertex_idx_node_id;
488
+ drop INDEX entity_idx_node_id;
489
+ drop INDEX entity_node_id_gin_idx;
490
  ```
491
  * Known issue of the Apache AGE: The released versions got below issue:
492
  > You might find that the properties of the nodes/edges are empty.
lightrag/kg/postgres_impl.py CHANGED
@@ -30,6 +30,7 @@ from ..base import (
30
  DocStatus,
31
  DocProcessingStatus,
32
  BaseGraphStorage,
 
33
  )
34
 
35
  if sys.platform.startswith("win"):
@@ -442,6 +443,22 @@ class PGDocStatusStorage(DocStatusStorage):
442
  existed = set([element["id"] for element in result])
443
  return set(data) - existed
444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  async def get_status_counts(self) -> Dict[str, int]:
446
  """Get counts of documents in each status"""
447
  sql = """SELECT status as "status", COUNT(1) as "count"
@@ -884,9 +901,9 @@ class PGGraphStorage(BaseGraphStorage):
884
 
885
  query = """SELECT * FROM cypher('%s', $$
886
  MATCH (n:Entity {node_id: "%s"})
887
- OPTIONAL MATCH (n)-[r]-(connected)
888
- RETURN n, r, connected
889
- $$) AS (n agtype, r agtype, connected agtype)""" % (
890
  self.graph_name,
891
  label,
892
  )
 
30
  DocStatus,
31
  DocProcessingStatus,
32
  BaseGraphStorage,
33
+ T,
34
  )
35
 
36
  if sys.platform.startswith("win"):
 
443
  existed = set([element["id"] for element in result])
444
  return set(data) - existed
445
 
446
+ async def get_by_id(self, id: str) -> Union[T, None]:
447
+ sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and id=$2"
448
+ params = {"workspace": self.db.workspace, "id": id}
449
+ result = await self.db.query(sql, params, True)
450
+ if result is None:
451
+ return None
452
+ else:
453
+ return DocProcessingStatus(
454
+ content_length=result[0]["content_length"],
455
+ content_summary=result[0]["content_summary"],
456
+ status=result[0]["status"],
457
+ chunks_count=result[0]["chunks_count"],
458
+ created_at=result[0]["created_at"],
459
+ updated_at=result[0]["updated_at"],
460
+ )
461
+
462
  async def get_status_counts(self) -> Dict[str, int]:
463
  """Get counts of documents in each status"""
464
  sql = """SELECT status as "status", COUNT(1) as "count"
 
901
 
902
  query = """SELECT * FROM cypher('%s', $$
903
  MATCH (n:Entity {node_id: "%s"})
904
+ OPTIONAL MATCH (n)-[]-(connected)
905
+ RETURN n, connected
906
+ $$) AS (n agtype, connected agtype)""" % (
907
  self.graph_name,
908
  label,
909
  )
lightrag/lightrag.py CHANGED
@@ -373,7 +373,7 @@ class LightRAG:
373
  doc_id
374
  for doc_id in new_docs.keys()
375
  if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
376
- or current_doc["status"] == DocStatus.FAILED
377
  }
378
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
379
 
 
373
  doc_id
374
  for doc_id in new_docs.keys()
375
  if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
376
+ or current_doc.status == DocStatus.FAILED
377
  }
378
  new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
379