Commit
·
5e1537b
1
Parent(s):
c7cae5d
- Fix the bug from main stream that using doc['status']
Browse files- Improve the performance of Apache AGE.
- Revise the README.md for Apache AGE indexing.
- README.md +31 -2
- lightrag/kg/postgres_impl.py +20 -3
- lightrag/lightrag.py +1 -1
README.md
CHANGED
@@ -455,9 +455,38 @@ For production level scenarios you will most likely want to leverage an enterpri
|
|
455 |
* If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag
|
456 |
* How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py)
|
457 |
* Create index for AGE example: (Change below `dickens` to your graph name if necessary)
|
458 |
-
```
|
|
|
459 |
SET search_path = ag_catalog, "$user", public;
|
460 |
-
CREATE INDEX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
```
|
462 |
* Known issue of the Apache AGE: The released versions got below issue:
|
463 |
> You might find that the properties of the nodes/edges are empty.
|
|
|
455 |
* If you prefer docker, please start with this image if you are a beginner to avoid hiccups (DO read the overview): https://hub.docker.com/r/shangor/postgres-for-rag
|
456 |
* How to start? Ref to: [examples/lightrag_zhipu_postgres_demo.py](https://github.com/HKUDS/LightRAG/blob/main/examples/lightrag_zhipu_postgres_demo.py)
|
457 |
* Create index for AGE example: (Change below `dickens` to your graph name if necessary)
|
458 |
+
```sql
|
459 |
+
load 'age';
|
460 |
SET search_path = ag_catalog, "$user", public;
|
461 |
+
CREATE INDEX CONCURRENTLY entity_p_idx ON dickens."Entity" (id);
|
462 |
+
CREATE INDEX CONCURRENTLY vertex_p_idx ON dickens."_ag_label_vertex" (id);
|
463 |
+
CREATE INDEX CONCURRENTLY directed_p_idx ON dickens."DIRECTED" (id);
|
464 |
+
CREATE INDEX CONCURRENTLY directed_eid_idx ON dickens."DIRECTED" (end_id);
|
465 |
+
CREATE INDEX CONCURRENTLY directed_sid_idx ON dickens."DIRECTED" (start_id);
|
466 |
+
CREATE INDEX CONCURRENTLY directed_seid_idx ON dickens."DIRECTED" (start_id,end_id);
|
467 |
+
CREATE INDEX CONCURRENTLY edge_p_idx ON dickens."_ag_label_edge" (id);
|
468 |
+
CREATE INDEX CONCURRENTLY edge_sid_idx ON dickens."_ag_label_edge" (start_id);
|
469 |
+
CREATE INDEX CONCURRENTLY edge_eid_idx ON dickens."_ag_label_edge" (end_id);
|
470 |
+
CREATE INDEX CONCURRENTLY edge_seid_idx ON dickens."_ag_label_edge" (start_id,end_id);
|
471 |
+
create INDEX CONCURRENTLY vertex_idx_node_id ON dickens."_ag_label_vertex" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype));
|
472 |
+
create INDEX CONCURRENTLY entity_idx_node_id ON dickens."Entity" (ag_catalog.agtype_access_operator(properties, '"node_id"'::agtype));
|
473 |
+
CREATE INDEX CONCURRENTLY entity_node_id_gin_idx ON dickens."Entity" using gin(properties);
|
474 |
+
ALTER TABLE dickens."DIRECTED" CLUSTER ON directed_sid_idx;
|
475 |
+
|
476 |
+
-- drop if necessary
|
477 |
+
drop INDEX entity_p_idx;
|
478 |
+
drop INDEX vertex_p_idx;
|
479 |
+
drop INDEX directed_p_idx;
|
480 |
+
drop INDEX directed_eid_idx;
|
481 |
+
drop INDEX directed_sid_idx;
|
482 |
+
drop INDEX directed_seid_idx;
|
483 |
+
drop INDEX edge_p_idx;
|
484 |
+
drop INDEX edge_sid_idx;
|
485 |
+
drop INDEX edge_eid_idx;
|
486 |
+
drop INDEX edge_seid_idx;
|
487 |
+
drop INDEX vertex_idx_node_id;
|
488 |
+
drop INDEX entity_idx_node_id;
|
489 |
+
drop INDEX entity_node_id_gin_idx;
|
490 |
```
|
491 |
* Known issue of the Apache AGE: The released versions got below issue:
|
492 |
> You might find that the properties of the nodes/edges are empty.
|
lightrag/kg/postgres_impl.py
CHANGED
@@ -30,6 +30,7 @@ from ..base import (
|
|
30 |
DocStatus,
|
31 |
DocProcessingStatus,
|
32 |
BaseGraphStorage,
|
|
|
33 |
)
|
34 |
|
35 |
if sys.platform.startswith("win"):
|
@@ -442,6 +443,22 @@ class PGDocStatusStorage(DocStatusStorage):
|
|
442 |
existed = set([element["id"] for element in result])
|
443 |
return set(data) - existed
|
444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
async def get_status_counts(self) -> Dict[str, int]:
|
446 |
"""Get counts of documents in each status"""
|
447 |
sql = """SELECT status as "status", COUNT(1) as "count"
|
@@ -884,9 +901,9 @@ class PGGraphStorage(BaseGraphStorage):
|
|
884 |
|
885 |
query = """SELECT * FROM cypher('%s', $$
|
886 |
MATCH (n:Entity {node_id: "%s"})
|
887 |
-
OPTIONAL MATCH (n)-[
|
888 |
-
RETURN n,
|
889 |
-
$$) AS (n agtype,
|
890 |
self.graph_name,
|
891 |
label,
|
892 |
)
|
|
|
30 |
DocStatus,
|
31 |
DocProcessingStatus,
|
32 |
BaseGraphStorage,
|
33 |
+
T,
|
34 |
)
|
35 |
|
36 |
if sys.platform.startswith("win"):
|
|
|
443 |
existed = set([element["id"] for element in result])
|
444 |
return set(data) - existed
|
445 |
|
446 |
+
async def get_by_id(self, id: str) -> Union[T, None]:
|
447 |
+
sql = "select * from LIGHTRAG_DOC_STATUS where workspace=$1 and id=$2"
|
448 |
+
params = {"workspace": self.db.workspace, "id": id}
|
449 |
+
result = await self.db.query(sql, params, True)
|
450 |
+
if result is None:
|
451 |
+
return None
|
452 |
+
else:
|
453 |
+
return DocProcessingStatus(
|
454 |
+
content_length=result[0]["content_length"],
|
455 |
+
content_summary=result[0]["content_summary"],
|
456 |
+
status=result[0]["status"],
|
457 |
+
chunks_count=result[0]["chunks_count"],
|
458 |
+
created_at=result[0]["created_at"],
|
459 |
+
updated_at=result[0]["updated_at"],
|
460 |
+
)
|
461 |
+
|
462 |
async def get_status_counts(self) -> Dict[str, int]:
|
463 |
"""Get counts of documents in each status"""
|
464 |
sql = """SELECT status as "status", COUNT(1) as "count"
|
|
|
901 |
|
902 |
query = """SELECT * FROM cypher('%s', $$
|
903 |
MATCH (n:Entity {node_id: "%s"})
|
904 |
+
OPTIONAL MATCH (n)-[]-(connected)
|
905 |
+
RETURN n, connected
|
906 |
+
$$) AS (n agtype, connected agtype)""" % (
|
907 |
self.graph_name,
|
908 |
label,
|
909 |
)
|
lightrag/lightrag.py
CHANGED
@@ -373,7 +373,7 @@ class LightRAG:
|
|
373 |
doc_id
|
374 |
for doc_id in new_docs.keys()
|
375 |
if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
|
376 |
-
or current_doc
|
377 |
}
|
378 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
379 |
|
|
|
373 |
doc_id
|
374 |
for doc_id in new_docs.keys()
|
375 |
if (current_doc := await self.doc_status.get_by_id(doc_id)) is None
|
376 |
+
or current_doc.status == DocStatus.FAILED
|
377 |
}
|
378 |
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
|
379 |
|