Roy
commited on
Commit
·
ddd0cf7
1
Parent(s):
7c21f76
Refactor requirements and code formatting
Browse files- Simplified requirements.txt by removing specific version constraints
- Added comment about extra library installation using pipmaster
- Improved code formatting in base.py, operate.py, and postgres_impl.py
- Cleaned up SQL templates and query method signatures with consistent formatting
- lightrag/base.py +4 -1
- lightrag/kg/postgres_impl.py +30 -30
- lightrag/operate.py +12 -4
- requirements.txt +2 -50
lightrag/base.py
CHANGED
@@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC):
|
|
108 |
embedding_func: EmbeddingFunc
|
109 |
cosine_better_than_threshold: float = field(default=0.2)
|
110 |
meta_fields: set[str] = field(default_factory=set)
|
|
|
111 |
@abstractmethod
|
112 |
-
async def query(
|
|
|
|
|
113 |
"""Query the vector storage and retrieve top_k results."""
|
114 |
|
115 |
@abstractmethod
|
|
|
108 |
embedding_func: EmbeddingFunc
|
109 |
cosine_better_than_threshold: float = field(default=0.2)
|
110 |
meta_fields: set[str] = field(default_factory=set)
|
111 |
+
|
112 |
@abstractmethod
|
113 |
+
async def query(
|
114 |
+
self, query: str, top_k: int, ids: list[str] | None = None
|
115 |
+
) -> list[dict[str, Any]]:
|
116 |
"""Query the vector storage and retrieve top_k results."""
|
117 |
|
118 |
@abstractmethod
|
lightrag/kg/postgres_impl.py
CHANGED
@@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
|
|
439 |
"content": item["content"],
|
440 |
"content_vector": json.dumps(item["__vector__"].tolist()),
|
441 |
"chunk_id": item["source_id"],
|
442 |
-
#TODO: add document_id
|
443 |
}
|
444 |
return upsert_sql, data
|
445 |
|
@@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
|
|
452 |
"target_id": item["tgt_id"],
|
453 |
"content": item["content"],
|
454 |
"content_vector": json.dumps(item["__vector__"].tolist()),
|
455 |
-
"chunk_id": item["source_id"]
|
456 |
-
#TODO: add document_id
|
457 |
}
|
458 |
return upsert_sql, data
|
459 |
|
@@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage):
|
|
496 |
await self.db.execute(upsert_sql, data)
|
497 |
|
498 |
#################### query method ###############
|
499 |
-
async def query(
|
|
|
|
|
500 |
embeddings = await self.embedding_func([query])
|
501 |
embedding = embeddings[0]
|
502 |
embedding_string = ",".join(map(str, embedding))
|
@@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage):
|
|
505 |
formatted_ids = ",".join(f"'{id}'" for id in ids)
|
506 |
else:
|
507 |
formatted_ids = "NULL"
|
508 |
-
|
509 |
sql = SQL_TEMPLATES[self.base_namespace].format(
|
510 |
-
embedding_string=embedding_string,
|
511 |
-
doc_ids=formatted_ids
|
512 |
)
|
513 |
params = {
|
514 |
"workspace": self.db.workspace,
|
@@ -1598,7 +1599,7 @@ SQL_TEMPLATES = {
|
|
1598 |
content_vector=EXCLUDED.content_vector,
|
1599 |
update_time = CURRENT_TIMESTAMP
|
1600 |
""",
|
1601 |
-
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
1602 |
content_vector, chunk_id)
|
1603 |
VALUES ($1, $2, $3, $4, $5, $6)
|
1604 |
ON CONFLICT (workspace,id) DO UPDATE
|
@@ -1657,54 +1658,53 @@ SQL_TEMPLATES = {
|
|
1657 |
""",
|
1658 |
"relationships": """
|
1659 |
WITH relevant_chunks AS (
|
1660 |
-
SELECT id as chunk_id
|
1661 |
-
FROM LIGHTRAG_DOC_CHUNKS
|
1662 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1663 |
)
|
1664 |
-
SELECT source_id as src_id, target_id as tgt_id
|
1665 |
FROM (
|
1666 |
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
|
1667 |
FROM LIGHTRAG_VDB_RELATION r
|
1668 |
-
WHERE r.workspace=$1
|
1669 |
AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
|
1670 |
) filtered
|
1671 |
-
WHERE distance>$2
|
1672 |
-
ORDER BY distance DESC
|
1673 |
LIMIT $3
|
1674 |
""",
|
1675 |
-
"entities":
|
1676 |
-
'''
|
1677 |
WITH relevant_chunks AS (
|
1678 |
-
SELECT id as chunk_id
|
1679 |
-
FROM LIGHTRAG_DOC_CHUNKS
|
1680 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1681 |
)
|
1682 |
SELECT entity_name FROM
|
1683 |
(
|
1684 |
SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
1685 |
-
FROM LIGHTRAG_VDB_ENTITY
|
1686 |
where workspace=$1
|
1687 |
AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
|
1688 |
)
|
1689 |
-
WHERE distance>$2
|
1690 |
-
ORDER BY distance DESC
|
1691 |
LIMIT $3
|
1692 |
-
|
1693 |
-
|
1694 |
WITH relevant_chunks AS (
|
1695 |
-
SELECT id as chunk_id
|
1696 |
-
FROM LIGHTRAG_DOC_CHUNKS
|
1697 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1698 |
)
|
1699 |
SELECT id FROM
|
1700 |
(
|
1701 |
SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
1702 |
-
FROM LIGHTRAG_DOC_CHUNKS
|
1703 |
where workspace=$1
|
1704 |
AND id IN (SELECT chunk_id FROM relevant_chunks)
|
1705 |
)
|
1706 |
-
WHERE distance>$2
|
1707 |
-
ORDER BY distance DESC
|
1708 |
LIMIT $3
|
1709 |
-
"""
|
1710 |
-
}
|
|
|
439 |
"content": item["content"],
|
440 |
"content_vector": json.dumps(item["__vector__"].tolist()),
|
441 |
"chunk_id": item["source_id"],
|
442 |
+
# TODO: add document_id
|
443 |
}
|
444 |
return upsert_sql, data
|
445 |
|
|
|
452 |
"target_id": item["tgt_id"],
|
453 |
"content": item["content"],
|
454 |
"content_vector": json.dumps(item["__vector__"].tolist()),
|
455 |
+
"chunk_id": item["source_id"],
|
456 |
+
# TODO: add document_id
|
457 |
}
|
458 |
return upsert_sql, data
|
459 |
|
|
|
496 |
await self.db.execute(upsert_sql, data)
|
497 |
|
498 |
#################### query method ###############
|
499 |
+
async def query(
|
500 |
+
self, query: str, top_k: int, ids: list[str] | None = None
|
501 |
+
) -> list[dict[str, Any]]:
|
502 |
embeddings = await self.embedding_func([query])
|
503 |
embedding = embeddings[0]
|
504 |
embedding_string = ",".join(map(str, embedding))
|
|
|
507 |
formatted_ids = ",".join(f"'{id}'" for id in ids)
|
508 |
else:
|
509 |
formatted_ids = "NULL"
|
510 |
+
|
511 |
sql = SQL_TEMPLATES[self.base_namespace].format(
|
512 |
+
embedding_string=embedding_string, doc_ids=formatted_ids
|
|
|
513 |
)
|
514 |
params = {
|
515 |
"workspace": self.db.workspace,
|
|
|
1599 |
content_vector=EXCLUDED.content_vector,
|
1600 |
update_time = CURRENT_TIMESTAMP
|
1601 |
""",
|
1602 |
+
"upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
|
1603 |
content_vector, chunk_id)
|
1604 |
VALUES ($1, $2, $3, $4, $5, $6)
|
1605 |
ON CONFLICT (workspace,id) DO UPDATE
|
|
|
1658 |
""",
|
1659 |
"relationships": """
|
1660 |
WITH relevant_chunks AS (
|
1661 |
+
SELECT id as chunk_id
|
1662 |
+
FROM LIGHTRAG_DOC_CHUNKS
|
1663 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1664 |
)
|
1665 |
+
SELECT source_id as src_id, target_id as tgt_id
|
1666 |
FROM (
|
1667 |
SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
|
1668 |
FROM LIGHTRAG_VDB_RELATION r
|
1669 |
+
WHERE r.workspace=$1
|
1670 |
AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
|
1671 |
) filtered
|
1672 |
+
WHERE distance>$2
|
1673 |
+
ORDER BY distance DESC
|
1674 |
LIMIT $3
|
1675 |
""",
|
1676 |
+
"entities": """
|
|
|
1677 |
WITH relevant_chunks AS (
|
1678 |
+
SELECT id as chunk_id
|
1679 |
+
FROM LIGHTRAG_DOC_CHUNKS
|
1680 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1681 |
)
|
1682 |
SELECT entity_name FROM
|
1683 |
(
|
1684 |
SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
1685 |
+
FROM LIGHTRAG_VDB_ENTITY
|
1686 |
where workspace=$1
|
1687 |
AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
|
1688 |
)
|
1689 |
+
WHERE distance>$2
|
1690 |
+
ORDER BY distance DESC
|
1691 |
LIMIT $3
|
1692 |
+
""",
|
1693 |
+
"chunks": """
|
1694 |
WITH relevant_chunks AS (
|
1695 |
+
SELECT id as chunk_id
|
1696 |
+
FROM LIGHTRAG_DOC_CHUNKS
|
1697 |
WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
|
1698 |
)
|
1699 |
SELECT id FROM
|
1700 |
(
|
1701 |
SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
|
1702 |
+
FROM LIGHTRAG_DOC_CHUNKS
|
1703 |
where workspace=$1
|
1704 |
AND id IN (SELECT chunk_id FROM relevant_chunks)
|
1705 |
)
|
1706 |
+
WHERE distance>$2
|
1707 |
+
ORDER BY distance DESC
|
1708 |
LIMIT $3
|
1709 |
+
""",
|
1710 |
+
}
|
lightrag/operate.py
CHANGED
@@ -893,7 +893,9 @@ async def mix_kg_vector_query(
|
|
893 |
# Reduce top_k for vector search in hybrid mode since we have structured information from KG
|
894 |
mix_topk = min(10, query_param.top_k)
|
895 |
# TODO: add ids to the query
|
896 |
-
results = await chunks_vdb.query(
|
|
|
|
|
897 |
if not results:
|
898 |
return None
|
899 |
|
@@ -1102,7 +1104,9 @@ async def _get_node_data(
|
|
1102 |
f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
|
1103 |
)
|
1104 |
|
1105 |
-
results = await entities_vdb.query(
|
|
|
|
|
1106 |
|
1107 |
if not len(results):
|
1108 |
return "", "", ""
|
@@ -1357,7 +1361,9 @@ async def _get_edge_data(
|
|
1357 |
f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
|
1358 |
)
|
1359 |
|
1360 |
-
results = await relationships_vdb.query(
|
|
|
|
|
1361 |
|
1362 |
if not len(results):
|
1363 |
return "", "", ""
|
@@ -1606,7 +1612,9 @@ async def naive_query(
|
|
1606 |
if cached_response is not None:
|
1607 |
return cached_response
|
1608 |
|
1609 |
-
results = await chunks_vdb.query(
|
|
|
|
|
1610 |
if not len(results):
|
1611 |
return PROMPTS["fail_response"]
|
1612 |
|
|
|
893 |
# Reduce top_k for vector search in hybrid mode since we have structured information from KG
|
894 |
mix_topk = min(10, query_param.top_k)
|
895 |
# TODO: add ids to the query
|
896 |
+
results = await chunks_vdb.query(
|
897 |
+
augmented_query, top_k=mix_topk, ids=query_param.ids
|
898 |
+
)
|
899 |
if not results:
|
900 |
return None
|
901 |
|
|
|
1104 |
f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
|
1105 |
)
|
1106 |
|
1107 |
+
results = await entities_vdb.query(
|
1108 |
+
query, top_k=query_param.top_k, ids=query_param.ids
|
1109 |
+
)
|
1110 |
|
1111 |
if not len(results):
|
1112 |
return "", "", ""
|
|
|
1361 |
f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
|
1362 |
)
|
1363 |
|
1364 |
+
results = await relationships_vdb.query(
|
1365 |
+
keywords, top_k=query_param.top_k, ids=query_param.ids
|
1366 |
+
)
|
1367 |
|
1368 |
if not len(results):
|
1369 |
return "", "", ""
|
|
|
1612 |
if cached_response is not None:
|
1613 |
return cached_response
|
1614 |
|
1615 |
+
results = await chunks_vdb.query(
|
1616 |
+
query, top_k=query_param.top_k, ids=query_param.ids
|
1617 |
+
)
|
1618 |
if not len(results):
|
1619 |
return PROMPTS["fail_response"]
|
1620 |
|
requirements.txt
CHANGED
@@ -1,53 +1,3 @@
|
|
1 |
-
aioboto3==14.1.0
|
2 |
-
aiofiles==24.1.0
|
3 |
-
aiohttp==3.11.13
|
4 |
-
ascii_colors==0.5.2
|
5 |
-
asyncpg==0.30.0
|
6 |
-
chromadb==0.6.3
|
7 |
-
community==1.0.0b1
|
8 |
-
docx==0.2.4
|
9 |
-
# faiss
|
10 |
-
fastapi==0.115.11
|
11 |
-
glm==0.4.4
|
12 |
-
graspologic==3.4.1
|
13 |
-
gunicorn==23.0.0
|
14 |
-
httpx==0.28.1
|
15 |
-
imgui_bundle==1.6.2
|
16 |
-
jsonlines==4.0.0
|
17 |
-
llama_index==0.12.22
|
18 |
-
moderngl==5.12.0
|
19 |
-
motor==3.7.0
|
20 |
-
nano_vectordb==0.0.4.3
|
21 |
-
neo4j==5.28.1
|
22 |
-
nest_asyncio==1.6.0
|
23 |
-
networkx==3.4.2
|
24 |
-
numpy
|
25 |
-
openpyxl==3.1.5
|
26 |
-
oracledb==3.0.0
|
27 |
-
Pillow==11.1.0
|
28 |
-
pipmaster==0.4.0
|
29 |
-
protobuf
|
30 |
-
psutil==7.0.0
|
31 |
-
psycopg==3.2.5
|
32 |
-
psycopg_pool==3.2.6
|
33 |
-
pydantic==2.10.6
|
34 |
-
pymilvus==2.5.4
|
35 |
-
pymongo==4.11.2
|
36 |
-
PyPDF2==3.0.1
|
37 |
-
python-dotenv==1.0.1
|
38 |
-
pyvis==0.3.2
|
39 |
-
qdrant_client==1.13.3
|
40 |
-
redis==5.2.1
|
41 |
-
Requests==2.32.3
|
42 |
-
sentence_transformers==3.4.1
|
43 |
-
setuptools==75.8.0
|
44 |
-
SQLAlchemy==2.0.38
|
45 |
-
starlette==0.46.0
|
46 |
-
tenacity==9.0.0
|
47 |
-
tiktoken==0.9.0
|
48 |
-
torch==2.6.0
|
49 |
-
transformers==4.49.0
|
50 |
-
uvicorn==0.34.0
|
51 |
aiohttp
|
52 |
configparser
|
53 |
future
|
@@ -63,3 +13,5 @@ tenacity
|
|
63 |
|
64 |
# LLM packages
|
65 |
tiktoken
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
aiohttp
|
2 |
configparser
|
3 |
future
|
|
|
13 |
|
14 |
# LLM packages
|
15 |
tiktoken
|
16 |
+
|
17 |
+
# Extra libraries are installed when needed using pipmaster
|