Spaces:

rm-lht
/

lightrag

Configuration error

Roy commited on Mar 10

Commit

ddd0cf7

1 Parent(s): 7c21f76

Refactor requirements and code formatting

- Simplified requirements.txt by removing specific version constraints
- Added comment about extra library installation using pipmaster
- Improved code formatting in base.py, operate.py, and postgres_impl.py
- Cleaned up SQL templates and query method signatures with consistent formatting

Files changed (4) hide show

lightrag/base.py +4 -1
lightrag/kg/postgres_impl.py +30 -30
lightrag/operate.py +12 -4
requirements.txt +2 -50

lightrag/base.py CHANGED Viewed

@@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC):
     embedding_func: EmbeddingFunc
     cosine_better_than_threshold: float = field(default=0.2)
     meta_fields: set[str] = field(default_factory=set)
     @abstractmethod
-    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
     @abstractmethod

     embedding_func: EmbeddingFunc
     cosine_better_than_threshold: float = field(default=0.2)
     meta_fields: set[str] = field(default_factory=set)
     @abstractmethod
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         """Query the vector storage and retrieve top_k results."""
     @abstractmethod

lightrag/kg/postgres_impl.py CHANGED Viewed

@@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
             "chunk_id": item["source_id"],
-            #TODO: add document_id
         }
         return upsert_sql, data
@@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
-            "chunk_id": item["source_id"]
-            #TODO: add document_id
         }
         return upsert_sql, data
@@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage):
             await self.db.execute(upsert_sql, data)
     #################### query method ###############
-    async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
@@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage):
             formatted_ids = ",".join(f"'{id}'" for id in ids)
         else:
             formatted_ids = "NULL"
         sql = SQL_TEMPLATES[self.base_namespace].format(
-            embedding_string=embedding_string,
-            doc_ids=formatted_ids
         )
         params = {
             "workspace": self.db.workspace,
@@ -1598,7 +1599,7 @@ SQL_TEMPLATES = {
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
-    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
                       content_vector, chunk_id)
                       VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
@@ -1657,54 +1658,53 @@ SQL_TEMPLATES = {
        """,
     "relationships": """
     WITH relevant_chunks AS (
-        SELECT id as chunk_id
-        FROM LIGHTRAG_DOC_CHUNKS
         WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
     )
-    SELECT source_id as src_id, target_id as tgt_id
     FROM (
         SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
         FROM LIGHTRAG_VDB_RELATION r
-        WHERE r.workspace=$1
         AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
     ) filtered
-    WHERE distance>$2
-    ORDER BY distance DESC
     LIMIT $3
     """,
-    "entities":
-    '''
         WITH relevant_chunks AS (
-            SELECT id as chunk_id
-            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT entity_name FROM
             (
                 SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-                FROM LIGHTRAG_VDB_ENTITY
                 where workspace=$1
                 AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
             )
-        WHERE distance>$2
-        ORDER BY distance DESC
         LIMIT $3
-    ''',
-    'chunks': """
         WITH relevant_chunks AS (
-            SELECT id as chunk_id
-            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT id FROM
             (
                 SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
-                FROM LIGHTRAG_DOC_CHUNKS
                 where workspace=$1
                 AND id IN (SELECT chunk_id FROM relevant_chunks)
             )
-            WHERE distance>$2
-            ORDER BY distance DESC
             LIMIT $3
-    """
-}

             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
             "chunk_id": item["source_id"],
+            # TODO: add document_id
         }
         return upsert_sql, data
             "target_id": item["tgt_id"],
             "content": item["content"],
             "content_vector": json.dumps(item["__vector__"].tolist()),
+            "chunk_id": item["source_id"],
+            # TODO: add document_id
         }
         return upsert_sql, data
             await self.db.execute(upsert_sql, data)
     #################### query method ###############
+    async def query(
+        self, query: str, top_k: int, ids: list[str] | None = None
+    ) -> list[dict[str, Any]]:
         embeddings = await self.embedding_func([query])
         embedding = embeddings[0]
         embedding_string = ",".join(map(str, embedding))
             formatted_ids = ",".join(f"'{id}'" for id in ids)
         else:
             formatted_ids = "NULL"
         sql = SQL_TEMPLATES[self.base_namespace].format(
+            embedding_string=embedding_string, doc_ids=formatted_ids
         )
         params = {
             "workspace": self.db.workspace,
                       content_vector=EXCLUDED.content_vector,
                       update_time = CURRENT_TIMESTAMP
                      """,
+    "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
                       content_vector, chunk_id)
                       VALUES ($1, $2, $3, $4, $5, $6)
                       ON CONFLICT (workspace,id) DO UPDATE
        """,
     "relationships": """
     WITH relevant_chunks AS (
+        SELECT id as chunk_id
+        FROM LIGHTRAG_DOC_CHUNKS
         WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
     )
+    SELECT source_id as src_id, target_id as tgt_id
     FROM (
         SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
         FROM LIGHTRAG_VDB_RELATION r
+        WHERE r.workspace=$1
         AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
     ) filtered
+    WHERE distance>$2
+    ORDER BY distance DESC
     LIMIT $3
     """,
+    "entities": """
         WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT entity_name FROM
             (
                 SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_VDB_ENTITY
                 where workspace=$1
                 AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
             )
+        WHERE distance>$2
+        ORDER BY distance DESC
         LIMIT $3
+    """,
+    "chunks": """
         WITH relevant_chunks AS (
+            SELECT id as chunk_id
+            FROM LIGHTRAG_DOC_CHUNKS
             WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
         )
         SELECT id FROM
             (
                 SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
+                FROM LIGHTRAG_DOC_CHUNKS
                 where workspace=$1
                 AND id IN (SELECT chunk_id FROM relevant_chunks)
             )
+            WHERE distance>$2
+            ORDER BY distance DESC
             LIMIT $3
+    """,
+}

lightrag/operate.py CHANGED Viewed

@@ -893,7 +893,9 @@ async def mix_kg_vector_query(
             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
             # TODO: add ids to the query
-            results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids)
             if not results:
                 return None
@@ -1102,7 +1104,9 @@ async def _get_node_data(
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
-    results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
     if not len(results):
         return "", "", ""
@@ -1357,7 +1361,9 @@ async def _get_edge_data(
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
-    results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids)
     if not len(results):
         return "", "", ""
@@ -1606,7 +1612,9 @@ async def naive_query(
     if cached_response is not None:
         return cached_response
-    results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
     if not len(results):
         return PROMPTS["fail_response"]

             # Reduce top_k for vector search in hybrid mode since we have structured information from KG
             mix_topk = min(10, query_param.top_k)
             # TODO: add ids to the query
+            results = await chunks_vdb.query(
+                augmented_query, top_k=mix_topk, ids=query_param.ids
+            )
             if not results:
                 return None
         f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
     )
+    results = await entities_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return "", "", ""
         f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
     )
+    results = await relationships_vdb.query(
+        keywords, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return "", "", ""
     if cached_response is not None:
         return cached_response
+    results = await chunks_vdb.query(
+        query, top_k=query_param.top_k, ids=query_param.ids
+    )
     if not len(results):
         return PROMPTS["fail_response"]

requirements.txt CHANGED Viewed

@@ -1,53 +1,3 @@
-aioboto3==14.1.0
-aiofiles==24.1.0
-aiohttp==3.11.13
-ascii_colors==0.5.2
-asyncpg==0.30.0
-chromadb==0.6.3
-community==1.0.0b1
-docx==0.2.4
-# faiss
-fastapi==0.115.11
-glm==0.4.4
-graspologic==3.4.1
-gunicorn==23.0.0
-httpx==0.28.1
-imgui_bundle==1.6.2
-jsonlines==4.0.0
-llama_index==0.12.22
-moderngl==5.12.0
-motor==3.7.0
-nano_vectordb==0.0.4.3
-neo4j==5.28.1
-nest_asyncio==1.6.0
-networkx==3.4.2
-numpy
-openpyxl==3.1.5
-oracledb==3.0.0
-Pillow==11.1.0
-pipmaster==0.4.0
-protobuf
-psutil==7.0.0
-psycopg==3.2.5
-psycopg_pool==3.2.6
-pydantic==2.10.6
-pymilvus==2.5.4
-pymongo==4.11.2
-PyPDF2==3.0.1
-python-dotenv==1.0.1
-pyvis==0.3.2
-qdrant_client==1.13.3
-redis==5.2.1
-Requests==2.32.3
-sentence_transformers==3.4.1
-setuptools==75.8.0
-SQLAlchemy==2.0.38
-starlette==0.46.0
-tenacity==9.0.0
-tiktoken==0.9.0
-torch==2.6.0
-transformers==4.49.0
-uvicorn==0.34.0
 aiohttp
 configparser
 future
@@ -63,3 +13,5 @@ tenacity
 # LLM packages
 tiktoken

 aiohttp
 configparser
 future
 # LLM packages
 tiktoken
+# Extra libraries are installed when needed using pipmaster