Roy commited on
Commit
ddd0cf7
·
1 Parent(s): 7c21f76

Refactor requirements and code formatting

Browse files

- Simplified requirements.txt by removing specific version constraints
- Added comment about extra library installation using pipmaster
- Improved code formatting in base.py, operate.py, and postgres_impl.py
- Cleaned up SQL templates and query method signatures with consistent formatting

lightrag/base.py CHANGED
@@ -108,8 +108,11 @@ class BaseVectorStorage(StorageNameSpace, ABC):
108
  embedding_func: EmbeddingFunc
109
  cosine_better_than_threshold: float = field(default=0.2)
110
  meta_fields: set[str] = field(default_factory=set)
 
111
  @abstractmethod
112
- async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
 
 
113
  """Query the vector storage and retrieve top_k results."""
114
 
115
  @abstractmethod
 
108
  embedding_func: EmbeddingFunc
109
  cosine_better_than_threshold: float = field(default=0.2)
110
  meta_fields: set[str] = field(default_factory=set)
111
+
112
  @abstractmethod
113
+ async def query(
114
+ self, query: str, top_k: int, ids: list[str] | None = None
115
+ ) -> list[dict[str, Any]]:
116
  """Query the vector storage and retrieve top_k results."""
117
 
118
  @abstractmethod
lightrag/kg/postgres_impl.py CHANGED
@@ -439,7 +439,7 @@ class PGVectorStorage(BaseVectorStorage):
439
  "content": item["content"],
440
  "content_vector": json.dumps(item["__vector__"].tolist()),
441
  "chunk_id": item["source_id"],
442
- #TODO: add document_id
443
  }
444
  return upsert_sql, data
445
 
@@ -452,8 +452,8 @@ class PGVectorStorage(BaseVectorStorage):
452
  "target_id": item["tgt_id"],
453
  "content": item["content"],
454
  "content_vector": json.dumps(item["__vector__"].tolist()),
455
- "chunk_id": item["source_id"]
456
- #TODO: add document_id
457
  }
458
  return upsert_sql, data
459
 
@@ -496,7 +496,9 @@ class PGVectorStorage(BaseVectorStorage):
496
  await self.db.execute(upsert_sql, data)
497
 
498
  #################### query method ###############
499
- async def query(self, query: str, top_k: int, ids: list[str] | None = None) -> list[dict[str, Any]]:
 
 
500
  embeddings = await self.embedding_func([query])
501
  embedding = embeddings[0]
502
  embedding_string = ",".join(map(str, embedding))
@@ -505,10 +507,9 @@ class PGVectorStorage(BaseVectorStorage):
505
  formatted_ids = ",".join(f"'{id}'" for id in ids)
506
  else:
507
  formatted_ids = "NULL"
508
-
509
  sql = SQL_TEMPLATES[self.base_namespace].format(
510
- embedding_string=embedding_string,
511
- doc_ids=formatted_ids
512
  )
513
  params = {
514
  "workspace": self.db.workspace,
@@ -1598,7 +1599,7 @@ SQL_TEMPLATES = {
1598
  content_vector=EXCLUDED.content_vector,
1599
  update_time = CURRENT_TIMESTAMP
1600
  """,
1601
- "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
1602
  content_vector, chunk_id)
1603
  VALUES ($1, $2, $3, $4, $5, $6)
1604
  ON CONFLICT (workspace,id) DO UPDATE
@@ -1657,54 +1658,53 @@ SQL_TEMPLATES = {
1657
  """,
1658
  "relationships": """
1659
  WITH relevant_chunks AS (
1660
- SELECT id as chunk_id
1661
- FROM LIGHTRAG_DOC_CHUNKS
1662
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1663
  )
1664
- SELECT source_id as src_id, target_id as tgt_id
1665
  FROM (
1666
  SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
1667
  FROM LIGHTRAG_VDB_RELATION r
1668
- WHERE r.workspace=$1
1669
  AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
1670
  ) filtered
1671
- WHERE distance>$2
1672
- ORDER BY distance DESC
1673
  LIMIT $3
1674
  """,
1675
- "entities":
1676
- '''
1677
  WITH relevant_chunks AS (
1678
- SELECT id as chunk_id
1679
- FROM LIGHTRAG_DOC_CHUNKS
1680
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1681
  )
1682
  SELECT entity_name FROM
1683
  (
1684
  SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
1685
- FROM LIGHTRAG_VDB_ENTITY
1686
  where workspace=$1
1687
  AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
1688
  )
1689
- WHERE distance>$2
1690
- ORDER BY distance DESC
1691
  LIMIT $3
1692
- ''',
1693
- 'chunks': """
1694
  WITH relevant_chunks AS (
1695
- SELECT id as chunk_id
1696
- FROM LIGHTRAG_DOC_CHUNKS
1697
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1698
  )
1699
  SELECT id FROM
1700
  (
1701
  SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
1702
- FROM LIGHTRAG_DOC_CHUNKS
1703
  where workspace=$1
1704
  AND id IN (SELECT chunk_id FROM relevant_chunks)
1705
  )
1706
- WHERE distance>$2
1707
- ORDER BY distance DESC
1708
  LIMIT $3
1709
- """
1710
- }
 
439
  "content": item["content"],
440
  "content_vector": json.dumps(item["__vector__"].tolist()),
441
  "chunk_id": item["source_id"],
442
+ # TODO: add document_id
443
  }
444
  return upsert_sql, data
445
 
 
452
  "target_id": item["tgt_id"],
453
  "content": item["content"],
454
  "content_vector": json.dumps(item["__vector__"].tolist()),
455
+ "chunk_id": item["source_id"],
456
+ # TODO: add document_id
457
  }
458
  return upsert_sql, data
459
 
 
496
  await self.db.execute(upsert_sql, data)
497
 
498
  #################### query method ###############
499
+ async def query(
500
+ self, query: str, top_k: int, ids: list[str] | None = None
501
+ ) -> list[dict[str, Any]]:
502
  embeddings = await self.embedding_func([query])
503
  embedding = embeddings[0]
504
  embedding_string = ",".join(map(str, embedding))
 
507
  formatted_ids = ",".join(f"'{id}'" for id in ids)
508
  else:
509
  formatted_ids = "NULL"
510
+
511
  sql = SQL_TEMPLATES[self.base_namespace].format(
512
+ embedding_string=embedding_string, doc_ids=formatted_ids
 
513
  )
514
  params = {
515
  "workspace": self.db.workspace,
 
1599
  content_vector=EXCLUDED.content_vector,
1600
  update_time = CURRENT_TIMESTAMP
1601
  """,
1602
+ "upsert_entity": """INSERT INTO LIGHTRAG_VDB_ENTITY (workspace, id, entity_name, content,
1603
  content_vector, chunk_id)
1604
  VALUES ($1, $2, $3, $4, $5, $6)
1605
  ON CONFLICT (workspace,id) DO UPDATE
 
1658
  """,
1659
  "relationships": """
1660
  WITH relevant_chunks AS (
1661
+ SELECT id as chunk_id
1662
+ FROM LIGHTRAG_DOC_CHUNKS
1663
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1664
  )
1665
+ SELECT source_id as src_id, target_id as tgt_id
1666
  FROM (
1667
  SELECT r.id, r.source_id, r.target_id, 1 - (r.content_vector <=> '[{embedding_string}]'::vector) as distance
1668
  FROM LIGHTRAG_VDB_RELATION r
1669
+ WHERE r.workspace=$1
1670
  AND r.chunk_id IN (SELECT chunk_id FROM relevant_chunks)
1671
  ) filtered
1672
+ WHERE distance>$2
1673
+ ORDER BY distance DESC
1674
  LIMIT $3
1675
  """,
1676
+ "entities": """
 
1677
  WITH relevant_chunks AS (
1678
+ SELECT id as chunk_id
1679
+ FROM LIGHTRAG_DOC_CHUNKS
1680
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1681
  )
1682
  SELECT entity_name FROM
1683
  (
1684
  SELECT id, entity_name, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
1685
+ FROM LIGHTRAG_VDB_ENTITY
1686
  where workspace=$1
1687
  AND chunk_id IN (SELECT chunk_id FROM relevant_chunks)
1688
  )
1689
+ WHERE distance>$2
1690
+ ORDER BY distance DESC
1691
  LIMIT $3
1692
+ """,
1693
+ "chunks": """
1694
  WITH relevant_chunks AS (
1695
+ SELECT id as chunk_id
1696
+ FROM LIGHTRAG_DOC_CHUNKS
1697
  WHERE {doc_ids} IS NULL OR full_doc_id = ANY(ARRAY[{doc_ids}])
1698
  )
1699
  SELECT id FROM
1700
  (
1701
  SELECT id, 1 - (content_vector <=> '[{embedding_string}]'::vector) as distance
1702
+ FROM LIGHTRAG_DOC_CHUNKS
1703
  where workspace=$1
1704
  AND id IN (SELECT chunk_id FROM relevant_chunks)
1705
  )
1706
+ WHERE distance>$2
1707
+ ORDER BY distance DESC
1708
  LIMIT $3
1709
+ """,
1710
+ }
lightrag/operate.py CHANGED
@@ -893,7 +893,9 @@ async def mix_kg_vector_query(
893
  # Reduce top_k for vector search in hybrid mode since we have structured information from KG
894
  mix_topk = min(10, query_param.top_k)
895
  # TODO: add ids to the query
896
- results = await chunks_vdb.query(augmented_query, top_k=mix_topk, ids = query_param.ids)
 
 
897
  if not results:
898
  return None
899
 
@@ -1102,7 +1104,9 @@ async def _get_node_data(
1102
  f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
1103
  )
1104
 
1105
- results = await entities_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
 
 
1106
 
1107
  if not len(results):
1108
  return "", "", ""
@@ -1357,7 +1361,9 @@ async def _get_edge_data(
1357
  f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
1358
  )
1359
 
1360
- results = await relationships_vdb.query(keywords, top_k = query_param.top_k, ids = query_param.ids)
 
 
1361
 
1362
  if not len(results):
1363
  return "", "", ""
@@ -1606,7 +1612,9 @@ async def naive_query(
1606
  if cached_response is not None:
1607
  return cached_response
1608
 
1609
- results = await chunks_vdb.query(query, top_k=query_param.top_k, ids = query_param.ids)
 
 
1610
  if not len(results):
1611
  return PROMPTS["fail_response"]
1612
 
 
893
  # Reduce top_k for vector search in hybrid mode since we have structured information from KG
894
  mix_topk = min(10, query_param.top_k)
895
  # TODO: add ids to the query
896
+ results = await chunks_vdb.query(
897
+ augmented_query, top_k=mix_topk, ids=query_param.ids
898
+ )
899
  if not results:
900
  return None
901
 
 
1104
  f"Query nodes: {query}, top_k: {query_param.top_k}, cosine: {entities_vdb.cosine_better_than_threshold}"
1105
  )
1106
 
1107
+ results = await entities_vdb.query(
1108
+ query, top_k=query_param.top_k, ids=query_param.ids
1109
+ )
1110
 
1111
  if not len(results):
1112
  return "", "", ""
 
1361
  f"Query edges: {keywords}, top_k: {query_param.top_k}, cosine: {relationships_vdb.cosine_better_than_threshold}"
1362
  )
1363
 
1364
+ results = await relationships_vdb.query(
1365
+ keywords, top_k=query_param.top_k, ids=query_param.ids
1366
+ )
1367
 
1368
  if not len(results):
1369
  return "", "", ""
 
1612
  if cached_response is not None:
1613
  return cached_response
1614
 
1615
+ results = await chunks_vdb.query(
1616
+ query, top_k=query_param.top_k, ids=query_param.ids
1617
+ )
1618
  if not len(results):
1619
  return PROMPTS["fail_response"]
1620
 
requirements.txt CHANGED
@@ -1,53 +1,3 @@
1
- aioboto3==14.1.0
2
- aiofiles==24.1.0
3
- aiohttp==3.11.13
4
- ascii_colors==0.5.2
5
- asyncpg==0.30.0
6
- chromadb==0.6.3
7
- community==1.0.0b1
8
- docx==0.2.4
9
- # faiss
10
- fastapi==0.115.11
11
- glm==0.4.4
12
- graspologic==3.4.1
13
- gunicorn==23.0.0
14
- httpx==0.28.1
15
- imgui_bundle==1.6.2
16
- jsonlines==4.0.0
17
- llama_index==0.12.22
18
- moderngl==5.12.0
19
- motor==3.7.0
20
- nano_vectordb==0.0.4.3
21
- neo4j==5.28.1
22
- nest_asyncio==1.6.0
23
- networkx==3.4.2
24
- numpy
25
- openpyxl==3.1.5
26
- oracledb==3.0.0
27
- Pillow==11.1.0
28
- pipmaster==0.4.0
29
- protobuf
30
- psutil==7.0.0
31
- psycopg==3.2.5
32
- psycopg_pool==3.2.6
33
- pydantic==2.10.6
34
- pymilvus==2.5.4
35
- pymongo==4.11.2
36
- PyPDF2==3.0.1
37
- python-dotenv==1.0.1
38
- pyvis==0.3.2
39
- qdrant_client==1.13.3
40
- redis==5.2.1
41
- Requests==2.32.3
42
- sentence_transformers==3.4.1
43
- setuptools==75.8.0
44
- SQLAlchemy==2.0.38
45
- starlette==0.46.0
46
- tenacity==9.0.0
47
- tiktoken==0.9.0
48
- torch==2.6.0
49
- transformers==4.49.0
50
- uvicorn==0.34.0
51
  aiohttp
52
  configparser
53
  future
@@ -63,3 +13,5 @@ tenacity
63
 
64
  # LLM packages
65
  tiktoken
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  aiohttp
2
  configparser
3
  future
 
13
 
14
  # LLM packages
15
  tiktoken
16
+
17
+ # Extra libraries are installed when needed using pipmaster