Merge branch 'main' into add-Memgraph-graph-db
Browse files- README-zh.md +2 -0
- README.md +2 -0
- env.example +18 -1
- lightrag/kg/{age_impl.py → deprecated/age_impl.py} +0 -0
- lightrag/kg/faiss_impl.py +4 -10
- lightrag/operate.py +21 -0
- pyproject.toml +93 -0
- setup.py +4 -105
README-zh.md
CHANGED
@@ -757,6 +757,8 @@ async def initialize_rag():
|
|
757 |
|
758 |
<details>
|
759 |
<summary> <b>使用Faiss进行存储</b> </summary>
|
|
|
|
|
760 |
|
761 |
- 安装所需依赖:
|
762 |
|
|
|
757 |
|
758 |
<details>
|
759 |
<summary> <b>使用Faiss进行存储</b> </summary>
|
760 |
+
在使用Faiss向量数据库之前必须手工安装`faiss-cpu`或`faiss-gpu`。
|
761 |
+
|
762 |
|
763 |
- 安装所需依赖:
|
764 |
|
README.md
CHANGED
@@ -819,6 +819,8 @@ For production level scenarios you will most likely want to leverage an enterpri
|
|
819 |
|
820 |
<details>
|
821 |
<summary> <b>Using Faiss for Storage</b> </summary>
|
|
|
|
|
822 |
|
823 |
- Install the required dependencies:
|
824 |
|
|
|
819 |
|
820 |
<details>
|
821 |
<summary> <b>Using Faiss for Storage</b> </summary>
|
822 |
+
You must manually install faiss-cpu or faiss-gpu before using FAISS vector db.
|
823 |
+
Manually install `faiss-cpu` or `faiss-gpu` before using FAISS vector db.
|
824 |
|
825 |
- Install the required dependencies:
|
826 |
|
env.example
CHANGED
@@ -108,11 +108,28 @@ EMBEDDING_BINDING_HOST=http://localhost:11434
|
|
108 |
# AZURE_EMBEDDING_ENDPOINT=your_endpoint
|
109 |
# AZURE_EMBEDDING_API_KEY=your_api_key
|
110 |
|
|
|
111 |
### Data storage selection
|
|
|
|
|
112 |
# LIGHTRAG_KV_STORAGE=PGKVStorage
|
113 |
-
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
114 |
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
|
|
|
116 |
|
117 |
### PostgreSQL Configuration
|
118 |
POSTGRES_HOST=localhost
|
|
|
108 |
# AZURE_EMBEDDING_ENDPOINT=your_endpoint
|
109 |
# AZURE_EMBEDDING_API_KEY=your_api_key
|
110 |
|
111 |
+
###########################
|
112 |
### Data storage selection
|
113 |
+
###########################
|
114 |
+
### PostgreSQL
|
115 |
# LIGHTRAG_KV_STORAGE=PGKVStorage
|
|
|
116 |
# LIGHTRAG_DOC_STATUS_STORAGE=PGDocStatusStorage
|
117 |
+
# LIGHTRAG_VECTOR_STORAGE=PGVectorStorage
|
118 |
+
# LIGHTRAG_GRAPH_STORAGE=PGGraphStorage
|
119 |
+
### MongoDB
|
120 |
+
# LIGHTRAG_KV_STORAGE=MongoKVStorage
|
121 |
+
# LIGHTRAG_DOC_STATUS_STORAGE=MongoDocStatusStorage
|
122 |
+
# LIGHTRAG_VECTOR_STORAGE=MongoVectorDBStorage
|
123 |
+
# LIGHTRAG_GRAPH_STORAGE=MongoGraphStorage
|
124 |
+
### KV Storage
|
125 |
+
# LIGHTRAG_KV_STORAGE=RedisKVStorage
|
126 |
+
# LIGHTRAG_DOC_STATUS_STORAGE=RedisDocStatusStorage
|
127 |
+
### Vector Storage
|
128 |
+
# LIGHTRAG_VECTOR_STORAGE=FaissVectorDBStorage
|
129 |
+
# LIGHTRAG_VECTOR_STORAGE=MilvusVectorDBStorage
|
130 |
+
### Graph Storage
|
131 |
# LIGHTRAG_GRAPH_STORAGE=Neo4JStorage
|
132 |
+
# LIGHTRAG_GRAPH_STORAGE=MemgraphStorage
|
133 |
|
134 |
### PostgreSQL Configuration
|
135 |
POSTGRES_HOST=localhost
|
lightrag/kg/{age_impl.py → deprecated/age_impl.py}
RENAMED
File without changes
|
lightrag/kg/faiss_impl.py
CHANGED
@@ -4,9 +4,7 @@ import asyncio
|
|
4 |
from typing import Any, final
|
5 |
import json
|
6 |
import numpy as np
|
7 |
-
|
8 |
from dataclasses import dataclass
|
9 |
-
import pipmaster as pm
|
10 |
|
11 |
from lightrag.utils import logger, compute_mdhash_id
|
12 |
from lightrag.base import BaseVectorStorage
|
@@ -17,11 +15,7 @@ from .shared_storage import (
|
|
17 |
set_all_update_flags,
|
18 |
)
|
19 |
|
20 |
-
|
21 |
-
FAISS_PACKAGE = "faiss-gpu" if USE_GPU else "faiss-cpu"
|
22 |
-
if not pm.is_installed(FAISS_PACKAGE):
|
23 |
-
pm.install(FAISS_PACKAGE)
|
24 |
-
|
25 |
import faiss # type: ignore
|
26 |
|
27 |
|
@@ -165,7 +159,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|
165 |
meta["__vector__"] = embeddings[i].tolist()
|
166 |
self._id_to_meta.update({fid: meta})
|
167 |
|
168 |
-
logger.
|
169 |
return [m["__id__"] for m in list_data]
|
170 |
|
171 |
async def query(
|
@@ -228,7 +222,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|
228 |
2. Only one process should updating the storage at a time before index_done_callback,
|
229 |
KG-storage-log should be used to avoid data corruption
|
230 |
"""
|
231 |
-
logger.
|
232 |
to_remove = []
|
233 |
for cid in ids:
|
234 |
fid = self._find_faiss_id_by_custom_id(cid)
|
@@ -330,7 +324,7 @@ class FaissVectorDBStorage(BaseVectorStorage):
|
|
330 |
and rebuild in-memory structures so we can query.
|
331 |
"""
|
332 |
if not os.path.exists(self._faiss_index_file):
|
333 |
-
logger.warning("No existing Faiss index file found
|
334 |
return
|
335 |
|
336 |
try:
|
|
|
4 |
from typing import Any, final
|
5 |
import json
|
6 |
import numpy as np
|
|
|
7 |
from dataclasses import dataclass
|
|
|
8 |
|
9 |
from lightrag.utils import logger, compute_mdhash_id
|
10 |
from lightrag.base import BaseVectorStorage
|
|
|
15 |
set_all_update_flags,
|
16 |
)
|
17 |
|
18 |
+
# You must manually install faiss-cpu or faiss-gpu before using FAISS vector db
|
|
|
|
|
|
|
|
|
19 |
import faiss # type: ignore
|
20 |
|
21 |
|
|
|
159 |
meta["__vector__"] = embeddings[i].tolist()
|
160 |
self._id_to_meta.update({fid: meta})
|
161 |
|
162 |
+
logger.debug(f"Upserted {len(list_data)} vectors into Faiss index.")
|
163 |
return [m["__id__"] for m in list_data]
|
164 |
|
165 |
async def query(
|
|
|
222 |
2. Only one process should updating the storage at a time before index_done_callback,
|
223 |
KG-storage-log should be used to avoid data corruption
|
224 |
"""
|
225 |
+
logger.debug(f"Deleting {len(ids)} vectors from {self.namespace}")
|
226 |
to_remove = []
|
227 |
for cid in ids:
|
228 |
fid = self._find_faiss_id_by_custom_id(cid)
|
|
|
324 |
and rebuild in-memory structures so we can query.
|
325 |
"""
|
326 |
if not os.path.exists(self._faiss_index_file):
|
327 |
+
logger.warning(f"No existing Faiss index file found for {self.namespace}")
|
328 |
return
|
329 |
|
330 |
try:
|
lightrag/operate.py
CHANGED
@@ -168,6 +168,13 @@ async def _handle_single_entity_extraction(
|
|
168 |
# Normalize entity name
|
169 |
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
# Clean and validate entity type
|
172 |
entity_type = clean_str(record_attributes[2]).strip('"')
|
173 |
if not entity_type.strip() or entity_type.startswith('("'):
|
@@ -209,6 +216,20 @@ async def _handle_single_relationship_extraction(
|
|
209 |
# Normalize source and target entity names
|
210 |
source = normalize_extracted_info(source, is_entity=True)
|
211 |
target = normalize_extracted_info(target, is_entity=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
if source == target:
|
213 |
logger.debug(
|
214 |
f"Relationship source and target are the same in: {record_attributes}"
|
|
|
168 |
# Normalize entity name
|
169 |
entity_name = normalize_extracted_info(entity_name, is_entity=True)
|
170 |
|
171 |
+
# Check if entity name became empty after normalization
|
172 |
+
if not entity_name or not entity_name.strip():
|
173 |
+
logger.warning(
|
174 |
+
f"Entity extraction error: entity name became empty after normalization. Original: '{record_attributes[1]}'"
|
175 |
+
)
|
176 |
+
return None
|
177 |
+
|
178 |
# Clean and validate entity type
|
179 |
entity_type = clean_str(record_attributes[2]).strip('"')
|
180 |
if not entity_type.strip() or entity_type.startswith('("'):
|
|
|
216 |
# Normalize source and target entity names
|
217 |
source = normalize_extracted_info(source, is_entity=True)
|
218 |
target = normalize_extracted_info(target, is_entity=True)
|
219 |
+
|
220 |
+
# Check if source or target became empty after normalization
|
221 |
+
if not source or not source.strip():
|
222 |
+
logger.warning(
|
223 |
+
f"Relationship extraction error: source entity became empty after normalization. Original: '{record_attributes[1]}'"
|
224 |
+
)
|
225 |
+
return None
|
226 |
+
|
227 |
+
if not target or not target.strip():
|
228 |
+
logger.warning(
|
229 |
+
f"Relationship extraction error: target entity became empty after normalization. Original: '{record_attributes[2]}'"
|
230 |
+
)
|
231 |
+
return None
|
232 |
+
|
233 |
if source == target:
|
234 |
logger.debug(
|
235 |
f"Relationship source and target are the same in: {record_attributes}"
|
pyproject.toml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["setuptools>=64", "wheel"]
|
3 |
+
build-backend = "setuptools.build_meta"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "lightrag-hku"
|
7 |
+
dynamic = ["version"]
|
8 |
+
authors = [
|
9 |
+
{name = "Zirui Guo"}
|
10 |
+
]
|
11 |
+
description = "LightRAG: Simple and Fast Retrieval-Augmented Generation"
|
12 |
+
readme = "README.md"
|
13 |
+
license = {text = "MIT"}
|
14 |
+
requires-python = ">=3.9"
|
15 |
+
classifiers = [
|
16 |
+
"Development Status :: 4 - Beta",
|
17 |
+
"Programming Language :: Python :: 3",
|
18 |
+
"License :: OSI Approved :: MIT License",
|
19 |
+
"Operating System :: OS Independent",
|
20 |
+
"Intended Audience :: Developers",
|
21 |
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
22 |
+
]
|
23 |
+
dependencies = [
|
24 |
+
"aiohttp",
|
25 |
+
"configparser",
|
26 |
+
"dotenv",
|
27 |
+
"future",
|
28 |
+
"numpy",
|
29 |
+
"pandas>=2.0.0",
|
30 |
+
"pipmaster",
|
31 |
+
"pydantic",
|
32 |
+
"python-dotenv",
|
33 |
+
"pyuca",
|
34 |
+
"setuptools",
|
35 |
+
"tenacity",
|
36 |
+
"tiktoken",
|
37 |
+
"xlsxwriter>=3.1.0",
|
38 |
+
]
|
39 |
+
|
40 |
+
[project.optional-dependencies]
|
41 |
+
api = [
|
42 |
+
# Core dependencies
|
43 |
+
"aiohttp",
|
44 |
+
"configparser",
|
45 |
+
"dotenv",
|
46 |
+
"future",
|
47 |
+
"numpy",
|
48 |
+
"openai",
|
49 |
+
"pandas>=2.0.0",
|
50 |
+
"pipmaster",
|
51 |
+
"pydantic",
|
52 |
+
"python-dotenv",
|
53 |
+
"pyuca",
|
54 |
+
"setuptools",
|
55 |
+
"tenacity",
|
56 |
+
"tiktoken",
|
57 |
+
"xlsxwriter>=3.1.0",
|
58 |
+
# API-specific dependencies
|
59 |
+
"aiofiles",
|
60 |
+
"ascii_colors",
|
61 |
+
"asyncpg",
|
62 |
+
"distro",
|
63 |
+
"fastapi",
|
64 |
+
"httpcore",
|
65 |
+
"httpx",
|
66 |
+
"jiter",
|
67 |
+
"passlib[bcrypt]",
|
68 |
+
"PyJWT",
|
69 |
+
"python-jose[cryptography]",
|
70 |
+
"python-multipart",
|
71 |
+
"pytz",
|
72 |
+
"uvicorn",
|
73 |
+
]
|
74 |
+
|
75 |
+
[project.scripts]
|
76 |
+
lightrag-server = "lightrag.api.lightrag_server:main"
|
77 |
+
lightrag-gunicorn = "lightrag.api.run_with_gunicorn:main"
|
78 |
+
|
79 |
+
[project.urls]
|
80 |
+
Homepage = "https://github.com/HKUDS/LightRAG"
|
81 |
+
Documentation = "https://github.com/HKUDS/LightRAG"
|
82 |
+
Repository = "https://github.com/HKUDS/LightRAG"
|
83 |
+
"Bug Tracker" = "https://github.com/HKUDS/LightRAG/issues"
|
84 |
+
|
85 |
+
[tool.setuptools]
|
86 |
+
packages = ["lightrag"]
|
87 |
+
include-package-data = true
|
88 |
+
|
89 |
+
[tool.setuptools.dynamic]
|
90 |
+
version = {attr = "lightrag.__version__"}
|
91 |
+
|
92 |
+
[tool.setuptools.package-data]
|
93 |
+
lightrag = ["api/webui/**/*"]
|
setup.py
CHANGED
@@ -1,107 +1,6 @@
|
|
1 |
-
|
2 |
-
|
3 |
|
|
|
4 |
|
5 |
-
|
6 |
-
def read_long_description():
|
7 |
-
try:
|
8 |
-
return Path("README.md").read_text(encoding="utf-8")
|
9 |
-
except FileNotFoundError:
|
10 |
-
return "A description of LightRAG is currently unavailable."
|
11 |
-
|
12 |
-
|
13 |
-
# Retrieving metadata from __init__.py
|
14 |
-
def retrieve_metadata():
|
15 |
-
vars2find = ["__author__", "__version__", "__url__"]
|
16 |
-
vars2readme = {}
|
17 |
-
try:
|
18 |
-
with open("./lightrag/__init__.py") as f:
|
19 |
-
for line in f.readlines():
|
20 |
-
for v in vars2find:
|
21 |
-
if line.startswith(v):
|
22 |
-
line = (
|
23 |
-
line.replace(" ", "")
|
24 |
-
.replace('"', "")
|
25 |
-
.replace("'", "")
|
26 |
-
.strip()
|
27 |
-
)
|
28 |
-
vars2readme[v] = line.split("=")[1]
|
29 |
-
except FileNotFoundError:
|
30 |
-
raise FileNotFoundError("Metadata file './lightrag/__init__.py' not found.")
|
31 |
-
|
32 |
-
# Checking if all required variables are found
|
33 |
-
missing_vars = [v for v in vars2find if v not in vars2readme]
|
34 |
-
if missing_vars:
|
35 |
-
raise ValueError(
|
36 |
-
f"Missing required metadata variables in __init__.py: {missing_vars}"
|
37 |
-
)
|
38 |
-
|
39 |
-
return vars2readme
|
40 |
-
|
41 |
-
|
42 |
-
# Reading dependencies from requirements.txt
|
43 |
-
def read_requirements(file_path="requirements.txt"):
|
44 |
-
deps = []
|
45 |
-
try:
|
46 |
-
with open(file_path) as f:
|
47 |
-
deps = [
|
48 |
-
line.strip() for line in f if line.strip() and not line.startswith("#")
|
49 |
-
]
|
50 |
-
except FileNotFoundError:
|
51 |
-
print(f"Warning: '{file_path}' not found. No dependencies will be installed.")
|
52 |
-
return deps
|
53 |
-
|
54 |
-
|
55 |
-
def read_api_requirements():
|
56 |
-
return read_requirements("lightrag/api/requirements.txt")
|
57 |
-
|
58 |
-
|
59 |
-
def read_extra_requirements():
|
60 |
-
return read_requirements("lightrag/tools/lightrag_visualizer/requirements.txt")
|
61 |
-
|
62 |
-
|
63 |
-
metadata = retrieve_metadata()
|
64 |
-
long_description = read_long_description()
|
65 |
-
requirements = read_requirements()
|
66 |
-
|
67 |
-
setuptools.setup(
|
68 |
-
name="lightrag-hku",
|
69 |
-
url=metadata["__url__"],
|
70 |
-
version=metadata["__version__"],
|
71 |
-
author=metadata["__author__"],
|
72 |
-
description="LightRAG: Simple and Fast Retrieval-Augmented Generation",
|
73 |
-
long_description=long_description,
|
74 |
-
long_description_content_type="text/markdown",
|
75 |
-
packages=setuptools.find_packages(
|
76 |
-
exclude=("tests*", "docs*")
|
77 |
-
), # Automatically find packages
|
78 |
-
classifiers=[
|
79 |
-
"Development Status :: 4 - Beta",
|
80 |
-
"Programming Language :: Python :: 3",
|
81 |
-
"License :: OSI Approved :: MIT License",
|
82 |
-
"Operating System :: OS Independent",
|
83 |
-
"Intended Audience :: Developers",
|
84 |
-
"Topic :: Software Development :: Libraries :: Python Modules",
|
85 |
-
],
|
86 |
-
python_requires=">=3.9",
|
87 |
-
install_requires=requirements,
|
88 |
-
include_package_data=True, # Includes non-code files from MANIFEST.in
|
89 |
-
project_urls={ # Additional project metadata
|
90 |
-
"Documentation": metadata.get("__url__", ""),
|
91 |
-
"Source": metadata.get("__url__", ""),
|
92 |
-
"Tracker": f"{metadata.get('__url__', '')}/issues"
|
93 |
-
if metadata.get("__url__")
|
94 |
-
else "",
|
95 |
-
},
|
96 |
-
extras_require={
|
97 |
-
"api": requirements + read_api_requirements(),
|
98 |
-
"tools": read_extra_requirements(), # API requirements as optional
|
99 |
-
},
|
100 |
-
entry_points={
|
101 |
-
"console_scripts": [
|
102 |
-
"lightrag-server=lightrag.api.lightrag_server:main [api]",
|
103 |
-
"lightrag-gunicorn=lightrag.api.run_with_gunicorn:main [api]",
|
104 |
-
"lightrag-viewer=lightrag.tools.lightrag_visualizer.graph_visualizer:main [tools]",
|
105 |
-
],
|
106 |
-
},
|
107 |
-
)
|
|
|
1 |
+
# Minimal setup.py for backward compatibility
|
2 |
+
# Primary configuration is now in pyproject.toml
|
3 |
|
4 |
+
from setuptools import setup
|
5 |
|
6 |
+
setup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|