jin commited on
Commit
c6d1ec5
·
1 Parent(s): c77f948

Optimization logic

Browse files
.gitignore CHANGED
@@ -12,3 +12,5 @@ ignore_this.txt
12
  .venv/
13
  *.ignore.*
14
  .ruff_cache/
 
 
 
12
  .venv/
13
  *.ignore.*
14
  .ruff_cache/
15
+ gui/
16
+ *.log
examples/lightrag_api_oracle_demo..py CHANGED
@@ -1,11 +1,16 @@
 
1
  from fastapi import FastAPI, HTTPException, File, UploadFile
 
2
  from contextlib import asynccontextmanager
3
  from pydantic import BaseModel
4
- from typing import Optional
 
5
 
6
- import sys
7
- import os
8
  from pathlib import Path
 
 
9
 
10
  import asyncio
11
  import nest_asyncio
@@ -13,15 +18,11 @@ from lightrag import LightRAG, QueryParam
13
  from lightrag.llm import openai_complete_if_cache, openai_embedding
14
  from lightrag.utils import EmbeddingFunc
15
  import numpy as np
 
16
 
17
  from lightrag.kg.oracle_impl import OracleDB
18
 
19
 
20
- print(os.getcwd())
21
-
22
- script_directory = Path(__file__).resolve().parent.parent
23
- sys.path.append(os.path.abspath(script_directory))
24
-
25
 
26
  # Apply nest_asyncio to solve event loop issues
27
  nest_asyncio.apply()
@@ -37,18 +38,16 @@ APIKEY = "ocigenerativeai"
37
  # Configure working directory
38
  WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}")
39
  print(f"WORKING_DIR: {WORKING_DIR}")
40
- LLM_MODEL = os.environ.get("LLM_MODEL", "cohere.command-r-plus")
41
  print(f"LLM_MODEL: {LLM_MODEL}")
42
  EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "cohere.embed-multilingual-v3.0")
43
  print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
44
  EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 512))
45
  print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
46
 
47
-
48
  if not os.path.exists(WORKING_DIR):
49
  os.mkdir(WORKING_DIR)
50
-
51
-
52
  async def llm_model_func(
53
  prompt, system_prompt=None, history_messages=[], **kwargs
54
  ) -> str:
@@ -78,10 +77,10 @@ async def get_embedding_dim():
78
  embedding_dim = embedding.shape[1]
79
  return embedding_dim
80
 
81
-
82
  async def init():
 
83
  # Detect embedding dimension
84
- embedding_dimension = await get_embedding_dim()
85
  print(f"Detected embedding dimension: {embedding_dimension}")
86
  # Create Oracle DB connection
87
  # The `config` parameter is the connection configuration of Oracle DB
@@ -89,36 +88,36 @@ async def init():
89
  # We storage data in unified tables, so we need to set a `workspace` parameter to specify which docs we want to store and query
90
  # Below is an example of how to connect to Oracle Autonomous Database on Oracle Cloud
91
 
92
- oracle_db = OracleDB(
93
- config={
94
- "user": "",
95
- "password": "",
96
- "dsn": "",
97
- "config_dir": "",
98
- "wallet_location": "",
99
- "wallet_password": "",
100
- "workspace": "",
101
- } # specify which docs you want to store and query
102
- )
103
 
 
 
 
 
 
 
 
 
 
 
 
104
  # Check if Oracle DB tables exist, if not, tables will be created
105
  await oracle_db.check_tables()
106
  # Initialize LightRAG
107
- # We use Oracle DB as the KV/vector/graph storage
108
  rag = LightRAG(
109
- enable_llm_cache=False,
110
- working_dir=WORKING_DIR,
111
- chunk_token_size=512,
112
- llm_model_func=llm_model_func,
113
- embedding_func=EmbeddingFunc(
114
- embedding_dim=embedding_dimension,
115
- max_token_size=512,
116
- func=embedding_func,
117
- ),
118
- graph_storage="OracleGraphStorage",
119
- kv_storage="OracleKVStorage",
120
- vector_storage="OracleVectorDBStorage",
121
- )
122
 
123
  # Setthe KV/vector/graph storage's `db` property, so all operation will use same connection pool
124
  rag.graph_storage_cls.db = oracle_db
@@ -128,6 +127,17 @@ async def init():
128
  return rag
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
131
  # Data models
132
 
133
 
@@ -135,7 +145,10 @@ class QueryRequest(BaseModel):
135
  query: str
136
  mode: str = "hybrid"
137
  only_need_context: bool = False
 
138
 
 
 
139
 
140
  class InsertRequest(BaseModel):
141
  text: str
@@ -143,7 +156,7 @@ class InsertRequest(BaseModel):
143
 
144
  class Response(BaseModel):
145
  status: str
146
- data: Optional[str] = None
147
  message: Optional[str] = None
148
 
149
 
@@ -151,7 +164,6 @@ class Response(BaseModel):
151
 
152
  rag = None # 定义为全局对象
153
 
154
-
155
  @asynccontextmanager
156
  async def lifespan(app: FastAPI):
157
  global rag
@@ -160,24 +172,39 @@ async def lifespan(app: FastAPI):
160
  yield
161
 
162
 
163
- app = FastAPI(
164
- title="LightRAG API", description="API for RAG operations", lifespan=lifespan
165
- )
166
-
167
 
168
  @app.post("/query", response_model=Response)
169
  async def query_endpoint(request: QueryRequest):
170
- try:
171
  # loop = asyncio.get_event_loop()
172
- result = await rag.aquery(
 
 
 
 
173
  request.query,
174
  param=QueryParam(
175
- mode=request.mode, only_need_context=request.only_need_context
 
 
 
176
  ),
177
  )
178
- return Response(status="success", data=result)
179
- except Exception as e:
180
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
181
 
182
 
183
  @app.post("/insert", response_model=Response)
@@ -220,7 +247,7 @@ async def health_check():
220
  if __name__ == "__main__":
221
  import uvicorn
222
 
223
- uvicorn.run(app, host="0.0.0.0", port=8020)
224
 
225
  # Usage example
226
  # To run the server, use the following command in your terminal:
@@ -237,4 +264,4 @@ if __name__ == "__main__":
237
  # curl -X POST "http://127.0.0.1:8020/insert_file" -H "Content-Type: application/json" -d '{"file_path": "path/to/your/file.txt"}'
238
 
239
  # 4. Health check:
240
- # curl -X GET "http://127.0.0.1:8020/health"
 
1
+
2
  from fastapi import FastAPI, HTTPException, File, UploadFile
3
+ from fastapi import Query
4
  from contextlib import asynccontextmanager
5
  from pydantic import BaseModel
6
+ from typing import Optional,Any
7
+ from fastapi.responses import JSONResponse
8
 
9
+ import sys, os
10
+ print(os.getcwd())
11
  from pathlib import Path
12
+ script_directory = Path(__file__).resolve().parent.parent
13
+ sys.path.append(os.path.abspath(script_directory))
14
 
15
  import asyncio
16
  import nest_asyncio
 
18
  from lightrag.llm import openai_complete_if_cache, openai_embedding
19
  from lightrag.utils import EmbeddingFunc
20
  import numpy as np
21
+ from datetime import datetime
22
 
23
  from lightrag.kg.oracle_impl import OracleDB
24
 
25
 
 
 
 
 
 
26
 
27
  # Apply nest_asyncio to solve event loop issues
28
  nest_asyncio.apply()
 
38
  # Configure working directory
39
  WORKING_DIR = os.environ.get("RAG_DIR", f"{DEFAULT_RAG_DIR}")
40
  print(f"WORKING_DIR: {WORKING_DIR}")
41
+ LLM_MODEL = os.environ.get("LLM_MODEL", "cohere.command-r-plus-08-2024")
42
  print(f"LLM_MODEL: {LLM_MODEL}")
43
  EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "cohere.embed-multilingual-v3.0")
44
  print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
45
  EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 512))
46
  print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
47
 
 
48
  if not os.path.exists(WORKING_DIR):
49
  os.mkdir(WORKING_DIR)
50
+
 
51
  async def llm_model_func(
52
  prompt, system_prompt=None, history_messages=[], **kwargs
53
  ) -> str:
 
77
  embedding_dim = embedding.shape[1]
78
  return embedding_dim
79
 
 
80
  async def init():
81
+
82
  # Detect embedding dimension
83
+ embedding_dimension = 1024 #await get_embedding_dim()
84
  print(f"Detected embedding dimension: {embedding_dimension}")
85
  # Create Oracle DB connection
86
  # The `config` parameter is the connection configuration of Oracle DB
 
88
  # We storage data in unified tables, so we need to set a `workspace` parameter to specify which docs we want to store and query
89
  # Below is an example of how to connect to Oracle Autonomous Database on Oracle Cloud
90
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ oracle_db = OracleDB(config={
93
+ "user":"",
94
+ "password":"",
95
+ "dsn":"",
96
+ "config_dir":"path_to_config_dir",
97
+ "wallet_location":"path_to_wallet_location",
98
+ "wallet_password":"wallet_password",
99
+ "workspace":"company"
100
+ } # specify which docs you want to store and query
101
+ )
102
+
103
  # Check if Oracle DB tables exist, if not, tables will be created
104
  await oracle_db.check_tables()
105
  # Initialize LightRAG
106
+ # We use Oracle DB as the KV/vector/graph storage
107
  rag = LightRAG(
108
+ enable_llm_cache=False,
109
+ working_dir=WORKING_DIR,
110
+ chunk_token_size=512,
111
+ llm_model_func=llm_model_func,
112
+ embedding_func=EmbeddingFunc(
113
+ embedding_dim=embedding_dimension,
114
+ max_token_size=512,
115
+ func=embedding_func,
116
+ ),
117
+ graph_storage = "OracleGraphStorage",
118
+ kv_storage="OracleKVStorage",
119
+ vector_storage="OracleVectorDBStorage"
120
+ )
121
 
122
  # Setthe KV/vector/graph storage's `db` property, so all operation will use same connection pool
123
  rag.graph_storage_cls.db = oracle_db
 
127
  return rag
128
 
129
 
130
+ # Extract and Insert into LightRAG storage
131
+ #with open("./dickens/book.txt", "r", encoding="utf-8") as f:
132
+ # await rag.ainsert(f.read())
133
+
134
+ # # Perform search in different modes
135
+ # modes = ["naive", "local", "global", "hybrid"]
136
+ # for mode in modes:
137
+ # print("="*20, mode, "="*20)
138
+ # print(await rag.aquery("这篇文档是关于什么内容的?", param=QueryParam(mode=mode)))
139
+ # print("-"*100, "\n")
140
+
141
  # Data models
142
 
143
 
 
145
  query: str
146
  mode: str = "hybrid"
147
  only_need_context: bool = False
148
+ only_need_prompt: bool = False
149
 
150
+ class DataRequest(BaseModel):
151
+ limit: int = 100
152
 
153
  class InsertRequest(BaseModel):
154
  text: str
 
156
 
157
  class Response(BaseModel):
158
  status: str
159
+ data: Optional[Any] = None
160
  message: Optional[str] = None
161
 
162
 
 
164
 
165
  rag = None # 定义为全局对象
166
 
 
167
  @asynccontextmanager
168
  async def lifespan(app: FastAPI):
169
  global rag
 
172
  yield
173
 
174
 
175
+ app = FastAPI(title="LightRAG API", description="API for RAG operations",lifespan=lifespan)
 
 
 
176
 
177
  @app.post("/query", response_model=Response)
178
  async def query_endpoint(request: QueryRequest):
179
+ #try:
180
  # loop = asyncio.get_event_loop()
181
+ if request.mode == "naive":
182
+ top_k = 3
183
+ else:
184
+ top_k = 60
185
+ result = await rag.aquery(
186
  request.query,
187
  param=QueryParam(
188
+ mode=request.mode,
189
+ only_need_context=request.only_need_context,
190
+ only_need_prompt=request.only_need_prompt,
191
+ top_k=top_k
192
  ),
193
  )
194
+ return Response(status="success", data=result)
195
+ # except Exception as e:
196
+ # raise HTTPException(status_code=500, detail=str(e))
197
+
198
+
199
+ @app.get("/data", response_model=Response)
200
+ async def query_all_nodes(type: str = Query("nodes"), limit: int = Query(100)):
201
+ if type == "nodes":
202
+ result = await rag.chunk_entity_relation_graph.get_all_nodes(limit = limit)
203
+ elif type == "edges":
204
+ result = await rag.chunk_entity_relation_graph.get_all_edges(limit = limit)
205
+ elif type == "statistics":
206
+ result = await rag.chunk_entity_relation_graph.get_statistics()
207
+ return Response(status="success", data=result)
208
 
209
 
210
  @app.post("/insert", response_model=Response)
 
247
  if __name__ == "__main__":
248
  import uvicorn
249
 
250
+ uvicorn.run(app, host="127.0.0.1", port=8020)
251
 
252
  # Usage example
253
  # To run the server, use the following command in your terminal:
 
264
  # curl -X POST "http://127.0.0.1:8020/insert_file" -H "Content-Type: application/json" -d '{"file_path": "path/to/your/file.txt"}'
265
 
266
  # 4. Health check:
267
+ # curl -X GET "http://127.0.0.1:8020/health"
examples/lightrag_oracle_demo.py CHANGED
@@ -97,6 +97,8 @@ async def main():
97
  graph_storage="OracleGraphStorage",
98
  kv_storage="OracleKVStorage",
99
  vector_storage="OracleVectorDBStorage",
 
 
100
  )
101
 
102
  # Setthe KV/vector/graph storage's `db` property, so all operation will use same connection pool
 
97
  graph_storage="OracleGraphStorage",
98
  kv_storage="OracleKVStorage",
99
  vector_storage="OracleVectorDBStorage",
100
+
101
+ addon_params = {"example_number":1, "language":"Simplfied Chinese"},
102
  )
103
 
104
  # Setthe KV/vector/graph storage's `db` property, so all operation will use same connection pool
lightrag/base.py CHANGED
@@ -21,6 +21,8 @@ class QueryParam:
21
  response_type: str = "Multiple Paragraphs"
22
  # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode.
23
  top_k: int = 60
 
 
24
  # Number of tokens for the original chunks.
25
  max_token_for_text_unit: int = 4000
26
  # Number of tokens for the relationship descriptions
 
21
  response_type: str = "Multiple Paragraphs"
22
  # Number of top-k items to retrieve; corresponds to entities in "local" mode and relationships in "global" mode.
23
  top_k: int = 60
24
+ # Number of document chunks to retrieve.
25
+ # top_n: int = 10
26
  # Number of tokens for the original chunks.
27
  max_token_for_text_unit: int = 4000
28
  # Number of tokens for the relationship descriptions
lightrag/kg/oracle_impl.py CHANGED
@@ -333,6 +333,8 @@ class OracleGraphStorage(BaseGraphStorage):
333
  entity_type = node_data["entity_type"]
334
  description = node_data["description"]
335
  source_id = node_data["source_id"]
 
 
336
  content = entity_name + description
337
  contents = [content]
338
  batches = [
@@ -369,6 +371,8 @@ class OracleGraphStorage(BaseGraphStorage):
369
  keywords = edge_data["keywords"]
370
  description = edge_data["description"]
371
  source_chunk_id = edge_data["source_id"]
 
 
372
  content = keywords + source_name + target_name + description
373
  contents = [content]
374
  batches = [
@@ -544,6 +548,14 @@ class OracleGraphStorage(BaseGraphStorage):
544
  res = await self.db.query(sql=SQL,params=params, multirows=True)
545
  if res:
546
  return res
 
 
 
 
 
 
 
 
547
  N_T = {
548
  "full_docs": "LIGHTRAG_DOC_FULL",
549
  "text_chunks": "LIGHTRAG_DOC_CHUNKS",
@@ -715,18 +727,36 @@ SQL_TEMPLATES = {
715
  WHEN NOT MATCHED THEN
716
  INSERT(workspace,source_name,target_name,weight,keywords,description,source_chunk_id,content,content_vector)
717
  values (:workspace,:source_name,:target_name,:weight,:keywords,:description,:source_chunk_id,:content,:content_vector) """,
718
- "get_all_nodes":"""SELECT t1.name as id,t1.entity_type as label,t1.DESCRIPTION,t2.content
719
- FROM LIGHTRAG_GRAPH_NODES t1
720
- LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id
721
- WHERE t1.workspace=:workspace
722
- order by t1.CREATETIME DESC
723
- fetch first :limit rows only
724
- """,
 
 
 
 
 
 
 
 
 
725
  "get_all_edges":"""SELECT t1.id,t1.keywords as label,t1.keywords, t1.source_name as source, t1.target_name as target,
726
  t1.weight,t1.DESCRIPTION,t2.content
727
  FROM LIGHTRAG_GRAPH_EDGES t1
728
  LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id
729
  WHERE t1.workspace=:workspace
730
  order by t1.CREATETIME DESC
731
- fetch first :limit rows only"""
 
 
 
 
 
 
 
 
 
732
  }
 
333
  entity_type = node_data["entity_type"]
334
  description = node_data["description"]
335
  source_id = node_data["source_id"]
336
+ logger.debug(f"entity_name:{entity_name}, entity_type:{entity_type}")
337
+
338
  content = entity_name + description
339
  contents = [content]
340
  batches = [
 
371
  keywords = edge_data["keywords"]
372
  description = edge_data["description"]
373
  source_chunk_id = edge_data["source_id"]
374
+ logger.debug(f"source_name:{source_name}, target_name:{target_name}, keywords: {keywords}")
375
+
376
  content = keywords + source_name + target_name + description
377
  contents = [content]
378
  batches = [
 
548
  res = await self.db.query(sql=SQL,params=params, multirows=True)
549
  if res:
550
  return res
551
+
552
+ async def get_statistics(self):
553
+ SQL = SQL_TEMPLATES["get_statistics"]
554
+ params = {"workspace":self.db.workspace}
555
+ res = await self.db.query(sql=SQL,params=params, multirows=True)
556
+ if res:
557
+ return res
558
+
559
  N_T = {
560
  "full_docs": "LIGHTRAG_DOC_FULL",
561
  "text_chunks": "LIGHTRAG_DOC_CHUNKS",
 
727
  WHEN NOT MATCHED THEN
728
  INSERT(workspace,source_name,target_name,weight,keywords,description,source_chunk_id,content,content_vector)
729
  values (:workspace,:source_name,:target_name,:weight,:keywords,:description,:source_chunk_id,:content,:content_vector) """,
730
+ "get_all_nodes":"""WITH t0 AS (
731
+ SELECT name AS id, entity_type AS label, entity_type, description,
732
+ '["' || replace(source_chunk_id, '<SEP>', '","') || '"]' source_chunk_ids
733
+ FROM lightrag_graph_nodes
734
+ WHERE workspace = :workspace
735
+ ORDER BY createtime DESC fetch first :limit rows only
736
+ ), t1 AS (
737
+ SELECT t0.id, source_chunk_id
738
+ FROM t0, JSON_TABLE ( source_chunk_ids, '$[*]' COLUMNS ( source_chunk_id PATH '$' ) )
739
+ ), t2 AS (
740
+ SELECT t1.id, LISTAGG(t2.content, '\n') content
741
+ FROM t1 LEFT JOIN lightrag_doc_chunks t2 ON t1.source_chunk_id = t2.id
742
+ GROUP BY t1.id
743
+ )
744
+ SELECT t0.id, label, entity_type, description, t2.content
745
+ FROM t0 LEFT JOIN t2 ON t0.id = t2.id""",
746
  "get_all_edges":"""SELECT t1.id,t1.keywords as label,t1.keywords, t1.source_name as source, t1.target_name as target,
747
  t1.weight,t1.DESCRIPTION,t2.content
748
  FROM LIGHTRAG_GRAPH_EDGES t1
749
  LEFT JOIN LIGHTRAG_DOC_CHUNKS t2 on t1.source_chunk_id=t2.id
750
  WHERE t1.workspace=:workspace
751
  order by t1.CREATETIME DESC
752
+ fetch first :limit rows only""",
753
+ "get_statistics":"""select count(distinct CASE WHEN type='node' THEN id END) as nodes_count,
754
+ count(distinct CASE WHEN type='edge' THEN id END) as edges_count
755
+ FROM (
756
+ select 'node' as type, id FROM GRAPH_TABLE (lightrag_graph
757
+ MATCH (a) WHERE a.workspace=:workspace columns(a.name as id))
758
+ UNION
759
+ select 'edge' as type, TO_CHAR(id) id FROM GRAPH_TABLE (lightrag_graph
760
+ MATCH (a)-[e]->(b) WHERE e.workspace=:workspace columns(e.id))
761
+ )""",
762
  }
lightrag/lightrag.py CHANGED
@@ -12,9 +12,8 @@ from .llm import (
12
  from .operate import (
13
  chunking_by_token_size,
14
  extract_entities,
15
- local_query,
16
- global_query,
17
- hybrid_query,
18
  naive_query,
19
  )
20
 
@@ -309,28 +308,8 @@ class LightRAG:
309
  return loop.run_until_complete(self.aquery(query, param))
310
 
311
  async def aquery(self, query: str, param: QueryParam = QueryParam()):
312
- if param.mode == "local":
313
- response = await local_query(
314
- query,
315
- self.chunk_entity_relation_graph,
316
- self.entities_vdb,
317
- self.relationships_vdb,
318
- self.text_chunks,
319
- param,
320
- asdict(self),
321
- )
322
- elif param.mode == "global":
323
- response = await global_query(
324
- query,
325
- self.chunk_entity_relation_graph,
326
- self.entities_vdb,
327
- self.relationships_vdb,
328
- self.text_chunks,
329
- param,
330
- asdict(self),
331
- )
332
- elif param.mode == "hybrid":
333
- response = await hybrid_query(
334
  query,
335
  self.chunk_entity_relation_graph,
336
  self.entities_vdb,
 
12
  from .operate import (
13
  chunking_by_token_size,
14
  extract_entities,
15
+ # local_query,global_query,hybrid_query,
16
+ kg_query,
 
17
  naive_query,
18
  )
19
 
 
308
  return loop.run_until_complete(self.aquery(query, param))
309
 
310
  async def aquery(self, query: str, param: QueryParam = QueryParam()):
311
+ if param.mode in ["local", "global", "hybrid"]:
312
+ response = await kg_query(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  query,
314
  self.chunk_entity_relation_graph,
315
  self.entities_vdb,
lightrag/llm.py CHANGED
@@ -69,12 +69,15 @@ async def openai_complete_if_cache(
69
  response = await openai_async_client.chat.completions.create(
70
  model=model, messages=messages, **kwargs
71
  )
72
-
 
 
 
73
  if hashing_kv is not None:
74
  await hashing_kv.upsert(
75
  {args_hash: {"return": response.choices[0].message.content, "model": model}}
76
  )
77
- return response.choices[0].message.content
78
 
79
 
80
  @retry(
@@ -539,7 +542,7 @@ async def openai_embedding(
539
  texts: list[str],
540
  model: str = "text-embedding-3-small",
541
  base_url: str = None,
542
- api_key: str = None,
543
  ) -> np.ndarray:
544
  if api_key:
545
  os.environ["OPENAI_API_KEY"] = api_key
@@ -548,7 +551,7 @@ async def openai_embedding(
548
  AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url)
549
  )
550
  response = await openai_async_client.embeddings.create(
551
- model=model, input=texts, encoding_format="float"
552
  )
553
  return np.array([dp.embedding for dp in response.data])
554
 
 
69
  response = await openai_async_client.chat.completions.create(
70
  model=model, messages=messages, **kwargs
71
  )
72
+ content = response.choices[0].message.content
73
+ if r'\u' in content:
74
+ content = content.encode('utf-8').decode('unicode_escape')
75
+ print(content)
76
  if hashing_kv is not None:
77
  await hashing_kv.upsert(
78
  {args_hash: {"return": response.choices[0].message.content, "model": model}}
79
  )
80
+ return content
81
 
82
 
83
  @retry(
 
542
  texts: list[str],
543
  model: str = "text-embedding-3-small",
544
  base_url: str = None,
545
+ api_key: str = None
546
  ) -> np.ndarray:
547
  if api_key:
548
  os.environ["OPENAI_API_KEY"] = api_key
 
551
  AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url)
552
  )
553
  response = await openai_async_client.embeddings.create(
554
+ model=model, input=texts, encoding_format="float"
555
  )
556
  return np.array([dp.embedding for dp in response.data])
557
 
lightrag/operate.py CHANGED
@@ -248,14 +248,23 @@ async def extract_entities(
248
  entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
249
 
250
  ordered_chunks = list(chunks.items())
251
-
 
 
 
 
 
 
 
252
  entity_extract_prompt = PROMPTS["entity_extraction"]
253
  context_base = dict(
254
  tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
255
  record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
256
  completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
257
  entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
258
- )
 
 
259
  continue_prompt = PROMPTS["entiti_continue_extraction"]
260
  if_loop_prompt = PROMPTS["entiti_if_loop_extraction"]
261
 
@@ -270,7 +279,6 @@ async def extract_entities(
270
  content = chunk_dp["content"]
271
  hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
272
  final_result = await use_llm_func(hint_prompt)
273
-
274
  history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
275
  for now_glean_index in range(entity_extract_max_gleaning):
276
  glean_result = await use_llm_func(continue_prompt, history_messages=history)
@@ -388,8 +396,7 @@ async def extract_entities(
388
 
389
  return knowledge_graph_inst
390
 
391
-
392
- async def local_query(
393
  query,
394
  knowledge_graph_inst: BaseGraphStorage,
395
  entities_vdb: BaseVectorStorage,
@@ -399,43 +406,61 @@ async def local_query(
399
  global_config: dict,
400
  ) -> str:
401
  context = None
 
 
 
 
 
 
 
 
 
 
 
 
402
  use_model_func = global_config["llm_model_func"]
403
-
404
  kw_prompt_temp = PROMPTS["keywords_extraction"]
405
- kw_prompt = kw_prompt_temp.format(query=query)
406
- result = await use_model_func(kw_prompt)
407
- json_text = locate_json_string_body_from_string(result)
408
- logger.debug("local_query json_text:", json_text)
409
  try:
 
410
  keywords_data = json.loads(json_text)
411
- keywords = keywords_data.get("low_level_keywords", [])
412
- keywords = ", ".join(keywords)
413
- except json.JSONDecodeError:
414
- print(result)
415
- try:
416
- result = (
417
- result.replace(kw_prompt[:-1], "")
418
- .replace("user", "")
419
- .replace("model", "")
420
- .strip()
421
- )
422
- result = "{" + result.split("{")[1].split("}")[0] + "}"
423
-
424
- keywords_data = json.loads(result)
425
- keywords = keywords_data.get("low_level_keywords", [])
426
- keywords = ", ".join(keywords)
427
- # Handle parsing error
428
- except json.JSONDecodeError as e:
429
- print(f"JSON parsing error: {e}")
430
- return PROMPTS["fail_response"]
431
- if keywords:
432
- context = await _build_local_query_context(
 
 
 
 
433
  keywords,
434
  knowledge_graph_inst,
435
  entities_vdb,
 
436
  text_chunks_db,
437
  query_param,
438
  )
 
439
  if query_param.only_need_context:
440
  return context
441
  if context is None:
@@ -443,13 +468,13 @@ async def local_query(
443
  sys_prompt_temp = PROMPTS["rag_response"]
444
  sys_prompt = sys_prompt_temp.format(
445
  context_data=context, response_type=query_param.response_type
446
- )
447
  if query_param.only_need_prompt:
448
  return sys_prompt
449
  response = await use_model_func(
450
  query,
451
  system_prompt=sys_prompt,
452
- )
453
  if len(response) > len(sys_prompt):
454
  response = (
455
  response.replace(sys_prompt, "")
@@ -464,22 +489,87 @@ async def local_query(
464
  return response
465
 
466
 
467
- async def _build_local_query_context(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  query,
469
  knowledge_graph_inst: BaseGraphStorage,
470
  entities_vdb: BaseVectorStorage,
471
  text_chunks_db: BaseKVStorage[TextChunkSchema],
472
  query_param: QueryParam,
473
  ):
 
474
  results = await entities_vdb.query(query, top_k=query_param.top_k)
475
-
476
  if not len(results):
477
  return None
 
478
  node_datas = await asyncio.gather(
479
  *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results]
480
  )
481
  if not all([n is not None for n in node_datas]):
482
  logger.warning("Some nodes are missing, maybe the storage is damaged")
 
 
483
  node_degrees = await asyncio.gather(
484
  *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results]
485
  )
@@ -488,15 +578,19 @@ async def _build_local_query_context(
488
  for k, n, d in zip(results, node_datas, node_degrees)
489
  if n is not None
490
  ] # what is this text_chunks_db doing. dont remember it in airvx. check the diagram.
 
491
  use_text_units = await _find_most_related_text_unit_from_entities(
492
  node_datas, query_param, text_chunks_db, knowledge_graph_inst
493
  )
 
494
  use_relations = await _find_most_related_edges_from_entities(
495
  node_datas, query_param, knowledge_graph_inst
496
  )
497
  logger.info(
498
  f"Local query uses {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} text units"
499
- )
 
 
500
  entites_section_list = [["id", "entity", "type", "description", "rank"]]
501
  for i, n in enumerate(node_datas):
502
  entites_section_list.append(
@@ -531,20 +625,7 @@ async def _build_local_query_context(
531
  for i, t in enumerate(use_text_units):
532
  text_units_section_list.append([i, t["content"]])
533
  text_units_context = list_of_list_to_csv(text_units_section_list)
534
- return f"""
535
- -----Entities-----
536
- ```csv
537
- {entities_context}
538
- ```
539
- -----Relationships-----
540
- ```csv
541
- {relations_context}
542
- ```
543
- -----Sources-----
544
- ```csv
545
- {text_units_context}
546
- ```
547
- """
548
 
549
 
550
  async def _find_most_related_text_unit_from_entities(
@@ -659,88 +740,9 @@ async def _find_most_related_edges_from_entities(
659
  return all_edges_data
660
 
661
 
662
- async def global_query(
663
- query,
664
- knowledge_graph_inst: BaseGraphStorage,
665
- entities_vdb: BaseVectorStorage,
666
- relationships_vdb: BaseVectorStorage,
667
- text_chunks_db: BaseKVStorage[TextChunkSchema],
668
- query_param: QueryParam,
669
- global_config: dict,
670
- ) -> str:
671
- context = None
672
- use_model_func = global_config["llm_model_func"]
673
-
674
- kw_prompt_temp = PROMPTS["keywords_extraction"]
675
- kw_prompt = kw_prompt_temp.format(query=query)
676
- result = await use_model_func(kw_prompt)
677
- json_text = locate_json_string_body_from_string(result)
678
- logger.debug("global json_text:", json_text)
679
- try:
680
- keywords_data = json.loads(json_text)
681
- keywords = keywords_data.get("high_level_keywords", [])
682
- keywords = ", ".join(keywords)
683
- except json.JSONDecodeError:
684
- try:
685
- result = (
686
- result.replace(kw_prompt[:-1], "")
687
- .replace("user", "")
688
- .replace("model", "")
689
- .strip()
690
- )
691
- result = "{" + result.split("{")[1].split("}")[0] + "}"
692
-
693
- keywords_data = json.loads(result)
694
- keywords = keywords_data.get("high_level_keywords", [])
695
- keywords = ", ".join(keywords)
696
-
697
- except json.JSONDecodeError as e:
698
- # Handle parsing error
699
- print(f"JSON parsing error: {e}")
700
- return PROMPTS["fail_response"]
701
- if keywords:
702
- context = await _build_global_query_context(
703
- keywords,
704
- knowledge_graph_inst,
705
- entities_vdb,
706
- relationships_vdb,
707
- text_chunks_db,
708
- query_param,
709
- )
710
-
711
- if query_param.only_need_context:
712
- return context
713
- if context is None:
714
- return PROMPTS["fail_response"]
715
-
716
- sys_prompt_temp = PROMPTS["rag_response"]
717
- sys_prompt = sys_prompt_temp.format(
718
- context_data=context, response_type=query_param.response_type
719
- )
720
- if query_param.only_need_prompt:
721
- return sys_prompt
722
- response = await use_model_func(
723
- query,
724
- system_prompt=sys_prompt,
725
- )
726
- if len(response) > len(sys_prompt):
727
- response = (
728
- response.replace(sys_prompt, "")
729
- .replace("user", "")
730
- .replace("model", "")
731
- .replace(query, "")
732
- .replace("<system>", "")
733
- .replace("</system>", "")
734
- .strip()
735
- )
736
-
737
- return response
738
-
739
-
740
- async def _build_global_query_context(
741
  keywords,
742
  knowledge_graph_inst: BaseGraphStorage,
743
- entities_vdb: BaseVectorStorage,
744
  relationships_vdb: BaseVectorStorage,
745
  text_chunks_db: BaseKVStorage[TextChunkSchema],
746
  query_param: QueryParam,
@@ -782,6 +784,7 @@ async def _build_global_query_context(
782
  logger.info(
783
  f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} text units"
784
  )
 
785
  relations_section_list = [
786
  ["id", "source", "target", "description", "keywords", "weight", "rank"]
787
  ]
@@ -816,21 +819,8 @@ async def _build_global_query_context(
816
  for i, t in enumerate(use_text_units):
817
  text_units_section_list.append([i, t["content"]])
818
  text_units_context = list_of_list_to_csv(text_units_section_list)
 
819
 
820
- return f"""
821
- -----Entities-----
822
- ```csv
823
- {entities_context}
824
- ```
825
- -----Relationships-----
826
- ```csv
827
- {relations_context}
828
- ```
829
- -----Sources-----
830
- ```csv
831
- {text_units_context}
832
- ```
833
- """
834
 
835
 
836
  async def _find_most_related_entities_from_relationships(
@@ -901,137 +891,11 @@ async def _find_related_text_unit_from_relationships(
901
  return all_text_units
902
 
903
 
904
- async def hybrid_query(
905
- query,
906
- knowledge_graph_inst: BaseGraphStorage,
907
- entities_vdb: BaseVectorStorage,
908
- relationships_vdb: BaseVectorStorage,
909
- text_chunks_db: BaseKVStorage[TextChunkSchema],
910
- query_param: QueryParam,
911
- global_config: dict,
912
- ) -> str:
913
- low_level_context = None
914
- high_level_context = None
915
- use_model_func = global_config["llm_model_func"]
916
-
917
- kw_prompt_temp = PROMPTS["keywords_extraction"]
918
- kw_prompt = kw_prompt_temp.format(query=query)
919
-
920
- result = await use_model_func(kw_prompt)
921
- json_text = locate_json_string_body_from_string(result)
922
- logger.debug("hybrid_query json_text:", json_text)
923
- try:
924
- keywords_data = json.loads(json_text)
925
- hl_keywords = keywords_data.get("high_level_keywords", [])
926
- ll_keywords = keywords_data.get("low_level_keywords", [])
927
- hl_keywords = ", ".join(hl_keywords)
928
- ll_keywords = ", ".join(ll_keywords)
929
- except json.JSONDecodeError:
930
- try:
931
- result = (
932
- result.replace(kw_prompt[:-1], "")
933
- .replace("user", "")
934
- .replace("model", "")
935
- .strip()
936
- )
937
- result = "{" + result.split("{")[1].split("}")[0] + "}"
938
- keywords_data = json.loads(result)
939
- hl_keywords = keywords_data.get("high_level_keywords", [])
940
- ll_keywords = keywords_data.get("low_level_keywords", [])
941
- hl_keywords = ", ".join(hl_keywords)
942
- ll_keywords = ", ".join(ll_keywords)
943
- # Handle parsing error
944
- except json.JSONDecodeError as e:
945
- print(f"JSON parsing error: {e}")
946
- return PROMPTS["fail_response"]
947
-
948
- if ll_keywords:
949
- low_level_context = await _build_local_query_context(
950
- ll_keywords,
951
- knowledge_graph_inst,
952
- entities_vdb,
953
- text_chunks_db,
954
- query_param,
955
- )
956
-
957
- if hl_keywords:
958
- high_level_context = await _build_global_query_context(
959
- hl_keywords,
960
- knowledge_graph_inst,
961
- entities_vdb,
962
- relationships_vdb,
963
- text_chunks_db,
964
- query_param,
965
- )
966
-
967
- context = combine_contexts(high_level_context, low_level_context)
968
-
969
- if query_param.only_need_context:
970
- return context
971
- if context is None:
972
- return PROMPTS["fail_response"]
973
-
974
- sys_prompt_temp = PROMPTS["rag_response"]
975
- sys_prompt = sys_prompt_temp.format(
976
- context_data=context, response_type=query_param.response_type
977
- )
978
- if query_param.only_need_prompt:
979
- return sys_prompt
980
- response = await use_model_func(
981
- query,
982
- system_prompt=sys_prompt,
983
- )
984
- if len(response) > len(sys_prompt):
985
- response = (
986
- response.replace(sys_prompt, "")
987
- .replace("user", "")
988
- .replace("model", "")
989
- .replace(query, "")
990
- .replace("<system>", "")
991
- .replace("</system>", "")
992
- .strip()
993
- )
994
- return response
995
-
996
-
997
- def combine_contexts(high_level_context, low_level_context):
998
  # Function to extract entities, relationships, and sources from context strings
999
-
1000
- def extract_sections(context):
1001
- entities_match = re.search(
1002
- r"-----Entities-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL
1003
- )
1004
- relationships_match = re.search(
1005
- r"-----Relationships-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL
1006
- )
1007
- sources_match = re.search(
1008
- r"-----Sources-----\s*```csv\s*(.*?)\s*```", context, re.DOTALL
1009
- )
1010
-
1011
- entities = entities_match.group(1) if entities_match else ""
1012
- relationships = relationships_match.group(1) if relationships_match else ""
1013
- sources = sources_match.group(1) if sources_match else ""
1014
-
1015
- return entities, relationships, sources
1016
-
1017
- # Extract sections from both contexts
1018
-
1019
- if high_level_context is None:
1020
- warnings.warn(
1021
- "High Level context is None. Return empty High entity/relationship/source"
1022
- )
1023
- hl_entities, hl_relationships, hl_sources = "", "", ""
1024
- else:
1025
- hl_entities, hl_relationships, hl_sources = extract_sections(high_level_context)
1026
-
1027
- if low_level_context is None:
1028
- warnings.warn(
1029
- "Low Level context is None. Return empty Low entity/relationship/source"
1030
- )
1031
- ll_entities, ll_relationships, ll_sources = "", "", ""
1032
- else:
1033
- ll_entities, ll_relationships, ll_sources = extract_sections(low_level_context)
1034
-
1035
  # Combine and deduplicate the entities
1036
  combined_entities = process_combine_contexts(hl_entities, ll_entities)
1037
 
@@ -1043,21 +907,7 @@ def combine_contexts(high_level_context, low_level_context):
1043
  # Combine and deduplicate the sources
1044
  combined_sources = process_combine_contexts(hl_sources, ll_sources)
1045
 
1046
- # Format the combined context
1047
- return f"""
1048
- -----Entities-----
1049
- ```csv
1050
- {combined_entities}
1051
- ```
1052
- -----Relationships-----
1053
- ```csv
1054
- {combined_relationships}
1055
- ```
1056
- -----Sources-----
1057
- ```csv
1058
- {combined_sources}
1059
- ```
1060
- """
1061
 
1062
 
1063
  async def naive_query(
@@ -1080,7 +930,7 @@ async def naive_query(
1080
  max_token_size=query_param.max_token_for_text_unit,
1081
  )
1082
  logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks")
1083
- section = "--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
1084
  if query_param.only_need_context:
1085
  return section
1086
  sys_prompt_temp = PROMPTS["naive_rag_response"]
 
248
  entity_extract_max_gleaning = global_config["entity_extract_max_gleaning"]
249
 
250
  ordered_chunks = list(chunks.items())
251
+ # add language and example number params to prompt
252
+ language = global_config["addon_params"].get("language",PROMPTS["DEFAULT_LANGUAGE"])
253
+ example_number = global_config["addon_params"].get("example_number", None)
254
+ if example_number and example_number<len(PROMPTS["entity_extraction_examples"]):
255
+ examples="\n".join(PROMPTS["entity_extraction_examples"][:int(example_number)])
256
+ else:
257
+ examples="\n".join(PROMPTS["entity_extraction_examples"])
258
+
259
  entity_extract_prompt = PROMPTS["entity_extraction"]
260
  context_base = dict(
261
  tuple_delimiter=PROMPTS["DEFAULT_TUPLE_DELIMITER"],
262
  record_delimiter=PROMPTS["DEFAULT_RECORD_DELIMITER"],
263
  completion_delimiter=PROMPTS["DEFAULT_COMPLETION_DELIMITER"],
264
  entity_types=",".join(PROMPTS["DEFAULT_ENTITY_TYPES"]),
265
+ examples=examples,
266
+ language=language)
267
+
268
  continue_prompt = PROMPTS["entiti_continue_extraction"]
269
  if_loop_prompt = PROMPTS["entiti_if_loop_extraction"]
270
 
 
279
  content = chunk_dp["content"]
280
  hint_prompt = entity_extract_prompt.format(**context_base, input_text=content)
281
  final_result = await use_llm_func(hint_prompt)
 
282
  history = pack_user_ass_to_openai_messages(hint_prompt, final_result)
283
  for now_glean_index in range(entity_extract_max_gleaning):
284
  glean_result = await use_llm_func(continue_prompt, history_messages=history)
 
396
 
397
  return knowledge_graph_inst
398
 
399
+ async def kg_query(
 
400
  query,
401
  knowledge_graph_inst: BaseGraphStorage,
402
  entities_vdb: BaseVectorStorage,
 
406
  global_config: dict,
407
  ) -> str:
408
  context = None
409
+ example_number = global_config["addon_params"].get("example_number", None)
410
+ if example_number and example_number < len(PROMPTS["keywords_extraction_examples"]):
411
+ examples = "\n".join(PROMPTS["keywords_extraction_examples"][:int(example_number)])
412
+ else:
413
+ examples="\n".join(PROMPTS["keywords_extraction_examples"])
414
+
415
+ # Set mode
416
+ if query_param.mode not in ["local", "global", "hybrid"]:
417
+ logger.error(f"Unknown mode {query_param.mode} in kg_query")
418
+ return PROMPTS["fail_response"]
419
+
420
+ # LLM generate keywords
421
  use_model_func = global_config["llm_model_func"]
 
422
  kw_prompt_temp = PROMPTS["keywords_extraction"]
423
+ kw_prompt = kw_prompt_temp.format(query=query,examples=examples)
424
+ result = await use_model_func(kw_prompt)
425
+ logger.info(f"kw_prompt result:")
426
+ print(result)
427
  try:
428
+ json_text = locate_json_string_body_from_string(result)
429
  keywords_data = json.loads(json_text)
430
+ hl_keywords = keywords_data.get("high_level_keywords", [])
431
+ ll_keywords = keywords_data.get("low_level_keywords", [])
432
+
433
+ # Handle parsing error
434
+ except json.JSONDecodeError as e:
435
+ print(f"JSON parsing error: {e} {result}")
436
+ return PROMPTS["fail_response"]
437
+
438
+ # Handdle keywords missing
439
+ if hl_keywords == [] and ll_keywords == []:
440
+ logger.warning("low_level_keywords and high_level_keywords is empty")
441
+ return PROMPTS["fail_response"]
442
+ if ll_keywords == [] and query_param.mode in ["local","hybrid"]:
443
+ logger.warning("low_level_keywords is empty")
444
+ return PROMPTS["fail_response"]
445
+ else:
446
+ ll_keywords = ", ".join(ll_keywords)
447
+ if hl_keywords == [] and query_param.mode in ["global","hybrid"]:
448
+ logger.warning("high_level_keywords is empty")
449
+ return PROMPTS["fail_response"]
450
+ else:
451
+ hl_keywords = ", ".join(hl_keywords)
452
+
453
+ # Build context
454
+ keywords = [ll_keywords, hl_keywords]
455
+ context = await _build_query_context(
456
  keywords,
457
  knowledge_graph_inst,
458
  entities_vdb,
459
+ relationships_vdb,
460
  text_chunks_db,
461
  query_param,
462
  )
463
+
464
  if query_param.only_need_context:
465
  return context
466
  if context is None:
 
468
  sys_prompt_temp = PROMPTS["rag_response"]
469
  sys_prompt = sys_prompt_temp.format(
470
  context_data=context, response_type=query_param.response_type
471
+ )
472
  if query_param.only_need_prompt:
473
  return sys_prompt
474
  response = await use_model_func(
475
  query,
476
  system_prompt=sys_prompt,
477
+ )
478
  if len(response) > len(sys_prompt):
479
  response = (
480
  response.replace(sys_prompt, "")
 
489
  return response
490
 
491
 
492
+ async def _build_query_context(
493
+ query: list,
494
+ knowledge_graph_inst: BaseGraphStorage,
495
+ entities_vdb: BaseVectorStorage,
496
+ relationships_vdb: BaseVectorStorage,
497
+ text_chunks_db: BaseKVStorage[TextChunkSchema],
498
+ query_param: QueryParam,
499
+ ):
500
+ ll_kewwords, hl_keywrds = query[0], query[1]
501
+ if query_param.mode in ["local", "hybrid"]:
502
+ if ll_kewwords == "":
503
+ ll_entities_context,ll_relations_context,ll_text_units_context = "","",""
504
+ warnings.warn("Low Level context is None. Return empty Low entity/relationship/source")
505
+ query_param.mode = "global"
506
+ else:
507
+ ll_entities_context,ll_relations_context,ll_text_units_context = await _get_node_data(
508
+ ll_kewwords,
509
+ knowledge_graph_inst,
510
+ entities_vdb,
511
+ text_chunks_db,
512
+ query_param
513
+ )
514
+ if query_param.mode in ["global", "hybrid"]:
515
+ if hl_keywrds == "":
516
+ hl_entities_context,hl_relations_context,hl_text_units_context = "","",""
517
+ warnings.warn("High Level context is None. Return empty High entity/relationship/source")
518
+ query_param.mode = "local"
519
+ else:
520
+ hl_entities_context,hl_relations_context,hl_text_units_context = await _get_edge_data(
521
+ hl_keywrds,
522
+ knowledge_graph_inst,
523
+ relationships_vdb,
524
+ text_chunks_db,
525
+ query_param
526
+ )
527
+ if query_param.mode == 'hybrid':
528
+ entities_context,relations_context,text_units_context = combine_contexts(
529
+ [hl_entities_context,ll_entities_context],
530
+ [hl_relations_context,ll_relations_context],
531
+ [hl_text_units_context,ll_text_units_context]
532
+ )
533
+ elif query_param.mode == 'local':
534
+ entities_context,relations_context,text_units_context = ll_entities_context,ll_relations_context,ll_text_units_context
535
+ elif query_param.mode == 'global':
536
+ entities_context,relations_context,text_units_context = hl_entities_context,hl_relations_context,hl_text_units_context
537
+ return f"""
538
+ # -----Entities-----
539
+ # ```csv
540
+ # {entities_context}
541
+ # ```
542
+ # -----Relationships-----
543
+ # ```csv
544
+ # {relations_context}
545
+ # ```
546
+ # -----Sources-----
547
+ # ```csv
548
+ # {text_units_context}
549
+ # ```
550
+ # """
551
+
552
+
553
+
554
+ async def _get_node_data(
555
  query,
556
  knowledge_graph_inst: BaseGraphStorage,
557
  entities_vdb: BaseVectorStorage,
558
  text_chunks_db: BaseKVStorage[TextChunkSchema],
559
  query_param: QueryParam,
560
  ):
561
+ # 获取相似的实体
562
  results = await entities_vdb.query(query, top_k=query_param.top_k)
 
563
  if not len(results):
564
  return None
565
+ # 获取实体信息
566
  node_datas = await asyncio.gather(
567
  *[knowledge_graph_inst.get_node(r["entity_name"]) for r in results]
568
  )
569
  if not all([n is not None for n in node_datas]):
570
  logger.warning("Some nodes are missing, maybe the storage is damaged")
571
+
572
+ # 获取实体的度
573
  node_degrees = await asyncio.gather(
574
  *[knowledge_graph_inst.node_degree(r["entity_name"]) for r in results]
575
  )
 
578
  for k, n, d in zip(results, node_datas, node_degrees)
579
  if n is not None
580
  ] # what is this text_chunks_db doing. dont remember it in airvx. check the diagram.
581
+ # 根据实体获取文本片段
582
  use_text_units = await _find_most_related_text_unit_from_entities(
583
  node_datas, query_param, text_chunks_db, knowledge_graph_inst
584
  )
585
+ # 获取关联的边
586
  use_relations = await _find_most_related_edges_from_entities(
587
  node_datas, query_param, knowledge_graph_inst
588
  )
589
  logger.info(
590
  f"Local query uses {len(node_datas)} entites, {len(use_relations)} relations, {len(use_text_units)} text units"
591
+ )
592
+
593
+ # 构建提示词
594
  entites_section_list = [["id", "entity", "type", "description", "rank"]]
595
  for i, n in enumerate(node_datas):
596
  entites_section_list.append(
 
625
  for i, t in enumerate(use_text_units):
626
  text_units_section_list.append([i, t["content"]])
627
  text_units_context = list_of_list_to_csv(text_units_section_list)
628
+ return entities_context,relations_context,text_units_context
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
 
631
  async def _find_most_related_text_unit_from_entities(
 
740
  return all_edges_data
741
 
742
 
743
+ async def _get_edge_data(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  keywords,
745
  knowledge_graph_inst: BaseGraphStorage,
 
746
  relationships_vdb: BaseVectorStorage,
747
  text_chunks_db: BaseKVStorage[TextChunkSchema],
748
  query_param: QueryParam,
 
784
  logger.info(
785
  f"Global query uses {len(use_entities)} entites, {len(edge_datas)} relations, {len(use_text_units)} text units"
786
  )
787
+
788
  relations_section_list = [
789
  ["id", "source", "target", "description", "keywords", "weight", "rank"]
790
  ]
 
819
  for i, t in enumerate(use_text_units):
820
  text_units_section_list.append([i, t["content"]])
821
  text_units_context = list_of_list_to_csv(text_units_section_list)
822
+ return entities_context,relations_context,text_units_context
823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
824
 
825
 
826
  async def _find_most_related_entities_from_relationships(
 
891
  return all_text_units
892
 
893
 
894
+ def combine_contexts(entities, relationships, sources):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
895
  # Function to extract entities, relationships, and sources from context strings
896
+ hl_entities, ll_entities = entities[0], entities[1]
897
+ hl_relationships, ll_relationships = relationships[0],relationships[1]
898
+ hl_sources, ll_sources = sources[0], sources[1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899
  # Combine and deduplicate the entities
900
  combined_entities = process_combine_contexts(hl_entities, ll_entities)
901
 
 
907
  # Combine and deduplicate the sources
908
  combined_sources = process_combine_contexts(hl_sources, ll_sources)
909
 
910
+ return combined_entities, combined_relationships, combined_sources
 
 
 
 
 
 
 
 
 
 
 
 
 
 
911
 
912
 
913
  async def naive_query(
 
930
  max_token_size=query_param.max_token_for_text_unit,
931
  )
932
  logger.info(f"Truncate {len(chunks)} to {len(maybe_trun_chunks)} chunks")
933
+ section = "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
934
  if query_param.only_need_context:
935
  return section
936
  sys_prompt_temp = PROMPTS["naive_rag_response"]
lightrag/prompt.py CHANGED
@@ -2,6 +2,7 @@ GRAPH_FIELD_SEP = "<SEP>"
2
 
3
  PROMPTS = {}
4
 
 
5
  PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
6
  PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
7
  PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
@@ -11,6 +12,7 @@ PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"]
11
 
12
  PROMPTS["entity_extraction"] = """-Goal-
13
  Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
 
14
 
15
  -Steps-
16
  1. Identify all entities. For each identified entity, extract the following information:
@@ -38,7 +40,19 @@ Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_
38
  ######################
39
  -Examples-
40
  ######################
41
- Example 1:
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  Entity_types: [person, technology, mission, organization, location]
44
  Text:
@@ -62,8 +76,8 @@ Output:
62
  ("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
63
  ("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
64
  ("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
65
- #############################
66
- Example 2:
67
 
68
  Entity_types: [person, technology, mission, organization, location]
69
  Text:
@@ -80,8 +94,8 @@ Output:
80
  ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
81
  ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
82
  ("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
83
- #############################
84
- Example 3:
85
 
86
  Entity_types: [person, role, technology, organization, event, location, concept]
87
  Text:
@@ -107,22 +121,15 @@ Output:
107
  ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
108
  ("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
109
  ("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
110
- #############################
111
- -Real Data-
112
- ######################
113
- Entity_types: {entity_types}
114
- Text: {input_text}
115
- ######################
116
- Output:
117
- """
118
 
119
- PROMPTS[
120
- "summarize_entity_descriptions"
121
- ] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
122
  Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
123
  Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
124
  If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
125
  Make sure it is written in third person, and include the entity names so we the have full context.
 
126
 
127
  #######
128
  -Data-
@@ -132,14 +139,10 @@ Description List: {description_list}
132
  Output:
133
  """
134
 
135
- PROMPTS[
136
- "entiti_continue_extraction"
137
- ] = """MANY entities were missed in the last extraction. Add them below using the same format:
138
  """
139
 
140
- PROMPTS[
141
- "entiti_if_loop_extraction"
142
- ] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added.
143
  """
144
 
145
  PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
@@ -169,6 +172,7 @@ Add sections and commentary to the response as appropriate for the length and fo
169
  PROMPTS["keywords_extraction"] = """---Role---
170
 
171
  You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
 
172
 
173
  ---Goal---
174
 
@@ -184,7 +188,20 @@ Given the query, list both high-level and low-level keywords. High-level keyword
184
  ######################
185
  -Examples-
186
  ######################
187
- Example 1:
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  Query: "How does international trade influence global economic stability?"
190
  ################
@@ -193,8 +210,8 @@ Output:
193
  "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
194
  "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
195
  }}
196
- #############################
197
- Example 2:
198
 
199
  Query: "What are the environmental consequences of deforestation on biodiversity?"
200
  ################
@@ -203,8 +220,8 @@ Output:
203
  "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
204
  "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
205
  }}
206
- #############################
207
- Example 3:
208
 
209
  Query: "What is the role of education in reducing poverty?"
210
  ################
@@ -213,14 +230,9 @@ Output:
213
  "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
214
  "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
215
  }}
216
- #############################
217
- -Real Data-
218
- ######################
219
- Query: {query}
220
- ######################
221
- Output:
222
 
223
- """
224
 
225
  PROMPTS["naive_rag_response"] = """---Role---
226
 
 
2
 
3
  PROMPTS = {}
4
 
5
+ PROMPTS["DEFAULT_LANGUAGE"] = "English"
6
  PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
7
  PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
8
  PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
 
12
 
13
  PROMPTS["entity_extraction"] = """-Goal-
14
  Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
15
+ Use {language} as output language.
16
 
17
  -Steps-
18
  1. Identify all entities. For each identified entity, extract the following information:
 
40
  ######################
41
  -Examples-
42
  ######################
43
+ {examples}
44
+
45
+ #############################
46
+ -Real Data-
47
+ ######################
48
+ Entity_types: {entity_types}
49
+ Text: {input_text}
50
+ ######################
51
+ Output:
52
+ """
53
+
54
+ PROMPTS["entity_extraction_examples"] = [
55
+ """Example 1:
56
 
57
  Entity_types: [person, technology, mission, organization, location]
58
  Text:
 
76
  ("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
77
  ("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
78
  ("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
79
+ #############################""",
80
+ """Example 2:
81
 
82
  Entity_types: [person, technology, mission, organization, location]
83
  Text:
 
94
  ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
95
  ("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
96
  ("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
97
+ #############################""",
98
+ """Example 3:
99
 
100
  Entity_types: [person, role, technology, organization, event, location, concept]
101
  Text:
 
121
  ("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
122
  ("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
123
  ("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
124
+ #############################"""
125
+ ]
 
 
 
 
 
 
126
 
127
+ PROMPTS["summarize_entity_descriptions"] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
 
 
128
  Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
129
  Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
130
  If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
131
  Make sure it is written in third person, and include the entity names so we the have full context.
132
+ Use Chinese as output language.
133
 
134
  #######
135
  -Data-
 
139
  Output:
140
  """
141
 
142
+ PROMPTS["entiti_continue_extraction"] = """MANY entities were missed in the last extraction. Add them below using the same format:
 
 
143
  """
144
 
145
+ PROMPTS["entiti_if_loop_extraction"] = """It appears some entities may have still been missed. Answer YES | NO if there are still entities that need to be added.
 
 
146
  """
147
 
148
  PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
 
172
  PROMPTS["keywords_extraction"] = """---Role---
173
 
174
  You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
175
+ Use Chinese as output language.
176
 
177
  ---Goal---
178
 
 
188
  ######################
189
  -Examples-
190
  ######################
191
+ {examples}
192
+
193
+ #############################
194
+ -Real Data-
195
+ ######################
196
+ Query: {query}
197
+ ######################
198
+ The `Output` should be human text, not unicode characters. Keep the same language as `Query`.
199
+ Output:
200
+
201
+ """
202
+
203
+ PROMPTS["keywords_extraction_examples"] = [
204
+ """Example 1:
205
 
206
  Query: "How does international trade influence global economic stability?"
207
  ################
 
210
  "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
211
  "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
212
  }}
213
+ #############################""",
214
+ """Example 2:
215
 
216
  Query: "What are the environmental consequences of deforestation on biodiversity?"
217
  ################
 
220
  "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
221
  "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
222
  }}
223
+ #############################""",
224
+ """Example 3:
225
 
226
  Query: "What is the role of education in reducing poverty?"
227
  ################
 
230
  "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
231
  "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
232
  }}
233
+ #############################"""
234
+ ]
 
 
 
 
235
 
 
236
 
237
  PROMPTS["naive_rag_response"] = """---Role---
238
 
lightrag/utils.py CHANGED
@@ -47,14 +47,26 @@ class EmbeddingFunc:
47
 
48
  def locate_json_string_body_from_string(content: str) -> Union[str, None]:
49
  """Locate the JSON string body from a string"""
50
- maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
51
- if maybe_json_str is not None:
52
- maybe_json_str = maybe_json_str.group(0)
53
- maybe_json_str = maybe_json_str.replace("\\n", "")
54
- maybe_json_str = maybe_json_str.replace("\n", "")
55
- maybe_json_str = maybe_json_str.replace("'", '"')
56
- return maybe_json_str
57
- else:
 
 
 
 
 
 
 
 
 
 
 
 
58
  return None
59
 
60
 
 
47
 
48
  def locate_json_string_body_from_string(content: str) -> Union[str, None]:
49
  """Locate the JSON string body from a string"""
50
+ try:
51
+ maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
52
+ if maybe_json_str is not None:
53
+ maybe_json_str = maybe_json_str.group(0)
54
+ maybe_json_str = maybe_json_str.replace("\\n", "")
55
+ maybe_json_str = maybe_json_str.replace("\n", "")
56
+ maybe_json_str = maybe_json_str.replace("'", '"')
57
+ json.loads(maybe_json_str)
58
+ return maybe_json_str
59
+ except:
60
+ # try:
61
+ # content = (
62
+ # content.replace(kw_prompt[:-1], "")
63
+ # .replace("user", "")
64
+ # .replace("model", "")
65
+ # .strip()
66
+ # )
67
+ # maybe_json_str = "{" + content.split("{")[1].split("}")[0] + "}"
68
+ # json.loads(maybe_json_str)
69
+
70
  return None
71
 
72