LarFii commited on
Commit
2f4b338
·
1 Parent(s): 46fd151

Fix cache bugs

Browse files
examples/lightrag_api_openai_compatible_demo.py CHANGED
@@ -24,6 +24,10 @@ EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-large")
24
  print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
25
  EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192))
26
  print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
 
 
 
 
27
 
28
  if not os.path.exists(WORKING_DIR):
29
  os.mkdir(WORKING_DIR)
@@ -36,10 +40,12 @@ async def llm_model_func(
36
  prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
37
  ) -> str:
38
  return await openai_complete_if_cache(
39
- LLM_MODEL,
40
- prompt,
41
  system_prompt=system_prompt,
42
  history_messages=history_messages,
 
 
43
  **kwargs,
44
  )
45
 
@@ -49,8 +55,10 @@ async def llm_model_func(
49
 
50
  async def embedding_func(texts: list[str]) -> np.ndarray:
51
  return await openai_embed(
52
- texts,
53
  model=EMBEDDING_MODEL,
 
 
54
  )
55
 
56
 
 
24
  print(f"EMBEDDING_MODEL: {EMBEDDING_MODEL}")
25
  EMBEDDING_MAX_TOKEN_SIZE = int(os.environ.get("EMBEDDING_MAX_TOKEN_SIZE", 8192))
26
  print(f"EMBEDDING_MAX_TOKEN_SIZE: {EMBEDDING_MAX_TOKEN_SIZE}")
27
+ BASE_URL = int(os.environ.get("BASE_URL", "https://api.openai.com/v1"))
28
+ print(f"BASE_URL: {BASE_URL}")
29
+ API_KEY = int(os.environ.get("API_KEY", "xxxxxxxx"))
30
+ print(f"API_KEY: {API_KEY}")
31
 
32
  if not os.path.exists(WORKING_DIR):
33
  os.mkdir(WORKING_DIR)
 
40
  prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
41
  ) -> str:
42
  return await openai_complete_if_cache(
43
+ model=LLM_MODEL,
44
+ prompt=prompt,
45
  system_prompt=system_prompt,
46
  history_messages=history_messages,
47
+ base_url=BASE_URL,
48
+ api_key=API_KEY,
49
  **kwargs,
50
  )
51
 
 
55
 
56
  async def embedding_func(texts: list[str]) -> np.ndarray:
57
  return await openai_embed(
58
+ texts=texts,
59
  model=EMBEDDING_MODEL,
60
+ base_url=BASE_URL,
61
+ api_key=API_KEY,
62
  )
63
 
64
 
lightrag/kg/jsondocstatus_impl.py CHANGED
@@ -109,6 +109,22 @@ class JsonDocStatusStorage(DocStatusStorage):
109
  if v["status"] == DocStatus.PENDING
110
  }
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  async def index_done_callback(self):
113
  """Save data to file after indexing"""
114
  write_json(self._data, self._file_name)
 
109
  if v["status"] == DocStatus.PENDING
110
  }
111
 
112
+ async def get_processed_docs(self) -> dict[str, DocProcessingStatus]:
113
+ """Get all processed documents"""
114
+ return {
115
+ k: DocProcessingStatus(**v)
116
+ for k, v in self._data.items()
117
+ if v["status"] == DocStatus.PROCESSED
118
+ }
119
+
120
+ async def get_processing_docs(self) -> dict[str, DocProcessingStatus]:
121
+ """Get all processing documents"""
122
+ return {
123
+ k: DocProcessingStatus(**v)
124
+ for k, v in self._data.items()
125
+ if v["status"] == DocStatus.PROCESSING
126
+ }
127
+
128
  async def index_done_callback(self):
129
  """Save data to file after indexing"""
130
  write_json(self._data, self._file_name)
lightrag/lightrag.py CHANGED
@@ -543,7 +543,7 @@ class LightRAG:
543
  new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids}
544
 
545
  if not new_docs:
546
- logger.info("All documents have been processed or are duplicates")
547
  return
548
 
549
  # 4. Store status document
@@ -560,15 +560,16 @@ class LightRAG:
560
  each chunk for entity and relation extraction, and updating the
561
  document status.
562
 
563
- 1. Get all pending and failed documents
564
  2. Split document content into chunks
565
  3. Process each chunk for entity and relation extraction
566
  4. Update the document status
567
  """
568
- # 1. get all pending and failed documents
569
  to_process_docs: dict[str, DocProcessingStatus] = {}
570
 
571
- # Fetch failed documents
 
572
  failed_docs = await self.doc_status.get_failed_docs()
573
  to_process_docs.update(failed_docs)
574
  pendings_docs = await self.doc_status.get_pending_docs()
@@ -599,6 +600,7 @@ class LightRAG:
599
  doc_status_id: {
600
  "status": DocStatus.PROCESSING,
601
  "updated_at": datetime.now().isoformat(),
 
602
  "content_summary": status_doc.content_summary,
603
  "content_length": status_doc.content_length,
604
  "created_at": status_doc.created_at,
@@ -635,6 +637,10 @@ class LightRAG:
635
  doc_status_id: {
636
  "status": DocStatus.PROCESSED,
637
  "chunks_count": len(chunks),
 
 
 
 
638
  "updated_at": datetime.now().isoformat(),
639
  }
640
  }
@@ -648,6 +654,10 @@ class LightRAG:
648
  doc_status_id: {
649
  "status": DocStatus.FAILED,
650
  "error": str(e),
 
 
 
 
651
  "updated_at": datetime.now().isoformat(),
652
  }
653
  }
 
543
  new_docs = {doc_id: new_docs[doc_id] for doc_id in unique_new_doc_ids}
544
 
545
  if not new_docs:
546
+ logger.info("No new unique documents were found.")
547
  return
548
 
549
  # 4. Store status document
 
560
  each chunk for entity and relation extraction, and updating the
561
  document status.
562
 
563
+ 1. Get all pending, failed, and abnormally terminated processing documents.
564
  2. Split document content into chunks
565
  3. Process each chunk for entity and relation extraction
566
  4. Update the document status
567
  """
568
+ # 1. Get all pending, failed, and abnormally terminated processing documents.
569
  to_process_docs: dict[str, DocProcessingStatus] = {}
570
 
571
+ processing_docs = await self.doc_status.get_processing_docs()
572
+ to_process_docs.update(processing_docs)
573
  failed_docs = await self.doc_status.get_failed_docs()
574
  to_process_docs.update(failed_docs)
575
  pendings_docs = await self.doc_status.get_pending_docs()
 
600
  doc_status_id: {
601
  "status": DocStatus.PROCESSING,
602
  "updated_at": datetime.now().isoformat(),
603
+ "content": status_doc.content,
604
  "content_summary": status_doc.content_summary,
605
  "content_length": status_doc.content_length,
606
  "created_at": status_doc.created_at,
 
637
  doc_status_id: {
638
  "status": DocStatus.PROCESSED,
639
  "chunks_count": len(chunks),
640
+ "content": status_doc.content,
641
+ "content_summary": status_doc.content_summary,
642
+ "content_length": status_doc.content_length,
643
+ "created_at": status_doc.created_at,
644
  "updated_at": datetime.now().isoformat(),
645
  }
646
  }
 
654
  doc_status_id: {
655
  "status": DocStatus.FAILED,
656
  "error": str(e),
657
+ "content": status_doc.content,
658
+ "content_summary": status_doc.content_summary,
659
+ "content_length": status_doc.content_length,
660
+ "created_at": status_doc.created_at,
661
  "updated_at": datetime.now().isoformat(),
662
  }
663
  }
lightrag/llm/openai.py CHANGED
@@ -103,17 +103,17 @@ async def openai_complete_if_cache(
103
  ) -> str:
104
  if history_messages is None:
105
  history_messages = []
106
- if api_key:
107
- os.environ["OPENAI_API_KEY"] = api_key
108
 
109
  default_headers = {
110
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
111
  "Content-Type": "application/json",
112
  }
113
  openai_async_client = (
114
- AsyncOpenAI(default_headers=default_headers)
115
  if base_url is None
116
- else AsyncOpenAI(base_url=base_url, default_headers=default_headers)
117
  )
118
  kwargs.pop("hashing_kv", None)
119
  kwargs.pop("keyword_extraction", None)
@@ -294,17 +294,17 @@ async def openai_embed(
294
  base_url: str = None,
295
  api_key: str = None,
296
  ) -> np.ndarray:
297
- if api_key:
298
- os.environ["OPENAI_API_KEY"] = api_key
299
 
300
  default_headers = {
301
  "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
302
  "Content-Type": "application/json",
303
  }
304
  openai_async_client = (
305
- AsyncOpenAI(default_headers=default_headers)
306
  if base_url is None
307
- else AsyncOpenAI(base_url=base_url, default_headers=default_headers)
308
  )
309
  response = await openai_async_client.embeddings.create(
310
  model=model, input=texts, encoding_format="float"
 
103
  ) -> str:
104
  if history_messages is None:
105
  history_messages = []
106
+ if not api_key:
107
+ api_key = os.environ["OPENAI_API_KEY"]
108
 
109
  default_headers = {
110
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
111
  "Content-Type": "application/json",
112
  }
113
  openai_async_client = (
114
+ AsyncOpenAI(default_headers=default_headers, api_key=api_key)
115
  if base_url is None
116
+ else AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=api_key)
117
  )
118
  kwargs.pop("hashing_kv", None)
119
  kwargs.pop("keyword_extraction", None)
 
294
  base_url: str = None,
295
  api_key: str = None,
296
  ) -> np.ndarray:
297
+ if not api_key:
298
+ api_key = os.environ["OPENAI_API_KEY"]
299
 
300
  default_headers = {
301
  "User-Agent": f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_8) LightRAG/{__api_version__}",
302
  "Content-Type": "application/json",
303
  }
304
  openai_async_client = (
305
+ AsyncOpenAI(default_headers=default_headers, api_key=api_key)
306
  if base_url is None
307
+ else AsyncOpenAI(base_url=base_url, default_headers=default_headers, api_key=api_key)
308
  )
309
  response = await openai_async_client.embeddings.create(
310
  model=model, input=texts, encoding_format="float"