Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

zrguo commited on Dec 29, 2024

Commit

3c5ab1e

unverified ·

2 Parent(s): 0b317b1 2d9f120

Merge pull request #525 from magicyuan876/main

Browse files

Files changed (3) hide show

lightrag/operate.py +60 -7
lightrag/prompt.py +19 -2
lightrag/storage.py +11 -1

lightrag/operate.py CHANGED Viewed

@@ -30,6 +30,7 @@ from .base import (
     QueryParam,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
 def chunking_by_token_size(
@@ -128,6 +129,7 @@ async def _handle_single_relationship_extraction(
         description=edge_description,
         keywords=edge_keywords,
         source_id=edge_source_id,
     )
@@ -445,6 +447,9 @@ async def extract_entities(
                 + dp["src_id"]
                 + dp["tgt_id"]
                 + dp["description"],
             }
             for dp in all_relationships_data
         }
@@ -733,9 +738,22 @@ async def _get_node_data(
     entities_context = list_of_list_to_csv(entites_section_list)
     relations_section_list = [
-        ["id", "source", "target", "description", "keywords", "weight", "rank"]
     ]
     for i, e in enumerate(use_relations):
         relations_section_list.append(
             [
                 i,
@@ -745,6 +763,7 @@ async def _get_node_data(
                 e["keywords"],
                 e["weight"],
                 e["rank"],
             ]
         )
     relations_context = list_of_list_to_csv(relations_section_list)
@@ -892,7 +911,13 @@ async def _get_edge_data(
         *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
     )
     edge_datas = [
-        {"src_id": k["src_id"], "tgt_id": k["tgt_id"], "rank": d, **v}
         for k, v, d in zip(results, edge_datas, edge_degree)
         if v is not None
     ]
@@ -916,9 +941,22 @@ async def _get_edge_data(
     )
     relations_section_list = [
-        ["id", "source", "target", "description", "keywords", "weight", "rank"]
     ]
     for i, e in enumerate(edge_datas):
         relations_section_list.append(
             [
                 i,
@@ -928,6 +966,7 @@ async def _get_edge_data(
                 e["keywords"],
                 e["weight"],
                 e["rank"],
             ]
         )
     relations_context = list_of_list_to_csv(relations_section_list)
@@ -1259,9 +1298,15 @@ async def mix_kg_vector_query(
             chunks_ids = [r["id"] for r in results]
             chunks = await text_chunks_db.get_by_ids(chunks_ids)
-            valid_chunks = [
-                chunk for chunk in chunks if chunk is not None and "content" in chunk
-            ]
             if not valid_chunks:
                 return None
@@ -1275,7 +1320,15 @@ async def mix_kg_vector_query(
             if not maybe_trun_chunks:
                 return None
-            return "\n--New Chunk--\n".join([c["content"] for c in maybe_trun_chunks])
         except Exception as e:
             logger.error(f"Error in get_vector_context: {e}")
             return None

     QueryParam,
 )
 from .prompt import GRAPH_FIELD_SEP, PROMPTS
+import time
 def chunking_by_token_size(
         description=edge_description,
         keywords=edge_keywords,
         source_id=edge_source_id,
+        metadata={"created_at": time.time()},
     )
                 + dp["src_id"]
                 + dp["tgt_id"]
                 + dp["description"],
+                "metadata": {
+                    "created_at": dp.get("metadata", {}).get("created_at", time.time())
+                },
             }
             for dp in all_relationships_data
         }
     entities_context = list_of_list_to_csv(entites_section_list)
     relations_section_list = [
+        [
+            "id",
+            "source",
+            "target",
+            "description",
+            "keywords",
+            "weight",
+            "rank",
+            "created_at",
+        ]
     ]
     for i, e in enumerate(use_relations):
+        created_at = e.get("created_at", "UNKNOWN")
+        # Convert timestamp to readable format
+        if isinstance(created_at, (int, float)):
+            created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         relations_section_list.append(
             [
                 i,
                 e["keywords"],
                 e["weight"],
                 e["rank"],
+                created_at,
             ]
         )
     relations_context = list_of_list_to_csv(relations_section_list)
         *[knowledge_graph_inst.edge_degree(r["src_id"], r["tgt_id"]) for r in results]
     )
     edge_datas = [
+        {
+            "src_id": k["src_id"],
+            "tgt_id": k["tgt_id"],
+            "rank": d,
+            "created_at": k.get("__created_at__", None),  # 从 KV 存储中获取时间元数据
+            **v,
+        }
         for k, v, d in zip(results, edge_datas, edge_degree)
         if v is not None
     ]
     )
     relations_section_list = [
+        [
+            "id",
+            "source",
+            "target",
+            "description",
+            "keywords",
+            "weight",
+            "rank",
+            "created_at",
+        ]
     ]
     for i, e in enumerate(edge_datas):
+        created_at = e.get("created_at", "Unknown")
+        # Convert timestamp to readable format
+        if isinstance(created_at, (int, float)):
+            created_at = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(created_at))
         relations_section_list.append(
             [
                 i,
                 e["keywords"],
                 e["weight"],
                 e["rank"],
+                created_at,
             ]
         )
     relations_context = list_of_list_to_csv(relations_section_list)
             chunks_ids = [r["id"] for r in results]
             chunks = await text_chunks_db.get_by_ids(chunks_ids)
+            valid_chunks = []
+            for chunk, result in zip(chunks, results):
+                if chunk is not None and "content" in chunk:
+                    # Merge chunk content and time metadata
+                    chunk_with_time = {
+                        "content": chunk["content"],
+                        "created_at": result.get("created_at", None),
+                    }
+                    valid_chunks.append(chunk_with_time)
             if not valid_chunks:
                 return None
             if not maybe_trun_chunks:
                 return None
+            # Include time information in content
+            formatted_chunks = []
+            for c in maybe_trun_chunks:
+                chunk_text = c["content"]
+                if c["created_at"]:
+                    chunk_text = f"[Created at: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(c['created_at']))}]\n{chunk_text}"
+                formatted_chunks.append(chunk_text)
+            return "\n--New Chunk--\n".join(formatted_chunks)
         except Exception as e:
             logger.error(f"Error in get_vector_context: {e}")
             return None

lightrag/prompt.py CHANGED Viewed

@@ -164,6 +164,12 @@ Generate a response of the target length and format that responds to the user's
 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.
 ---Target response length and format---
 {response_type}
@@ -172,8 +178,7 @@ Do not include information where the supporting evidence for it is not provided.
 {context_data}
-Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
-"""
 PROMPTS["keywords_extraction"] = """---Role---
@@ -250,6 +255,12 @@ Generate a response of the target length and format that responds to the user's
 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.
 ---Target response length and format---
 {response_type}
@@ -293,6 +304,12 @@ You are a professional assistant responsible for answering questions based on kn
 Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
 ---Data Sources---
 1. Knowledge Graph Data:

 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.
+When handling relationships with timestamps:
+1. Each relationship has a "created_at" timestamp indicating when we acquired this knowledge
+2. When encountering conflicting relationships, consider both the semantic content and the timestamp
+3. Don't automatically prefer the most recently created relationships - use judgment based on the context
+4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
 ---Target response length and format---
 {response_type}
 {context_data}
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown."""
 PROMPTS["keywords_extraction"] = """---Role---
 If you don't know the answer, just say so. Do not make anything up.
 Do not include information where the supporting evidence for it is not provided.
+When handling content with timestamps:
+1. Each piece of content has a "created_at" timestamp indicating when we acquired this knowledge
+2. When encountering conflicting information, consider both the content and the timestamp
+3. Don't automatically prefer the most recent content - use judgment based on the context
+4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
 ---Target response length and format---
 {response_type}
 Generate a concise response that summarizes relevant points from the provided information. If you don't know the answer, just say so. Do not make anything up or include information where the supporting evidence is not provided.
+When handling information with timestamps:
+1. Each piece of information (both relationships and content) has a "created_at" timestamp indicating when we acquired this knowledge
+2. When encountering conflicting information, consider both the content/relationship and the timestamp
+3. Don't automatically prefer the most recent information - use judgment based on the context
+4. For time-specific queries, prioritize temporal information in the content before considering creation timestamps
 ---Data Sources---
 1. Knowledge Graph Data:

lightrag/storage.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Any, Union, cast, Dict
 import networkx as nx
 import numpy as np
 from nano_vectordb import NanoVectorDB
 from .utils import (
     logger,
@@ -87,9 +88,12 @@ class NanoVectorDBStorage(BaseVectorStorage):
         if not len(data):
             logger.warning("You insert an empty data to vector DB")
             return []
         list_data = [
             {
                 "__id__": k,
                 **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
             }
             for k, v in data.items()
@@ -132,7 +136,13 @@ class NanoVectorDBStorage(BaseVectorStorage):
             better_than_threshold=self.cosine_better_than_threshold,
         )
         results = [
-            {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
         ]
         return results

 import networkx as nx
 import numpy as np
 from nano_vectordb import NanoVectorDB
+import time
 from .utils import (
     logger,
         if not len(data):
             logger.warning("You insert an empty data to vector DB")
             return []
+        current_time = time.time()
         list_data = [
             {
                 "__id__": k,
+                "__created_at__": current_time,
                 **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
             }
             for k, v in data.items()
             better_than_threshold=self.cosine_better_than_threshold,
         )
         results = [
+            {
+                **dp,
+                "id": dp["__id__"],
+                "distance": dp["__metrics__"],
+                "created_at": dp.get("__created_at__"),
+            }
+            for dp in results
         ]
         return results