Spaces:

rm-lht
/

lightrag

Configuration error

App Files Files Community

LarFii commited on Oct 10, 2024

Commit

5c22708

1 Parent(s): 356b4e1

update

Browse files

Files changed (6) hide show

lightrag/__init__.py +1 -1
lightrag/base.py +116 -0
lightrag/prompt.py +256 -0
lightrag/storage.py +246 -0
lightrag/utils.py +165 -0
setup.py +1 -1

lightrag/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .lightrag import LightRAG, QueryParam
-__version__ = "0.0.1"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/GraphEdit"

 from .lightrag import LightRAG, QueryParam
+__version__ = "0.0.2"
 __author__ = "Zirui Guo"
 __url__ = "https://github.com/HKUDS/GraphEdit"

lightrag/base.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from dataclasses import dataclass, field
+from typing import TypedDict, Union, Literal, Generic, TypeVar
+import numpy as np
+from .utils import EmbeddingFunc
+TextChunkSchema = TypedDict(
+    "TextChunkSchema",
+    {"tokens": int, "content": str, "full_doc_id": str, "chunk_order_index": int},
+)
+T = TypeVar("T")
+@dataclass
+class QueryParam:
+    mode: Literal["local", "global", "hybird", "naive"] = "global"
+    only_need_context: bool = False
+    response_type: str = "Multiple Paragraphs"
+    top_k: int = 60
+    max_token_for_text_unit: int = 4000
+    max_token_for_global_context: int = 4000
+    max_token_for_local_context: int = 4000
+@dataclass
+class StorageNameSpace:
+    namespace: str
+    global_config: dict
+    async def index_done_callback(self):
+        """commit the storage operations after indexing"""
+        pass
+    async def query_done_callback(self):
+        """commit the storage operations after querying"""
+        pass
+@dataclass
+class BaseVectorStorage(StorageNameSpace):
+    embedding_func: EmbeddingFunc
+    meta_fields: set = field(default_factory=set)
+    async def query(self, query: str, top_k: int) -> list[dict]:
+        raise NotImplementedError
+    async def upsert(self, data: dict[str, dict]):
+        """Use 'content' field from value for embedding, use key as id.
+        If embedding_func is None, use 'embedding' field from value
+        """
+        raise NotImplementedError
+@dataclass
+class BaseKVStorage(Generic[T], StorageNameSpace):
+    async def all_keys(self) -> list[str]:
+        raise NotImplementedError
+    async def get_by_id(self, id: str) -> Union[T, None]:
+        raise NotImplementedError
+    async def get_by_ids(
+        self, ids: list[str], fields: Union[set[str], None] = None
+    ) -> list[Union[T, None]]:
+        raise NotImplementedError
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        """return un-exist keys"""
+        raise NotImplementedError
+    async def upsert(self, data: dict[str, T]):
+        raise NotImplementedError
+    async def drop(self):
+        raise NotImplementedError
+@dataclass
+class BaseGraphStorage(StorageNameSpace):
+    async def has_node(self, node_id: str) -> bool:
+        raise NotImplementedError
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        raise NotImplementedError
+    async def node_degree(self, node_id: str) -> int:
+        raise NotImplementedError
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        raise NotImplementedError
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        raise NotImplementedError
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        raise NotImplementedError
+    async def get_node_edges(
+        self, source_node_id: str
+    ) -> Union[list[tuple[str, str]], None]:
+        raise NotImplementedError
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        raise NotImplementedError
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        raise NotImplementedError
+    async def clustering(self, algorithm: str):
+        raise NotImplementedError
+    async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]:
+        raise NotImplementedError("Node embedding is not used in lightrag.")

lightrag/prompt.py ADDED Viewed

	@@ -0,0 +1,256 @@

+GRAPH_FIELD_SEP = "<SEP>"
+PROMPTS = {}
+PROMPTS["DEFAULT_TUPLE_DELIMITER"] = "<|>"
+PROMPTS["DEFAULT_RECORD_DELIMITER"] = "##"
+PROMPTS["DEFAULT_COMPLETION_DELIMITER"] = "<|COMPLETE|>"
+PROMPTS["process_tickers"] = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
+PROMPTS["DEFAULT_ENTITY_TYPES"] = ["organization", "person", "geo", "event"]
+PROMPTS[
+    "entity_extraction"
+] = """-Goal-
+Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
+-Steps-
+1. Identify all entities. For each identified entity, extract the following information:
+- entity_name: Name of the entity, capitalized
+- entity_type: One of the following types: [{entity_types}]
+- entity_description: Comprehensive description of the entity's attributes and activities
+Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>
+2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
+For each pair of related entities, extract the following information:
+- source_entity: name of the source entity, as identified in step 1
+- target_entity: name of the target entity, as identified in step 1
+- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
+- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
+- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
+Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)
+3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
+Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)
+4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.
+5. When finished, output {completion_delimiter}
+######################
+-Examples-
+######################
+Example 1:
+Entity_types: [person, technology, mission, organization, location]
+Text:
+while Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor's authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan's shared commitment to discovery was an unspoken rebellion against Cruz's narrowing vision of control and order.
+Then Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”
+The underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor's, a wordless clash of wills softening into an uneasy truce.
+It was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths
+################
+Output:
+("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}
+("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}
+("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}
+("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}
+("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}
+("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor's authoritarian certainty and observes changes in Taylor's attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}
+("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz's vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}
+("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}
+("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan's commitment to discovery is in rebellion against Cruz's vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}
+("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}
+("content_keywords"{tuple_delimiter}"power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}
+#############################
+Example 2:
+Entity_types: [person, technology, mission, organization, location]
+Text:
+They were no longer mere operatives; they had become guardians of a threshold, keepers of a message from a realm beyond stars and stripes. This elevation in their mission could not be shackled by regulations and established protocols—it demanded a new perspective, a new resolve.
+Tension threaded through the dialogue of beeps and static as communications with Washington buzzed in the background. The team stood, a portentous air enveloping them. It was clear that the decisions they made in the ensuing hours could redefine humanity's place in the cosmos or condemn them to ignorance and potential peril.
+Their connection to the stars solidified, the group moved to address the crystallizing warning, shifting from passive recipients to active participants. Mercer's latter instincts gained precedence— the team's mandate had evolved, no longer solely to observe and report but to interact and prepare. A metamorphosis had begun, and Operation: Dulce hummed with the newfound frequency of their daring, a tone set not by the earthly
+#############
+Output:
+("entity"{tuple_delimiter}"Washington"{tuple_delimiter}"location"{tuple_delimiter}"Washington is a location where communications are being received, indicating its importance in the decision-making process."){record_delimiter}
+("entity"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"mission"{tuple_delimiter}"Operation: Dulce is described as a mission that has evolved to interact and prepare, indicating a significant shift in objectives and activities."){record_delimiter}
+("entity"{tuple_delimiter}"The team"{tuple_delimiter}"organization"{tuple_delimiter}"The team is portrayed as a group of individuals who have transitioned from passive observers to active participants in a mission, showing a dynamic change in their role."){record_delimiter}
+("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Washington"{tuple_delimiter}"The team receives communications from Washington, which influences their decision-making process."{tuple_delimiter}"decision-making, external influence"{tuple_delimiter}7){record_delimiter}
+("relationship"{tuple_delimiter}"The team"{tuple_delimiter}"Operation: Dulce"{tuple_delimiter}"The team is directly involved in Operation: Dulce, executing its evolved objectives and activities."{tuple_delimiter}"mission evolution, active participation"{tuple_delimiter}9){completion_delimiter}
+("content_keywords"{tuple_delimiter}"mission evolution, decision-making, active participation, cosmic significance"){completion_delimiter}
+#############################
+Example 3:
+Entity_types: [person, role, technology, organization, event, location, concept]
+Text:
+their voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.
+"It's like it's learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers' a whole new meaning."
+Alex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."
+Together, they stood on the edge of the unknown, forging humanity's response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.
+The encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation
+#############
+Output:
+("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}
+("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}
+("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}
+("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}
+("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}
+("entity"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity's Response is the collective action taken by Alex's team in response to a message from an unknown intelligence."){record_delimiter}
+("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}
+("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}
+("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity's Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity's Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}
+("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}
+("content_keywords"{tuple_delimiter}"first contact, control, communication, cosmic significance"){completion_delimiter}
+#############################
+-Real Data-
+######################
+Entity_types: {entity_types}
+Text: {input_text}
+######################
+Output:
+"""
+PROMPTS[
+    "summarize_entity_descriptions"
+] = """You are a helpful assistant responsible for generating a comprehensive summary of the data provided below.
+Given one or two entities, and a list of descriptions, all related to the same entity or group of entities.
+Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions.
+If the provided descriptions are contradictory, please resolve the contradictions and provide a single, coherent summary.
+Make sure it is written in third person, and include the entity names so we the have full context.
+#######
+-Data-
+Entities: {entity_name}
+Description List: {description_list}
+#######
+Output:
+"""
+PROMPTS[
+    "entiti_continue_extraction"
+] = """MANY entities were missed in the last extraction.  Add them below using the same format:
+"""
+PROMPTS[
+    "entiti_if_loop_extraction"
+] = """It appears some entities may have still been missed.  Answer YES | NO if there are still entities that need to be added.
+"""
+PROMPTS["fail_response"] = "Sorry, I'm not able to provide an answer to that question."
+PROMPTS[
+    "rag_response"
+] = """---Role---
+You are a helpful assistant responding to questions about data in the tables provided.
+---Goal---
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+If you don't know the answer, just say so. Do not make anything up.
+Do not include information where the supporting evidence for it is not provided.
+---Target response length and format---
+{response_type}
+---Data tables---
+{context_data}
+---Goal---
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+If you don't know the answer, just say so. Do not make anything up.
+Do not include information where the supporting evidence for it is not provided.
+---Target response length and format---
+{response_type}
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
+"""
+PROMPTS["keywords_extraction"] = """---Role---
+You are a helpful assistant tasked with identifying both high-level and low-level keywords in the user's query.
+---Goal---
+Given the query, list both high-level and low-level keywords. High-level keywords focus on overarching concepts or themes, while low-level keywords focus on specific entities, details, or concrete terms.
+---Instructions---
+- Output the keywords in JSON format.
+- The JSON should have two keys:
+  - "high_level_keywords" for overarching concepts or themes.
+  - "low_level_keywords" for specific entities or details.
+######################
+-Examples-
+######################
+Example 1:
+Query: "How does international trade influence global economic stability?"
+################
+Output:
+{{
+  "high_level_keywords": ["International trade", "Global economic stability", "Economic impact"],
+  "low_level_keywords": ["Trade agreements", "Tariffs", "Currency exchange", "Imports", "Exports"]
+}}
+#############################
+Example 2:
+Query: "What are the environmental consequences of deforestation on biodiversity?"
+################
+Output:
+{{
+  "high_level_keywords": ["Environmental consequences", "Deforestation", "Biodiversity loss"],
+  "low_level_keywords": ["Species extinction", "Habitat destruction", "Carbon emissions", "Rainforest", "Ecosystem"]
+}}
+#############################
+Example 3:
+Query: "What is the role of education in reducing poverty?"
+################
+Output:
+{{
+  "high_level_keywords": ["Education", "Poverty reduction", "Socioeconomic development"],
+  "low_level_keywords": ["School access", "Literacy rates", "Job training", "Income inequality"]
+}}
+#############################
+-Real Data-
+######################
+Query: {query}
+######################
+Output:
+"""
+PROMPTS[
+    "naive_rag_response"
+] = """You're a helpful assistant
+Below are the knowledge you know:
+{content_data}
+---
+If you don't know the answer or if the provided knowledge do not contain sufficient information to provide an answer, just say so. Do not make anything up.
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+If you don't know the answer, just say so. Do not make anything up.
+Do not include information where the supporting evidence for it is not provided.
+---Target response length and format---
+{response_type}
+"""

lightrag/storage.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import asyncio
+import html
+import json
+import os
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Union, cast
+import pickle
+import hnswlib
+import networkx as nx
+import numpy as np
+from nano_vectordb import NanoVectorDB
+import xxhash
+from .utils import load_json, logger, write_json
+from .base import (
+    BaseGraphStorage,
+    BaseKVStorage,
+    BaseVectorStorage,
+)
+@dataclass
+class JsonKVStorage(BaseKVStorage):
+    def __post_init__(self):
+        working_dir = self.global_config["working_dir"]
+        self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
+        self._data = load_json(self._file_name) or {}
+        logger.info(f"Load KV {self.namespace} with {len(self._data)} data")
+    async def all_keys(self) -> list[str]:
+        return list(self._data.keys())
+    async def index_done_callback(self):
+        write_json(self._data, self._file_name)
+    async def get_by_id(self, id):
+        return self._data.get(id, None)
+    async def get_by_ids(self, ids, fields=None):
+        if fields is None:
+            return [self._data.get(id, None) for id in ids]
+        return [
+            (
+                {k: v for k, v in self._data[id].items() if k in fields}
+                if self._data.get(id, None)
+                else None
+            )
+            for id in ids
+        ]
+    async def filter_keys(self, data: list[str]) -> set[str]:
+        return set([s for s in data if s not in self._data])
+    async def upsert(self, data: dict[str, dict]):
+        left_data = {k: v for k, v in data.items() if k not in self._data}
+        self._data.update(left_data)
+        return left_data
+    async def drop(self):
+        self._data = {}
+@dataclass
+class NanoVectorDBStorage(BaseVectorStorage):
+    cosine_better_than_threshold: float = 0.2
+    def __post_init__(self):
+        self._client_file_name = os.path.join(
+            self.global_config["working_dir"], f"vdb_{self.namespace}.json"
+        )
+        self._max_batch_size = self.global_config["embedding_batch_num"]
+        self._client = NanoVectorDB(
+            self.embedding_func.embedding_dim, storage_file=self._client_file_name
+        )
+        self.cosine_better_than_threshold = self.global_config.get(
+            "cosine_better_than_threshold", self.cosine_better_than_threshold
+        )
+    async def upsert(self, data: dict[str, dict]):
+        logger.info(f"Inserting {len(data)} vectors to {self.namespace}")
+        if not len(data):
+            logger.warning("You insert an empty data to vector DB")
+            return []
+        list_data = [
+            {
+                "__id__": k,
+                **{k1: v1 for k1, v1 in v.items() if k1 in self.meta_fields},
+            }
+            for k, v in data.items()
+        ]
+        contents = [v["content"] for v in data.values()]
+        batches = [
+            contents[i : i + self._max_batch_size]
+            for i in range(0, len(contents), self._max_batch_size)
+        ]
+        embeddings_list = await asyncio.gather(
+            *[self.embedding_func(batch) for batch in batches]
+        )
+        embeddings = np.concatenate(embeddings_list)
+        for i, d in enumerate(list_data):
+            d["__vector__"] = embeddings[i]
+        results = self._client.upsert(datas=list_data)
+        return results
+    async def query(self, query: str, top_k=5):
+        embedding = await self.embedding_func([query])
+        embedding = embedding[0]
+        results = self._client.query(
+            query=embedding,
+            top_k=top_k,
+            better_than_threshold=self.cosine_better_than_threshold,
+        )
+        results = [
+            {**dp, "id": dp["__id__"], "distance": dp["__metrics__"]} for dp in results
+        ]
+        return results
+    async def index_done_callback(self):
+        self._client.save()
+@dataclass
+class NetworkXStorage(BaseGraphStorage):
+    @staticmethod
+    def load_nx_graph(file_name) -> nx.Graph:
+        if os.path.exists(file_name):
+            return nx.read_graphml(file_name)
+        return None
+    @staticmethod
+    def write_nx_graph(graph: nx.Graph, file_name):
+        logger.info(
+            f"Writing graph with {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges"
+        )
+        nx.write_graphml(graph, file_name)
+    @staticmethod
+    def stable_largest_connected_component(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Return the largest connected component of the graph, with nodes and edges sorted in a stable way.
+        """
+        from graspologic.utils import largest_connected_component
+        graph = graph.copy()
+        graph = cast(nx.Graph, largest_connected_component(graph))
+        node_mapping = {node: html.unescape(node.upper().strip()) for node in graph.nodes()}  # type: ignore
+        graph = nx.relabel_nodes(graph, node_mapping)
+        return NetworkXStorage._stabilize_graph(graph)
+    @staticmethod
+    def _stabilize_graph(graph: nx.Graph) -> nx.Graph:
+        """Refer to https://github.com/microsoft/graphrag/index/graph/utils/stable_lcc.py
+        Ensure an undirected graph with the same relationships will always be read the same way.
+        """
+        fixed_graph = nx.DiGraph() if graph.is_directed() else nx.Graph()
+        sorted_nodes = graph.nodes(data=True)
+        sorted_nodes = sorted(sorted_nodes, key=lambda x: x[0])
+        fixed_graph.add_nodes_from(sorted_nodes)
+        edges = list(graph.edges(data=True))
+        if not graph.is_directed():
+            def _sort_source_target(edge):
+                source, target, edge_data = edge
+                if source > target:
+                    temp = source
+                    source = target
+                    target = temp
+                return source, target, edge_data
+            edges = [_sort_source_target(edge) for edge in edges]
+        def _get_edge_key(source: Any, target: Any) -> str:
+            return f"{source} -> {target}"
+        edges = sorted(edges, key=lambda x: _get_edge_key(x[0], x[1]))
+        fixed_graph.add_edges_from(edges)
+        return fixed_graph
+    def __post_init__(self):
+        self._graphml_xml_file = os.path.join(
+            self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
+        )
+        preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
+        if preloaded_graph is not None:
+            logger.info(
+                f"Loaded graph from {self._graphml_xml_file} with {preloaded_graph.number_of_nodes()} nodes, {preloaded_graph.number_of_edges()} edges"
+            )
+        self._graph = preloaded_graph or nx.Graph()
+        self._node_embed_algorithms = {
+            "node2vec": self._node2vec_embed,
+        }
+    async def index_done_callback(self):
+        NetworkXStorage.write_nx_graph(self._graph, self._graphml_xml_file)
+    async def has_node(self, node_id: str) -> bool:
+        return self._graph.has_node(node_id)
+    async def has_edge(self, source_node_id: str, target_node_id: str) -> bool:
+        return self._graph.has_edge(source_node_id, target_node_id)
+    async def get_node(self, node_id: str) -> Union[dict, None]:
+        return self._graph.nodes.get(node_id)
+    async def node_degree(self, node_id: str) -> int:
+        return self._graph.degree(node_id)
+    async def edge_degree(self, src_id: str, tgt_id: str) -> int:
+        return self._graph.degree(src_id) + self._graph.degree(tgt_id)
+    async def get_edge(
+        self, source_node_id: str, target_node_id: str
+    ) -> Union[dict, None]:
+        return self._graph.edges.get((source_node_id, target_node_id))
+    async def get_node_edges(self, source_node_id: str):
+        if self._graph.has_node(source_node_id):
+            return list(self._graph.edges(source_node_id))
+        return None
+    async def upsert_node(self, node_id: str, node_data: dict[str, str]):
+        self._graph.add_node(node_id, **node_data)
+    async def upsert_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
+        self._graph.add_edge(source_node_id, target_node_id, **edge_data)
+    async def embed_nodes(self, algorithm: str) -> tuple[np.ndarray, list[str]]:
+        if algorithm not in self._node_embed_algorithms:
+            raise ValueError(f"Node embedding algorithm {algorithm} not supported")
+        return await self._node_embed_algorithms[algorithm]()
+    async def _node2vec_embed(self):
+        from graspologic import embed
+        embeddings, nodes = embed.node2vec_embed(
+            self._graph,
+            **self.global_config["node2vec_params"],
+        )
+        nodes_ids = [self._graph.nodes[node_id]["id"] for node_id in nodes]
+        return embeddings, nodes_ids

lightrag/utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import asyncio
+import html
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass
+from functools import wraps
+from hashlib import md5
+from typing import Any, Union
+import numpy as np
+import tiktoken
+ENCODER = None
+logger = logging.getLogger("lightrag")
+def set_logger(log_file: str):
+    logger.setLevel(logging.DEBUG)
+    file_handler = logging.FileHandler(log_file)
+    file_handler.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    file_handler.setFormatter(formatter)
+    if not logger.handlers:
+        logger.addHandler(file_handler)
+@dataclass
+class EmbeddingFunc:
+    embedding_dim: int
+    max_token_size: int
+    func: callable
+    async def __call__(self, *args, **kwargs) -> np.ndarray:
+        return await self.func(*args, **kwargs)
+def locate_json_string_body_from_string(content: str) -> Union[str, None]:
+    """Locate the JSON string body from a string"""
+    maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
+    if maybe_json_str is not None:
+        return maybe_json_str.group(0)
+    else:
+        return None
+def convert_response_to_json(response: str) -> dict:
+    json_str = locate_json_string_body_from_string(response)
+    assert json_str is not None, f"Unable to parse JSON from response: {response}"
+    try:
+        data = json.loads(json_str)
+        return data
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse JSON: {json_str}")
+        raise e from None
+def compute_args_hash(*args):
+    return md5(str(args).encode()).hexdigest()
+def compute_mdhash_id(content, prefix: str = ""):
+    return prefix + md5(content.encode()).hexdigest()
+def limit_async_func_call(max_size: int, waitting_time: float = 0.0001):
+    """Add restriction of maximum async calling times for a async func"""
+    def final_decro(func):
+        """Not using async.Semaphore to aovid use nest-asyncio"""
+        __current_size = 0
+        @wraps(func)
+        async def wait_func(*args, **kwargs):
+            nonlocal __current_size
+            while __current_size >= max_size:
+                await asyncio.sleep(waitting_time)
+            __current_size += 1
+            result = await func(*args, **kwargs)
+            __current_size -= 1
+            return result
+        return wait_func
+    return final_decro
+def wrap_embedding_func_with_attrs(**kwargs):
+    """Wrap a function with attributes"""
+    def final_decro(func) -> EmbeddingFunc:
+        new_func = EmbeddingFunc(**kwargs, func=func)
+        return new_func
+    return final_decro
+def load_json(file_name):
+    if not os.path.exists(file_name):
+        return None
+    with open(file_name) as f:
+        return json.load(f)
+def write_json(json_obj, file_name):
+    with open(file_name, "w") as f:
+        json.dump(json_obj, f, indent=2, ensure_ascii=False)
+def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):
+    global ENCODER
+    if ENCODER is None:
+        ENCODER = tiktoken.encoding_for_model(model_name)
+    tokens = ENCODER.encode(content)
+    return tokens
+def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"):
+    global ENCODER
+    if ENCODER is None:
+        ENCODER = tiktoken.encoding_for_model(model_name)
+    content = ENCODER.decode(tokens)
+    return content
+def pack_user_ass_to_openai_messages(*args: str):
+    roles = ["user", "assistant"]
+    return [
+        {"role": roles[i % 2], "content": content} for i, content in enumerate(args)
+    ]
+def split_string_by_multi_markers(content: str, markers: list[str]) -> list[str]:
+    """Split a string by multiple markers"""
+    if not markers:
+        return [content]
+    results = re.split("|".join(re.escape(marker) for marker in markers), content)
+    return [r.strip() for r in results if r.strip()]
+# Refer the utils functions of the official GraphRAG implementation:
+# https://github.com/microsoft/graphrag
+def clean_str(input: Any) -> str:
+    """Clean an input string by removing HTML escapes, control characters, and other unwanted characters."""
+    # If we get non-string input, just give it back
+    if not isinstance(input, str):
+        return input
+    result = html.unescape(input.strip())
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python
+    return re.sub(r"[\x00-\x1f\x7f-\x9f]", "", result)
+def is_float_regex(value):
+    return bool(re.match(r"^[-+]?[0-9]*\.?[0-9]+$", value))
+def truncate_list_by_token_size(list_data: list, key: callable, max_token_size: int):
+    """Truncate a list of data by token size"""
+    if max_token_size <= 0:
+        return []
+    tokens = 0
+    for i, data in enumerate(list_data):
+        tokens += len(encode_string_by_tiktoken(key(data)))
+        if tokens > max_token_size:
+            return list_data[:i]
+    return list_data
+def list_of_list_to_csv(data: list[list]):
+    return "\n".join(
+        [",\t".join([str(data_dd) for data_dd in data_d]) for data_d in data]
+    )
+def save_data_to_file(data, file_name):
+    with open(file_name, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)

setup.py CHANGED Viewed

@@ -21,7 +21,7 @@ with open("./requirements.txt") as f:
         deps.append(line.strip())
 setuptools.setup(
-    name="light-rag",
     url=vars2readme["__url__"],
     version=vars2readme["__version__"],
     author=vars2readme["__author__"],

         deps.append(line.strip())
 setuptools.setup(
+    name="lightrag-hku",
     url=vars2readme["__url__"],
     version=vars2readme["__version__"],
     author=vars2readme["__author__"],