added nvidia text-embedding API and example of using nvidia API llm and text-embedding
Browse files- examples/lightrag_nvidia_demo.py +159 -0
- lightrag/llm.py +40 -0
examples/lightrag_nvidia_demo.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
from lightrag import LightRAG, QueryParam
|
4 |
+
from lightrag.llm import openai_complete_if_cache, nvidia_openai_embedding, nvidia_openai_complete
|
5 |
+
from lightrag.utils import EmbeddingFunc
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
#for custom llm_model_func
|
9 |
+
from lightrag.utils import locate_json_string_body_from_string
|
10 |
+
|
11 |
+
WORKING_DIR = "./dickens"
|
12 |
+
|
13 |
+
if not os.path.exists(WORKING_DIR):
|
14 |
+
os.mkdir(WORKING_DIR)
|
15 |
+
|
16 |
+
#some method to use your API key (choose one)
|
17 |
+
# NVIDIA_OPENAI_API_KEY = os.getenv("NVIDIA_OPENAI_API_KEY")
|
18 |
+
NVIDIA_OPENAI_API_KEY = "nvapi-xxxx" #your api key
|
19 |
+
|
20 |
+
# using pre-defined function for nvidia LLM API. OpenAI compatible
|
21 |
+
# llm_model_func = nvidia_openai_complete
|
22 |
+
|
23 |
+
#If you trying to make custom llm_model_func to use llm model on NVIDIA API like other example:
|
24 |
+
async def llm_model_func(
|
25 |
+
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
26 |
+
) -> str:
|
27 |
+
result = await openai_complete_if_cache(
|
28 |
+
"nvidia/llama-3.1-nemotron-70b-instruct",
|
29 |
+
prompt,
|
30 |
+
system_prompt=system_prompt,
|
31 |
+
history_messages=history_messages,
|
32 |
+
api_key=NVIDIA_OPENAI_API_KEY,
|
33 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
34 |
+
**kwargs,
|
35 |
+
)
|
36 |
+
if keyword_extraction:
|
37 |
+
return locate_json_string_body_from_string(result)
|
38 |
+
return result
|
39 |
+
|
40 |
+
#custom embedding
|
41 |
+
nvidia_embed_model = "nvidia/nv-embedqa-e5-v5"
|
42 |
+
async def indexing_embedding_func(texts: list[str]) -> np.ndarray:
|
43 |
+
return await nvidia_openai_embedding(
|
44 |
+
texts,
|
45 |
+
model = nvidia_embed_model, #maximum 512 token
|
46 |
+
# model="nvidia/llama-3.2-nv-embedqa-1b-v1",
|
47 |
+
api_key=NVIDIA_OPENAI_API_KEY,
|
48 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
49 |
+
input_type = "passage",
|
50 |
+
trunc = "END", #handling on server side if input token is longer than maximum token
|
51 |
+
encode = "float"
|
52 |
+
)
|
53 |
+
|
54 |
+
async def query_embedding_func(texts: list[str]) -> np.ndarray:
|
55 |
+
return await nvidia_openai_embedding(
|
56 |
+
texts,
|
57 |
+
model = nvidia_embed_model, #maximum 512 token
|
58 |
+
# model="nvidia/llama-3.2-nv-embedqa-1b-v1",
|
59 |
+
api_key=NVIDIA_OPENAI_API_KEY,
|
60 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
61 |
+
input_type = "query",
|
62 |
+
trunc = "END", #handling on server side if input token is longer than maximum token
|
63 |
+
encode = "float"
|
64 |
+
)
|
65 |
+
|
66 |
+
#dimension are same
|
67 |
+
async def get_embedding_dim():
|
68 |
+
test_text = ["This is a test sentence."]
|
69 |
+
embedding = await indexing_embedding_func(test_text)
|
70 |
+
embedding_dim = embedding.shape[1]
|
71 |
+
return embedding_dim
|
72 |
+
|
73 |
+
|
74 |
+
# function test
|
75 |
+
async def test_funcs():
|
76 |
+
result = await llm_model_func("How are you?")
|
77 |
+
print("llm_model_func: ", result)
|
78 |
+
|
79 |
+
result = await indexing_embedding_func(["How are you?"])
|
80 |
+
print("embedding_func: ", result)
|
81 |
+
|
82 |
+
|
83 |
+
# asyncio.run(test_funcs())
|
84 |
+
|
85 |
+
|
86 |
+
async def main():
|
87 |
+
try:
|
88 |
+
embedding_dimension = await get_embedding_dim()
|
89 |
+
print(f"Detected embedding dimension: {embedding_dimension}")
|
90 |
+
|
91 |
+
#lightRAG class during indexing
|
92 |
+
rag = LightRAG(
|
93 |
+
working_dir=WORKING_DIR,
|
94 |
+
llm_model_func=llm_model_func,
|
95 |
+
# llm_model_name="meta/llama3-70b-instruct", #un comment if
|
96 |
+
embedding_func=EmbeddingFunc(
|
97 |
+
embedding_dim=embedding_dimension,
|
98 |
+
max_token_size=512, #maximum token size, somehow it's still exceed maximum number of token
|
99 |
+
#so truncate (trunc) parameter on embedding_func will handle it and try to examine the tokenizer used in LightRAG
|
100 |
+
#so you can adjust to be able to fit the NVIDIA model (future work)
|
101 |
+
func=indexing_embedding_func,
|
102 |
+
),
|
103 |
+
)
|
104 |
+
|
105 |
+
#reading file
|
106 |
+
with open("./book.txt", "r", encoding="utf-8") as f:
|
107 |
+
await rag.ainsert(f.read())
|
108 |
+
|
109 |
+
#redefine rag to change embedding into query type
|
110 |
+
rag = LightRAG(
|
111 |
+
working_dir=WORKING_DIR,
|
112 |
+
llm_model_func=llm_model_func,
|
113 |
+
# llm_model_name="meta/llama3-70b-instruct", #un comment if
|
114 |
+
embedding_func=EmbeddingFunc(
|
115 |
+
embedding_dim=embedding_dimension,
|
116 |
+
max_token_size=512,
|
117 |
+
func=query_embedding_func,
|
118 |
+
),
|
119 |
+
)
|
120 |
+
|
121 |
+
# Perform naive search
|
122 |
+
print("==============Naive===============")
|
123 |
+
print(
|
124 |
+
await rag.aquery(
|
125 |
+
"What are the top themes in this story?", param=QueryParam(mode="naive")
|
126 |
+
)
|
127 |
+
)
|
128 |
+
|
129 |
+
# Perform local search
|
130 |
+
print("==============local===============")
|
131 |
+
print(
|
132 |
+
await rag.aquery(
|
133 |
+
"What are the top themes in this story?", param=QueryParam(mode="local")
|
134 |
+
)
|
135 |
+
)
|
136 |
+
|
137 |
+
# Perform global search
|
138 |
+
print("==============global===============")
|
139 |
+
print(
|
140 |
+
await rag.aquery(
|
141 |
+
"What are the top themes in this story?",
|
142 |
+
param=QueryParam(mode="global"),
|
143 |
+
)
|
144 |
+
)
|
145 |
+
|
146 |
+
# Perform hybrid search
|
147 |
+
print("==============hybrid===============")
|
148 |
+
print(
|
149 |
+
await rag.aquery(
|
150 |
+
"What are the top themes in this story?",
|
151 |
+
param=QueryParam(mode="hybrid"),
|
152 |
+
)
|
153 |
+
)
|
154 |
+
except Exception as e:
|
155 |
+
print(f"An error occurred: {e}")
|
156 |
+
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
asyncio.run(main())
|
lightrag/llm.py
CHANGED
@@ -502,6 +502,20 @@ async def gpt_4o_mini_complete(
|
|
502 |
**kwargs,
|
503 |
)
|
504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
async def azure_openai_complete(
|
507 |
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
@@ -588,6 +602,32 @@ async def openai_embedding(
|
|
588 |
return np.array([dp.embedding for dp in response.data])
|
589 |
|
590 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8191)
|
592 |
@retry(
|
593 |
stop=stop_after_attempt(3),
|
|
|
502 |
**kwargs,
|
503 |
)
|
504 |
|
505 |
+
async def nvidia_openai_complete(
|
506 |
+
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
507 |
+
) -> str:
|
508 |
+
result = await openai_complete_if_cache(
|
509 |
+
"nvidia/llama-3.1-nemotron-70b-instruct", #context length 128k
|
510 |
+
prompt,
|
511 |
+
system_prompt=system_prompt,
|
512 |
+
history_messages=history_messages,
|
513 |
+
base_url="https://integrate.api.nvidia.com/v1",
|
514 |
+
**kwargs,
|
515 |
+
)
|
516 |
+
if keyword_extraction: # TODO: use JSON API
|
517 |
+
return locate_json_string_body_from_string(result)
|
518 |
+
return result
|
519 |
|
520 |
async def azure_openai_complete(
|
521 |
prompt, system_prompt=None, history_messages=[], keyword_extraction=False, **kwargs
|
|
|
602 |
return np.array([dp.embedding for dp in response.data])
|
603 |
|
604 |
|
605 |
+
@wrap_embedding_func_with_attrs(embedding_dim=2048, max_token_size=512)
|
606 |
+
@retry(
|
607 |
+
stop=stop_after_attempt(3),
|
608 |
+
wait=wait_exponential(multiplier=1, min=4, max=60),
|
609 |
+
retry=retry_if_exception_type((RateLimitError, APIConnectionError, Timeout)),
|
610 |
+
)
|
611 |
+
async def nvidia_openai_embedding(
|
612 |
+
texts: list[str],
|
613 |
+
model: str = "nvidia/llama-3.2-nv-embedqa-1b-v1", #refer to https://build.nvidia.com/nim?filters=usecase%3Ausecase_text_to_embedding
|
614 |
+
base_url: str = "https://integrate.api.nvidia.com/v1",
|
615 |
+
api_key: str = None,
|
616 |
+
input_type: str = "passage", #query for retrieval, passage for embedding
|
617 |
+
trunc: str = "NONE", #NONE or START or END
|
618 |
+
encode: str = "float" #float or base64
|
619 |
+
) -> np.ndarray:
|
620 |
+
if api_key:
|
621 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
622 |
+
|
623 |
+
openai_async_client = (
|
624 |
+
AsyncOpenAI() if base_url is None else AsyncOpenAI(base_url=base_url)
|
625 |
+
)
|
626 |
+
response = await openai_async_client.embeddings.create(
|
627 |
+
model=model, input=texts, encoding_format=encode, extra_body={"input_type": input_type, "truncate": trunc}
|
628 |
+
)
|
629 |
+
return np.array([dp.embedding for dp in response.data])
|
630 |
+
|
631 |
@wrap_embedding_func_with_attrs(embedding_dim=1536, max_token_size=8191)
|
632 |
@retry(
|
633 |
stop=stop_after_attempt(3),
|