zrguo
commited on
Commit
·
604db50
1
Parent(s):
cd12e9c
Update raganything_example.py
Browse files- examples/raganything_example.py +156 -63
examples/raganything_example.py
CHANGED
@@ -11,9 +11,74 @@ This example shows how to:
|
|
11 |
import os
|
12 |
import argparse
|
13 |
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
15 |
-
from lightrag.utils import EmbeddingFunc
|
16 |
-
from raganything
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
async def process_with_rag(
|
@@ -31,15 +96,21 @@ async def process_with_rag(
|
|
31 |
output_dir: Output directory for RAG results
|
32 |
api_key: OpenAI API key
|
33 |
base_url: Optional base URL for API
|
|
|
34 |
"""
|
35 |
try:
|
36 |
-
#
|
37 |
-
|
38 |
-
working_dir=working_dir,
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
"gpt-4o-mini",
|
44 |
prompt,
|
45 |
system_prompt=system_prompt,
|
@@ -47,61 +118,64 @@ async def process_with_rag(
|
|
47 |
api_key=api_key,
|
48 |
base_url=base_url,
|
49 |
**kwargs,
|
50 |
-
),
|
51 |
-
vision_model_func=lambda prompt,
|
52 |
-
system_prompt=None,
|
53 |
-
history_messages=[],
|
54 |
-
image_data=None,
|
55 |
-
**kwargs: openai_complete_if_cache(
|
56 |
-
"gpt-4o",
|
57 |
-
"",
|
58 |
-
system_prompt=None,
|
59 |
-
history_messages=[],
|
60 |
-
messages=[
|
61 |
-
{"role": "system", "content": system_prompt}
|
62 |
-
if system_prompt
|
63 |
-
else None,
|
64 |
-
{
|
65 |
-
"role": "user",
|
66 |
-
"content": [
|
67 |
-
{"type": "text", "text": prompt},
|
68 |
-
{
|
69 |
-
"type": "image_url",
|
70 |
-
"image_url": {
|
71 |
-
"url": f"data:image/jpeg;base64,{image_data}"
|
72 |
-
},
|
73 |
-
},
|
74 |
-
],
|
75 |
-
}
|
76 |
-
if image_data
|
77 |
-
else {"role": "user", "content": prompt},
|
78 |
-
],
|
79 |
-
api_key=api_key,
|
80 |
-
base_url=base_url,
|
81 |
-
**kwargs,
|
82 |
)
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
api_key=api_key,
|
100 |
base_url=base_url,
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
),
|
103 |
)
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
# Process document
|
106 |
await rag.process_document_complete(
|
107 |
file_path=file_path, output_dir=output_dir, parse_method="auto"
|
@@ -114,14 +188,17 @@ async def process_with_rag(
|
|
114 |
"Tell me about the experimental results and data tables",
|
115 |
]
|
116 |
|
117 |
-
|
118 |
for query in queries:
|
119 |
-
|
120 |
result = await rag.query_with_multimodal(query, mode="hybrid")
|
121 |
-
|
122 |
|
123 |
except Exception as e:
|
124 |
-
|
|
|
|
|
|
|
125 |
|
126 |
|
127 |
def main():
|
@@ -135,12 +212,20 @@ def main():
|
|
135 |
"--output", "-o", default="./output", help="Output directory path"
|
136 |
)
|
137 |
parser.add_argument(
|
138 |
-
"--api-key",
|
|
|
|
|
139 |
)
|
140 |
parser.add_argument("--base-url", help="Optional base URL for API")
|
141 |
|
142 |
args = parser.parse_args()
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
# Create output directory if specified
|
145 |
if args.output:
|
146 |
os.makedirs(args.output, exist_ok=True)
|
@@ -154,4 +239,12 @@ def main():
|
|
154 |
|
155 |
|
156 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
main()
|
|
|
11 |
import os
|
12 |
import argparse
|
13 |
import asyncio
|
14 |
+
import logging
|
15 |
+
import logging.config
|
16 |
+
from pathlib import Path
|
17 |
+
|
18 |
+
# Add project root directory to Python path
|
19 |
+
import sys
|
20 |
+
|
21 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
22 |
+
|
23 |
from lightrag.llm.openai import openai_complete_if_cache, openai_embed
|
24 |
+
from lightrag.utils import EmbeddingFunc, logger, set_verbose_debug
|
25 |
+
from raganything import RAGAnything, RAGAnythingConfig
|
26 |
+
|
27 |
+
|
28 |
+
def configure_logging():
|
29 |
+
"""Configure logging for the application"""
|
30 |
+
# Get log directory path from environment variable or use current directory
|
31 |
+
log_dir = os.getenv("LOG_DIR", os.getcwd())
|
32 |
+
log_file_path = os.path.abspath(os.path.join(log_dir, "raganything_example.log"))
|
33 |
+
|
34 |
+
print(f"\nRAGAnything example log file: {log_file_path}\n")
|
35 |
+
os.makedirs(os.path.dirname(log_dir), exist_ok=True)
|
36 |
+
|
37 |
+
# Get log file max size and backup count from environment variables
|
38 |
+
log_max_bytes = int(os.getenv("LOG_MAX_BYTES", 10485760)) # Default 10MB
|
39 |
+
log_backup_count = int(os.getenv("LOG_BACKUP_COUNT", 5)) # Default 5 backups
|
40 |
+
|
41 |
+
logging.config.dictConfig(
|
42 |
+
{
|
43 |
+
"version": 1,
|
44 |
+
"disable_existing_loggers": False,
|
45 |
+
"formatters": {
|
46 |
+
"default": {
|
47 |
+
"format": "%(levelname)s: %(message)s",
|
48 |
+
},
|
49 |
+
"detailed": {
|
50 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
51 |
+
},
|
52 |
+
},
|
53 |
+
"handlers": {
|
54 |
+
"console": {
|
55 |
+
"formatter": "default",
|
56 |
+
"class": "logging.StreamHandler",
|
57 |
+
"stream": "ext://sys.stderr",
|
58 |
+
},
|
59 |
+
"file": {
|
60 |
+
"formatter": "detailed",
|
61 |
+
"class": "logging.handlers.RotatingFileHandler",
|
62 |
+
"filename": log_file_path,
|
63 |
+
"maxBytes": log_max_bytes,
|
64 |
+
"backupCount": log_backup_count,
|
65 |
+
"encoding": "utf-8",
|
66 |
+
},
|
67 |
+
},
|
68 |
+
"loggers": {
|
69 |
+
"lightrag": {
|
70 |
+
"handlers": ["console", "file"],
|
71 |
+
"level": "INFO",
|
72 |
+
"propagate": False,
|
73 |
+
},
|
74 |
+
},
|
75 |
+
}
|
76 |
+
)
|
77 |
+
|
78 |
+
# Set the logger level to INFO
|
79 |
+
logger.setLevel(logging.INFO)
|
80 |
+
# Enable verbose debug if needed
|
81 |
+
set_verbose_debug(os.getenv("VERBOSE", "false").lower() == "true")
|
82 |
|
83 |
|
84 |
async def process_with_rag(
|
|
|
96 |
output_dir: Output directory for RAG results
|
97 |
api_key: OpenAI API key
|
98 |
base_url: Optional base URL for API
|
99 |
+
working_dir: Working directory for RAG storage
|
100 |
"""
|
101 |
try:
|
102 |
+
# Create RAGAnything configuration
|
103 |
+
config = RAGAnythingConfig(
|
104 |
+
working_dir=working_dir or "./rag_storage",
|
105 |
+
mineru_parse_method="auto",
|
106 |
+
enable_image_processing=True,
|
107 |
+
enable_table_processing=True,
|
108 |
+
enable_equation_processing=True,
|
109 |
+
)
|
110 |
+
|
111 |
+
# Define LLM model function
|
112 |
+
def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs):
|
113 |
+
return openai_complete_if_cache(
|
114 |
"gpt-4o-mini",
|
115 |
prompt,
|
116 |
system_prompt=system_prompt,
|
|
|
118 |
api_key=api_key,
|
119 |
base_url=base_url,
|
120 |
**kwargs,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
)
|
122 |
+
|
123 |
+
# Define vision model function for image processing
|
124 |
+
def vision_model_func(
|
125 |
+
prompt, system_prompt=None, history_messages=[], image_data=None, **kwargs
|
126 |
+
):
|
127 |
+
if image_data:
|
128 |
+
return openai_complete_if_cache(
|
129 |
+
"gpt-4o",
|
130 |
+
"",
|
131 |
+
system_prompt=None,
|
132 |
+
history_messages=[],
|
133 |
+
messages=[
|
134 |
+
{"role": "system", "content": system_prompt}
|
135 |
+
if system_prompt
|
136 |
+
else None,
|
137 |
+
{
|
138 |
+
"role": "user",
|
139 |
+
"content": [
|
140 |
+
{"type": "text", "text": prompt},
|
141 |
+
{
|
142 |
+
"type": "image_url",
|
143 |
+
"image_url": {
|
144 |
+
"url": f"data:image/jpeg;base64,{image_data}"
|
145 |
+
},
|
146 |
+
},
|
147 |
+
],
|
148 |
+
}
|
149 |
+
if image_data
|
150 |
+
else {"role": "user", "content": prompt},
|
151 |
+
],
|
152 |
api_key=api_key,
|
153 |
base_url=base_url,
|
154 |
+
**kwargs,
|
155 |
+
)
|
156 |
+
else:
|
157 |
+
return llm_model_func(prompt, system_prompt, history_messages, **kwargs)
|
158 |
+
|
159 |
+
# Define embedding function
|
160 |
+
embedding_func = EmbeddingFunc(
|
161 |
+
embedding_dim=3072,
|
162 |
+
max_token_size=8192,
|
163 |
+
func=lambda texts: openai_embed(
|
164 |
+
texts,
|
165 |
+
model="text-embedding-3-large",
|
166 |
+
api_key=api_key,
|
167 |
+
base_url=base_url,
|
168 |
),
|
169 |
)
|
170 |
|
171 |
+
# Initialize RAGAnything with new dataclass structure
|
172 |
+
rag = RAGAnything(
|
173 |
+
config=config,
|
174 |
+
llm_model_func=llm_model_func,
|
175 |
+
vision_model_func=vision_model_func,
|
176 |
+
embedding_func=embedding_func,
|
177 |
+
)
|
178 |
+
|
179 |
# Process document
|
180 |
await rag.process_document_complete(
|
181 |
file_path=file_path, output_dir=output_dir, parse_method="auto"
|
|
|
188 |
"Tell me about the experimental results and data tables",
|
189 |
]
|
190 |
|
191 |
+
logger.info("\nQuerying processed document:")
|
192 |
for query in queries:
|
193 |
+
logger.info(f"\nQuery: {query}")
|
194 |
result = await rag.query_with_multimodal(query, mode="hybrid")
|
195 |
+
logger.info(f"Answer: {result}")
|
196 |
|
197 |
except Exception as e:
|
198 |
+
logger.error(f"Error processing with RAG: {str(e)}")
|
199 |
+
import traceback
|
200 |
+
|
201 |
+
logger.error(traceback.format_exc())
|
202 |
|
203 |
|
204 |
def main():
|
|
|
212 |
"--output", "-o", default="./output", help="Output directory path"
|
213 |
)
|
214 |
parser.add_argument(
|
215 |
+
"--api-key",
|
216 |
+
default=os.getenv("OPENAI_API_KEY"),
|
217 |
+
help="OpenAI API key (defaults to OPENAI_API_KEY env var)",
|
218 |
)
|
219 |
parser.add_argument("--base-url", help="Optional base URL for API")
|
220 |
|
221 |
args = parser.parse_args()
|
222 |
|
223 |
+
# Check if API key is provided
|
224 |
+
if not args.api_key:
|
225 |
+
logger.error("Error: OpenAI API key is required")
|
226 |
+
logger.error("Set OPENAI_API_KEY environment variable or use --api-key option")
|
227 |
+
return
|
228 |
+
|
229 |
# Create output directory if specified
|
230 |
if args.output:
|
231 |
os.makedirs(args.output, exist_ok=True)
|
|
|
239 |
|
240 |
|
241 |
if __name__ == "__main__":
|
242 |
+
# Configure logging first
|
243 |
+
configure_logging()
|
244 |
+
|
245 |
+
print("RAGAnything Example")
|
246 |
+
print("=" * 30)
|
247 |
+
print("Processing document with multimodal RAG pipeline")
|
248 |
+
print("=" * 30)
|
249 |
+
|
250 |
main()
|