import os import logging from typing import List, Dict, Optional, Union from dotenv import load_dotenv from llama_index.core.agent.workflow import ReActAgent from llama_index.core.schema import ImageDocument from llama_index.core.tools import FunctionTool from llama_index.llms.google_genai import GoogleGenAI # Load environment variables load_dotenv() # Setup logging logger = logging.getLogger(__name__) # Helper function to load prompt from file def load_prompt_from_file(filename: str, default_prompt: str) -> str: """Loads a prompt from a text file.""" try: script_dir = os.path.dirname(__file__) prompt_path = os.path.join(script_dir, filename) with open(prompt_path, "r") as f: prompt = f.read() logger.info(f"Successfully loaded prompt from {prompt_path}") return prompt except FileNotFoundError: logger.warning(f"Prompt file {filename} not found at {prompt_path}. Using default.") return default_prompt except Exception as e: logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True) return default_prompt # --- Core Figure Interpretation Logic (using Multi-Modal LLM) --- def interpret_figure_with_llm(image_path: str, request: str) -> str: """Interprets a figure in an image based on a specific request using a multi-modal LLM. Args: image_path (str): Path to the image file containing the figure. request (str): The specific question or interpretation task (e.g., "Describe this chart", "Extract sales for Q3", "Identify the main trend"). Returns: str: The interpretation result or an error message. """ logger.info(f"Interpreting figure in image: {image_path} with request: {request}") # Check if image exists if not os.path.exists(image_path): logger.error(f"Image file not found: {image_path}") return f"Error: Image file not found at {image_path}" # LLM configuration (Must be a multi-modal model) # Ensure the selected model supports image input (e.g., gemini-1.5-pro) llm_model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for figure interpretation LLM.") return "Error: GEMINI_API_KEY not set." try: # Initialize the multi-modal LLM llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name) logger.info(f"Using figure interpretation LLM: {llm_model_name}") # Prepare the prompt for the multi-modal LLM # The prompt needs to guide the LLM to act as the figure interpreter # based on the specific request. prompt = ( f"You are an expert figure interpreter. Analyze the provided image containing a chart, graph, diagram, or table. " f"Focus *only* on the visual information present in the image. " f"Fulfill the following request accurately and concisely:\n\n" f"REQUEST: {request}\n\n" f"Based *only* on the image, provide the answer:" ) # Load the image data (LlamaIndex integration might handle this differently depending on version) # Assuming a method to load image data compatible with the LLM call # This might involve using ImageBlock or similar structures in newer LlamaIndex versions. # For simplicity here, we assume the LLM call can handle a path or loaded image object. # Example using complete (adjust based on actual LlamaIndex multi-modal API) # Note: The exact API for multi-modal completion might vary. # This is a conceptual example. from llama_index.core import SimpleDirectoryReader # Example import # Load the image document reader = SimpleDirectoryReader(input_files=[image_path]) image_documents = reader.load_data() if not image_documents or not isinstance(image_documents[0], ImageDocument): logger.error(f"Failed to load image as ImageDocument: {image_path}") return f"Error: Could not load image file {image_path} for analysis." # Make the multi-modal completion call response = llm.complete( prompt=prompt, image_documents=image_documents # Pass the loaded image document(s) ) interpretation = response.text.strip() logger.info("Figure interpretation successful.") return interpretation except FileNotFoundError: # This might be redundant due to the initial check, but good practice logger.error(f"Image file not found during LLM call: {image_path}") return f"Error: Image file not found at {image_path}" except ImportError as ie: logger.error(f"Missing library for multi-modal processing: {ie}") return f"Error: Missing required library for image processing ({ie})." except Exception as e: # Catch potential API errors or other issues logger.error(f"LLM call failed during figure interpretation: {e}", exc_info=True) # Check if the error suggests the model doesn't support images if "does not support image input" in str(e).lower(): logger.error(f"The configured model {llm_model_name} does not support image input.") return f"Error: The configured LLM ({llm_model_name}) does not support image input. Please configure a multi-modal model." return f"Error during figure interpretation: {e}" # --- Tool Definitions (Wrapping the core logic) --- # These tools essentially pass the request to the core LLM function. def describe_figure_tool_fn(image_path: str) -> str: "Provides a general description of the figure in the image (type, elements, topic)." return interpret_figure_with_llm(image_path, "Describe this figure, including its type, main elements (axes, labels, legend), and overall topic.") def extract_data_points_tool_fn(image_path: str, data_request: str) -> str: "Extracts specific data points or values from the figure in the image." return interpret_figure_with_llm(image_path, f"Extract the following data points/values from the figure: {data_request}. If exact values are not clear, provide the closest estimate based on the visual.") def identify_trends_tool_fn(image_path: str) -> str: "Identifies and describes trends or patterns shown in the figure in the image." return interpret_figure_with_llm(image_path, "Analyze and describe the main trends or patterns shown in this figure.") def compare_elements_tool_fn(image_path: str, comparison_request: str) -> str: "Compares different elements within the figure in the image." return interpret_figure_with_llm(image_path, f"Compare the following elements within the figure: {comparison_request}. Be specific about the comparison based on the visual data.") def summarize_figure_insights_tool_fn(image_path: str) -> str: "Summarizes the key insights or main message conveyed by the figure in the image." return interpret_figure_with_llm(image_path, "Summarize the key insights or the main message conveyed by this figure.") # --- Tool Definitions for Agent --- describe_figure_tool = FunctionTool.from_defaults( fn=describe_figure_tool_fn, name="describe_figure", description="Provides a general description of the figure in the image (type, elements, topic). Input: image_path (str)." ) extract_data_points_tool = FunctionTool.from_defaults( fn=extract_data_points_tool_fn, name="extract_data_points", description="Extracts specific data points/values from the figure. Input: image_path (str), data_request (str)." ) identify_trends_tool = FunctionTool.from_defaults( fn=identify_trends_tool_fn, name="identify_trends", description="Identifies and describes trends/patterns in the figure. Input: image_path (str)." ) compare_elements_tool = FunctionTool.from_defaults( fn=compare_elements_tool_fn, name="compare_elements", description="Compares different elements within the figure. Input: image_path (str), comparison_request (str)." ) summarize_figure_insights_tool = FunctionTool.from_defaults( fn=summarize_figure_insights_tool_fn, name="summarize_figure_insights", description="Summarizes the key insights/main message of the figure. Input: image_path (str)." ) # --- Agent Initialization --- def initialize_figure_interpretation_agent() -> ReActAgent: """Initializes the Figure Interpretation Agent.""" logger.info("Initializing FigureInterpretationAgent...") # Configuration for the agent's main LLM (can be the same multi-modal one) agent_llm_model = os.getenv("FIGURE_INTERPRETATION_AGENT_LLM_MODEL", "models/gemini-1.5-pro") gemini_api_key = os.getenv("GEMINI_API_KEY") if not gemini_api_key: logger.error("GEMINI_API_KEY not found for FigureInterpretationAgent.") raise ValueError("GEMINI_API_KEY must be set for FigureInterpretationAgent") try: # Agent's LLM doesn't necessarily need to be multi-modal itself, # if the tools handle the multi-modal calls. # However, using a multi-modal one might allow more direct interaction patterns later. llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model) logger.info(f"Using agent LLM: {agent_llm_model}") # Load system prompt default_system_prompt = ("You are FigureInterpretationAgent... [Default prompt content - replace with actual]" # Placeholder ) system_prompt = load_prompt_from_file("../prompts/figure_interpretation_agent_prompt.txt", default_system_prompt) if system_prompt == default_system_prompt: logger.warning("Using default/fallback system prompt for FigureInterpretationAgent.") # Define available tools tools = [ describe_figure_tool, extract_data_points_tool, identify_trends_tool, compare_elements_tool, summarize_figure_insights_tool ] # Define valid handoff targets valid_handoffs = [ "planner_agent", # To return results "research_agent", # If context from figure needs further research "reasoning_agent" # If interpretation needs logical analysis ] agent = ReActAgent( name="figure_interpretation_agent", description=( "Analyzes and interprets visual data representations (charts, graphs, tables) from image files. " "Can describe figures, extract data, identify trends, compare elements, and summarize insights." ), tools=tools, llm=llm, system_prompt=system_prompt, can_handoff_to=valid_handoffs, # Note: This agent inherently requires multi-modal input capabilities, # which are handled within its tools via a multi-modal LLM. ) logger.info("FigureInterpretationAgent initialized successfully.") return agent except Exception as e: logger.error(f"Error during FigureInterpretationAgent initialization: {e}", exc_info=True) raise # Example usage (for testing if run directly) if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger.info("Running figure_interpretation_agent.py directly for testing...") # Check required keys required_keys = ["GEMINI_API_KEY"] missing_keys = [key for key in required_keys if not os.getenv(key)] if missing_keys: print(f"Error: Required environment variable(s) not set: {', '.join(missing_keys)}. Cannot run test.") else: # Check if a multi-modal model is likely configured (heuristic) model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro") if "pro" not in model_name.lower() and "vision" not in model_name.lower(): print(f"Warning: Configured LLM {model_name} might not support image input. Tests may fail.") # Create a dummy image file for testing (requires Pillow) dummy_image_path = "dummy_figure.png" try: from PIL import Image, ImageDraw, ImageFont img = Image.new('RGB', (400, 200), color = (255, 255, 255)) d = ImageDraw.Draw(img) # Try to load a default font, handle if not found try: font = ImageFont.truetype("arial.ttf", 15) # Common font, might not exist except IOError: font = ImageFont.load_default() print("Arial font not found, using default PIL font.") d.text((10,10), "Simple Bar Chart", fill=(0,0,0), font=font) d.rectangle([50, 50, 100, 150], fill=(255,0,0)) # Bar 1 d.text((60, 160), "A", fill=(0,0,0), font=font) d.rectangle([150, 80, 200, 150], fill=(0,0,255)) # Bar 2 d.text((160, 160), "B", fill=(0,0,0), font=font) img.save(dummy_image_path) print(f"Created dummy image file: {dummy_image_path}") # Test the tools directly print("\nTesting describe_figure...") desc = describe_figure_tool_fn(dummy_image_path) print(f"Description: {desc}") print("\nTesting extract_data_points (qualitative)...") extract_req = "Height of bar A vs Bar B" # Qualitative request extract_res = extract_data_points_tool_fn(dummy_image_path, extract_req) print(f"Extraction Result: {extract_res}") print("\nTesting compare_elements...") compare_req = "Compare bar A and bar B" compare_res = compare_elements_tool_fn(dummy_image_path, compare_req) print(f"Comparison Result: {compare_res}") # Clean up dummy image os.remove(dummy_image_path) except ImportError: print("Pillow library not installed. Skipping direct tool tests that require image creation.") # Optionally, still try initializing the agent try: test_agent = initialize_figure_interpretation_agent() print("\nFigure Interpretation Agent initialized successfully (tool tests skipped).") except Exception as e: print(f"Error initializing agent: {e}") except Exception as e: print(f"Error during testing: {e}") if os.path.exists(dummy_image_path): os.remove(dummy_image_path) # Ensure cleanup on error