Final_Assignment_Template

Running

App Files Files Community

ChillThrills commited on May 13

Commit

d2da2aa

1 Parent(s): b96a1eb

add tabulate to requirements

Browse files

Files changed (2) hide show

app.py +152 -70
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -90,11 +90,11 @@ GOOGLE_GEMINI_API_KEY = os.getenv("GOOGLE_GEMINI_API_KEY")
 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 AGENT_DEFAULT_TIMEOUT = 15
-MAX_CONTEXT_LENGTH_LLM = 15000
 MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
 CSV_SAMPLE_ROWS = 3
-MAX_FILE_CONTEXT_LENGTH = 7000 # Max characters for file context summary
 # Global variable for ASR pipeline (initialized on first use)
 asr_pipeline_instance: Optional[Any] = None
@@ -321,13 +321,12 @@ class FileProcessor:
             return f"Error: Audio processing skipped for '{filename}', librosa library not available."
         try:
             with io.BytesIO(content) as audio_buffer:
-                # Load audio, ensure 16kHz mono for Whisper
                 y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
             gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
             start_time = time.time()
-            transcription_result = asr_pipeline(y) # Pass numpy array directly
             end_time = time.time()
             gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
@@ -787,6 +786,7 @@ class GeneralRAGPipeline:
         self.pipeline_cache.set(cache_key, final_results)
         return final_results
 class GaiaLevel1Agent:
     def __init__(self, api_url: str = DEFAULT_API_URL):
         self.api_url = api_url
@@ -796,6 +796,8 @@ class GaiaLevel1Agent:
         if genai and GOOGLE_GEMINI_API_KEY:
             try:
                 genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
                 model_name = 'gemini-2.0-flash'
                 self.llm_model = genai.GenerativeModel(model_name)
                 gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
@@ -811,7 +813,7 @@ class GaiaLevel1Agent:
     @lru_cache(maxsize=32)
     def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
         file_url = f"{self.api_url}/files/{task_id}"
-        gaia_logger.info(f"Agent fetching file from: {file_url}")
         for attempt in range(2):
             try:
                 response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
@@ -826,78 +828,109 @@ class GaiaLevel1Agent:
                 content_type = response.headers.get("Content-Type", "")
-                gaia_logger.info(f"File downloaded: {filename}, type: {content_type}, size: {len(response.content)} bytes")
                 processed_content = FileProcessor.process(response.content, filename, content_type)
                 return processed_content
             except requests.exceptions.HTTPError as e:
                 if e.response.status_code == 404:
-                    gaia_logger.warning(f"File not found: {file_url}")
                     return None
                 gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
             except requests.exceptions.Timeout:
                 gaia_logger.warning(f"Timeout fetching file {task_id}")
-                if attempt < 1:
-                    time.sleep(1)
             except Exception as e:
                 gaia_logger.error(f"Error fetching/processing file {task_id} ({file_url}): {e}", exc_info=True)
-                if attempt < 1:
-                    time.sleep(1)
         return None
     def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> str:
         if not self.llm_model:
             gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
-            if web_context:
-                return f"Based on web search (LLM unavailable): {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
-            if file_context:
-                return f"Based on the provided document (LLM unavailable, first 100 chars of processed content): {file_context[:100]}..."
-            return "I am currently unable to process this request fully as the LLM is not available."
         prompt_parts = [
-            "You are a helpful AI assistant. Your task is to answer the user's question based *only* on the provided context from a document and/or web search results. Be concise and directly answer the question.",
-            "If the provided context is insufficient to answer the question, clearly state that the information is not available in the provided materials or that you cannot answer based on the context.",
-            "Web search results might contain 'Snippets' (short summaries) or more detailed 'Enriched Content'. Prioritize 'Enriched Content' if available and relevant.",
             "\nUser Question: ", question
         ]
-        combined_context_len = 0
         if file_context:
-            prompt_parts.extend(["\n\nContext from Provided Document:\n---", file_context, "---"])
-            combined_context_len += len(file_context)
         if web_context:
-            available_len_for_web = MAX_CONTEXT_LENGTH_LLM - combined_context_len - len("\n".join(prompt_parts)) - 200
-            truncated_web_context = web_context
-            if len(web_context) > available_len_for_web and available_len_for_web > 0:
-                truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
-                gaia_logger.info(f"Truncated web context from {len(web_context)} to {len(truncated_web_context)} chars.")
-            elif available_len_for_web <= 0 and web_context:
-                truncated_web_context = "\n...(web context omitted due to length constraints with file context)"
-                gaia_logger.warning("Web context completely omitted due to length constraints with file context.")
-            prompt_parts.extend(["\n\nContext from Web Search Results:\n---", truncated_web_context, "---"])
-            combined_context_len += len(truncated_web_context)
-        if not file_context and not web_context:
-            prompt_parts.append("\n\nNo document or web context was available to answer the question.")
-        prompt_parts.append("\n\nAnswer:")
         final_prompt = "\n".join(prompt_parts)
-        gaia_logger.info(f"LLM Prompt (first 200): {final_prompt[:200]}...")
-        gaia_logger.info(f"LLM Prompt (last 200): ...{final_prompt[-200:]}")
-        gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars ({combined_context_len} from context)")
         if not GenerationConfig:
             gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
-            return "LLM configuration error."
         try:
             gen_config = GenerationConfig(
-                temperature=0.4,
-                top_p=0.95,
-                max_output_tokens=2048
             )
             safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
             response = self.llm_model.generate_content(
@@ -911,65 +944,114 @@ class GaiaLevel1Agent:
                 if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
                     reason = response.prompt_feedback.block_reason.name
                 gaia_logger.warning(f"Gemini response blocked. Reason: {reason}.")
-                return f"My response was blocked (Reason: {reason}). Please rephrase or check context."
             llm_answer = response.text
-            gaia_logger.info(f"LLM Answer (first 100): {llm_answer[:100]}...")
             return llm_answer
         except Exception as e:
             gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
-            if "429" in str(e) or "ResourceExhausted" in str(type(e).__name__):
-                return "LLM temporarily unavailable (rate limit)."
-            return "Error generating LLM answer."
     def __call__(self, question: str, task_id: Optional[str] = None) -> str:
         gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
         q_lower = question.lower().strip()
         if "what is your name" in q_lower or "who are you" in q_lower:
-            return "I am a GAIA-oriented AI assistant using RAG, FileProcessor, and an LLM."
         file_ctx_str: Optional[str] = None
-        file_kws = ["document", "file", "provided text", "attached", "read", "content of the", "table", "data in"]
-        if task_id and any(kw in q_lower for kw in file_kws):
             file_ctx_str = self._fetch_and_process_file_content(task_id)
             if file_ctx_str:
                 gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
             else:
-                gaia_logger.warning(f"Failed to get processed file content for task {task_id}")
         web_ctx_str: Optional[str] = None
         needs_web = True
-        if file_ctx_str and len(file_ctx_str) > 500:
-            web_still_needed_kws = ["what is", "who is", "current", "latest", "news", "public opinion", "recent events"]
-            if not any(kw in q_lower for kw in web_still_needed_kws):
                 needs_web = False
                 gaia_logger.info("Substantial file context present and question doesn't strongly imply web search. Skipping web search.")
-        if "don't search" in q_lower or "do not search" in q_lower:
             needs_web = False
-            gaia_logger.info("Web search disabled by prompt.")
         if needs_web:
             search_q = question.replace("?", "").strip()
-            gaia_logger.info(f"RAG Pipeline for query: {search_q[:70]}")
-            rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False)
             if rag_res:
                 snippets = []
                 for i, res_item in enumerate(rag_res):
-                    title, body, href = res_item.get('title','N/A'), res_item.get('body',''), res_item.get('href','#')
-                    provider = res_item.get('query_tag','WebSearch')
-                    prefix = "Enriched" if res_item.get('enriched') else "Snippet"
-                    body_str = str(body) if body is not None else ""
-                    body_prompt = body_str[:(MAX_CONTEXT_LENGTH_LLM // (len(rag_res) if rag_res else 1)) - 200] + "..." if len(body_str) > 2800 else body_str
-                    snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_prompt}\n---")
                 web_ctx_str = "\n\n".join(snippets)
-                gaia_logger.info(f"RAG results: {len(web_ctx_str)} chars from {len(rag_res)} sources.")
             else:
                 gaia_logger.warning("RAG pipeline yielded no web results for the query.")
         answer = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
-        gaia_logger.info(f"Final answer (first 70): {answer[:70]}...")
         return answer
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
     if profile:

 TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
 AGENT_DEFAULT_TIMEOUT = 15
+MAX_CONTEXT_LENGTH_LLM = 30000
 MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
 CSV_SAMPLE_ROWS = 3
+MAX_FILE_CONTEXT_LENGTH = 10000 # Max characters for file context summary
 # Global variable for ASR pipeline (initialized on first use)
 asr_pipeline_instance: Optional[Any] = None
             return f"Error: Audio processing skipped for '{filename}', librosa library not available."
         try:
             with io.BytesIO(content) as audio_buffer:
                 y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
             gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
             start_time = time.time()
+            # Added generate_kwargs to hint language and task - adjust 'en' if other languages are primary
+            transcription_result = asr_pipeline(y, generate_kwargs={"task": "transcribe", "language": "en"})
             end_time = time.time()
             gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
         self.pipeline_cache.set(cache_key, final_results)
         return final_results
 class GaiaLevel1Agent:
     def __init__(self, api_url: str = DEFAULT_API_URL):
         self.api_url = api_url
         if genai and GOOGLE_GEMINI_API_KEY:
             try:
                 genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
+                # Using gemini-1.5-flash-latest for better context window support
+                # and consistency with MAX_CONTEXT_LENGTH_LLM = 30000
                 model_name = 'gemini-2.0-flash'
                 self.llm_model = genai.GenerativeModel(model_name)
                 gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
     @lru_cache(maxsize=32)
     def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
         file_url = f"{self.api_url}/files/{task_id}"
+        # gaia_logger.info(f"Agent fetching file from: {file_url}") # Reduced verbosity
         for attempt in range(2):
             try:
                 response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
                 content_type = response.headers.get("Content-Type", "")
+                # gaia_logger.info(f"File downloaded: {filename}, type: {content_type}, size: {len(response.content)} bytes") # Reduced verbosity
                 processed_content = FileProcessor.process(response.content, filename, content_type)
                 return processed_content
             except requests.exceptions.HTTPError as e:
                 if e.response.status_code == 404:
+                    gaia_logger.warning(f"File not found for task {task_id}: {file_url}")
                     return None
                 gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
             except requests.exceptions.Timeout:
                 gaia_logger.warning(f"Timeout fetching file {task_id}")
+                if attempt < 1: time.sleep(1)
             except Exception as e:
                 gaia_logger.error(f"Error fetching/processing file {task_id} ({file_url}): {e}", exc_info=True)
+                if attempt < 1: time.sleep(1)
         return None
     def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> str:
         if not self.llm_model:
             gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
+            # Fallback if LLM is entirely unavailable
+            if web_context and file_context:
+                return "FINAL ANSWER: LLM unavailable; context from file and web was found but not processed by LLM."
+            elif web_context:
+                return f"FINAL ANSWER: LLM unavailable; web context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
+            elif file_context:
+                return f"FINAL ANSWER: LLM unavailable; file context found: {file_context[:100]}..."
+            return "FINAL ANSWER: LLM unavailable and no context found."
+        # --- NEW PROMPT STRUCTURE ---
         prompt_parts = [
+            "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document and/or web search results).",
+            "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
+            "After your reasoning, you MUST conclude your response with the exact phrase 'FINAL ANSWER:', followed by your answer on the same line or the next.",
+            "The content of your FINAL ANSWER must strictly follow these rules:",
+            "  - If the answer is a number: write it directly without commas (e.g., 1234 not 1,234). Do not include units like '$' or '%' unless the question *explicitly* asks for the unit to be part of the answer.",
+            "  - If the answer is a string: use as few words as possible. Do not use articles (a, an, the) unless grammatically essential. Do not use abbreviations (e.g., write 'United States' not 'USA', 'Los Angeles' not 'LA') unless the question implies an abbreviation or it's a very common, universally understood one relevant to the context. Write digits in plain text (e.g., 'two' not '2') if they are part of a descriptive phrase, but use numerical digits if the question implies a code, identifier, version number, or a direct numerical value is more natural (e.g., 'Windows 10', 'part number 5').",
+            "  - If the answer is a list of items: provide them as a comma-separated list (e.g., item1, item2, item3). Apply the number or string rules above to each element in the list.",
+            "  - If the context is insufficient to answer the question: your reasoning should clearly state this, and your FINAL ANSWER should be 'Information not available in provided context'. Do not invent answers.",
+            "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'.",
             "\nUser Question: ", question
         ]
+        # --- END OF NEW PROMPT STRUCTURE HEAD ---
+        current_prompt_text_len = sum(len(p) for p in prompt_parts)
+        # Context preparation (similar to before, but ensure it fits with new prompt instructions)
+        context_added = False
         if file_context:
+            file_header = "\n\nContext from Provided Document:\n---"
+            file_footer = "\n---"
+            max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - (len(web_context) if web_context else 0) - len(file_header) - len(file_footer) - 500 # Buffer for web, answer instructions etc.
+            if max_len_for_file > 100 : # Only add if there's meaningful space
+                truncated_file_context = file_context[:max_len_for_file]
+                if len(file_context) > len(truncated_file_context):
+                    truncated_file_context += " ... (file context truncated)"
+                prompt_parts.extend([file_header, truncated_file_context, file_footer])
+                current_prompt_text_len += len(file_header) + len(truncated_file_context) + len(file_footer)
+                context_added = True
+            else:
+                gaia_logger.warning("Not enough space for file context in LLM prompt.")
         if web_context:
+            web_header = "\n\nContext from Web Search Results:\n---"
+            web_footer = "\n---"
+            # Recalculate available length for web specifically
+            available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(web_header) - len(web_footer) - 300 # Buffer for answer instructions
+            if available_len_for_web > 100: # Only add if there's meaningful space
+                truncated_web_context = web_context
+                if len(web_context) > available_len_for_web:
+                    truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
+                    gaia_logger.info(f"Truncated web context from {len(web_context)} to {len(truncated_web_context)} chars for LLM.")
+                prompt_parts.extend([web_header, truncated_web_context, web_footer])
+                context_added = True
+            else:
+                gaia_logger.warning("Not enough space for web context in LLM prompt, or web context itself is empty.")
+        if not context_added: # If neither file nor web context could be added (e.g., due to length)
+            prompt_parts.append("\n\nNo document or web context could be provided due to length constraints or availability.")
+        prompt_parts.append("\n\nReasoning and Final Answer:") # LLM will put its thoughts here, then "FINAL ANSWER: ..."
         final_prompt = "\n".join(prompt_parts)
+        gaia_logger.info(f"LLM Prompt (first 300): {final_prompt[:300]}...")
+        gaia_logger.info(f"LLM Prompt (last 300): ...{final_prompt[-300:]}")
+        gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars.")
         if not GenerationConfig:
             gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
+            return "FINAL ANSWER: LLM configuration error."
         try:
             gen_config = GenerationConfig(
+                temperature=0.1, # Reduced temperature for more deterministic and rule-following answers
+                top_p=0.95,      # Kept top_p
+                max_output_tokens=2048 # Should be enough for thoughts + answer
             )
+            # Safety settings remain the same
             safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
             response = self.llm_model.generate_content(
                 if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
                     reason = response.prompt_feedback.block_reason.name
                 gaia_logger.warning(f"Gemini response blocked. Reason: {reason}.")
+                # Return in the requested format even for errors
+                return f"My response was blocked (Reason: {reason}). FINAL ANSWER: Error processing request."
             llm_answer = response.text
+            gaia_logger.info(f"LLM Full Answer (first 200): {llm_answer[:200]}...")
+            # Ensure the output contains "FINAL ANSWER:" as per new strong requirement.
+            # If not, we might need to append it or re-prompt, but for now, let's see how well the LLM adheres.
+            if "FINAL ANSWER:" not in llm_answer:
+                gaia_logger.warning("LLM did not produce 'FINAL ANSWER:' template. Appending based on full response.")
+                # This is a fallback, ideally the LLM follows the prompt.
+                # For a GAIA contest, just returning the raw text might be safer if it's mostly the answer.
+                # Or, if the answer is consistently the last part:
+                # lines = llm_answer.strip().split('\n')
+                # simple_final_answer = lines[-1] if lines else "Could not extract answer"
+                # return f"LLM output did not follow template. Attempted extraction: FINAL ANSWER: {simple_final_answer}"
+                # For now, let the raw output pass, as it might contain partial reasoning + answer.
+                # The strictness of GAIA might penalize this more than a missing template from the LLM.
+                # The prompt is very explicit, so the LLM *should* follow it.
+                pass # Let raw LLM output through if it misses the template for now.
             return llm_answer
         except Exception as e:
             gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
+            error_type_name = type(e).__name__
+            if "429" in str(e) or "ResourceExhausted" in error_type_name:
+                return "Error: LLM temporarily unavailable (rate limit). FINAL ANSWER: LLM rate limit."
+            return f"Error generating LLM answer: {error_type_name}. FINAL ANSWER: LLM error."
     def __call__(self, question: str, task_id: Optional[str] = None) -> str:
+        # This part remains largely the same, as it's about gathering context
+        # The _formulate_answer_with_llm will now use the new prompt
         gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
         q_lower = question.lower().strip()
+        # Simple canned response - ensure it also follows the new format if strictly needed,
+        # but this is usually for agent identity, not a GAIA scored question.
+        # For GAIA, it might be better to let the LLM answer this with context if any.
+        # However, if this is a hardcoded check:
         if "what is your name" in q_lower or "who are you" in q_lower:
+            return "I am a general AI assistant. FINAL ANSWER: general AI assistant"
         file_ctx_str: Optional[str] = None
+        # Expanded keywords slightly for more robust file-related question detection
+        file_kws = ["document", "file", "text", "provide", "attach", "read", "content", "table", "data", "excel", "pdf", "audio", "code", "script", "log"]
+        # Check if question *implies* a file is primary, not just mentions a type
+        if task_id and (any(kw in q_lower for kw in file_kws) or "this task involves a file" in q_lower): # Hypothetical trigger
             file_ctx_str = self._fetch_and_process_file_content(task_id)
             if file_ctx_str:
                 gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
             else:
+                gaia_logger.warning(f"No file content or failed to process for task {task_id}")
         web_ctx_str: Optional[str] = None
         needs_web = True
+        # Heuristic to skip web search if substantial file context exists and question isn't clearly web-focused
+        if file_ctx_str and len(file_ctx_str) > 300: # If file context is somewhat substantial
+            # Keywords that strongly suggest a web search is still needed
+            web_still_needed_kws = [
+                "what is", "who is", "current", "latest", "news", "public opinion",
+                "recent events", "search for", "find information on", "browse", "look up"
+            ]
+            # Keywords that might be answerable from a good document
+            doc_can_answer_kws = ["summarize", "according to the document", "in the provided text"]
+            if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
+                needs_web = False
+                gaia_logger.info("Question seems focused on document context, and substantial file context exists. Tentatively skipping web search.")
+            elif not any(kw in q_lower for kw in web_still_needed_kws):
                 needs_web = False
                 gaia_logger.info("Substantial file context present and question doesn't strongly imply web search. Skipping web search.")
+        if "don't search" in q_lower or "do not search" in q_lower or "without searching" in q_lower:
             needs_web = False
+            gaia_logger.info("Web search explicitly disabled by prompt.")
         if needs_web:
             search_q = question.replace("?", "").strip()
+            # Tavily query length is handled within TavilyProvider now.
+            # No general truncation here unless other providers also show issues.
+            gaia_logger.info(f"RAG Pipeline initiated for query: {search_q[:70]}")
+            rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False) # Consider force_refresh for some GAIA levels if freshness is key
             if rag_res:
                 snippets = []
                 for i, res_item in enumerate(rag_res):
+                    title = res_item.get('title','N/A')
+                    body = res_item.get('body','')
+                    href = res_item.get('href','#')
+                    provider = res_item.get('query_tag','WebSearch')
+                    prefix = "EnrichedContent" if res_item.get('enriched') else "Snippet"
+                    # Truncate individual snippets less aggressively here, final truncation happens in _formulate_answer_with_llm
+                    body_preview = (body[:1500] + "...") if len(body) > 1500 else body
+                    snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_preview}\n---")
                 web_ctx_str = "\n\n".join(snippets)
+                gaia_logger.info(f"RAG processed {len(rag_res)} sources, total web context length for LLM (pre-truncation): {len(web_ctx_str)} chars.")
             else:
                 gaia_logger.warning("RAG pipeline yielded no web results for the query.")
         answer = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
+        gaia_logger.info(f"LLM-based answer (first 70 after FINAL ANSWER: if present): {answer.split('FINAL ANSWER:')[-1].strip()[:70]}...")
         return answer
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     space_id = os.getenv("SPACE_ID")
     if profile:

requirements.txt CHANGED Viewed

@@ -12,4 +12,5 @@ transformers
 torch
 librosa
 openpyxl
-pdfplumber

 torch
 librosa
 openpyxl
+pdfplumber
+tabulate