ChillThrills commited on
Commit
d2da2aa
·
1 Parent(s): b96a1eb

add tabulate to requirements

Browse files
Files changed (2) hide show
  1. app.py +152 -70
  2. requirements.txt +2 -1
app.py CHANGED
@@ -90,11 +90,11 @@ GOOGLE_GEMINI_API_KEY = os.getenv("GOOGLE_GEMINI_API_KEY")
90
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
91
 
92
  AGENT_DEFAULT_TIMEOUT = 15
93
- MAX_CONTEXT_LENGTH_LLM = 15000
94
 
95
  MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
96
  CSV_SAMPLE_ROWS = 3
97
- MAX_FILE_CONTEXT_LENGTH = 7000 # Max characters for file context summary
98
 
99
  # Global variable for ASR pipeline (initialized on first use)
100
  asr_pipeline_instance: Optional[Any] = None
@@ -321,13 +321,12 @@ class FileProcessor:
321
  return f"Error: Audio processing skipped for '{filename}', librosa library not available."
322
  try:
323
  with io.BytesIO(content) as audio_buffer:
324
- # Load audio, ensure 16kHz mono for Whisper
325
  y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
326
 
327
-
328
  gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
329
  start_time = time.time()
330
- transcription_result = asr_pipeline(y) # Pass numpy array directly
 
331
  end_time = time.time()
332
  gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
333
 
@@ -787,6 +786,7 @@ class GeneralRAGPipeline:
787
  self.pipeline_cache.set(cache_key, final_results)
788
  return final_results
789
 
 
790
  class GaiaLevel1Agent:
791
  def __init__(self, api_url: str = DEFAULT_API_URL):
792
  self.api_url = api_url
@@ -796,6 +796,8 @@ class GaiaLevel1Agent:
796
  if genai and GOOGLE_GEMINI_API_KEY:
797
  try:
798
  genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
 
 
799
  model_name = 'gemini-2.0-flash'
800
  self.llm_model = genai.GenerativeModel(model_name)
801
  gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
@@ -811,7 +813,7 @@ class GaiaLevel1Agent:
811
  @lru_cache(maxsize=32)
812
  def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
813
  file_url = f"{self.api_url}/files/{task_id}"
814
- gaia_logger.info(f"Agent fetching file from: {file_url}")
815
  for attempt in range(2):
816
  try:
817
  response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
@@ -826,78 +828,109 @@ class GaiaLevel1Agent:
826
 
827
  content_type = response.headers.get("Content-Type", "")
828
 
829
- gaia_logger.info(f"File downloaded: {filename}, type: {content_type}, size: {len(response.content)} bytes")
830
  processed_content = FileProcessor.process(response.content, filename, content_type)
831
  return processed_content
832
 
833
  except requests.exceptions.HTTPError as e:
834
  if e.response.status_code == 404:
835
- gaia_logger.warning(f"File not found: {file_url}")
836
  return None
837
  gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
838
  except requests.exceptions.Timeout:
839
  gaia_logger.warning(f"Timeout fetching file {task_id}")
840
- if attempt < 1:
841
- time.sleep(1)
842
  except Exception as e:
843
  gaia_logger.error(f"Error fetching/processing file {task_id} ({file_url}): {e}", exc_info=True)
844
- if attempt < 1:
845
- time.sleep(1)
846
  return None
847
 
848
  def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> str:
849
  if not self.llm_model:
850
  gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
851
- if web_context:
852
- return f"Based on web search (LLM unavailable): {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
853
- if file_context:
854
- return f"Based on the provided document (LLM unavailable, first 100 chars of processed content): {file_context[:100]}..."
855
- return "I am currently unable to process this request fully as the LLM is not available."
856
-
 
 
 
 
857
  prompt_parts = [
858
- "You are a helpful AI assistant. Your task is to answer the user's question based *only* on the provided context from a document and/or web search results. Be concise and directly answer the question.",
859
- "If the provided context is insufficient to answer the question, clearly state that the information is not available in the provided materials or that you cannot answer based on the context.",
860
- "Web search results might contain 'Snippets' (short summaries) or more detailed 'Enriched Content'. Prioritize 'Enriched Content' if available and relevant.",
 
 
 
 
 
 
861
  "\nUser Question: ", question
862
  ]
 
863
 
864
- combined_context_len = 0
 
 
 
865
  if file_context:
866
- prompt_parts.extend(["\n\nContext from Provided Document:\n---", file_context, "---"])
867
- combined_context_len += len(file_context)
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  if web_context:
869
- available_len_for_web = MAX_CONTEXT_LENGTH_LLM - combined_context_len - len("\n".join(prompt_parts)) - 200
870
- truncated_web_context = web_context
871
- if len(web_context) > available_len_for_web and available_len_for_web > 0:
872
- truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
873
- gaia_logger.info(f"Truncated web context from {len(web_context)} to {len(truncated_web_context)} chars.")
874
- elif available_len_for_web <= 0 and web_context:
875
- truncated_web_context = "\n...(web context omitted due to length constraints with file context)"
876
- gaia_logger.warning("Web context completely omitted due to length constraints with file context.")
877
-
878
- prompt_parts.extend(["\n\nContext from Web Search Results:\n---", truncated_web_context, "---"])
879
- combined_context_len += len(truncated_web_context)
880
-
881
- if not file_context and not web_context:
882
- prompt_parts.append("\n\nNo document or web context was available to answer the question.")
 
 
 
 
 
883
 
884
- prompt_parts.append("\n\nAnswer:")
885
  final_prompt = "\n".join(prompt_parts)
886
 
887
- gaia_logger.info(f"LLM Prompt (first 200): {final_prompt[:200]}...")
888
- gaia_logger.info(f"LLM Prompt (last 200): ...{final_prompt[-200:]}")
889
- gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars ({combined_context_len} from context)")
890
 
891
  if not GenerationConfig:
892
  gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
893
- return "LLM configuration error."
894
 
895
  try:
896
  gen_config = GenerationConfig(
897
- temperature=0.4,
898
- top_p=0.95,
899
- max_output_tokens=2048
900
  )
 
901
  safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
902
 
903
  response = self.llm_model.generate_content(
@@ -911,65 +944,114 @@ class GaiaLevel1Agent:
911
  if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
912
  reason = response.prompt_feedback.block_reason.name
913
  gaia_logger.warning(f"Gemini response blocked. Reason: {reason}.")
914
- return f"My response was blocked (Reason: {reason}). Please rephrase or check context."
 
915
 
916
  llm_answer = response.text
917
- gaia_logger.info(f"LLM Answer (first 100): {llm_answer[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
918
  return llm_answer
 
919
  except Exception as e:
920
  gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
921
- if "429" in str(e) or "ResourceExhausted" in str(type(e).__name__):
922
- return "LLM temporarily unavailable (rate limit)."
923
- return "Error generating LLM answer."
 
924
 
925
  def __call__(self, question: str, task_id: Optional[str] = None) -> str:
 
 
926
  gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
927
  q_lower = question.lower().strip()
 
 
 
 
 
928
  if "what is your name" in q_lower or "who are you" in q_lower:
929
- return "I am a GAIA-oriented AI assistant using RAG, FileProcessor, and an LLM."
 
930
 
931
  file_ctx_str: Optional[str] = None
932
- file_kws = ["document", "file", "provided text", "attached", "read", "content of the", "table", "data in"]
933
- if task_id and any(kw in q_lower for kw in file_kws):
 
 
934
  file_ctx_str = self._fetch_and_process_file_content(task_id)
935
  if file_ctx_str:
936
  gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
937
  else:
938
- gaia_logger.warning(f"Failed to get processed file content for task {task_id}")
939
 
940
  web_ctx_str: Optional[str] = None
941
  needs_web = True
942
- if file_ctx_str and len(file_ctx_str) > 500:
943
- web_still_needed_kws = ["what is", "who is", "current", "latest", "news", "public opinion", "recent events"]
944
- if not any(kw in q_lower for kw in web_still_needed_kws):
 
 
 
 
 
 
 
 
 
 
 
945
  needs_web = False
946
  gaia_logger.info("Substantial file context present and question doesn't strongly imply web search. Skipping web search.")
947
- if "don't search" in q_lower or "do not search" in q_lower:
 
948
  needs_web = False
949
- gaia_logger.info("Web search disabled by prompt.")
950
 
951
  if needs_web:
952
  search_q = question.replace("?", "").strip()
953
- gaia_logger.info(f"RAG Pipeline for query: {search_q[:70]}")
954
- rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False)
 
 
955
  if rag_res:
956
  snippets = []
957
  for i, res_item in enumerate(rag_res):
958
- title, body, href = res_item.get('title','N/A'), res_item.get('body',''), res_item.get('href','#')
959
- provider = res_item.get('query_tag','WebSearch')
960
- prefix = "Enriched" if res_item.get('enriched') else "Snippet"
961
- body_str = str(body) if body is not None else ""
962
- body_prompt = body_str[:(MAX_CONTEXT_LENGTH_LLM // (len(rag_res) if rag_res else 1)) - 200] + "..." if len(body_str) > 2800 else body_str
963
- snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_prompt}\n---")
 
 
 
 
964
  web_ctx_str = "\n\n".join(snippets)
965
- gaia_logger.info(f"RAG results: {len(web_ctx_str)} chars from {len(rag_res)} sources.")
966
  else:
967
  gaia_logger.warning("RAG pipeline yielded no web results for the query.")
968
 
969
  answer = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
970
- gaia_logger.info(f"Final answer (first 70): {answer[:70]}...")
971
  return answer
972
 
 
 
973
  def run_and_submit_all(profile: gr.OAuthProfile | None):
974
  space_id = os.getenv("SPACE_ID")
975
  if profile:
 
90
  TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
91
 
92
  AGENT_DEFAULT_TIMEOUT = 15
93
+ MAX_CONTEXT_LENGTH_LLM = 30000
94
 
95
  MAX_FILE_SIZE = 5 * 1024 * 1024 # 5MB
96
  CSV_SAMPLE_ROWS = 3
97
+ MAX_FILE_CONTEXT_LENGTH = 10000 # Max characters for file context summary
98
 
99
  # Global variable for ASR pipeline (initialized on first use)
100
  asr_pipeline_instance: Optional[Any] = None
 
321
  return f"Error: Audio processing skipped for '{filename}', librosa library not available."
322
  try:
323
  with io.BytesIO(content) as audio_buffer:
 
324
  y, sr = librosa.load(audio_buffer, sr=16000, mono=True)
325
 
 
326
  gaia_logger.info(f"Transcribing audio file: {filename} ({len(y)/sr:.2f} seconds)")
327
  start_time = time.time()
328
+ # Added generate_kwargs to hint language and task - adjust 'en' if other languages are primary
329
+ transcription_result = asr_pipeline(y, generate_kwargs={"task": "transcribe", "language": "en"})
330
  end_time = time.time()
331
  gaia_logger.info(f"Audio transcription for '{filename}' took {end_time - start_time:.2f} seconds.")
332
 
 
786
  self.pipeline_cache.set(cache_key, final_results)
787
  return final_results
788
 
789
+
790
  class GaiaLevel1Agent:
791
  def __init__(self, api_url: str = DEFAULT_API_URL):
792
  self.api_url = api_url
 
796
  if genai and GOOGLE_GEMINI_API_KEY:
797
  try:
798
  genai.configure(api_key=GOOGLE_GEMINI_API_KEY)
799
+ # Using gemini-1.5-flash-latest for better context window support
800
+ # and consistency with MAX_CONTEXT_LENGTH_LLM = 30000
801
  model_name = 'gemini-2.0-flash'
802
  self.llm_model = genai.GenerativeModel(model_name)
803
  gaia_logger.info(f"Gemini LLM ('{model_name}') initialized.")
 
813
  @lru_cache(maxsize=32)
814
  def _fetch_and_process_file_content(self, task_id: str) -> Optional[str]:
815
  file_url = f"{self.api_url}/files/{task_id}"
816
+ # gaia_logger.info(f"Agent fetching file from: {file_url}") # Reduced verbosity
817
  for attempt in range(2):
818
  try:
819
  response = requests.get(file_url, timeout=AGENT_DEFAULT_TIMEOUT)
 
828
 
829
  content_type = response.headers.get("Content-Type", "")
830
 
831
+ # gaia_logger.info(f"File downloaded: {filename}, type: {content_type}, size: {len(response.content)} bytes") # Reduced verbosity
832
  processed_content = FileProcessor.process(response.content, filename, content_type)
833
  return processed_content
834
 
835
  except requests.exceptions.HTTPError as e:
836
  if e.response.status_code == 404:
837
+ gaia_logger.warning(f"File not found for task {task_id}: {file_url}")
838
  return None
839
  gaia_logger.warning(f"HTTP error fetching file {task_id}: {e}")
840
  except requests.exceptions.Timeout:
841
  gaia_logger.warning(f"Timeout fetching file {task_id}")
842
+ if attempt < 1: time.sleep(1)
 
843
  except Exception as e:
844
  gaia_logger.error(f"Error fetching/processing file {task_id} ({file_url}): {e}", exc_info=True)
845
+ if attempt < 1: time.sleep(1)
 
846
  return None
847
 
848
  def _formulate_answer_with_llm(self, question: str, file_context: Optional[str], web_context: Optional[str]) -> str:
849
  if not self.llm_model:
850
  gaia_logger.warning("LLM model (Gemini) not available for answer formulation.")
851
+ # Fallback if LLM is entirely unavailable
852
+ if web_context and file_context:
853
+ return "FINAL ANSWER: LLM unavailable; context from file and web was found but not processed by LLM."
854
+ elif web_context:
855
+ return f"FINAL ANSWER: LLM unavailable; web context found: {web_context.splitlines()[0] if web_context.splitlines() else 'No specific snippet found.'}"
856
+ elif file_context:
857
+ return f"FINAL ANSWER: LLM unavailable; file context found: {file_context[:100]}..."
858
+ return "FINAL ANSWER: LLM unavailable and no context found."
859
+
860
+ # --- NEW PROMPT STRUCTURE ---
861
  prompt_parts = [
862
+ "You are a general AI assistant. Your primary goal is to answer the user's question accurately and concisely based *only* on the provided context (from a document and/or web search results).",
863
+ "First, think step-by-step and briefly explain your reasoning based on the context. This part is for clarity and should come before your final answer.",
864
+ "After your reasoning, you MUST conclude your response with the exact phrase 'FINAL ANSWER:', followed by your answer on the same line or the next.",
865
+ "The content of your FINAL ANSWER must strictly follow these rules:",
866
+ " - If the answer is a number: write it directly without commas (e.g., 1234 not 1,234). Do not include units like '$' or '%' unless the question *explicitly* asks for the unit to be part of the answer.",
867
+ " - If the answer is a string: use as few words as possible. Do not use articles (a, an, the) unless grammatically essential. Do not use abbreviations (e.g., write 'United States' not 'USA', 'Los Angeles' not 'LA') unless the question implies an abbreviation or it's a very common, universally understood one relevant to the context. Write digits in plain text (e.g., 'two' not '2') if they are part of a descriptive phrase, but use numerical digits if the question implies a code, identifier, version number, or a direct numerical value is more natural (e.g., 'Windows 10', 'part number 5').",
868
+ " - If the answer is a list of items: provide them as a comma-separated list (e.g., item1, item2, item3). Apply the number or string rules above to each element in the list.",
869
+ " - If the context is insufficient to answer the question: your reasoning should clearly state this, and your FINAL ANSWER should be 'Information not available in provided context'. Do not invent answers.",
870
+ "Prioritize information from 'Enriched Content' from web search results if available and relevant over shorter 'Snippets'.",
871
  "\nUser Question: ", question
872
  ]
873
+ # --- END OF NEW PROMPT STRUCTURE HEAD ---
874
 
875
+ current_prompt_text_len = sum(len(p) for p in prompt_parts)
876
+
877
+ # Context preparation (similar to before, but ensure it fits with new prompt instructions)
878
+ context_added = False
879
  if file_context:
880
+ file_header = "\n\nContext from Provided Document:\n---"
881
+ file_footer = "\n---"
882
+ max_len_for_file = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - (len(web_context) if web_context else 0) - len(file_header) - len(file_footer) - 500 # Buffer for web, answer instructions etc.
883
+
884
+ if max_len_for_file > 100 : # Only add if there's meaningful space
885
+ truncated_file_context = file_context[:max_len_for_file]
886
+ if len(file_context) > len(truncated_file_context):
887
+ truncated_file_context += " ... (file context truncated)"
888
+ prompt_parts.extend([file_header, truncated_file_context, file_footer])
889
+ current_prompt_text_len += len(file_header) + len(truncated_file_context) + len(file_footer)
890
+ context_added = True
891
+ else:
892
+ gaia_logger.warning("Not enough space for file context in LLM prompt.")
893
+
894
+
895
  if web_context:
896
+ web_header = "\n\nContext from Web Search Results:\n---"
897
+ web_footer = "\n---"
898
+ # Recalculate available length for web specifically
899
+ available_len_for_web = MAX_CONTEXT_LENGTH_LLM - current_prompt_text_len - len(web_header) - len(web_footer) - 300 # Buffer for answer instructions
900
+
901
+ if available_len_for_web > 100: # Only add if there's meaningful space
902
+ truncated_web_context = web_context
903
+ if len(web_context) > available_len_for_web:
904
+ truncated_web_context = web_context[:available_len_for_web] + "\n... (web context truncated)"
905
+ gaia_logger.info(f"Truncated web context from {len(web_context)} to {len(truncated_web_context)} chars for LLM.")
906
+
907
+ prompt_parts.extend([web_header, truncated_web_context, web_footer])
908
+ context_added = True
909
+ else:
910
+ gaia_logger.warning("Not enough space for web context in LLM prompt, or web context itself is empty.")
911
+
912
+
913
+ if not context_added: # If neither file nor web context could be added (e.g., due to length)
914
+ prompt_parts.append("\n\nNo document or web context could be provided due to length constraints or availability.")
915
 
916
+ prompt_parts.append("\n\nReasoning and Final Answer:") # LLM will put its thoughts here, then "FINAL ANSWER: ..."
917
  final_prompt = "\n".join(prompt_parts)
918
 
919
+ gaia_logger.info(f"LLM Prompt (first 300): {final_prompt[:300]}...")
920
+ gaia_logger.info(f"LLM Prompt (last 300): ...{final_prompt[-300:]}")
921
+ gaia_logger.info(f"LLM Total prompt length: {len(final_prompt)} chars.")
922
 
923
  if not GenerationConfig:
924
  gaia_logger.error("GenerationConfig not available. Cannot make LLM call.")
925
+ return "FINAL ANSWER: LLM configuration error."
926
 
927
  try:
928
  gen_config = GenerationConfig(
929
+ temperature=0.1, # Reduced temperature for more deterministic and rule-following answers
930
+ top_p=0.95, # Kept top_p
931
+ max_output_tokens=2048 # Should be enough for thoughts + answer
932
  )
933
+ # Safety settings remain the same
934
  safety_set = [{"category": c, "threshold": "BLOCK_MEDIUM_AND_ABOVE"} for c in ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]]
935
 
936
  response = self.llm_model.generate_content(
 
944
  if hasattr(response, 'prompt_feedback') and response.prompt_feedback.block_reason:
945
  reason = response.prompt_feedback.block_reason.name
946
  gaia_logger.warning(f"Gemini response blocked. Reason: {reason}.")
947
+ # Return in the requested format even for errors
948
+ return f"My response was blocked (Reason: {reason}). FINAL ANSWER: Error processing request."
949
 
950
  llm_answer = response.text
951
+ gaia_logger.info(f"LLM Full Answer (first 200): {llm_answer[:200]}...")
952
+
953
+ # Ensure the output contains "FINAL ANSWER:" as per new strong requirement.
954
+ # If not, we might need to append it or re-prompt, but for now, let's see how well the LLM adheres.
955
+ if "FINAL ANSWER:" not in llm_answer:
956
+ gaia_logger.warning("LLM did not produce 'FINAL ANSWER:' template. Appending based on full response.")
957
+ # This is a fallback, ideally the LLM follows the prompt.
958
+ # For a GAIA contest, just returning the raw text might be safer if it's mostly the answer.
959
+ # Or, if the answer is consistently the last part:
960
+ # lines = llm_answer.strip().split('\n')
961
+ # simple_final_answer = lines[-1] if lines else "Could not extract answer"
962
+ # return f"LLM output did not follow template. Attempted extraction: FINAL ANSWER: {simple_final_answer}"
963
+ # For now, let the raw output pass, as it might contain partial reasoning + answer.
964
+ # The strictness of GAIA might penalize this more than a missing template from the LLM.
965
+ # The prompt is very explicit, so the LLM *should* follow it.
966
+ pass # Let raw LLM output through if it misses the template for now.
967
+
968
  return llm_answer
969
+
970
  except Exception as e:
971
  gaia_logger.error(f"Error calling Gemini API: {e}", exc_info=True)
972
+ error_type_name = type(e).__name__
973
+ if "429" in str(e) or "ResourceExhausted" in error_type_name:
974
+ return "Error: LLM temporarily unavailable (rate limit). FINAL ANSWER: LLM rate limit."
975
+ return f"Error generating LLM answer: {error_type_name}. FINAL ANSWER: LLM error."
976
 
977
  def __call__(self, question: str, task_id: Optional[str] = None) -> str:
978
+ # This part remains largely the same, as it's about gathering context
979
+ # The _formulate_answer_with_llm will now use the new prompt
980
  gaia_logger.info(f"Agent processing: '{question[:70]}...', TaskID: {task_id}")
981
  q_lower = question.lower().strip()
982
+
983
+ # Simple canned response - ensure it also follows the new format if strictly needed,
984
+ # but this is usually for agent identity, not a GAIA scored question.
985
+ # For GAIA, it might be better to let the LLM answer this with context if any.
986
+ # However, if this is a hardcoded check:
987
  if "what is your name" in q_lower or "who are you" in q_lower:
988
+ return "I am a general AI assistant. FINAL ANSWER: general AI assistant"
989
+
990
 
991
  file_ctx_str: Optional[str] = None
992
+ # Expanded keywords slightly for more robust file-related question detection
993
+ file_kws = ["document", "file", "text", "provide", "attach", "read", "content", "table", "data", "excel", "pdf", "audio", "code", "script", "log"]
994
+ # Check if question *implies* a file is primary, not just mentions a type
995
+ if task_id and (any(kw in q_lower for kw in file_kws) or "this task involves a file" in q_lower): # Hypothetical trigger
996
  file_ctx_str = self._fetch_and_process_file_content(task_id)
997
  if file_ctx_str:
998
  gaia_logger.info(f"Processed file context ({len(file_ctx_str)} chars) for task {task_id}")
999
  else:
1000
+ gaia_logger.warning(f"No file content or failed to process for task {task_id}")
1001
 
1002
  web_ctx_str: Optional[str] = None
1003
  needs_web = True
1004
+ # Heuristic to skip web search if substantial file context exists and question isn't clearly web-focused
1005
+ if file_ctx_str and len(file_ctx_str) > 300: # If file context is somewhat substantial
1006
+ # Keywords that strongly suggest a web search is still needed
1007
+ web_still_needed_kws = [
1008
+ "what is", "who is", "current", "latest", "news", "public opinion",
1009
+ "recent events", "search for", "find information on", "browse", "look up"
1010
+ ]
1011
+ # Keywords that might be answerable from a good document
1012
+ doc_can_answer_kws = ["summarize", "according to the document", "in the provided text"]
1013
+
1014
+ if any(kw in q_lower for kw in doc_can_answer_kws) and not any(kw in q_lower for kw in web_still_needed_kws):
1015
+ needs_web = False
1016
+ gaia_logger.info("Question seems focused on document context, and substantial file context exists. Tentatively skipping web search.")
1017
+ elif not any(kw in q_lower for kw in web_still_needed_kws):
1018
  needs_web = False
1019
  gaia_logger.info("Substantial file context present and question doesn't strongly imply web search. Skipping web search.")
1020
+
1021
+ if "don't search" in q_lower or "do not search" in q_lower or "without searching" in q_lower:
1022
  needs_web = False
1023
+ gaia_logger.info("Web search explicitly disabled by prompt.")
1024
 
1025
  if needs_web:
1026
  search_q = question.replace("?", "").strip()
1027
+ # Tavily query length is handled within TavilyProvider now.
1028
+ # No general truncation here unless other providers also show issues.
1029
+ gaia_logger.info(f"RAG Pipeline initiated for query: {search_q[:70]}")
1030
+ rag_res = self.rag_pipeline.analyze(query=search_q, force_refresh=False) # Consider force_refresh for some GAIA levels if freshness is key
1031
  if rag_res:
1032
  snippets = []
1033
  for i, res_item in enumerate(rag_res):
1034
+ title = res_item.get('title','N/A')
1035
+ body = res_item.get('body','')
1036
+ href = res_item.get('href','#')
1037
+ provider = res_item.get('query_tag','WebSearch')
1038
+ prefix = "EnrichedContent" if res_item.get('enriched') else "Snippet"
1039
+
1040
+ # Truncate individual snippets less aggressively here, final truncation happens in _formulate_answer_with_llm
1041
+ body_preview = (body[:1500] + "...") if len(body) > 1500 else body
1042
+
1043
+ snippets.append(f"Source [{i+1} - {provider}]: {title}\nURL: {href}\n{prefix}: {body_preview}\n---")
1044
  web_ctx_str = "\n\n".join(snippets)
1045
+ gaia_logger.info(f"RAG processed {len(rag_res)} sources, total web context length for LLM (pre-truncation): {len(web_ctx_str)} chars.")
1046
  else:
1047
  gaia_logger.warning("RAG pipeline yielded no web results for the query.")
1048
 
1049
  answer = self._formulate_answer_with_llm(question, file_ctx_str, web_ctx_str)
1050
+ gaia_logger.info(f"LLM-based answer (first 70 after FINAL ANSWER: if present): {answer.split('FINAL ANSWER:')[-1].strip()[:70]}...")
1051
  return answer
1052
 
1053
+
1054
+
1055
  def run_and_submit_all(profile: gr.OAuthProfile | None):
1056
  space_id = os.getenv("SPACE_ID")
1057
  if profile:
requirements.txt CHANGED
@@ -12,4 +12,5 @@ transformers
12
  torch
13
  librosa
14
  openpyxl
15
- pdfplumber
 
 
12
  torch
13
  librosa
14
  openpyxl
15
+ pdfplumber
16
+ tabulate