# fetch_gaia_audio.py import os import re import requests DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" OUT_PATH = "/mnt/data/test.wav" def main(): # 1) Fetch GAIA questions resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15) resp.raise_for_status() questions = resp.json() # 2) Try attachments field first for q in questions: for field in ("attachments", "attachment", "audio"): urls = q.get(field) if not urls: continue if isinstance(urls, str): urls = [urls] for url in urls: if is_media_url(url): return download_audio(url) # 3) Fallback: regex scan in question text pattern = re.compile(r"(https?://\S+\.(?:mp3|wav))", re.IGNORECASE) for q in questions: text = q.get("question", "") match = pattern.search(text) if match: url = match.group(1) return download_audio(url) print("⚠️ No .mp3/.wav URL found in GAIA payload; skipping download.") return def is_media_url(url: str) -> bool: return bool(re.match(r"^https?://.*\.(?:mp3|wav)$", url, re.IGNORECASE)) def download_audio(url: str): print(f"Downloading audio from {url}") r = requests.get(url, timeout=30) r.raise_for_status() ext = os.path.splitext(url)[1].lower() content = r.content if ext == ".mp3": # try to convert to wav if pydub installed try: from pydub import AudioSegment mp3_path = "/mnt/data/tmp.mp3" with open(mp3_path, "wb") as f: f.write(content) audio = AudioSegment.from_mp3(mp3_path) audio.export(OUT_PATH, format="wav") print(f"✔ Saved WAV to {OUT_PATH}") return except ImportError: # fallback: write raw mp3 bytes OUT = OUT_PATH.replace(".wav", ".mp3") with open(OUT, "wb") as f: f.write(content) print(f"⚠ pydub not installed; saved MP3 to {OUT}") return # if it's .wav or any other, write directly with open(OUT_PATH, "wb") as f: f.write(content) print(f"✔ Saved WAV to {OUT_PATH}") if __name__ == "__main__": main()