Final_Assignment_D3MI4N / fetch_gaia_audio.py
D3MI4N's picture
include excel reader and audio tools
ab62c9e
# fetch_gaia_audio.py
import os
import re
import requests
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
OUT_PATH = "/mnt/data/test.wav"
def main():
# 1) Fetch GAIA questions
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
resp.raise_for_status()
questions = resp.json()
# 2) Try attachments field first
for q in questions:
for field in ("attachments", "attachment", "audio"):
urls = q.get(field)
if not urls:
continue
if isinstance(urls, str):
urls = [urls]
for url in urls:
if is_media_url(url):
return download_audio(url)
# 3) Fallback: regex scan in question text
pattern = re.compile(r"(https?://\S+\.(?:mp3|wav))", re.IGNORECASE)
for q in questions:
text = q.get("question", "")
match = pattern.search(text)
if match:
url = match.group(1)
return download_audio(url)
print("⚠️ No .mp3/.wav URL found in GAIA payload; skipping download.")
return
def is_media_url(url: str) -> bool:
return bool(re.match(r"^https?://.*\.(?:mp3|wav)$", url, re.IGNORECASE))
def download_audio(url: str):
print(f"Downloading audio from {url}")
r = requests.get(url, timeout=30)
r.raise_for_status()
ext = os.path.splitext(url)[1].lower()
content = r.content
if ext == ".mp3":
# try to convert to wav if pydub installed
try:
from pydub import AudioSegment
mp3_path = "/mnt/data/tmp.mp3"
with open(mp3_path, "wb") as f:
f.write(content)
audio = AudioSegment.from_mp3(mp3_path)
audio.export(OUT_PATH, format="wav")
print(f"✔ Saved WAV to {OUT_PATH}")
return
except ImportError:
# fallback: write raw mp3 bytes
OUT = OUT_PATH.replace(".wav", ".mp3")
with open(OUT, "wb") as f:
f.write(content)
print(f"⚠ pydub not installed; saved MP3 to {OUT}")
return
# if it's .wav or any other, write directly
with open(OUT_PATH, "wb") as f:
f.write(content)
print(f"✔ Saved WAV to {OUT_PATH}")
if __name__ == "__main__":
main()