Spaces:
Sleeping
Sleeping
# fetch_gaia_audio.py | |
import os | |
import re | |
import requests | |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" | |
OUT_PATH = "/mnt/data/test.wav" | |
def main(): | |
# 1) Fetch GAIA questions | |
resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15) | |
resp.raise_for_status() | |
questions = resp.json() | |
# 2) Try attachments field first | |
for q in questions: | |
for field in ("attachments", "attachment", "audio"): | |
urls = q.get(field) | |
if not urls: | |
continue | |
if isinstance(urls, str): | |
urls = [urls] | |
for url in urls: | |
if is_media_url(url): | |
return download_audio(url) | |
# 3) Fallback: regex scan in question text | |
pattern = re.compile(r"(https?://\S+\.(?:mp3|wav))", re.IGNORECASE) | |
for q in questions: | |
text = q.get("question", "") | |
match = pattern.search(text) | |
if match: | |
url = match.group(1) | |
return download_audio(url) | |
print("⚠️ No .mp3/.wav URL found in GAIA payload; skipping download.") | |
return | |
def is_media_url(url: str) -> bool: | |
return bool(re.match(r"^https?://.*\.(?:mp3|wav)$", url, re.IGNORECASE)) | |
def download_audio(url: str): | |
print(f"Downloading audio from {url}") | |
r = requests.get(url, timeout=30) | |
r.raise_for_status() | |
ext = os.path.splitext(url)[1].lower() | |
content = r.content | |
if ext == ".mp3": | |
# try to convert to wav if pydub installed | |
try: | |
from pydub import AudioSegment | |
mp3_path = "/mnt/data/tmp.mp3" | |
with open(mp3_path, "wb") as f: | |
f.write(content) | |
audio = AudioSegment.from_mp3(mp3_path) | |
audio.export(OUT_PATH, format="wav") | |
print(f"✔ Saved WAV to {OUT_PATH}") | |
return | |
except ImportError: | |
# fallback: write raw mp3 bytes | |
OUT = OUT_PATH.replace(".wav", ".mp3") | |
with open(OUT, "wb") as f: | |
f.write(content) | |
print(f"⚠ pydub not installed; saved MP3 to {OUT}") | |
return | |
# if it's .wav or any other, write directly | |
with open(OUT_PATH, "wb") as f: | |
f.write(content) | |
print(f"✔ Saved WAV to {OUT_PATH}") | |
if __name__ == "__main__": | |
main() | |