{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/lirt/miniconda3/envs/gaia_env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from huggingface_hub import hf_hub_download\n", "import json\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "metadata_file = hf_hub_download(\n", " repo_id = \"gaia-benchmark/GAIA\",\n", " filename = \"2023/validation/metadata.jsonl\",\n", " repo_type = \"dataset\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "questions_and_answers = []" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "with open(metadata_file, 'r') as f:\n", " for line in f:\n", " questions_and_answers.append(json.loads(line))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(questions_and_answers)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of validation questions: 165\n", "Columns: ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'Annotator Metadata']\n", "Levels: Level\n", "2 86\n", "1 53\n", "3 26\n", "Name: count, dtype: int64\n", "\n", "Sample question:\n", "Question: A paper about AI regulation that was originally submitted to arXiv.org in June 2022 shows a figure with three axes, where each axis has a label word at both ends. Which of these words is used to describe a type of society in a Physics and Society article submitted to arXiv.org on August 11, 2016?\n", "Answer: egalitarian\n", "Level: 2\n" ] } ], "source": [ "# Display the structure\n", "print(f\"Number of validation questions: {len(df)}\")\n", "print(f\"Columns: {df.columns.tolist()}\")\n", "print(f\"Levels: {df['Level'].value_counts()}\")\n", "\n", "# Show a sample question\n", "print(\"\\nSample question:\")\n", "print(f\"Question: {df.iloc[0]['Question']}\")\n", "print(f\"Answer: {df.iloc[0]['Final answer']}\")\n", "print(f\"Level: {df.iloc[0]['Level']}\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df_level1 = df[df['Level'] == 1].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | task_id | \n", "Question | \n", "Level | \n", "Final answer | \n", "file_name | \n", "Annotator Metadata | \n", "
|---|---|---|---|---|---|---|
| 0 | \n", "e1fc63a2-da7a-432f-be78-7c4a95598703 | \n", "If Eliud Kipchoge could maintain his record-ma... | \n", "1 | \n", "17 | \n", "\n", " | {'Steps': '1. Googled Eliud Kipchoge marathon ... | \n", "
| 1 | \n", "8e867cd7-cff9-4e6c-867a-ff5ddc2550be | \n", "How many studio albums were published by Merce... | \n", "1 | \n", "3 | \n", "\n", " | {'Steps': '1. I did a search for Mercedes Sosa... | \n", "
| 2 | \n", "ec09fa32-d03f-4bf8-84b0-1f16922c3ae4 | \n", "Here's a fun riddle that I think you'll enjoy.... | \n", "1 | \n", "3 | \n", "\n", " | {'Steps': 'Step 1: Evaluate the problem statem... | \n", "
| 3 | \n", "5d0080cb-90d7-4712-bc33-848150e917d3 | \n", "What was the volume in m^3 of the fish bag tha... | \n", "1 | \n", "0.1777 | \n", "\n", " | {'Steps': '1. Searched '\"Can Hiccup Supply Eno... | \n", "
| 4 | \n", "a1e91b78-d3d8-4675-bb8d-62741b4b68a6 | \n", "In the video https://www.youtube.com/watch?v=L... | \n", "1 | \n", "3 | \n", "\n", " | {'Steps': '1. Navigate to the YouTube link.\n", "2.... | \n", "