Patrick Haller
Adding model benchmark predictions
73e4e1f
TITLE_TEXT = "PISA-Bench: The PISA Index as a Multilingual and Multimodal Metric for the Evaluation of Vision-Language Models"
INTRODUCTION_TEXT = """📄 PISA-Bench is a multilingual, multimodal benchmark designed to evaluate vision-language models on human-authored reasoning tasks derived from the OECD PISA assessments. Unlike many existing VLM datasets that rely on synthetic or English-only content, PISA-Bench provides 122 high-quality examples with images, questions, and answer options extracted from real PISA test material, translated and verified across six languages (EN, DE, ES, FR, IT, ZH).
The benchmark focuses on genuine reasoning rather than pattern matching: each item was manually curated, quality-checked, and categorized into key reasoning types such as spatial & geometric reasoning, graph & pattern analysis, quantitative reasoning, and text & diagram understanding.
Initial results show that even strong VLMs struggle with these tasks, especially in non-English settings, highlighting persistent gaps in multilingual multimodal reasoning.
This leaderboard tracks model performance across languages, providing a transparent and standardized evaluation for future research on multilingual VLMs."""
TASK_DESCRIPTION = """## Task type: Multimodal reasoning (image + text)
**Input:**
* Instruction (optional)
* Image
* Question
Answer options or free-form answer prompt
**Output:**
Models must generate a concise textual answer. Evaluation is performed using an LLM-as-a-judge protocol comparing the model’s answer to the gold reference. For multiple-choice questions, the generated answer must correspond to one of the provided options.
"""
DATASET_CONSTRUCTION = """Overview of the dataset construction pipeline. We (1) collect tasks from the original OECD PISA
tests, (2) decompose them into modular components (instruction, image, question, and answer options),
(3) verify, augment, and, if necessary, correct the extracted content, and (4) translate them into five target
languages (ES, DE, CH, FR, IT) and verify translations through native speakers."""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite our paper"
CITATION_BUTTON_TEXT = r"""@misc{haller2025pisabenchpisaindexmultilingual,
title={PISA-Bench: The PISA Index as a Multilingual and Multimodal Metric for the Evaluation of Vision-Language Models},
author={Patrick Haller and Fabio Barth and Jonas Golde and Georg Rehm and Alan Akbik},
year={2025},
eprint={2510.24792},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2510.24792},
}"""