@inproceedings{takizawa-etal-2025-mcqformatbench,
title = "{MCQF}ormat{B}ench: Robustness Tests for Multiple-Choice Questions",
author = "Takizawa, Hiroo and
Sugawara, Saku and
Aizawa, Akiko",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.69/",
pages = "824--846",
ISBN = "979-8-89176-261-9",
abstract = "Multiple-choice questions (MCQs) are often used to evaluate large language models (LLMs). They measure LLMs' general common sense and reasoning abilities, as well as their knowledge in specific domains such as law and medicine. However, the robustness of LLMs to various question formats in MCQs has not been thoroughly evaluated. While there are studies on the sensitivity of LLMs to input variations, research into their responsiveness to different question formats is still limited. In this study, we propose a method to construct tasks to comprehensively evaluate the robustness against format changes of MCQs by decomposing the answering process into several steps. Using this dataset, we evaluate nine LLMs, such as Llama3-70B and Mixtral-8x7B. We find the lack of robustness to differences in the format of MCQs. It is crucial to consider whether the format of MCQs influences their evaluation scores when assessing LLMs using MCQ datasets."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="takizawa-etal-2025-mcqformatbench">
<titleInfo>
<title>MCQFormatBench: Robustness Tests for Multiple-Choice Questions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hiroo</namePart>
<namePart type="family">Takizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Saku</namePart>
<namePart type="family">Sugawara</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Aizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>Multiple-choice questions (MCQs) are often used to evaluate large language models (LLMs). They measure LLMs’ general common sense and reasoning abilities, as well as their knowledge in specific domains such as law and medicine. However, the robustness of LLMs to various question formats in MCQs has not been thoroughly evaluated. While there are studies on the sensitivity of LLMs to input variations, research into their responsiveness to different question formats is still limited. In this study, we propose a method to construct tasks to comprehensively evaluate the robustness against format changes of MCQs by decomposing the answering process into several steps. Using this dataset, we evaluate nine LLMs, such as Llama3-70B and Mixtral-8x7B. We find the lack of robustness to differences in the format of MCQs. It is crucial to consider whether the format of MCQs influences their evaluation scores when assessing LLMs using MCQ datasets.</abstract>
<identifier type="citekey">takizawa-etal-2025-mcqformatbench</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.69/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>824</start>
<end>846</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MCQFormatBench: Robustness Tests for Multiple-Choice Questions
%A Takizawa, Hiroo
%A Sugawara, Saku
%A Aizawa, Akiko
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F takizawa-etal-2025-mcqformatbench
%X Multiple-choice questions (MCQs) are often used to evaluate large language models (LLMs). They measure LLMs’ general common sense and reasoning abilities, as well as their knowledge in specific domains such as law and medicine. However, the robustness of LLMs to various question formats in MCQs has not been thoroughly evaluated. While there are studies on the sensitivity of LLMs to input variations, research into their responsiveness to different question formats is still limited. In this study, we propose a method to construct tasks to comprehensively evaluate the robustness against format changes of MCQs by decomposing the answering process into several steps. Using this dataset, we evaluate nine LLMs, such as Llama3-70B and Mixtral-8x7B. We find the lack of robustness to differences in the format of MCQs. It is crucial to consider whether the format of MCQs influences their evaluation scores when assessing LLMs using MCQ datasets.
%U https://aclanthology.org/2025.gem-1.69/
%P 824-846
Markdown (Informal)
[MCQFormatBench: Robustness Tests for Multiple-Choice Questions](https://aclanthology.org/2025.gem-1.69/) (Takizawa et al., GEM 2025)
ACL