@inproceedings{tam-etal-2025-none,
title = "None of the Above, Less of the Right Parallel Patterns in Human and {LLM} Performance on Multi-Choice Questions Answering",
author = "Tam, Zhi Rui and
Wu, Cheng-Kuang and
Lin, Chieh-Yen and
Chen, Yun-Nung",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1031/",
doi = "10.18653/v1/2025.findings-acl.1031",
pages = "20112--20134",
ISBN = "979-8-89176-256-5",
abstract = "Multiple-choice exam questions with ``None of the above'' (NA) options have been extensively studied in educational testing, in which existing research suggests that they better assess true knowledge. However, their impact on Large Language Models (LLMs) evaluation remains underexplored. Through systematic experiments with 28 LLMs on the MMLU benchmark, we examine how NA options affect model performance and confidence calibration. Our analysis reveals that NA options, when used as the correct answer, lead to a consistent 30-50{\%} performance drop across models regardless of scale{--}suggesting that LLMs lack the meta-cognitive ability to systematically evaluate and reject all given options when none are correct. This degradation shows strong domain dependence, with minimal impact on mathematical reasoning (14.6{\%} drop) but severe effects on tasks requiring uncertainty handling like business ethics (48.1{\%} drop). Our results highlight important implications for benchmark design and raise questions about LLMs' ability to handle uncertainty in real-world applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tam-etal-2025-none">
<titleInfo>
<title>None of the Above, Less of the Right Parallel Patterns in Human and LLM Performance on Multi-Choice Questions Answering</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhi</namePart>
<namePart type="given">Rui</namePart>
<namePart type="family">Tam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng-Kuang</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chieh-Yen</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Multiple-choice exam questions with “None of the above” (NA) options have been extensively studied in educational testing, in which existing research suggests that they better assess true knowledge. However, their impact on Large Language Models (LLMs) evaluation remains underexplored. Through systematic experiments with 28 LLMs on the MMLU benchmark, we examine how NA options affect model performance and confidence calibration. Our analysis reveals that NA options, when used as the correct answer, lead to a consistent 30-50% performance drop across models regardless of scale–suggesting that LLMs lack the meta-cognitive ability to systematically evaluate and reject all given options when none are correct. This degradation shows strong domain dependence, with minimal impact on mathematical reasoning (14.6% drop) but severe effects on tasks requiring uncertainty handling like business ethics (48.1% drop). Our results highlight important implications for benchmark design and raise questions about LLMs’ ability to handle uncertainty in real-world applications.</abstract>
<identifier type="citekey">tam-etal-2025-none</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1031</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1031/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>20112</start>
<end>20134</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T None of the Above, Less of the Right Parallel Patterns in Human and LLM Performance on Multi-Choice Questions Answering
%A Tam, Zhi Rui
%A Wu, Cheng-Kuang
%A Lin, Chieh-Yen
%A Chen, Yun-Nung
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F tam-etal-2025-none
%X Multiple-choice exam questions with “None of the above” (NA) options have been extensively studied in educational testing, in which existing research suggests that they better assess true knowledge. However, their impact on Large Language Models (LLMs) evaluation remains underexplored. Through systematic experiments with 28 LLMs on the MMLU benchmark, we examine how NA options affect model performance and confidence calibration. Our analysis reveals that NA options, when used as the correct answer, lead to a consistent 30-50% performance drop across models regardless of scale–suggesting that LLMs lack the meta-cognitive ability to systematically evaluate and reject all given options when none are correct. This degradation shows strong domain dependence, with minimal impact on mathematical reasoning (14.6% drop) but severe effects on tasks requiring uncertainty handling like business ethics (48.1% drop). Our results highlight important implications for benchmark design and raise questions about LLMs’ ability to handle uncertainty in real-world applications.
%R 10.18653/v1/2025.findings-acl.1031
%U https://aclanthology.org/2025.findings-acl.1031/
%U https://doi.org/10.18653/v1/2025.findings-acl.1031
%P 20112-20134
Markdown (Informal)
[None of the Above, Less of the Right Parallel Patterns in Human and LLM Performance on Multi-Choice Questions Answering](https://aclanthology.org/2025.findings-acl.1031/) (Tam et al., Findings 2025)
ACL