@inproceedings{palta-etal-2024-plausibly,
title = "Plausibly Problematic Questions in Multiple-Choice Benchmarks for Commonsense Reasoning",
author = "Palta, Shramay and
Balepur, Nishant and
Rankel, Peter and
Wiegreffe, Sarah and
Carpuat, Marine and
Rudinger, Rachel",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.198/",
doi = "10.18653/v1/2024.findings-emnlp.198",
pages = "3451--3473",
abstract = "Questions involving commonsense reasoning about everyday situations often admit many possible or plausible answers. In contrast, multiple-choice question (MCQ) benchmarks for commonsense reasoning require a hard selection of a single correct answer, which, in principle, should represent the most plausible answer choice. On 250 MCQ items sampled from two commonsense reasoning benchmarks, we collect 5,000 independent plausibility judgments on answer choices. We find that for over 20{\%} of the sampled MCQS, the answer choice rated most plausible does not match the benchmark gold answers; upon manual inspection, we confirm that this subset exhibits higher rates of problems like ambiguity or semantic mismatch between question and answer choices. Experiments with LLMs reveal low accuracyand high variation in performance on the subset, suggesting our plausibility criterion may be helpful in identifying more reliable benchmark items for commonsense evaluation."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="palta-etal-2024-plausibly">
<titleInfo>
<title>Plausibly Problematic Questions in Multiple-Choice Benchmarks for Commonsense Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shramay</namePart>
<namePart type="family">Palta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nishant</namePart>
<namePart type="family">Balepur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Rankel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarah</namePart>
<namePart type="family">Wiegreffe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marine</namePart>
<namePart type="family">Carpuat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rachel</namePart>
<namePart type="family">Rudinger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Questions involving commonsense reasoning about everyday situations often admit many possible or plausible answers. In contrast, multiple-choice question (MCQ) benchmarks for commonsense reasoning require a hard selection of a single correct answer, which, in principle, should represent the most plausible answer choice. On 250 MCQ items sampled from two commonsense reasoning benchmarks, we collect 5,000 independent plausibility judgments on answer choices. We find that for over 20% of the sampled MCQS, the answer choice rated most plausible does not match the benchmark gold answers; upon manual inspection, we confirm that this subset exhibits higher rates of problems like ambiguity or semantic mismatch between question and answer choices. Experiments with LLMs reveal low accuracyand high variation in performance on the subset, suggesting our plausibility criterion may be helpful in identifying more reliable benchmark items for commonsense evaluation.</abstract>
<identifier type="citekey">palta-etal-2024-plausibly</identifier>
<identifier type="doi">10.18653/v1/2024.findings-emnlp.198</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.198/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>3451</start>
<end>3473</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Plausibly Problematic Questions in Multiple-Choice Benchmarks for Commonsense Reasoning
%A Palta, Shramay
%A Balepur, Nishant
%A Rankel, Peter
%A Wiegreffe, Sarah
%A Carpuat, Marine
%A Rudinger, Rachel
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F palta-etal-2024-plausibly
%X Questions involving commonsense reasoning about everyday situations often admit many possible or plausible answers. In contrast, multiple-choice question (MCQ) benchmarks for commonsense reasoning require a hard selection of a single correct answer, which, in principle, should represent the most plausible answer choice. On 250 MCQ items sampled from two commonsense reasoning benchmarks, we collect 5,000 independent plausibility judgments on answer choices. We find that for over 20% of the sampled MCQS, the answer choice rated most plausible does not match the benchmark gold answers; upon manual inspection, we confirm that this subset exhibits higher rates of problems like ambiguity or semantic mismatch between question and answer choices. Experiments with LLMs reveal low accuracyand high variation in performance on the subset, suggesting our plausibility criterion may be helpful in identifying more reliable benchmark items for commonsense evaluation.
%R 10.18653/v1/2024.findings-emnlp.198
%U https://aclanthology.org/2024.findings-emnlp.198/
%U https://doi.org/10.18653/v1/2024.findings-emnlp.198
%P 3451-3473
Markdown (Informal)
[Plausibly Problematic Questions in Multiple-Choice Benchmarks for Commonsense Reasoning](https://aclanthology.org/2024.findings-emnlp.198/) (Palta et al., Findings 2024)
ACL