@inproceedings{raina-etal-2023-assessing,
title = "Assessing Distractors in Multiple-Choice Tests",
author = "Raina, Vatsal and
Liusie, Adian and
Gales, Mark",
editor = {Deutsch, Daniel and
Dror, Rotem and
Eger, Steffen and
Gao, Yang and
Leiter, Christoph and
Opitz, Juri and
R{\"u}ckl{\'e}, Andreas},
booktitle = "Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2023",
address = "Bali, Indonesia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.eval4nlp-1.2/",
doi = "10.18653/v1/2023.eval4nlp-1.2",
pages = "12--22",
abstract = "Multiple-choice tests are a common approach for assessing candidates' comprehension skills. Standard multiple-choice reading comprehension exams require candidates to select the correct answer option from a discrete set based on a question in relation to a contextual passage. For appropriate assessment, the distractor answer options must by definition be incorrect but plausible and diverse. However, generating good quality distractors satisfying these criteria is a challenging task for content creators. We propose automated assessment metrics for the quality of distractors in multiple-choice reading comprehension tests. Specifically, we define quality in terms of the incorrectness, plausibility and diversity of the distractor options. We assess incorrectness using the classification ability of a binary multiple-choice reading comprehension system. Plausibility is assessed by considering the distractor confidence - the probability mass associated with the distractor options for a standard multi-class multiple-choice reading comprehension system. Diversity is assessed by pairwise comparison of an embedding-based equivalence metric between the distractors of a question. To further validate the plausibility metric we compare against candidate distributions over multiple-choice questions and agreement with a ChatGPT model`s interpretation of distractor plausibility and diversity."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="raina-etal-2023-assessing">
<titleInfo>
<title>Assessing Distractors in Multiple-Choice Tests</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vatsal</namePart>
<namePart type="family">Raina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adian</namePart>
<namePart type="family">Liusie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Gales</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christoph</namePart>
<namePart type="family">Leiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juri</namePart>
<namePart type="family">Opitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Rücklé</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bali, Indonesia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Multiple-choice tests are a common approach for assessing candidates’ comprehension skills. Standard multiple-choice reading comprehension exams require candidates to select the correct answer option from a discrete set based on a question in relation to a contextual passage. For appropriate assessment, the distractor answer options must by definition be incorrect but plausible and diverse. However, generating good quality distractors satisfying these criteria is a challenging task for content creators. We propose automated assessment metrics for the quality of distractors in multiple-choice reading comprehension tests. Specifically, we define quality in terms of the incorrectness, plausibility and diversity of the distractor options. We assess incorrectness using the classification ability of a binary multiple-choice reading comprehension system. Plausibility is assessed by considering the distractor confidence - the probability mass associated with the distractor options for a standard multi-class multiple-choice reading comprehension system. Diversity is assessed by pairwise comparison of an embedding-based equivalence metric between the distractors of a question. To further validate the plausibility metric we compare against candidate distributions over multiple-choice questions and agreement with a ChatGPT model‘s interpretation of distractor plausibility and diversity.</abstract>
<identifier type="citekey">raina-etal-2023-assessing</identifier>
<identifier type="doi">10.18653/v1/2023.eval4nlp-1.2</identifier>
<location>
<url>https://aclanthology.org/2023.eval4nlp-1.2/</url>
</location>
<part>
<date>2023-11</date>
<extent unit="page">
<start>12</start>
<end>22</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Assessing Distractors in Multiple-Choice Tests
%A Raina, Vatsal
%A Liusie, Adian
%A Gales, Mark
%Y Deutsch, Daniel
%Y Dror, Rotem
%Y Eger, Steffen
%Y Gao, Yang
%Y Leiter, Christoph
%Y Opitz, Juri
%Y Rücklé, Andreas
%S Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems
%D 2023
%8 November
%I Association for Computational Linguistics
%C Bali, Indonesia
%F raina-etal-2023-assessing
%X Multiple-choice tests are a common approach for assessing candidates’ comprehension skills. Standard multiple-choice reading comprehension exams require candidates to select the correct answer option from a discrete set based on a question in relation to a contextual passage. For appropriate assessment, the distractor answer options must by definition be incorrect but plausible and diverse. However, generating good quality distractors satisfying these criteria is a challenging task for content creators. We propose automated assessment metrics for the quality of distractors in multiple-choice reading comprehension tests. Specifically, we define quality in terms of the incorrectness, plausibility and diversity of the distractor options. We assess incorrectness using the classification ability of a binary multiple-choice reading comprehension system. Plausibility is assessed by considering the distractor confidence - the probability mass associated with the distractor options for a standard multi-class multiple-choice reading comprehension system. Diversity is assessed by pairwise comparison of an embedding-based equivalence metric between the distractors of a question. To further validate the plausibility metric we compare against candidate distributions over multiple-choice questions and agreement with a ChatGPT model‘s interpretation of distractor plausibility and diversity.
%R 10.18653/v1/2023.eval4nlp-1.2
%U https://aclanthology.org/2023.eval4nlp-1.2/
%U https://doi.org/10.18653/v1/2023.eval4nlp-1.2
%P 12-22
Markdown (Informal)
[Assessing Distractors in Multiple-Choice Tests](https://aclanthology.org/2023.eval4nlp-1.2/) (Raina et al., Eval4NLP 2023)
ACL
- Vatsal Raina, Adian Liusie, and Mark Gales. 2023. Assessing Distractors in Multiple-Choice Tests. In Proceedings of the 4th Workshop on Evaluation and Comparison of NLP Systems, pages 12–22, Bali, Indonesia. Association for Computational Linguistics.