@inproceedings{sedoc-ungar-2020-item,
title = "Item Response Theory for Efficient Human Evaluation of Chatbots",
author = "Sedoc, Jo{\~a}o and
Ungar, Lyle",
editor = "Eger, Steffen and
Gao, Yang and
Peyrard, Maxime and
Zhao, Wei and
Hovy, Eduard",
booktitle = "Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.eval4nlp-1.3",
doi = "10.18653/v1/2020.eval4nlp-1.3",
pages = "21--33",
abstract = "Conversational agent quality is currently assessed using human evaluation, and often requires an exorbitant number of comparisons to achieve statistical significance. In this paper, we introduce Item Response Theory (IRT) for chatbot evaluation, using a paired comparison in which annotators judge which system responds better to the next turn of a conversation. IRT is widely used in educational testing for simultaneously assessing the ability of test takers and the quality of test questions. It is similarly well suited for chatbot evaluation since it allows the assessment of both models and the prompts used to evaluate them. We use IRT to efficiently assess chatbots, and show that different examples from the evaluation set are better suited for comparing high-quality (nearer to human performance) than low-quality systems. Finally, we use IRT to reduce the number of evaluation examples assessed by human annotators while retaining discriminative power.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sedoc-ungar-2020-item">
<titleInfo>
<title>Item Response Theory for Efficient Human Evaluation of Chatbots</title>
</titleInfo>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lyle</namePart>
<namePart type="family">Ungar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems</title>
</titleInfo>
<name type="personal">
<namePart type="given">Steffen</namePart>
<namePart type="family">Eger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maxime</namePart>
<namePart type="family">Peyrard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Hovy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Conversational agent quality is currently assessed using human evaluation, and often requires an exorbitant number of comparisons to achieve statistical significance. In this paper, we introduce Item Response Theory (IRT) for chatbot evaluation, using a paired comparison in which annotators judge which system responds better to the next turn of a conversation. IRT is widely used in educational testing for simultaneously assessing the ability of test takers and the quality of test questions. It is similarly well suited for chatbot evaluation since it allows the assessment of both models and the prompts used to evaluate them. We use IRT to efficiently assess chatbots, and show that different examples from the evaluation set are better suited for comparing high-quality (nearer to human performance) than low-quality systems. Finally, we use IRT to reduce the number of evaluation examples assessed by human annotators while retaining discriminative power.</abstract>
<identifier type="citekey">sedoc-ungar-2020-item</identifier>
<identifier type="doi">10.18653/v1/2020.eval4nlp-1.3</identifier>
<location>
<url>https://aclanthology.org/2020.eval4nlp-1.3</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>21</start>
<end>33</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Item Response Theory for Efficient Human Evaluation of Chatbots
%A Sedoc, João
%A Ungar, Lyle
%Y Eger, Steffen
%Y Gao, Yang
%Y Peyrard, Maxime
%Y Zhao, Wei
%Y Hovy, Eduard
%S Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F sedoc-ungar-2020-item
%X Conversational agent quality is currently assessed using human evaluation, and often requires an exorbitant number of comparisons to achieve statistical significance. In this paper, we introduce Item Response Theory (IRT) for chatbot evaluation, using a paired comparison in which annotators judge which system responds better to the next turn of a conversation. IRT is widely used in educational testing for simultaneously assessing the ability of test takers and the quality of test questions. It is similarly well suited for chatbot evaluation since it allows the assessment of both models and the prompts used to evaluate them. We use IRT to efficiently assess chatbots, and show that different examples from the evaluation set are better suited for comparing high-quality (nearer to human performance) than low-quality systems. Finally, we use IRT to reduce the number of evaluation examples assessed by human annotators while retaining discriminative power.
%R 10.18653/v1/2020.eval4nlp-1.3
%U https://aclanthology.org/2020.eval4nlp-1.3
%U https://doi.org/10.18653/v1/2020.eval4nlp-1.3
%P 21-33
Markdown (Informal)
[Item Response Theory for Efficient Human Evaluation of Chatbots](https://aclanthology.org/2020.eval4nlp-1.3) (Sedoc & Ungar, Eval4NLP 2020)
ACL