@inproceedings{amidei-etal-2018-rethinking,
title = "Rethinking the Agreement in Human Evaluation Tasks",
author = "Amidei, Jacopo and
Piwek, Paul and
Willis, Alistair",
editor = "Bender, Emily M. and
Derczynski, Leon and
Isabelle, Pierre",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/C18-1281",
pages = "3318--3329",
abstract = "Human evaluations are broadly thought to be more valuable the higher the inter-annotator agreement. In this paper we examine this idea. We will describe our experiments and analysis within the area of Automatic Question Generation. Our experiments show how annotators diverge in language annotation tasks due to a range of ineliminable factors. For this reason, we believe that annotation schemes for natural language generation tasks that are aimed at evaluating language quality need to be treated with great care. In particular, an unchecked focus on reduction of disagreement among annotators runs the danger of creating generation goals that reward output that is more distant from, rather than closer to, natural human-like language. We conclude the paper by suggesting a new approach to the use of the agreement metrics in natural language generation evaluation tasks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="amidei-etal-2018-rethinking">
<titleInfo>
<title>Rethinking the Agreement in Human Evaluation Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jacopo</namePart>
<namePart type="family">Amidei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Piwek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Willis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 27th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Emily</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Bender</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leon</namePart>
<namePart type="family">Derczynski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Isabelle</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Santa Fe, New Mexico, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Human evaluations are broadly thought to be more valuable the higher the inter-annotator agreement. In this paper we examine this idea. We will describe our experiments and analysis within the area of Automatic Question Generation. Our experiments show how annotators diverge in language annotation tasks due to a range of ineliminable factors. For this reason, we believe that annotation schemes for natural language generation tasks that are aimed at evaluating language quality need to be treated with great care. In particular, an unchecked focus on reduction of disagreement among annotators runs the danger of creating generation goals that reward output that is more distant from, rather than closer to, natural human-like language. We conclude the paper by suggesting a new approach to the use of the agreement metrics in natural language generation evaluation tasks.</abstract>
<identifier type="citekey">amidei-etal-2018-rethinking</identifier>
<location>
<url>https://aclanthology.org/C18-1281</url>
</location>
<part>
<date>2018-08</date>
<extent unit="page">
<start>3318</start>
<end>3329</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rethinking the Agreement in Human Evaluation Tasks
%A Amidei, Jacopo
%A Piwek, Paul
%A Willis, Alistair
%Y Bender, Emily M.
%Y Derczynski, Leon
%Y Isabelle, Pierre
%S Proceedings of the 27th International Conference on Computational Linguistics
%D 2018
%8 August
%I Association for Computational Linguistics
%C Santa Fe, New Mexico, USA
%F amidei-etal-2018-rethinking
%X Human evaluations are broadly thought to be more valuable the higher the inter-annotator agreement. In this paper we examine this idea. We will describe our experiments and analysis within the area of Automatic Question Generation. Our experiments show how annotators diverge in language annotation tasks due to a range of ineliminable factors. For this reason, we believe that annotation schemes for natural language generation tasks that are aimed at evaluating language quality need to be treated with great care. In particular, an unchecked focus on reduction of disagreement among annotators runs the danger of creating generation goals that reward output that is more distant from, rather than closer to, natural human-like language. We conclude the paper by suggesting a new approach to the use of the agreement metrics in natural language generation evaluation tasks.
%U https://aclanthology.org/C18-1281
%P 3318-3329
Markdown (Informal)
[Rethinking the Agreement in Human Evaluation Tasks](https://aclanthology.org/C18-1281) (Amidei et al., COLING 2018)
ACL
- Jacopo Amidei, Paul Piwek, and Alistair Willis. 2018. Rethinking the Agreement in Human Evaluation Tasks. In Proceedings of the 27th International Conference on Computational Linguistics, pages 3318–3329, Santa Fe, New Mexico, USA. Association for Computational Linguistics.