@inproceedings{amidei-etal-2019-agreement,
title = "Agreement is overrated: A plea for correlation to assess human evaluation reliability",
author = "Amidei, Jacopo and
Piwek, Paul and
Willis, Alistair",
editor = "van Deemter, Kees and
Lin, Chenghua and
Takamura, Hiroya",
booktitle = "Proceedings of the 12th International Conference on Natural Language Generation",
month = oct # "{--}" # nov,
year = "2019",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W19-8642",
doi = "10.18653/v1/W19-8642",
pages = "344--354",
abstract = "Inter-Annotator Agreement (IAA) is used as a means of assessing the quality of NLG evaluation data, in particular, its reliability. According to existing scales of IAA interpretation {--} see, for example, Lommel et al. (2014), Liu et al. (2016), Sedoc et al. (2018) and Amidei et al. (2018a) {--} most data collected for NLG evaluation fail the reliability test. We confirmed this trend by analysing papers published over the last 10 years in NLG-specific conferences (in total 135 papers that included some sort of human evaluation study). Following Sampson and Babarczy (2008), Lommel et al. (2014), Joshi et al. (2016) and Amidei et al. (2018b), such phenomena can be explained in terms of irreducible human language variability. Using three case studies, we show the limits of considering IAA as the only criterion for checking evaluation reliability. Given human language variability, we propose that for human evaluation of NLG, correlation coefficients and agreement coefficients should be used together to obtain a better assessment of the evaluation data reliability. This is illustrated using the three case studies.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="amidei-etal-2019-agreement">
<titleInfo>
<title>Agreement is overrated: A plea for correlation to assess human evaluation reliability</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jacopo</namePart>
<namePart type="family">Amidei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paul</namePart>
<namePart type="family">Piwek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alistair</namePart>
<namePart type="family">Willis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2019-oct–nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kees</namePart>
<namePart type="family">van Deemter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenghua</namePart>
<namePart type="family">Lin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroya</namePart>
<namePart type="family">Takamura</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Inter-Annotator Agreement (IAA) is used as a means of assessing the quality of NLG evaluation data, in particular, its reliability. According to existing scales of IAA interpretation – see, for example, Lommel et al. (2014), Liu et al. (2016), Sedoc et al. (2018) and Amidei et al. (2018a) – most data collected for NLG evaluation fail the reliability test. We confirmed this trend by analysing papers published over the last 10 years in NLG-specific conferences (in total 135 papers that included some sort of human evaluation study). Following Sampson and Babarczy (2008), Lommel et al. (2014), Joshi et al. (2016) and Amidei et al. (2018b), such phenomena can be explained in terms of irreducible human language variability. Using three case studies, we show the limits of considering IAA as the only criterion for checking evaluation reliability. Given human language variability, we propose that for human evaluation of NLG, correlation coefficients and agreement coefficients should be used together to obtain a better assessment of the evaluation data reliability. This is illustrated using the three case studies.</abstract>
<identifier type="citekey">amidei-etal-2019-agreement</identifier>
<identifier type="doi">10.18653/v1/W19-8642</identifier>
<location>
<url>https://aclanthology.org/W19-8642</url>
</location>
<part>
<date>2019-oct–nov</date>
<extent unit="page">
<start>344</start>
<end>354</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Agreement is overrated: A plea for correlation to assess human evaluation reliability
%A Amidei, Jacopo
%A Piwek, Paul
%A Willis, Alistair
%Y van Deemter, Kees
%Y Lin, Chenghua
%Y Takamura, Hiroya
%S Proceedings of the 12th International Conference on Natural Language Generation
%D 2019
%8 oct–nov
%I Association for Computational Linguistics
%C Tokyo, Japan
%F amidei-etal-2019-agreement
%X Inter-Annotator Agreement (IAA) is used as a means of assessing the quality of NLG evaluation data, in particular, its reliability. According to existing scales of IAA interpretation – see, for example, Lommel et al. (2014), Liu et al. (2016), Sedoc et al. (2018) and Amidei et al. (2018a) – most data collected for NLG evaluation fail the reliability test. We confirmed this trend by analysing papers published over the last 10 years in NLG-specific conferences (in total 135 papers that included some sort of human evaluation study). Following Sampson and Babarczy (2008), Lommel et al. (2014), Joshi et al. (2016) and Amidei et al. (2018b), such phenomena can be explained in terms of irreducible human language variability. Using three case studies, we show the limits of considering IAA as the only criterion for checking evaluation reliability. Given human language variability, we propose that for human evaluation of NLG, correlation coefficients and agreement coefficients should be used together to obtain a better assessment of the evaluation data reliability. This is illustrated using the three case studies.
%R 10.18653/v1/W19-8642
%U https://aclanthology.org/W19-8642
%U https://doi.org/10.18653/v1/W19-8642
%P 344-354
Markdown (Informal)
[Agreement is overrated: A plea for correlation to assess human evaluation reliability](https://aclanthology.org/W19-8642) (Amidei et al., INLG 2019)
ACL