@inproceedings{belz-etal-2020-disentangling,
title = "Disentangling the Properties of Human Evaluation Methods: A Classification System to Support Comparability, Meta-Evaluation and Reproducibility Testing",
author = "Belz, Anya and
Mille, Simon and
Howcroft, David M.",
editor = "Davis, Brian and
Graham, Yvette and
Kelleher, John and
Sripada, Yaji",
booktitle = "Proceedings of the 13th International Conference on Natural Language Generation",
month = dec,
year = "2020",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.inlg-1.24",
doi = "10.18653/v1/2020.inlg-1.24",
pages = "183--194",
abstract = "Current standards for designing and reporting human evaluations in NLP mean it is generally unclear which evaluations are comparable and can be expected to yield similar results when applied to the same system outputs. This has serious implications for reproducibility testing and meta-evaluation, in particular given that human evaluation is considered the gold standard against which the trustworthiness of automatic metrics is gauged. {\%}and merging others, as well as deciding which evaluations should be able to reproduce each other{'}s results. Using examples from NLG, we propose a classification system for evaluations based on disentangling (i) what is being evaluated (which aspect of quality), and (ii) how it is evaluated in specific (a) evaluation modes and (b) experimental designs. We show that this approach provides a basis for determining comparability, hence for comparison of evaluations across papers, meta-evaluation experiments, reproducibility testing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="belz-etal-2020-disentangling">
<titleInfo>
<title>Disentangling the Properties of Human Evaluation Methods: A Classification System to Support Comparability, Meta-Evaluation and Reproducibility Testing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Howcroft</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 13th International Conference on Natural Language Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Davis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">John</namePart>
<namePart type="family">Kelleher</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaji</namePart>
<namePart type="family">Sripada</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Current standards for designing and reporting human evaluations in NLP mean it is generally unclear which evaluations are comparable and can be expected to yield similar results when applied to the same system outputs. This has serious implications for reproducibility testing and meta-evaluation, in particular given that human evaluation is considered the gold standard against which the trustworthiness of automatic metrics is gauged. %and merging others, as well as deciding which evaluations should be able to reproduce each other’s results. Using examples from NLG, we propose a classification system for evaluations based on disentangling (i) what is being evaluated (which aspect of quality), and (ii) how it is evaluated in specific (a) evaluation modes and (b) experimental designs. We show that this approach provides a basis for determining comparability, hence for comparison of evaluations across papers, meta-evaluation experiments, reproducibility testing.</abstract>
<identifier type="citekey">belz-etal-2020-disentangling</identifier>
<identifier type="doi">10.18653/v1/2020.inlg-1.24</identifier>
<location>
<url>https://aclanthology.org/2020.inlg-1.24</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>183</start>
<end>194</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Disentangling the Properties of Human Evaluation Methods: A Classification System to Support Comparability, Meta-Evaluation and Reproducibility Testing
%A Belz, Anya
%A Mille, Simon
%A Howcroft, David M.
%Y Davis, Brian
%Y Graham, Yvette
%Y Kelleher, John
%Y Sripada, Yaji
%S Proceedings of the 13th International Conference on Natural Language Generation
%D 2020
%8 December
%I Association for Computational Linguistics
%C Dublin, Ireland
%F belz-etal-2020-disentangling
%X Current standards for designing and reporting human evaluations in NLP mean it is generally unclear which evaluations are comparable and can be expected to yield similar results when applied to the same system outputs. This has serious implications for reproducibility testing and meta-evaluation, in particular given that human evaluation is considered the gold standard against which the trustworthiness of automatic metrics is gauged. %and merging others, as well as deciding which evaluations should be able to reproduce each other’s results. Using examples from NLG, we propose a classification system for evaluations based on disentangling (i) what is being evaluated (which aspect of quality), and (ii) how it is evaluated in specific (a) evaluation modes and (b) experimental designs. We show that this approach provides a basis for determining comparability, hence for comparison of evaluations across papers, meta-evaluation experiments, reproducibility testing.
%R 10.18653/v1/2020.inlg-1.24
%U https://aclanthology.org/2020.inlg-1.24
%U https://doi.org/10.18653/v1/2020.inlg-1.24
%P 183-194
Markdown (Informal)
[Disentangling the Properties of Human Evaluation Methods: A Classification System to Support Comparability, Meta-Evaluation and Reproducibility Testing](https://aclanthology.org/2020.inlg-1.24) (Belz et al., INLG 2020)
ACL