@inproceedings{mathur-etal-2020-tangled,
title = "Tangled up in {BLEU}: Reevaluating the Evaluation of Automatic Machine Translation Evaluation Metrics",
author = "Mathur, Nitika and
Baldwin, Timothy and
Cohn, Trevor",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.448",
doi = "10.18653/v1/2020.acl-main.448",
pages = "4984--4997",
abstract = "Automatic metrics are fundamental for the development and evaluation of machine translation systems. Judging whether, and to what extent, automatic metrics concur with the gold standard of human evaluation is not a straightforward problem. We show that current methods for judging metrics are highly sensitive to the translations used for assessment, particularly the presence of outliers, which often leads to falsely confident conclusions about a metric{'}s efficacy. Finally, we turn to pairwise system ranking, developing a method for thresholding performance improvement under an automatic metric against human judgements, which allows quantification of type I versus type II errors incurred, i.e., insignificant human differences in system quality that are accepted, and significant human differences that are rejected. Together, these findings suggest improvements to the protocols for metric evaluation and system performance evaluation in machine translation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mathur-etal-2020-tangled">
<titleInfo>
<title>Tangled up in BLEU: Reevaluating the Evaluation of Automatic Machine Translation Evaluation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nitika</namePart>
<namePart type="family">Mathur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timothy</namePart>
<namePart type="family">Baldwin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic metrics are fundamental for the development and evaluation of machine translation systems. Judging whether, and to what extent, automatic metrics concur with the gold standard of human evaluation is not a straightforward problem. We show that current methods for judging metrics are highly sensitive to the translations used for assessment, particularly the presence of outliers, which often leads to falsely confident conclusions about a metric’s efficacy. Finally, we turn to pairwise system ranking, developing a method for thresholding performance improvement under an automatic metric against human judgements, which allows quantification of type I versus type II errors incurred, i.e., insignificant human differences in system quality that are accepted, and significant human differences that are rejected. Together, these findings suggest improvements to the protocols for metric evaluation and system performance evaluation in machine translation.</abstract>
<identifier type="citekey">mathur-etal-2020-tangled</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.448</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.448</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>4984</start>
<end>4997</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Tangled up in BLEU: Reevaluating the Evaluation of Automatic Machine Translation Evaluation Metrics
%A Mathur, Nitika
%A Baldwin, Timothy
%A Cohn, Trevor
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F mathur-etal-2020-tangled
%X Automatic metrics are fundamental for the development and evaluation of machine translation systems. Judging whether, and to what extent, automatic metrics concur with the gold standard of human evaluation is not a straightforward problem. We show that current methods for judging metrics are highly sensitive to the translations used for assessment, particularly the presence of outliers, which often leads to falsely confident conclusions about a metric’s efficacy. Finally, we turn to pairwise system ranking, developing a method for thresholding performance improvement under an automatic metric against human judgements, which allows quantification of type I versus type II errors incurred, i.e., insignificant human differences in system quality that are accepted, and significant human differences that are rejected. Together, these findings suggest improvements to the protocols for metric evaluation and system performance evaluation in machine translation.
%R 10.18653/v1/2020.acl-main.448
%U https://aclanthology.org/2020.acl-main.448
%U https://doi.org/10.18653/v1/2020.acl-main.448
%P 4984-4997
Markdown (Informal)
[Tangled up in BLEU: Reevaluating the Evaluation of Automatic Machine Translation Evaluation Metrics](https://aclanthology.org/2020.acl-main.448) (Mathur et al., ACL 2020)
ACL