@inproceedings{graham-etal-2020-statistical,
title = "Statistical Power and Translationese in Machine Translation Evaluation",
author = "Graham, Yvette and
Haddow, Barry and
Koehn, Philipp",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.emnlp-main.6",
doi = "10.18653/v1/2020.emnlp-main.6",
pages = "72--81",
abstract = "The term translationese has been used to describe features of translated text, and in this paper, we provide detailed analysis of potential adverse effects of translationese on machine translation evaluation. Our analysis shows differences in conclusions drawn from evaluations that include translationese in test data compared to experiments that tested only with text originally composed in that language. For this reason we recommend that reverse-created test data be omitted from future machine translation test sets. In addition, we provide a re-evaluation of a past machine translation evaluation claiming human-parity of MT. One important issue not previously considered is statistical power of significance tests applied to comparison of human and machine translation. Since the very aim of past evaluations was investigation of ties between human and MT systems, power analysis is of particular importance, to avoid, for example, claims of human parity simply corresponding to Type II error resulting from the application of a low powered test. We provide detailed analysis of tests used in such evaluations to provide an indication of a suitable minimum sample size for future studies.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="graham-etal-2020-statistical">
<titleInfo>
<title>Statistical Power and Translationese in Machine Translation Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Bonnie</namePart>
<namePart type="family">Webber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Trevor</namePart>
<namePart type="family">Cohn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The term translationese has been used to describe features of translated text, and in this paper, we provide detailed analysis of potential adverse effects of translationese on machine translation evaluation. Our analysis shows differences in conclusions drawn from evaluations that include translationese in test data compared to experiments that tested only with text originally composed in that language. For this reason we recommend that reverse-created test data be omitted from future machine translation test sets. In addition, we provide a re-evaluation of a past machine translation evaluation claiming human-parity of MT. One important issue not previously considered is statistical power of significance tests applied to comparison of human and machine translation. Since the very aim of past evaluations was investigation of ties between human and MT systems, power analysis is of particular importance, to avoid, for example, claims of human parity simply corresponding to Type II error resulting from the application of a low powered test. We provide detailed analysis of tests used in such evaluations to provide an indication of a suitable minimum sample size for future studies.</abstract>
<identifier type="citekey">graham-etal-2020-statistical</identifier>
<identifier type="doi">10.18653/v1/2020.emnlp-main.6</identifier>
<location>
<url>https://aclanthology.org/2020.emnlp-main.6</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>72</start>
<end>81</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Statistical Power and Translationese in Machine Translation Evaluation
%A Graham, Yvette
%A Haddow, Barry
%A Koehn, Philipp
%Y Webber, Bonnie
%Y Cohn, Trevor
%Y He, Yulan
%Y Liu, Yang
%S Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F graham-etal-2020-statistical
%X The term translationese has been used to describe features of translated text, and in this paper, we provide detailed analysis of potential adverse effects of translationese on machine translation evaluation. Our analysis shows differences in conclusions drawn from evaluations that include translationese in test data compared to experiments that tested only with text originally composed in that language. For this reason we recommend that reverse-created test data be omitted from future machine translation test sets. In addition, we provide a re-evaluation of a past machine translation evaluation claiming human-parity of MT. One important issue not previously considered is statistical power of significance tests applied to comparison of human and machine translation. Since the very aim of past evaluations was investigation of ties between human and MT systems, power analysis is of particular importance, to avoid, for example, claims of human parity simply corresponding to Type II error resulting from the application of a low powered test. We provide detailed analysis of tests used in such evaluations to provide an indication of a suitable minimum sample size for future studies.
%R 10.18653/v1/2020.emnlp-main.6
%U https://aclanthology.org/2020.emnlp-main.6
%U https://doi.org/10.18653/v1/2020.emnlp-main.6
%P 72-81
Markdown (Informal)
[Statistical Power and Translationese in Machine Translation Evaluation](https://aclanthology.org/2020.emnlp-main.6) (Graham et al., EMNLP 2020)
ACL