@inproceedings{mathur-etal-2020-results,
title = "Results of the {WMT}20 Metrics Shared Task",
author = "Mathur, Nitika and
Wei, Johnny and
Freitag, Markus and
Ma, Qingsong and
Bojar, Ond{\v{r}}ej",
booktitle = "Proceedings of the Fifth Conference on Machine Translation",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wmt-1.77",
pages = "688--725",
abstract = "This paper presents the results of the WMT20 Metrics Shared Task. Participants were asked to score the outputs of the translation systems competing in the WMT20 News Translation Task with automatic metrics. Ten research groups submitted 27 metrics, four of which are reference-less {``}metrics{''}. In addition, we computed five baseline metrics, including sentBLEU, BLEU, TER and using the SacreBLEU scorer. All metrics were evaluated on how well they correlate at the system-, document- and segment-level with the WMT20 official human scores. We present an extensive analysis on influence of different reference translations on metric reliability, how well automatic metrics score human translations, and we also flag major discrepancies between metric and human scores when evaluating MT systems. Finally, we investigate whether we can use automatic metrics to flag incorrect human ratings.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mathur-etal-2020-results">
<titleInfo>
<title>Results of the WMT20 Metrics Shared Task</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nitika</namePart>
<namePart type="family">Mathur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johnny</namePart>
<namePart type="family">Wei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qingsong</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Conference on Machine Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents the results of the WMT20 Metrics Shared Task. Participants were asked to score the outputs of the translation systems competing in the WMT20 News Translation Task with automatic metrics. Ten research groups submitted 27 metrics, four of which are reference-less “metrics”. In addition, we computed five baseline metrics, including sentBLEU, BLEU, TER and using the SacreBLEU scorer. All metrics were evaluated on how well they correlate at the system-, document- and segment-level with the WMT20 official human scores. We present an extensive analysis on influence of different reference translations on metric reliability, how well automatic metrics score human translations, and we also flag major discrepancies between metric and human scores when evaluating MT systems. Finally, we investigate whether we can use automatic metrics to flag incorrect human ratings.</abstract>
<identifier type="citekey">mathur-etal-2020-results</identifier>
<location>
<url>https://aclanthology.org/2020.wmt-1.77</url>
</location>
<part>
<date>2020-11</date>
<extent unit="page">
<start>688</start>
<end>725</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Results of the WMT20 Metrics Shared Task
%A Mathur, Nitika
%A Wei, Johnny
%A Freitag, Markus
%A Ma, Qingsong
%A Bojar, Ondřej
%S Proceedings of the Fifth Conference on Machine Translation
%D 2020
%8 November
%I Association for Computational Linguistics
%C Online
%F mathur-etal-2020-results
%X This paper presents the results of the WMT20 Metrics Shared Task. Participants were asked to score the outputs of the translation systems competing in the WMT20 News Translation Task with automatic metrics. Ten research groups submitted 27 metrics, four of which are reference-less “metrics”. In addition, we computed five baseline metrics, including sentBLEU, BLEU, TER and using the SacreBLEU scorer. All metrics were evaluated on how well they correlate at the system-, document- and segment-level with the WMT20 official human scores. We present an extensive analysis on influence of different reference translations on metric reliability, how well automatic metrics score human translations, and we also flag major discrepancies between metric and human scores when evaluating MT systems. Finally, we investigate whether we can use automatic metrics to flag incorrect human ratings.
%U https://aclanthology.org/2020.wmt-1.77
%P 688-725
Markdown (Informal)
[Results of the WMT20 Metrics Shared Task](https://aclanthology.org/2020.wmt-1.77) (Mathur et al., WMT 2020)
ACL
- Nitika Mathur, Johnny Wei, Markus Freitag, Qingsong Ma, and Ondřej Bojar. 2020. Results of the WMT20 Metrics Shared Task. In Proceedings of the Fifth Conference on Machine Translation, pages 688–725, Online. Association for Computational Linguistics.