@inproceedings{lo-knowles-2023-data,
title = "Data Sampling and (In)stability in Machine Translation Evaluation",
author = "Lo, Chi-kiu and
Knowles, Rebecca",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.826",
doi = "10.18653/v1/2023.findings-acl.826",
pages = "13064--13074",
abstract = "We analyze the different data sampling approaches used in selecting data for human evaluation and ranking of machine translation systems at the highly influential Conference on Machine Translation (WMT). By using automatic evaluation metrics, we are able to focus on the impact of the data sampling procedure as separate from questions about human annotator consistency. We provide evidence that the latest data sampling approach used at WMT skews the annotated data toward shorter documents, not necessarily representative of the full test set. Lastly, we examine a new data sampling method that uses the available labour budget to sample data in a more representative manner, with the goals of improving representation of various document lengths in the sample and producing more stable rankings of system translation quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lo-knowles-2023-data">
<titleInfo>
<title>Data Sampling and (In)stability in Machine Translation Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chi-kiu</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Knowles</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We analyze the different data sampling approaches used in selecting data for human evaluation and ranking of machine translation systems at the highly influential Conference on Machine Translation (WMT). By using automatic evaluation metrics, we are able to focus on the impact of the data sampling procedure as separate from questions about human annotator consistency. We provide evidence that the latest data sampling approach used at WMT skews the annotated data toward shorter documents, not necessarily representative of the full test set. Lastly, we examine a new data sampling method that uses the available labour budget to sample data in a more representative manner, with the goals of improving representation of various document lengths in the sample and producing more stable rankings of system translation quality.</abstract>
<identifier type="citekey">lo-knowles-2023-data</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.826</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.826</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>13064</start>
<end>13074</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data Sampling and (In)stability in Machine Translation Evaluation
%A Lo, Chi-kiu
%A Knowles, Rebecca
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F lo-knowles-2023-data
%X We analyze the different data sampling approaches used in selecting data for human evaluation and ranking of machine translation systems at the highly influential Conference on Machine Translation (WMT). By using automatic evaluation metrics, we are able to focus on the impact of the data sampling procedure as separate from questions about human annotator consistency. We provide evidence that the latest data sampling approach used at WMT skews the annotated data toward shorter documents, not necessarily representative of the full test set. Lastly, we examine a new data sampling method that uses the available labour budget to sample data in a more representative manner, with the goals of improving representation of various document lengths in the sample and producing more stable rankings of system translation quality.
%R 10.18653/v1/2023.findings-acl.826
%U https://aclanthology.org/2023.findings-acl.826
%U https://doi.org/10.18653/v1/2023.findings-acl.826
%P 13064-13074
Markdown (Informal)
[Data Sampling and (In)stability in Machine Translation Evaluation](https://aclanthology.org/2023.findings-acl.826) (Lo & Knowles, Findings 2023)
ACL