@inproceedings{saldias-fuentes-etal-2022-toward,
title = "Toward More Effective Human Evaluation for Machine Translation",
author = "Sald{\'\i}as Fuentes, Bel{\'e}n and
Foster, George and
Freitag, Markus and
Tan, Qijun",
editor = "Belz, Anya and
Popovi{\'c}, Maja and
Reiter, Ehud and
Shimorina, Anastasia",
booktitle = "Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.humeval-1.7",
doi = "10.18653/v1/2022.humeval-1.7",
pages = "76--89",
abstract = "Improvements in text generation technologies such as machine translation have necessitated more costly and time-consuming human evaluation procedures to ensure an accurate signal. We investigate a simple way to reduce cost by reducing the number of text segments that must be annotated in order to accurately predict a score for a complete test set. Using a sampling approach, we demonstrate that information from document membership and automatic metrics can help improve estimates compared to a pure random sampling baseline. We achieve gains of up to 20{\%} in average absolute error by leveraging stratified sampling and control variates. Our techniques can improve estimates made from a fixed annotation budget, are easy to implement, and can be applied to any problem with structure similar to the one we study.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saldias-fuentes-etal-2022-toward">
<titleInfo>
<title>Toward More Effective Human Evaluation for Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Belén</namePart>
<namePart type="family">Saldías Fuentes</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Foster</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Markus</namePart>
<namePart type="family">Freitag</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qijun</namePart>
<namePart type="family">Tan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anya</namePart>
<namePart type="family">Belz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anastasia</namePart>
<namePart type="family">Shimorina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Improvements in text generation technologies such as machine translation have necessitated more costly and time-consuming human evaluation procedures to ensure an accurate signal. We investigate a simple way to reduce cost by reducing the number of text segments that must be annotated in order to accurately predict a score for a complete test set. Using a sampling approach, we demonstrate that information from document membership and automatic metrics can help improve estimates compared to a pure random sampling baseline. We achieve gains of up to 20% in average absolute error by leveraging stratified sampling and control variates. Our techniques can improve estimates made from a fixed annotation budget, are easy to implement, and can be applied to any problem with structure similar to the one we study.</abstract>
<identifier type="citekey">saldias-fuentes-etal-2022-toward</identifier>
<identifier type="doi">10.18653/v1/2022.humeval-1.7</identifier>
<location>
<url>https://aclanthology.org/2022.humeval-1.7</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>76</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Toward More Effective Human Evaluation for Machine Translation
%A Saldías Fuentes, Belén
%A Foster, George
%A Freitag, Markus
%A Tan, Qijun
%Y Belz, Anya
%Y Popović, Maja
%Y Reiter, Ehud
%Y Shimorina, Anastasia
%S Proceedings of the 2nd Workshop on Human Evaluation of NLP Systems (HumEval)
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F saldias-fuentes-etal-2022-toward
%X Improvements in text generation technologies such as machine translation have necessitated more costly and time-consuming human evaluation procedures to ensure an accurate signal. We investigate a simple way to reduce cost by reducing the number of text segments that must be annotated in order to accurately predict a score for a complete test set. Using a sampling approach, we demonstrate that information from document membership and automatic metrics can help improve estimates compared to a pure random sampling baseline. We achieve gains of up to 20% in average absolute error by leveraging stratified sampling and control variates. Our techniques can improve estimates made from a fixed annotation budget, are easy to implement, and can be applied to any problem with structure similar to the one we study.
%R 10.18653/v1/2022.humeval-1.7
%U https://aclanthology.org/2022.humeval-1.7
%U https://doi.org/10.18653/v1/2022.humeval-1.7
%P 76-89
Markdown (Informal)
[Toward More Effective Human Evaluation for Machine Translation](https://aclanthology.org/2022.humeval-1.7) (Saldías Fuentes et al., HumEval 2022)
ACL