@inproceedings{denkowski-lavie-2010-choosing,
title = "Choosing the Right Evaluation for Machine Translation: an Examination of Annotator and Automatic Metric Performance on Human Judgment Tasks",
author = "Denkowski, Michael and
Lavie, Alon",
booktitle = "Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers",
month = oct # " 31-" # nov # " 4",
year = "2010",
address = "Denver, Colorado, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2010.amta-papers.20",
abstract = "This paper examines the motivation, design, and practical results of several types of human evaluation tasks for machine translation. In addition to considering annotator performance and task informativeness over multiple evaluations, we explore the practicality of tuning automatic evaluation metrics to each judgment type in a comprehensive experiment using the METEOR-NEXT metric. We present results showing clear advantages of tuning to certain types of judgments and discuss causes of inconsistency when tuning to various judgment data, as well as sources of difficulty in the human evaluation tasks themselves.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="denkowski-lavie-2010-choosing">
<titleInfo>
<title>Choosing the Right Evaluation for Machine Translation: an Examination of Annotator and Automatic Metric Performance on Human Judgment Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Denkowski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Lavie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-oct 31-nov 4</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers</title>
</titleInfo>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Denver, Colorado, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper examines the motivation, design, and practical results of several types of human evaluation tasks for machine translation. In addition to considering annotator performance and task informativeness over multiple evaluations, we explore the practicality of tuning automatic evaluation metrics to each judgment type in a comprehensive experiment using the METEOR-NEXT metric. We present results showing clear advantages of tuning to certain types of judgments and discuss causes of inconsistency when tuning to various judgment data, as well as sources of difficulty in the human evaluation tasks themselves.</abstract>
<identifier type="citekey">denkowski-lavie-2010-choosing</identifier>
<location>
<url>https://aclanthology.org/2010.amta-papers.20</url>
</location>
<part>
<date>2010-oct 31-nov 4</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Choosing the Right Evaluation for Machine Translation: an Examination of Annotator and Automatic Metric Performance on Human Judgment Tasks
%A Denkowski, Michael
%A Lavie, Alon
%S Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers
%D 2010
%8 oct 31 nov 4
%I Association for Machine Translation in the Americas
%C Denver, Colorado, USA
%F denkowski-lavie-2010-choosing
%X This paper examines the motivation, design, and practical results of several types of human evaluation tasks for machine translation. In addition to considering annotator performance and task informativeness over multiple evaluations, we explore the practicality of tuning automatic evaluation metrics to each judgment type in a comprehensive experiment using the METEOR-NEXT metric. We present results showing clear advantages of tuning to certain types of judgments and discuss causes of inconsistency when tuning to various judgment data, as well as sources of difficulty in the human evaluation tasks themselves.
%U https://aclanthology.org/2010.amta-papers.20
Markdown (Informal)
[Choosing the Right Evaluation for Machine Translation: an Examination of Annotator and Automatic Metric Performance on Human Judgment Tasks](https://aclanthology.org/2010.amta-papers.20) (Denkowski & Lavie, AMTA 2010)
ACL