@inproceedings{specia-gimenez-2010-combining,
title = "Combining Confidence Estimation and Reference-based Metrics for Segment-level {MT} Evaluation",
author = "Specia, Lucia and
Gim{\'e}nez, Jes{\'u}s",
booktitle = "Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers",
month = oct # " 31-" # nov # " 4",
year = "2010",
address = "Denver, Colorado, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2010.amta-papers.3",
abstract = "We describe an effort to improve standard reference-based metrics for Machine Translation (MT) evaluation by enriching them with Confidence Estimation (CE) features and using a learning mechanism trained on human annotations. Reference-based MT evaluation metrics compare the system output against reference translations looking for overlaps at different levels (lexical, syntactic, and semantic). These metrics aim at comparing MT systems or analyzing the progress of a given system and are known to have reasonably good correlation with human judgments at the corpus level, but not at the segment level. CE metrics, on the other hand, target the system in use, providing a quality score to the end-user for each translated segment. They cannot rely on reference translations, and use instead information extracted from the input text, system output and possibly external corpora to train machine learning algorithms. These metrics correlate better with human judgments at the segment level. However, they are usually highly biased by difficulty level of the input segment, and therefore are less appropriate for comparing multiple systems translating the same input segments. We show that these two classes of metrics are complementary and can be combined to provide MT evaluation metrics that achieve higher correlation with human judgments at the segment level.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="specia-gimenez-2010-combining">
<titleInfo>
<title>Combining Confidence Estimation and Reference-based Metrics for Segment-level MT Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jesús</namePart>
<namePart type="family">Giménez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-oct 31-nov 4</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers</title>
</titleInfo>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Denver, Colorado, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe an effort to improve standard reference-based metrics for Machine Translation (MT) evaluation by enriching them with Confidence Estimation (CE) features and using a learning mechanism trained on human annotations. Reference-based MT evaluation metrics compare the system output against reference translations looking for overlaps at different levels (lexical, syntactic, and semantic). These metrics aim at comparing MT systems or analyzing the progress of a given system and are known to have reasonably good correlation with human judgments at the corpus level, but not at the segment level. CE metrics, on the other hand, target the system in use, providing a quality score to the end-user for each translated segment. They cannot rely on reference translations, and use instead information extracted from the input text, system output and possibly external corpora to train machine learning algorithms. These metrics correlate better with human judgments at the segment level. However, they are usually highly biased by difficulty level of the input segment, and therefore are less appropriate for comparing multiple systems translating the same input segments. We show that these two classes of metrics are complementary and can be combined to provide MT evaluation metrics that achieve higher correlation with human judgments at the segment level.</abstract>
<identifier type="citekey">specia-gimenez-2010-combining</identifier>
<location>
<url>https://aclanthology.org/2010.amta-papers.3</url>
</location>
<part>
<date>2010-oct 31-nov 4</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Combining Confidence Estimation and Reference-based Metrics for Segment-level MT Evaluation
%A Specia, Lucia
%A Giménez, Jesús
%S Proceedings of the 9th Conference of the Association for Machine Translation in the Americas: Research Papers
%D 2010
%8 oct 31 nov 4
%I Association for Machine Translation in the Americas
%C Denver, Colorado, USA
%F specia-gimenez-2010-combining
%X We describe an effort to improve standard reference-based metrics for Machine Translation (MT) evaluation by enriching them with Confidence Estimation (CE) features and using a learning mechanism trained on human annotations. Reference-based MT evaluation metrics compare the system output against reference translations looking for overlaps at different levels (lexical, syntactic, and semantic). These metrics aim at comparing MT systems or analyzing the progress of a given system and are known to have reasonably good correlation with human judgments at the corpus level, but not at the segment level. CE metrics, on the other hand, target the system in use, providing a quality score to the end-user for each translated segment. They cannot rely on reference translations, and use instead information extracted from the input text, system output and possibly external corpora to train machine learning algorithms. These metrics correlate better with human judgments at the segment level. However, they are usually highly biased by difficulty level of the input segment, and therefore are less appropriate for comparing multiple systems translating the same input segments. We show that these two classes of metrics are complementary and can be combined to provide MT evaluation metrics that achieve higher correlation with human judgments at the segment level.
%U https://aclanthology.org/2010.amta-papers.3
Markdown (Informal)
[Combining Confidence Estimation and Reference-based Metrics for Segment-level MT Evaluation](https://aclanthology.org/2010.amta-papers.3) (Specia & Giménez, AMTA 2010)
ACL