@inproceedings{lavie-etal-2004-significance,
title = "The significance of recall in automatic metrics for {MT} evaluation",
author = "Lavie, Alon and
Sagae, Kenji and
Jayaraman, Shyamsundar",
editor = "Frederking, Robert E. and
Taylor, Kathryn B.",
booktitle = "Proceedings of the 6th Conference of the Association for Machine Translation in the Americas: Technical Papers",
month = sep # " 28 - " # oct # " 2",
year = "2004",
address = "Washington, USA",
publisher = "Springer",
url = "https://aclanthology.org/2004.amta-papers.16/",
pages = "134--143",
abstract = "Recent research has shown that a balanced harmonic mean (F1 measure) of unigram precision and recall outperforms the widely used BLEU and NIST metrics for Machine Translation evaluation in terms of correlation with human judgments of translation quality. We show that significantly better correlations can be achieved by placing more weight on recall than on precision. While this may seem unexpected, since BLEU and NIST focus on n-gram precision and disregard recall, our experiments show that correlation with human judgments is highest when almost all of the weight is assigned to recall. We also show that stemming is significantly beneficial not just to simpler unigram precision and recall based metrics, but also to BLEU and NIST."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lavie-etal-2004-significance">
<titleInfo>
<title>The significance of recall in automatic metrics for MT evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Lavie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenji</namePart>
<namePart type="family">Sagae</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shyamsundar</namePart>
<namePart type="family">Jayaraman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2004-sep 28 - oct 2</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 6th Conference of the Association for Machine Translation in the Americas: Technical Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Frederking</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kathryn</namePart>
<namePart type="given">B</namePart>
<namePart type="family">Taylor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Springer</publisher>
<place>
<placeTerm type="text">Washington, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent research has shown that a balanced harmonic mean (F1 measure) of unigram precision and recall outperforms the widely used BLEU and NIST metrics for Machine Translation evaluation in terms of correlation with human judgments of translation quality. We show that significantly better correlations can be achieved by placing more weight on recall than on precision. While this may seem unexpected, since BLEU and NIST focus on n-gram precision and disregard recall, our experiments show that correlation with human judgments is highest when almost all of the weight is assigned to recall. We also show that stemming is significantly beneficial not just to simpler unigram precision and recall based metrics, but also to BLEU and NIST.</abstract>
<identifier type="citekey">lavie-etal-2004-significance</identifier>
<location>
<url>https://aclanthology.org/2004.amta-papers.16/</url>
</location>
<part>
<date>2004-sep 28 - oct 2</date>
<extent unit="page">
<start>134</start>
<end>143</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The significance of recall in automatic metrics for MT evaluation
%A Lavie, Alon
%A Sagae, Kenji
%A Jayaraman, Shyamsundar
%Y Frederking, Robert E.
%Y Taylor, Kathryn B.
%S Proceedings of the 6th Conference of the Association for Machine Translation in the Americas: Technical Papers
%D 2004
%8 sep 28 oct 2
%I Springer
%C Washington, USA
%F lavie-etal-2004-significance
%X Recent research has shown that a balanced harmonic mean (F1 measure) of unigram precision and recall outperforms the widely used BLEU and NIST metrics for Machine Translation evaluation in terms of correlation with human judgments of translation quality. We show that significantly better correlations can be achieved by placing more weight on recall than on precision. While this may seem unexpected, since BLEU and NIST focus on n-gram precision and disregard recall, our experiments show that correlation with human judgments is highest when almost all of the weight is assigned to recall. We also show that stemming is significantly beneficial not just to simpler unigram precision and recall based metrics, but also to BLEU and NIST.
%U https://aclanthology.org/2004.amta-papers.16/
%P 134-143
Markdown (Informal)
[The significance of recall in automatic metrics for MT evaluation](https://aclanthology.org/2004.amta-papers.16/) (Lavie et al., AMTA 2004)
ACL