@article{reiter-2018-structured,
title = "A Structured Review of the Validity of {BLEU}",
author = "Reiter, Ehud",
journal = "Computational Linguistics",
volume = "44",
number = "3",
month = sep,
year = "2018",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/J18-3002",
doi = "10.1162/coli_a_00322",
pages = "393--401",
abstract = "The BLEU metric has been widely used in NLP for over 15 years to evaluate NLP systems, especially in machine translation and natural language generation. I present a structured review of the evidence on whether BLEU is a valid evaluation technique{---}in other words, whether BLEU scores correlate with real-world utility and user-satisfaction of NLP systems; this review covers 284 correlations reported in 34 papers. Overall, the evidence supports using BLEU for diagnostic evaluation of MT systems (which is what it was originally proposed for), but does not support using BLEU outside of MT, for evaluation of individual texts, or for scientific hypothesis testing.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="reiter-2018-structured">
<titleInfo>
<title>A Structured Review of the Validity of BLEU</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ehud</namePart>
<namePart type="family">Reiter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2018-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>The BLEU metric has been widely used in NLP for over 15 years to evaluate NLP systems, especially in machine translation and natural language generation. I present a structured review of the evidence on whether BLEU is a valid evaluation technique—in other words, whether BLEU scores correlate with real-world utility and user-satisfaction of NLP systems; this review covers 284 correlations reported in 34 papers. Overall, the evidence supports using BLEU for diagnostic evaluation of MT systems (which is what it was originally proposed for), but does not support using BLEU outside of MT, for evaluation of individual texts, or for scientific hypothesis testing.</abstract>
<identifier type="citekey">reiter-2018-structured</identifier>
<identifier type="doi">10.1162/coli_a_00322</identifier>
<location>
<url>https://aclanthology.org/J18-3002</url>
</location>
<part>
<date>2018-09</date>
<detail type="volume"><number>44</number></detail>
<detail type="issue"><number>3</number></detail>
<extent unit="page">
<start>393</start>
<end>401</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T A Structured Review of the Validity of BLEU
%A Reiter, Ehud
%J Computational Linguistics
%D 2018
%8 September
%V 44
%N 3
%I MIT Press
%C Cambridge, MA
%F reiter-2018-structured
%X The BLEU metric has been widely used in NLP for over 15 years to evaluate NLP systems, especially in machine translation and natural language generation. I present a structured review of the evidence on whether BLEU is a valid evaluation technique—in other words, whether BLEU scores correlate with real-world utility and user-satisfaction of NLP systems; this review covers 284 correlations reported in 34 papers. Overall, the evidence supports using BLEU for diagnostic evaluation of MT systems (which is what it was originally proposed for), but does not support using BLEU outside of MT, for evaluation of individual texts, or for scientific hypothesis testing.
%R 10.1162/coli_a_00322
%U https://aclanthology.org/J18-3002
%U https://doi.org/10.1162/coli_a_00322
%P 393-401
Markdown (Informal)
[A Structured Review of the Validity of BLEU](https://aclanthology.org/J18-3002) (Reiter, CL 2018)
ACL