@inproceedings{coughlin-2003-correlating,
title = "Correlating automated and human assessments of machine translation quality",
author = "Coughlin, Deborah",
booktitle = "Proceedings of Machine Translation Summit IX: Papers",
month = sep # " 23-27",
year = "2003",
address = "New Orleans, USA",
url = "https://aclanthology.org/2003.mtsummit-papers.9",
abstract = "We describe a large-scale investigation of the correlation between human judgments of machine translation quality and the automated metrics that are increasingly used to drive progress in the field. We compare the results of 124 human evaluations of machine translated sentences to the scores generated by two automatic evaluation metrics (BLEU and NIST). When datasets are held constant or file size is sufficiently large, BLEU and NIST scores closely parallel human judgments. Surprisingly, this was true even though these scores were calculated using just one human reference. We suggest that when human evaluators are forced to make decisions without sufficient context or domain expertise, they fall back on strategies that are not unlike determining n-gram precision.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="coughlin-2003-correlating">
<titleInfo>
<title>Correlating automated and human assessments of machine translation quality</title>
</titleInfo>
<name type="personal">
<namePart type="given">Deborah</namePart>
<namePart type="family">Coughlin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2003-sep 23-27</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of Machine Translation Summit IX: Papers</title>
</titleInfo>
<originInfo>
<place>
<placeTerm type="text">New Orleans, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe a large-scale investigation of the correlation between human judgments of machine translation quality and the automated metrics that are increasingly used to drive progress in the field. We compare the results of 124 human evaluations of machine translated sentences to the scores generated by two automatic evaluation metrics (BLEU and NIST). When datasets are held constant or file size is sufficiently large, BLEU and NIST scores closely parallel human judgments. Surprisingly, this was true even though these scores were calculated using just one human reference. We suggest that when human evaluators are forced to make decisions without sufficient context or domain expertise, they fall back on strategies that are not unlike determining n-gram precision.</abstract>
<identifier type="citekey">coughlin-2003-correlating</identifier>
<location>
<url>https://aclanthology.org/2003.mtsummit-papers.9</url>
</location>
<part>
<date>2003-sep 23-27</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Correlating automated and human assessments of machine translation quality
%A Coughlin, Deborah
%S Proceedings of Machine Translation Summit IX: Papers
%D 2003
%8 sep 23 27
%C New Orleans, USA
%F coughlin-2003-correlating
%X We describe a large-scale investigation of the correlation between human judgments of machine translation quality and the automated metrics that are increasingly used to drive progress in the field. We compare the results of 124 human evaluations of machine translated sentences to the scores generated by two automatic evaluation metrics (BLEU and NIST). When datasets are held constant or file size is sufficiently large, BLEU and NIST scores closely parallel human judgments. Surprisingly, this was true even though these scores were calculated using just one human reference. We suggest that when human evaluators are forced to make decisions without sufficient context or domain expertise, they fall back on strategies that are not unlike determining n-gram precision.
%U https://aclanthology.org/2003.mtsummit-papers.9
Markdown (Informal)
[Correlating automated and human assessments of machine translation quality](https://aclanthology.org/2003.mtsummit-papers.9) (Coughlin, MTSummit 2003)
ACL