@inproceedings{sun-2010-mining,
title = "Mining the Correlation between Human and Automatic Evaluation at Sentence Level",
author = "Sun, Yanli",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}`10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L10-1051/",
abstract = "Automatic evaluation metrics are fast and cost-effective measurements of the quality of a Machine Translation (MT) system. However, as humans are the end-user of MT output, human judgement is the benchmark to assess the usefulness of automatic evaluation metrics. While most studies report the correlation between human evaluation and automatic evaluation at corpus level, our study examines their correlation at sentence level. In addition to the statistical correlation scores, such as Spearman`s rank-order correlation coefficient, a finer-grained and detailed examination of the sensitivity of automatic metrics compared to human evaluation is also reported in this study. The results show that the threshold for human evaluators to agree with the judgements of automatic metrics varies with the automatic metrics at sentence level. While the automatic scores for two translations are greatly different, human evaluators may consider the translations to be qualitatively similar and vice versa. The detailed analysis of the correlation between automatic and human evaluation allows us determine with increased confidence whether an increase in the automatic scores will be agreed by human evaluators or not."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sun-2010-mining">
<titleInfo>
<title>Mining the Correlation between Human and Automatic Evaluation at Sentence Level</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yanli</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2010-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC‘10)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mike</namePart>
<namePart type="family">Rosner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Valletta, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Automatic evaluation metrics are fast and cost-effective measurements of the quality of a Machine Translation (MT) system. However, as humans are the end-user of MT output, human judgement is the benchmark to assess the usefulness of automatic evaluation metrics. While most studies report the correlation between human evaluation and automatic evaluation at corpus level, our study examines their correlation at sentence level. In addition to the statistical correlation scores, such as Spearman‘s rank-order correlation coefficient, a finer-grained and detailed examination of the sensitivity of automatic metrics compared to human evaluation is also reported in this study. The results show that the threshold for human evaluators to agree with the judgements of automatic metrics varies with the automatic metrics at sentence level. While the automatic scores for two translations are greatly different, human evaluators may consider the translations to be qualitatively similar and vice versa. The detailed analysis of the correlation between automatic and human evaluation allows us determine with increased confidence whether an increase in the automatic scores will be agreed by human evaluators or not.</abstract>
<identifier type="citekey">sun-2010-mining</identifier>
<location>
<url>https://aclanthology.org/L10-1051/</url>
</location>
<part>
<date>2010-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mining the Correlation between Human and Automatic Evaluation at Sentence Level
%A Sun, Yanli
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Piperidis, Stelios
%Y Rosner, Mike
%Y Tapias, Daniel
%S Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC‘10)
%D 2010
%8 May
%I European Language Resources Association (ELRA)
%C Valletta, Malta
%F sun-2010-mining
%X Automatic evaluation metrics are fast and cost-effective measurements of the quality of a Machine Translation (MT) system. However, as humans are the end-user of MT output, human judgement is the benchmark to assess the usefulness of automatic evaluation metrics. While most studies report the correlation between human evaluation and automatic evaluation at corpus level, our study examines their correlation at sentence level. In addition to the statistical correlation scores, such as Spearman‘s rank-order correlation coefficient, a finer-grained and detailed examination of the sensitivity of automatic metrics compared to human evaluation is also reported in this study. The results show that the threshold for human evaluators to agree with the judgements of automatic metrics varies with the automatic metrics at sentence level. While the automatic scores for two translations are greatly different, human evaluators may consider the translations to be qualitatively similar and vice versa. The detailed analysis of the correlation between automatic and human evaluation allows us determine with increased confidence whether an increase in the automatic scores will be agreed by human evaluators or not.
%U https://aclanthology.org/L10-1051/
Markdown (Informal)
[Mining the Correlation between Human and Automatic Evaluation at Sentence Level](https://aclanthology.org/L10-1051/) (Sun, LREC 2010)
ACL