@inproceedings{deriu-etal-2023-correction,
title = "Correction of Errors in Preference Ratings from Automated Metrics for Text Generation",
author = {Deriu, Jan and
von D{\"a}niken, Pius and
Tuggener, Don and
Cieliebak, Mark},
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-acl.404",
doi = "10.18653/v1/2023.findings-acl.404",
pages = "6456--6474",
abstract = "A major challenge in the field of Text Generation is evaluation: Human evaluations are cost-intensive, and automated metrics often display considerable disagreements with human judgments. In this paper, we propose to apply automated metrics for Text Generation in a preference-based evaluation protocol. The protocol features a statistical model that incorporates various levels of uncertainty to account for the error-proneness of the metrics. We show that existing metrics are generally over-confident in assigning significant differences between systems. As a remedy, the model allows to combine human ratings with automated ratings. We show that it can reduce the required amounts of human ratings to arrive at robust and statistically significant results by more than 50{\%}, while yielding the same evaluation outcome as the pure human evaluation in 95{\%} of cases. We showcase the benefits of the evaluation protocol for three text generation tasks: dialogue systems, machine translation, and text summarization.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="deriu-etal-2023-correction">
<titleInfo>
<title>Correction of Errors in Preference Ratings from Automated Metrics for Text Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Deriu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pius</namePart>
<namePart type="family">von Däniken</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Don</namePart>
<namePart type="family">Tuggener</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Cieliebak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Rogers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jordan</namePart>
<namePart type="family">Boyd-Graber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naoaki</namePart>
<namePart type="family">Okazaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Toronto, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>A major challenge in the field of Text Generation is evaluation: Human evaluations are cost-intensive, and automated metrics often display considerable disagreements with human judgments. In this paper, we propose to apply automated metrics for Text Generation in a preference-based evaluation protocol. The protocol features a statistical model that incorporates various levels of uncertainty to account for the error-proneness of the metrics. We show that existing metrics are generally over-confident in assigning significant differences between systems. As a remedy, the model allows to combine human ratings with automated ratings. We show that it can reduce the required amounts of human ratings to arrive at robust and statistically significant results by more than 50%, while yielding the same evaluation outcome as the pure human evaluation in 95% of cases. We showcase the benefits of the evaluation protocol for three text generation tasks: dialogue systems, machine translation, and text summarization.</abstract>
<identifier type="citekey">deriu-etal-2023-correction</identifier>
<identifier type="doi">10.18653/v1/2023.findings-acl.404</identifier>
<location>
<url>https://aclanthology.org/2023.findings-acl.404</url>
</location>
<part>
<date>2023-07</date>
<extent unit="page">
<start>6456</start>
<end>6474</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Correction of Errors in Preference Ratings from Automated Metrics for Text Generation
%A Deriu, Jan
%A von Däniken, Pius
%A Tuggener, Don
%A Cieliebak, Mark
%Y Rogers, Anna
%Y Boyd-Graber, Jordan
%Y Okazaki, Naoaki
%S Findings of the Association for Computational Linguistics: ACL 2023
%D 2023
%8 July
%I Association for Computational Linguistics
%C Toronto, Canada
%F deriu-etal-2023-correction
%X A major challenge in the field of Text Generation is evaluation: Human evaluations are cost-intensive, and automated metrics often display considerable disagreements with human judgments. In this paper, we propose to apply automated metrics for Text Generation in a preference-based evaluation protocol. The protocol features a statistical model that incorporates various levels of uncertainty to account for the error-proneness of the metrics. We show that existing metrics are generally over-confident in assigning significant differences between systems. As a remedy, the model allows to combine human ratings with automated ratings. We show that it can reduce the required amounts of human ratings to arrive at robust and statistically significant results by more than 50%, while yielding the same evaluation outcome as the pure human evaluation in 95% of cases. We showcase the benefits of the evaluation protocol for three text generation tasks: dialogue systems, machine translation, and text summarization.
%R 10.18653/v1/2023.findings-acl.404
%U https://aclanthology.org/2023.findings-acl.404
%U https://doi.org/10.18653/v1/2023.findings-acl.404
%P 6456-6474
Markdown (Informal)
[Correction of Errors in Preference Ratings from Automated Metrics for Text Generation](https://aclanthology.org/2023.findings-acl.404) (Deriu et al., Findings 2023)
ACL