@inproceedings{thompson-etal-2024-improving,
title = "Improving Statistical Significance in Human Evaluation of Automatic Metrics via Soft Pairwise Accuracy",
author = "Thompson, Brian and
Mathur, Nitika and
Deutsch, Daniel and
Khayrallah, Huda",
editor = "Haddow, Barry and
Kocmi, Tom and
Koehn, Philipp and
Monz, Christof",
booktitle = "Proceedings of the Ninth Conference on Machine Translation",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.wmt-1.118",
pages = "1222--1234",
abstract = "Selecting an automatic metric that best emulates human annotators is often non-trivial, because there is no clear definition of {``}best emulates.{''} A meta-metric is required to compare the human judgments to the automatic metric scores, and metric rankings depend on the choice of meta-metric. We propose Soft Pairwise Accuracy (SPA), a new meta-metric that builds on Pairwise Accuracy (PA) but incorporates the statistical significance of both the human judgments and the metric scores. We show that SPA is more stable than PA with respect to changes in the number of systems/segments used for evaluation. We also show that PA can only assign a small set of distinct output values to metrics, and this results in many metrics being artificially assigned the exact same PA score. We demonstrate that SPA fixes this issue. Finally, we show that SPA is more discriminative than PA, producing more statistically significant comparisons between metrics. SPA was selected as the official system-level metric for the 2024 WMT Metrics Shared Task.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thompson-etal-2024-improving">
<titleInfo>
<title>Improving Statistical Significance in Human Evaluation of Automatic Metrics via Soft Pairwise Accuracy</title>
</titleInfo>
<name type="personal">
<namePart type="given">Brian</namePart>
<namePart type="family">Thompson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nitika</namePart>
<namePart type="family">Mathur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Deutsch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Huda</namePart>
<namePart type="family">Khayrallah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth Conference on Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Barry</namePart>
<namePart type="family">Haddow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tom</namePart>
<namePart type="family">Kocmi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Philipp</namePart>
<namePart type="family">Koehn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christof</namePart>
<namePart type="family">Monz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Selecting an automatic metric that best emulates human annotators is often non-trivial, because there is no clear definition of “best emulates.” A meta-metric is required to compare the human judgments to the automatic metric scores, and metric rankings depend on the choice of meta-metric. We propose Soft Pairwise Accuracy (SPA), a new meta-metric that builds on Pairwise Accuracy (PA) but incorporates the statistical significance of both the human judgments and the metric scores. We show that SPA is more stable than PA with respect to changes in the number of systems/segments used for evaluation. We also show that PA can only assign a small set of distinct output values to metrics, and this results in many metrics being artificially assigned the exact same PA score. We demonstrate that SPA fixes this issue. Finally, we show that SPA is more discriminative than PA, producing more statistically significant comparisons between metrics. SPA was selected as the official system-level metric for the 2024 WMT Metrics Shared Task.</abstract>
<identifier type="citekey">thompson-etal-2024-improving</identifier>
<location>
<url>https://aclanthology.org/2024.wmt-1.118</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>1222</start>
<end>1234</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Improving Statistical Significance in Human Evaluation of Automatic Metrics via Soft Pairwise Accuracy
%A Thompson, Brian
%A Mathur, Nitika
%A Deutsch, Daniel
%A Khayrallah, Huda
%Y Haddow, Barry
%Y Kocmi, Tom
%Y Koehn, Philipp
%Y Monz, Christof
%S Proceedings of the Ninth Conference on Machine Translation
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F thompson-etal-2024-improving
%X Selecting an automatic metric that best emulates human annotators is often non-trivial, because there is no clear definition of “best emulates.” A meta-metric is required to compare the human judgments to the automatic metric scores, and metric rankings depend on the choice of meta-metric. We propose Soft Pairwise Accuracy (SPA), a new meta-metric that builds on Pairwise Accuracy (PA) but incorporates the statistical significance of both the human judgments and the metric scores. We show that SPA is more stable than PA with respect to changes in the number of systems/segments used for evaluation. We also show that PA can only assign a small set of distinct output values to metrics, and this results in many metrics being artificially assigned the exact same PA score. We demonstrate that SPA fixes this issue. Finally, we show that SPA is more discriminative than PA, producing more statistically significant comparisons between metrics. SPA was selected as the official system-level metric for the 2024 WMT Metrics Shared Task.
%U https://aclanthology.org/2024.wmt-1.118
%P 1222-1234
Markdown (Informal)
[Improving Statistical Significance in Human Evaluation of Automatic Metrics via Soft Pairwise Accuracy](https://aclanthology.org/2024.wmt-1.118) (Thompson et al., WMT 2024)
ACL