@inproceedings{noh-etal-2024-beyond,
title = "Beyond Reference: Evaluating High Quality Translations Better than Human References",
author = "Noh, Keonwoong and
Oh, Seokjin and
Jung, Woohwan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.294",
pages = "5111--5127",
abstract = "In Machine Translation (MT) evaluations, the conventional approach is to compare a translated sentence against its human-created reference sentence. MT metrics provide an absolute score (e.g., from 0 to 1) to a candidate sentence based on the similarity with the reference sentence. Thus, existing MT metrics give the maximum score to the reference sentence. However, this approach overlooks the potential for a candidate sentence to exceed the reference sentence in terms of quality. In particular, recent advancements in Large Language Models (LLMs) have highlighted this issue, as LLM-generated sentences often exceed the quality of human-written sentences. To address the problem, we introduce the Residual score Metric (ResuMe), which evaluates the relative quality between reference and candidate sentences. ResuMe assigns a positive score to candidate sentences that outperform their reference sentences, and a negative score when they fall short. By adding the residual scores from ResuMe to the absolute scores from MT metrics, it can be possible to allocate higher scores to candidate sentences than what reference sentences are received from MT metrics. Experimental results demonstrate that ResuMe enhances the alignments between MT metrics and human judgments both at the segment-level and the system-level.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="noh-etal-2024-beyond">
<titleInfo>
<title>Beyond Reference: Evaluating High Quality Translations Better than Human References</title>
</titleInfo>
<name type="personal">
<namePart type="given">Keonwoong</namePart>
<namePart type="family">Noh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokjin</namePart>
<namePart type="family">Oh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Woohwan</namePart>
<namePart type="family">Jung</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In Machine Translation (MT) evaluations, the conventional approach is to compare a translated sentence against its human-created reference sentence. MT metrics provide an absolute score (e.g., from 0 to 1) to a candidate sentence based on the similarity with the reference sentence. Thus, existing MT metrics give the maximum score to the reference sentence. However, this approach overlooks the potential for a candidate sentence to exceed the reference sentence in terms of quality. In particular, recent advancements in Large Language Models (LLMs) have highlighted this issue, as LLM-generated sentences often exceed the quality of human-written sentences. To address the problem, we introduce the Residual score Metric (ResuMe), which evaluates the relative quality between reference and candidate sentences. ResuMe assigns a positive score to candidate sentences that outperform their reference sentences, and a negative score when they fall short. By adding the residual scores from ResuMe to the absolute scores from MT metrics, it can be possible to allocate higher scores to candidate sentences than what reference sentences are received from MT metrics. Experimental results demonstrate that ResuMe enhances the alignments between MT metrics and human judgments both at the segment-level and the system-level.</abstract>
<identifier type="citekey">noh-etal-2024-beyond</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.294</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>5111</start>
<end>5127</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Reference: Evaluating High Quality Translations Better than Human References
%A Noh, Keonwoong
%A Oh, Seokjin
%A Jung, Woohwan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F noh-etal-2024-beyond
%X In Machine Translation (MT) evaluations, the conventional approach is to compare a translated sentence against its human-created reference sentence. MT metrics provide an absolute score (e.g., from 0 to 1) to a candidate sentence based on the similarity with the reference sentence. Thus, existing MT metrics give the maximum score to the reference sentence. However, this approach overlooks the potential for a candidate sentence to exceed the reference sentence in terms of quality. In particular, recent advancements in Large Language Models (LLMs) have highlighted this issue, as LLM-generated sentences often exceed the quality of human-written sentences. To address the problem, we introduce the Residual score Metric (ResuMe), which evaluates the relative quality between reference and candidate sentences. ResuMe assigns a positive score to candidate sentences that outperform their reference sentences, and a negative score when they fall short. By adding the residual scores from ResuMe to the absolute scores from MT metrics, it can be possible to allocate higher scores to candidate sentences than what reference sentences are received from MT metrics. Experimental results demonstrate that ResuMe enhances the alignments between MT metrics and human judgments both at the segment-level and the system-level.
%U https://aclanthology.org/2024.emnlp-main.294
%P 5111-5127
Markdown (Informal)
[Beyond Reference: Evaluating High Quality Translations Better than Human References](https://aclanthology.org/2024.emnlp-main.294) (Noh et al., EMNLP 2024)
ACL