@inproceedings{garland-etal-2022-comparison,
title = "Comparison Between {ATA} Grading Framework Scores and Auto Scores",
author = "Garland, Evelyn and
Berger, Carola and
Ritzdorf, Jon",
editor = "Campbell, Janice and
Larocca, Stephen and
Marciano, Jay and
Savenkov, Konstantin and
Yanishevsky, Alex",
booktitle = "Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)",
month = sep,
year = "2022",
address = "Orlando, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2022.amta-upg.13",
pages = "181--201",
abstract = "The authors of this study compared two types of translation quality scores assigned to the same sets of translation samples: 1) the ATA Grading Framework scores assigned by human experts, and 2) auto scores, including BLEU, TER, and COMET (with and without reference). They further explored the impact of different reference translations on the auto scores. Key findings from this study include: 1. auto scores that rely on reference translations depend heavily on which reference is used; 2. referenceless COMET seems promising when it is used to evaluate translations of short passages (250-300 English words); and 3. evidence suggests good agreement between the ATA-Framework score and some auto scores within a middle range, but the relationship becomes non-monotonic beyond the middle range. This study is subject to the limitation of a small sample size and is a retrospective exploratory study not specifically designed to test a pre-defined hypothesis.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="garland-etal-2022-comparison">
<titleInfo>
<title>Comparison Between ATA Grading Framework Scores and Auto Scores</title>
</titleInfo>
<name type="personal">
<namePart type="given">Evelyn</namePart>
<namePart type="family">Garland</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carola</namePart>
<namePart type="family">Berger</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jon</namePart>
<namePart type="family">Ritzdorf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stephen</namePart>
<namePart type="family">Larocca</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jay</namePart>
<namePart type="family">Marciano</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Yanishevsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Orlando, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The authors of this study compared two types of translation quality scores assigned to the same sets of translation samples: 1) the ATA Grading Framework scores assigned by human experts, and 2) auto scores, including BLEU, TER, and COMET (with and without reference). They further explored the impact of different reference translations on the auto scores. Key findings from this study include: 1. auto scores that rely on reference translations depend heavily on which reference is used; 2. referenceless COMET seems promising when it is used to evaluate translations of short passages (250-300 English words); and 3. evidence suggests good agreement between the ATA-Framework score and some auto scores within a middle range, but the relationship becomes non-monotonic beyond the middle range. This study is subject to the limitation of a small sample size and is a retrospective exploratory study not specifically designed to test a pre-defined hypothesis.</abstract>
<identifier type="citekey">garland-etal-2022-comparison</identifier>
<location>
<url>https://aclanthology.org/2022.amta-upg.13</url>
</location>
<part>
<date>2022-09</date>
<extent unit="page">
<start>181</start>
<end>201</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparison Between ATA Grading Framework Scores and Auto Scores
%A Garland, Evelyn
%A Berger, Carola
%A Ritzdorf, Jon
%Y Campbell, Janice
%Y Larocca, Stephen
%Y Marciano, Jay
%Y Savenkov, Konstantin
%Y Yanishevsky, Alex
%S Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track)
%D 2022
%8 September
%I Association for Machine Translation in the Americas
%C Orlando, USA
%F garland-etal-2022-comparison
%X The authors of this study compared two types of translation quality scores assigned to the same sets of translation samples: 1) the ATA Grading Framework scores assigned by human experts, and 2) auto scores, including BLEU, TER, and COMET (with and without reference). They further explored the impact of different reference translations on the auto scores. Key findings from this study include: 1. auto scores that rely on reference translations depend heavily on which reference is used; 2. referenceless COMET seems promising when it is used to evaluate translations of short passages (250-300 English words); and 3. evidence suggests good agreement between the ATA-Framework score and some auto scores within a middle range, but the relationship becomes non-monotonic beyond the middle range. This study is subject to the limitation of a small sample size and is a retrospective exploratory study not specifically designed to test a pre-defined hypothesis.
%U https://aclanthology.org/2022.amta-upg.13
%P 181-201
Markdown (Informal)
[Comparison Between ATA Grading Framework Scores and Auto Scores](https://aclanthology.org/2022.amta-upg.13) (Garland et al., AMTA 2022)
ACL
- Evelyn Garland, Carola Berger, and Jon Ritzdorf. 2022. Comparison Between ATA Grading Framework Scores and Auto Scores. In Proceedings of the 15th Biennial Conference of the Association for Machine Translation in the Americas (Volume 2: Users and Providers Track and Government Track), pages 181–201, Orlando, USA. Association for Machine Translation in the Americas.