@inproceedings{sinitsyna-savenkov-2024-comparative,
title = "Comparative Evaluation of Large Language Models for Linguistic Quality Assessment in Machine Translation",
author = "Sinitsyna, Daria and
Savenkov, Konstantin",
editor = "Martindale, Marianna and
Campbell, Janice and
Savenkov, Konstantin and
Goel, Shivali",
booktitle = "Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)",
month = sep,
year = "2024",
address = "Chicago, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2024.amta-presentations.12",
pages = "154--183",
abstract = "Building on our GPT-4 LQA research in MT, this study identifies top LLMs for an LQA pipeline with up to three models. LLMs like GPT-4, GPT-4o, GPT-4 Turbo, Google Vertex, Anthropic{'}s Claude 3, and Llama-3 are prompted using MQM error typology. These models generate segment-wise outputs describing translation errors, scored by severity and DQF-MQM penalties. The study evaluates four language pairs: English-Spanish, English-Chinese, English-German, and English-Portuguese, using datasets from our 2024 State of MT Report across eight domains. LLM outputs are correlated with human judgments, ranking models by alignment with human assessments for penalty score, issue presence, type, and severity. This research proposes an LQA pipeline with up to three models, weighted by output quality, highlighting LLMs{'} potential to enhance MT review processes and improve translation quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sinitsyna-savenkov-2024-comparative">
<titleInfo>
<title>Comparative Evaluation of Large Language Models for Linguistic Quality Assessment in Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Daria</namePart>
<namePart type="family">Sinitsyna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Martindale</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Janice</namePart>
<namePart type="family">Campbell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Savenkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivali</namePart>
<namePart type="family">Goel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Chicago, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Building on our GPT-4 LQA research in MT, this study identifies top LLMs for an LQA pipeline with up to three models. LLMs like GPT-4, GPT-4o, GPT-4 Turbo, Google Vertex, Anthropic’s Claude 3, and Llama-3 are prompted using MQM error typology. These models generate segment-wise outputs describing translation errors, scored by severity and DQF-MQM penalties. The study evaluates four language pairs: English-Spanish, English-Chinese, English-German, and English-Portuguese, using datasets from our 2024 State of MT Report across eight domains. LLM outputs are correlated with human judgments, ranking models by alignment with human assessments for penalty score, issue presence, type, and severity. This research proposes an LQA pipeline with up to three models, weighted by output quality, highlighting LLMs’ potential to enhance MT review processes and improve translation quality.</abstract>
<identifier type="citekey">sinitsyna-savenkov-2024-comparative</identifier>
<location>
<url>https://aclanthology.org/2024.amta-presentations.12</url>
</location>
<part>
<date>2024-09</date>
<extent unit="page">
<start>154</start>
<end>183</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparative Evaluation of Large Language Models for Linguistic Quality Assessment in Machine Translation
%A Sinitsyna, Daria
%A Savenkov, Konstantin
%Y Martindale, Marianna
%Y Campbell, Janice
%Y Savenkov, Konstantin
%Y Goel, Shivali
%S Proceedings of the 16th Conference of the Association for Machine Translation in the Americas (Volume 2: Presentations)
%D 2024
%8 September
%I Association for Machine Translation in the Americas
%C Chicago, USA
%F sinitsyna-savenkov-2024-comparative
%X Building on our GPT-4 LQA research in MT, this study identifies top LLMs for an LQA pipeline with up to three models. LLMs like GPT-4, GPT-4o, GPT-4 Turbo, Google Vertex, Anthropic’s Claude 3, and Llama-3 are prompted using MQM error typology. These models generate segment-wise outputs describing translation errors, scored by severity and DQF-MQM penalties. The study evaluates four language pairs: English-Spanish, English-Chinese, English-German, and English-Portuguese, using datasets from our 2024 State of MT Report across eight domains. LLM outputs are correlated with human judgments, ranking models by alignment with human assessments for penalty score, issue presence, type, and severity. This research proposes an LQA pipeline with up to three models, weighted by output quality, highlighting LLMs’ potential to enhance MT review processes and improve translation quality.
%U https://aclanthology.org/2024.amta-presentations.12
%P 154-183
Markdown (Informal)
[Comparative Evaluation of Large Language Models for Linguistic Quality Assessment in Machine Translation](https://aclanthology.org/2024.amta-presentations.12) (Sinitsyna & Savenkov, AMTA 2024)
ACL