@inproceedings{chaplynskyi-etal-2026-professional,
title = "Professional Translators Versus Quality Estimation Models: Reliability and Agreement in {E}nglish-{U}krainian Translation Evaluation",
author = "Chaplynskyi, Dmytro and
Zakharov, Kyrylo and
Ivashkevych, Lesia",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.unlp-1.10/",
pages = "97--107",
ISBN = "979-8-89176-359-3",
abstract = "We extend a prior study comparing automatic Quality Estimation (QE) models with crowdsourced student judgments for English{--}Ukrainian parallel corpus evaluation. Eight professional translators each rate 1,000 sentence pairs on a continuous 0{--}100 scale under one of two paradigms: holistic quality scoring or a two-stage fluency-plus-adequacy protocol, with a repeated task for test{--}retest reliability. Professionals using the holistic scale achieve significantly higher inter-rater reliability than both linguistics students and professionals using separate fluency and adequacy scales, contradicting the expectation that multidimensional evaluation improves agreement. Adequacy correlates strongly with holistic judgments while fluency emerges as a largely independent dimension. Experts also exhibit a significant leniency drift over the session, alongside increasing evaluation speed. We additionally evaluate three LLMs as translation quality judges (Gemini 3 Flash, GPT-5.4, Gemma 3 27B) and find that the two larger models modestly outperform dedicated QE models in correlation with expert scores (r = 0.814{--}0.821 vs. r {\ensuremath{\leq}} 0.747). When prompted for separate fluency and adequacy scores, the LLMs replicate the adequacy-dominance pattern, confirming that meaning preservation drives holistic quality perception across both human and machine judges."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chaplynskyi-etal-2026-professional">
<titleInfo>
<title>Professional Translators Versus Quality Estimation Models: Reliability and Agreement in English-Ukrainian Translation Evaluation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyrylo</namePart>
<namePart type="family">Zakharov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lesia</namePart>
<namePart type="family">Ivashkevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Lviv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-359-3</identifier>
</relatedItem>
<abstract>We extend a prior study comparing automatic Quality Estimation (QE) models with crowdsourced student judgments for English–Ukrainian parallel corpus evaluation. Eight professional translators each rate 1,000 sentence pairs on a continuous 0–100 scale under one of two paradigms: holistic quality scoring or a two-stage fluency-plus-adequacy protocol, with a repeated task for test–retest reliability. Professionals using the holistic scale achieve significantly higher inter-rater reliability than both linguistics students and professionals using separate fluency and adequacy scales, contradicting the expectation that multidimensional evaluation improves agreement. Adequacy correlates strongly with holistic judgments while fluency emerges as a largely independent dimension. Experts also exhibit a significant leniency drift over the session, alongside increasing evaluation speed. We additionally evaluate three LLMs as translation quality judges (Gemini 3 Flash, GPT-5.4, Gemma 3 27B) and find that the two larger models modestly outperform dedicated QE models in correlation with expert scores (r = 0.814–0.821 vs. r \ensuremathłeq 0.747). When prompted for separate fluency and adequacy scores, the LLMs replicate the adequacy-dominance pattern, confirming that meaning preservation drives holistic quality perception across both human and machine judges.</abstract>
<identifier type="citekey">chaplynskyi-etal-2026-professional</identifier>
<location>
<url>https://aclanthology.org/2026.unlp-1.10/</url>
</location>
<part>
<date>2026-05</date>
<extent unit="page">
<start>97</start>
<end>107</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Professional Translators Versus Quality Estimation Models: Reliability and Agreement in English-Ukrainian Translation Evaluation
%A Chaplynskyi, Dmytro
%A Zakharov, Kyrylo
%A Ivashkevych, Lesia
%Y Romanyshyn, Mariana
%S Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)
%D 2026
%8 May
%I Association for Computational Linguistics
%C Lviv, Ukraine
%@ 979-8-89176-359-3
%F chaplynskyi-etal-2026-professional
%X We extend a prior study comparing automatic Quality Estimation (QE) models with crowdsourced student judgments for English–Ukrainian parallel corpus evaluation. Eight professional translators each rate 1,000 sentence pairs on a continuous 0–100 scale under one of two paradigms: holistic quality scoring or a two-stage fluency-plus-adequacy protocol, with a repeated task for test–retest reliability. Professionals using the holistic scale achieve significantly higher inter-rater reliability than both linguistics students and professionals using separate fluency and adequacy scales, contradicting the expectation that multidimensional evaluation improves agreement. Adequacy correlates strongly with holistic judgments while fluency emerges as a largely independent dimension. Experts also exhibit a significant leniency drift over the session, alongside increasing evaluation speed. We additionally evaluate three LLMs as translation quality judges (Gemini 3 Flash, GPT-5.4, Gemma 3 27B) and find that the two larger models modestly outperform dedicated QE models in correlation with expert scores (r = 0.814–0.821 vs. r \ensuremathłeq 0.747). When prompted for separate fluency and adequacy scores, the LLMs replicate the adequacy-dominance pattern, confirming that meaning preservation drives holistic quality perception across both human and machine judges.
%U https://aclanthology.org/2026.unlp-1.10/
%P 97-107
Markdown (Informal)
[Professional Translators Versus Quality Estimation Models: Reliability and Agreement in English-Ukrainian Translation Evaluation](https://aclanthology.org/2026.unlp-1.10/) (Chaplynskyi et al., UNLP 2026)
ACL