@inproceedings{chaplynskyi-etal-2026-semantic,
title = "Semantic Fidelity Versus Literary Quality: A Construct Validity Study of Neural Machine Translation Metrics",
author = "Chaplynskyi, Dmytro and
Kulynych, Ivan and
Shvedova, Maria and
Ivashkevych, Lesia",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.unlp-1.8/",
pages = "67--79",
ISBN = "979-8-89176-359-3",
abstract = "Automatic machine translation metrics are the de facto standard for evaluating translation quality. Yet, it remains unclear what they actually measure. We investigate this question using a unique multilingual corpus: seven human Ukrainian translations of George Orwell{'}s Animal Farm, alongside three architecturally distinct AI systems (GPT-5.2, DeepL, and Lapa, a Ukrainian-tuned LLM). Across seven neural metrics, four reference-free and three reference-based, all three AI translations rank at the top. However, stylometric analysis exposes that these same AI translations are not as lexically rich as human ones ({\$}-{\$}18{\%} MTLD), underuse Ukrainian particles (up to 2x fewer) and diminutive morphology (2.6x fewer), and converge on near-identical outputs (LaBSE pairwise similarity 0.941 vs. 0.711 for human pairs). A controlled LLM-as-a-judge experiment demonstrates a clear preference reversal: when the English source is visible, AI ranks first; when it is hidden and the judge evaluates literary quality alone, humans rise to the top and AI falls to the lower ranks. Human evaluation (1,034 pairwise judgments) is balanced across both patterns. We argue that current MT metrics reward semantic fidelity and surface fluency {---} properties optimized by AI systems {---} while failing to capture the lexical richness, cultural adaptation, and stylistic voice that characterize skilled literary translation."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chaplynskyi-etal-2026-semantic">
<titleInfo>
<title>Semantic Fidelity Versus Literary Quality: A Construct Validity Study of Neural Machine Translation Metrics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmytro</namePart>
<namePart type="family">Chaplynskyi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Kulynych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Shvedova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lesia</namePart>
<namePart type="family">Ivashkevych</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mariana</namePart>
<namePart type="family">Romanyshyn</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Lviv, Ukraine</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-359-3</identifier>
</relatedItem>
<abstract>Automatic machine translation metrics are the de facto standard for evaluating translation quality. Yet, it remains unclear what they actually measure. We investigate this question using a unique multilingual corpus: seven human Ukrainian translations of George Orwell’s Animal Farm, alongside three architecturally distinct AI systems (GPT-5.2, DeepL, and Lapa, a Ukrainian-tuned LLM). Across seven neural metrics, four reference-free and three reference-based, all three AI translations rank at the top. However, stylometric analysis exposes that these same AI translations are not as lexically rich as human ones ($-$18% MTLD), underuse Ukrainian particles (up to 2x fewer) and diminutive morphology (2.6x fewer), and converge on near-identical outputs (LaBSE pairwise similarity 0.941 vs. 0.711 for human pairs). A controlled LLM-as-a-judge experiment demonstrates a clear preference reversal: when the English source is visible, AI ranks first; when it is hidden and the judge evaluates literary quality alone, humans rise to the top and AI falls to the lower ranks. Human evaluation (1,034 pairwise judgments) is balanced across both patterns. We argue that current MT metrics reward semantic fidelity and surface fluency — properties optimized by AI systems — while failing to capture the lexical richness, cultural adaptation, and stylistic voice that characterize skilled literary translation.</abstract>
<identifier type="citekey">chaplynskyi-etal-2026-semantic</identifier>
<location>
<url>https://aclanthology.org/2026.unlp-1.8/</url>
</location>
<part>
<date>2026-05</date>
<extent unit="page">
<start>67</start>
<end>79</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Semantic Fidelity Versus Literary Quality: A Construct Validity Study of Neural Machine Translation Metrics
%A Chaplynskyi, Dmytro
%A Kulynych, Ivan
%A Shvedova, Maria
%A Ivashkevych, Lesia
%Y Romanyshyn, Mariana
%S Proceedings of the Fifth Ukrainian Natural Language Processing Conference (UNLP 2026)
%D 2026
%8 May
%I Association for Computational Linguistics
%C Lviv, Ukraine
%@ 979-8-89176-359-3
%F chaplynskyi-etal-2026-semantic
%X Automatic machine translation metrics are the de facto standard for evaluating translation quality. Yet, it remains unclear what they actually measure. We investigate this question using a unique multilingual corpus: seven human Ukrainian translations of George Orwell’s Animal Farm, alongside three architecturally distinct AI systems (GPT-5.2, DeepL, and Lapa, a Ukrainian-tuned LLM). Across seven neural metrics, four reference-free and three reference-based, all three AI translations rank at the top. However, stylometric analysis exposes that these same AI translations are not as lexically rich as human ones ($-$18% MTLD), underuse Ukrainian particles (up to 2x fewer) and diminutive morphology (2.6x fewer), and converge on near-identical outputs (LaBSE pairwise similarity 0.941 vs. 0.711 for human pairs). A controlled LLM-as-a-judge experiment demonstrates a clear preference reversal: when the English source is visible, AI ranks first; when it is hidden and the judge evaluates literary quality alone, humans rise to the top and AI falls to the lower ranks. Human evaluation (1,034 pairwise judgments) is balanced across both patterns. We argue that current MT metrics reward semantic fidelity and surface fluency — properties optimized by AI systems — while failing to capture the lexical richness, cultural adaptation, and stylistic voice that characterize skilled literary translation.
%U https://aclanthology.org/2026.unlp-1.8/
%P 67-79
Markdown (Informal)
[Semantic Fidelity Versus Literary Quality: A Construct Validity Study of Neural Machine Translation Metrics](https://aclanthology.org/2026.unlp-1.8/) (Chaplynskyi et al., UNLP 2026)
ACL