@inproceedings{de-vroe-etal-2025-comparing,
title = "Comparing Human and Machine Translations of Generative Language Model Evaluation Datasets",
author = "de Vroe, Sander Bijl and
Stampoulidis, George and
Hakala, Kai and
Rouhe, Aku and
van Heeswijk, Mark and
Karlgren, Jussi",
editor = "Johansson, Richard and
Stymne, Sara",
booktitle = "Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library",
url = "https://aclanthology.org/2025.nodalida-1.9/",
pages = "80--85",
ISBN = "978-9908-53-109-0",
abstract = "The evaluation of Large Language Models (LLMs) is one of the crucial current challenges in the field of Natural Language Processing (NLP) and becomes even more challenging in the multilingual setting. Since the majority of the community`s benchmarks exist only in English, test sets are now being machine translated at scale into dozens of languages. This work explores the feasibility of that approach, comparing a Finnish machine translation (MT) of ARC-Challenge with a new human translated version. Our findings suggest that since absolute scores are fairly close and model size rankings are preserved, machine translation is adequate in this case. Surprisingly, however, the datasets reverse the order of base models compared to their chat-finetuned counterparts."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="de-vroe-etal-2025-comparing">
<titleInfo>
<title>Comparing Human and Machine Translations of Generative Language Model Evaluation Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sander</namePart>
<namePart type="given">Bijl</namePart>
<namePart type="family">de Vroe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">George</namePart>
<namePart type="family">Stampoulidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kai</namePart>
<namePart type="family">Hakala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aku</namePart>
<namePart type="family">Rouhe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">van Heeswijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jussi</namePart>
<namePart type="family">Karlgren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Johansson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Stymne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-109-0</identifier>
</relatedItem>
<abstract>The evaluation of Large Language Models (LLMs) is one of the crucial current challenges in the field of Natural Language Processing (NLP) and becomes even more challenging in the multilingual setting. Since the majority of the community‘s benchmarks exist only in English, test sets are now being machine translated at scale into dozens of languages. This work explores the feasibility of that approach, comparing a Finnish machine translation (MT) of ARC-Challenge with a new human translated version. Our findings suggest that since absolute scores are fairly close and model size rankings are preserved, machine translation is adequate in this case. Surprisingly, however, the datasets reverse the order of base models compared to their chat-finetuned counterparts.</abstract>
<identifier type="citekey">de-vroe-etal-2025-comparing</identifier>
<location>
<url>https://aclanthology.org/2025.nodalida-1.9/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>80</start>
<end>85</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Comparing Human and Machine Translations of Generative Language Model Evaluation Datasets
%A de Vroe, Sander Bijl
%A Stampoulidis, George
%A Hakala, Kai
%A Rouhe, Aku
%A van Heeswijk, Mark
%A Karlgren, Jussi
%Y Johansson, Richard
%Y Stymne, Sara
%S Proceedings of the Joint 25th Nordic Conference on Computational Linguistics and 11th Baltic Conference on Human Language Technologies (NoDaLiDa/Baltic-HLT 2025)
%D 2025
%8 March
%I University of Tartu Library
%C Tallinn, Estonia
%@ 978-9908-53-109-0
%F de-vroe-etal-2025-comparing
%X The evaluation of Large Language Models (LLMs) is one of the crucial current challenges in the field of Natural Language Processing (NLP) and becomes even more challenging in the multilingual setting. Since the majority of the community‘s benchmarks exist only in English, test sets are now being machine translated at scale into dozens of languages. This work explores the feasibility of that approach, comparing a Finnish machine translation (MT) of ARC-Challenge with a new human translated version. Our findings suggest that since absolute scores are fairly close and model size rankings are preserved, machine translation is adequate in this case. Surprisingly, however, the datasets reverse the order of base models compared to their chat-finetuned counterparts.
%U https://aclanthology.org/2025.nodalida-1.9/
%P 80-85
Markdown (Informal)
[Comparing Human and Machine Translations of Generative Language Model Evaluation Datasets](https://aclanthology.org/2025.nodalida-1.9/) (de Vroe et al., NoDaLiDa 2025)
ACL