@inproceedings{umutlu-etal-2025-evaluating,
title = "Evaluating the Quality of Benchmark Datasets for Low-Resource Languages: A Case Study on {T}urkish",
author = "Umutlu, Elif Ecem and
Cengiz, Ayse Aysu and
Sever, Ahmet Kaan and
Erdem, Seyma and
Aytan, Burak and
Tufan, Busra and
Topraksoy, Abdullah and
Dar{\i}c{\i}, Esra and
Toraman, Cagri",
editor = "Arviv, Ofir and
Clinciu, Miruna and
Dhole, Kaustubh and
Dror, Rotem and
Gehrmann, Sebastian and
Habba, Eliya and
Itzhak, Itay and
Mille, Simon and
Perlitz, Yotam and
Santus, Enrico and
Sedoc, Jo{\~a}o and
Shmueli Scheuer, Michal and
Stanovsky, Gabriel and
Tafjord, Oyvind",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.gem-1.41/",
pages = "471--487",
ISBN = "979-8-89176-261-9",
abstract = "The reliance on translated or adapted datasets from English or multilingual resources introduces challenges regarding linguistic and cultural suitability. This study addresses the need for robust and culturally appropriate benchmarks by evaluating the quality of 17 commonly used Turkish benchmark datasets. Using a comprehensive framework that assesses six criteria, both human and LLM-judge annotators provide detailed evaluations to identify dataset strengths and shortcomings.Our results reveal that 70{\%} of the benchmark datasets fail to meet our heuristic quality standards. The correctness of the usage of technical terms is the strongest criterion, but 85{\%} of the criteria are not satisfied in the examined datasets. Although LLM judges demonstrate potential, they are less effective than human annotators, particularly in understanding cultural common sense knowledge and interpreting fluent, unambiguous text. GPT-4o has stronger labeling capabilities for grammatical and technical tasks, while Llama3.3-70B excels at correctness and cultural knowledge evaluation. Our findings emphasize the urgent need for more rigorous quality control in creating and adapting datasets for low-resource languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="umutlu-etal-2025-evaluating">
<titleInfo>
<title>Evaluating the Quality of Benchmark Datasets for Low-Resource Languages: A Case Study on Turkish</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elif</namePart>
<namePart type="given">Ecem</namePart>
<namePart type="family">Umutlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayse</namePart>
<namePart type="given">Aysu</namePart>
<namePart type="family">Cengiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmet</namePart>
<namePart type="given">Kaan</namePart>
<namePart type="family">Sever</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seyma</namePart>
<namePart type="family">Erdem</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Burak</namePart>
<namePart type="family">Aytan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Busra</namePart>
<namePart type="family">Tufan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullah</namePart>
<namePart type="family">Topraksoy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esra</namePart>
<namePart type="family">Darıcı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cagri</namePart>
<namePart type="family">Toraman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ofir</namePart>
<namePart type="family">Arviv</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Miruna</namePart>
<namePart type="family">Clinciu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kaustubh</namePart>
<namePart type="family">Dhole</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eliya</namePart>
<namePart type="family">Habba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Itzhak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yotam</namePart>
<namePart type="family">Perlitz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">João</namePart>
<namePart type="family">Sedoc</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Shmueli Scheuer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oyvind</namePart>
<namePart type="family">Tafjord</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria and virtual meeting</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-261-9</identifier>
</relatedItem>
<abstract>The reliance on translated or adapted datasets from English or multilingual resources introduces challenges regarding linguistic and cultural suitability. This study addresses the need for robust and culturally appropriate benchmarks by evaluating the quality of 17 commonly used Turkish benchmark datasets. Using a comprehensive framework that assesses six criteria, both human and LLM-judge annotators provide detailed evaluations to identify dataset strengths and shortcomings.Our results reveal that 70% of the benchmark datasets fail to meet our heuristic quality standards. The correctness of the usage of technical terms is the strongest criterion, but 85% of the criteria are not satisfied in the examined datasets. Although LLM judges demonstrate potential, they are less effective than human annotators, particularly in understanding cultural common sense knowledge and interpreting fluent, unambiguous text. GPT-4o has stronger labeling capabilities for grammatical and technical tasks, while Llama3.3-70B excels at correctness and cultural knowledge evaluation. Our findings emphasize the urgent need for more rigorous quality control in creating and adapting datasets for low-resource languages.</abstract>
<identifier type="citekey">umutlu-etal-2025-evaluating</identifier>
<location>
<url>https://aclanthology.org/2025.gem-1.41/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>471</start>
<end>487</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Evaluating the Quality of Benchmark Datasets for Low-Resource Languages: A Case Study on Turkish
%A Umutlu, Elif Ecem
%A Cengiz, Ayse Aysu
%A Sever, Ahmet Kaan
%A Erdem, Seyma
%A Aytan, Burak
%A Tufan, Busra
%A Topraksoy, Abdullah
%A Darıcı, Esra
%A Toraman, Cagri
%Y Arviv, Ofir
%Y Clinciu, Miruna
%Y Dhole, Kaustubh
%Y Dror, Rotem
%Y Gehrmann, Sebastian
%Y Habba, Eliya
%Y Itzhak, Itay
%Y Mille, Simon
%Y Perlitz, Yotam
%Y Santus, Enrico
%Y Sedoc, João
%Y Shmueli Scheuer, Michal
%Y Stanovsky, Gabriel
%Y Tafjord, Oyvind
%S Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria and virtual meeting
%@ 979-8-89176-261-9
%F umutlu-etal-2025-evaluating
%X The reliance on translated or adapted datasets from English or multilingual resources introduces challenges regarding linguistic and cultural suitability. This study addresses the need for robust and culturally appropriate benchmarks by evaluating the quality of 17 commonly used Turkish benchmark datasets. Using a comprehensive framework that assesses six criteria, both human and LLM-judge annotators provide detailed evaluations to identify dataset strengths and shortcomings.Our results reveal that 70% of the benchmark datasets fail to meet our heuristic quality standards. The correctness of the usage of technical terms is the strongest criterion, but 85% of the criteria are not satisfied in the examined datasets. Although LLM judges demonstrate potential, they are less effective than human annotators, particularly in understanding cultural common sense knowledge and interpreting fluent, unambiguous text. GPT-4o has stronger labeling capabilities for grammatical and technical tasks, while Llama3.3-70B excels at correctness and cultural knowledge evaluation. Our findings emphasize the urgent need for more rigorous quality control in creating and adapting datasets for low-resource languages.
%U https://aclanthology.org/2025.gem-1.41/
%P 471-487
Markdown (Informal)
[Evaluating the Quality of Benchmark Datasets for Low-Resource Languages: A Case Study on Turkish](https://aclanthology.org/2025.gem-1.41/) (Umutlu et al., GEM 2025)
ACL
- Elif Ecem Umutlu, Ayse Aysu Cengiz, Ahmet Kaan Sever, Seyma Erdem, Burak Aytan, Busra Tufan, Abdullah Topraksoy, Esra Darıcı, and Cagri Toraman. 2025. Evaluating the Quality of Benchmark Datasets for Low-Resource Languages: A Case Study on Turkish. In Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM²), pages 471–487, Vienna, Austria and virtual meeting. Association for Computational Linguistics.