@inproceedings{toraman-etal-2026-turkbench,
title = "{T}urk{B}ench: A Benchmark for Evaluating {T}urkish Large Language Models",
author = {Toraman, Cagri and
Sever, Ahmet Kaan and
Cengiz, Ay{\c{s}}e Aysu and
Arslan, Elif Ecem and
Sevin{\c{c}}, G{\"o}rkem and
Kantar, Sarp and
Birdal, Mete Mert and
G{\"u}ldemir, Yusuf Faruk and
Kanburo{\u{g}}lu, Ali Bu{\u{g}}ra and
Feleko{\u{g}}lu, Sezen and
K{\"u}t{\"u}k, Birsen {\c{S}}ahin and
Tufan, B{\"u}{\c{s}}ra and
Gen{\c{c}}, Elif and
Co{\c{s}}kun, Serkan and
Demir, Gupse Ekin and
Aray{\i}c{\i}, Muhammed Emin and
Dursun, Olgun and
Gungor, Onur and
{\"U}sk{\"u}darl{\i}, Susan and
Topraksoy, Abdullah and
Dar{\i}c{\i}, Esra},
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.12/",
pages = "126--154",
ISBN = "979-8-89176-370-8",
abstract = "With the recent surge in the development of large language models, the need for comprehensive and language-specific evaluation benchmarks has become critical. While significant progress has been made in evaluating English-language models, benchmarks for other languages, particularly those with unique linguistic characteristics such as Turkish, remain less developed. Our study introduces TurkBench, a comprehensive benchmark designed to assess the capabilities of generative large language models in the Turkish language. TurkBench involves 8,151 data samples across 21 distinct subtasks. These are organized under six main categories of evaluation: Knowledge, Language Understanding, Reasoning, Content Moderation, Turkish Grammar and Vocabulary, and Instruction Following. The diverse range of tasks and the culturally relevant data would provide researchers and developers with a valuable tool for evaluating their models and identifying areas for improvement. We further publish our benchmark for online submissions at https://huggingface.co/turkbench"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toraman-etal-2026-turkbench">
<titleInfo>
<title>TurkBench: A Benchmark for Evaluating Turkish Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cagri</namePart>
<namePart type="family">Toraman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmet</namePart>
<namePart type="given">Kaan</namePart>
<namePart type="family">Sever</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayşe</namePart>
<namePart type="given">Aysu</namePart>
<namePart type="family">Cengiz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elif</namePart>
<namePart type="given">Ecem</namePart>
<namePart type="family">Arslan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Görkem</namePart>
<namePart type="family">Sevinç</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sarp</namePart>
<namePart type="family">Kantar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mete</namePart>
<namePart type="given">Mert</namePart>
<namePart type="family">Birdal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yusuf</namePart>
<namePart type="given">Faruk</namePart>
<namePart type="family">Güldemir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ali</namePart>
<namePart type="given">Buğra</namePart>
<namePart type="family">Kanburoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sezen</namePart>
<namePart type="family">Felekoğlu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Birsen</namePart>
<namePart type="given">Şahin</namePart>
<namePart type="family">Kütük</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Büşra</namePart>
<namePart type="family">Tufan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elif</namePart>
<namePart type="family">Genç</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Serkan</namePart>
<namePart type="family">Coşkun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gupse</namePart>
<namePart type="given">Ekin</namePart>
<namePart type="family">Demir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Muhammed</namePart>
<namePart type="given">Emin</namePart>
<namePart type="family">Arayıcı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olgun</namePart>
<namePart type="family">Dursun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Gungor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Susan</namePart>
<namePart type="family">Üsküdarlı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullah</namePart>
<namePart type="family">Topraksoy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Esra</namePart>
<namePart type="family">Darıcı</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>With the recent surge in the development of large language models, the need for comprehensive and language-specific evaluation benchmarks has become critical. While significant progress has been made in evaluating English-language models, benchmarks for other languages, particularly those with unique linguistic characteristics such as Turkish, remain less developed. Our study introduces TurkBench, a comprehensive benchmark designed to assess the capabilities of generative large language models in the Turkish language. TurkBench involves 8,151 data samples across 21 distinct subtasks. These are organized under six main categories of evaluation: Knowledge, Language Understanding, Reasoning, Content Moderation, Turkish Grammar and Vocabulary, and Instruction Following. The diverse range of tasks and the culturally relevant data would provide researchers and developers with a valuable tool for evaluating their models and identifying areas for improvement. We further publish our benchmark for online submissions at https://huggingface.co/turkbench</abstract>
<identifier type="citekey">toraman-etal-2026-turkbench</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.12/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>126</start>
<end>154</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TurkBench: A Benchmark for Evaluating Turkish Large Language Models
%A Toraman, Cagri
%A Sever, Ahmet Kaan
%A Cengiz, Ayşe Aysu
%A Arslan, Elif Ecem
%A Sevinç, Görkem
%A Kantar, Sarp
%A Birdal, Mete Mert
%A Güldemir, Yusuf Faruk
%A Kanburoğlu, Ali Buğra
%A Felekoğlu, Sezen
%A Kütük, Birsen Şahin
%A Tufan, Büşra
%A Genç, Elif
%A Coşkun, Serkan
%A Demir, Gupse Ekin
%A Arayıcı, Muhammed Emin
%A Dursun, Olgun
%A Gungor, Onur
%A Üsküdarlı, Susan
%A Topraksoy, Abdullah
%A Darıcı, Esra
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F toraman-etal-2026-turkbench
%X With the recent surge in the development of large language models, the need for comprehensive and language-specific evaluation benchmarks has become critical. While significant progress has been made in evaluating English-language models, benchmarks for other languages, particularly those with unique linguistic characteristics such as Turkish, remain less developed. Our study introduces TurkBench, a comprehensive benchmark designed to assess the capabilities of generative large language models in the Turkish language. TurkBench involves 8,151 data samples across 21 distinct subtasks. These are organized under six main categories of evaluation: Knowledge, Language Understanding, Reasoning, Content Moderation, Turkish Grammar and Vocabulary, and Instruction Following. The diverse range of tasks and the culturally relevant data would provide researchers and developers with a valuable tool for evaluating their models and identifying areas for improvement. We further publish our benchmark for online submissions at https://huggingface.co/turkbench
%U https://aclanthology.org/2026.sigturk-1.12/
%P 126-154
Markdown (Informal)
[TurkBench: A Benchmark for Evaluating Turkish Large Language Models](https://aclanthology.org/2026.sigturk-1.12/) (Toraman et al., SIGTURK 2026)
ACL
- Cagri Toraman, Ahmet Kaan Sever, Ayşe Aysu Cengiz, Elif Ecem Arslan, Görkem Sevinç, Sarp Kantar, Mete Mert Birdal, Yusuf Faruk Güldemir, Ali Buğra Kanburoğlu, Sezen Felekoğlu, Birsen Şahin Kütük, Büşra Tufan, Elif Genç, Serkan Coşkun, Gupse Ekin Demir, Muhammed Emin Arayıcı, Olgun Dursun, Onur Gungor, Susan Üsküdarlı, Abdullah Topraksoy, and Esra Darıcı. 2026. TurkBench: A Benchmark for Evaluating Turkish Large Language Models. In Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026), pages 126–154, Rabat, Morocco. Association for Computational Linguistics.