@inproceedings{basar-bisazza-2026-morphology,
title = "A Morphology-Aware Evaluation of {T}urkish Syntax in Large Language Models",
author = "Ba{\c{s}}ar, Ezgi and
Bisazza, Arianna",
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.9/",
pages = "95--102",
ISBN = "979-8-89176-370-8",
abstract = "Minimal pair benchmarks have become a common approach for evaluating the syntactic knowledge of language models (LMs). However, the creation of such benchmarks often overlooks language-specific confounders that may affect model performance, particularly in the case of morphologically rich languages. In this paper, we investigate how surface-level factors such as morpheme count, subword count, and sentence length influence the performance of LMs on a Turkish benchmark of linguistic minimal pairs. We further analyze whether a tokenizer{'}s degree of alignment with morphological boundaries can serve as a proxy for model performance. Finally, we test whether the distribution of morphemes in a minimal pair benchmark can skew model performance. Our results show that while surface factors have limited predictive power, they might still serve as a systematic source of bias. Moreover, we find that morphological alignment can roughly correspond to model performance, and morpheme-level imbalances in the benchmark may have a significant influence on results."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="basar-bisazza-2026-morphology">
<titleInfo>
<title>A Morphology-Aware Evaluation of Turkish Syntax in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ezgi</namePart>
<namePart type="family">Başar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>Minimal pair benchmarks have become a common approach for evaluating the syntactic knowledge of language models (LMs). However, the creation of such benchmarks often overlooks language-specific confounders that may affect model performance, particularly in the case of morphologically rich languages. In this paper, we investigate how surface-level factors such as morpheme count, subword count, and sentence length influence the performance of LMs on a Turkish benchmark of linguistic minimal pairs. We further analyze whether a tokenizer’s degree of alignment with morphological boundaries can serve as a proxy for model performance. Finally, we test whether the distribution of morphemes in a minimal pair benchmark can skew model performance. Our results show that while surface factors have limited predictive power, they might still serve as a systematic source of bias. Moreover, we find that morphological alignment can roughly correspond to model performance, and morpheme-level imbalances in the benchmark may have a significant influence on results.</abstract>
<identifier type="citekey">basar-bisazza-2026-morphology</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.9/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>95</start>
<end>102</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Morphology-Aware Evaluation of Turkish Syntax in Large Language Models
%A Başar, Ezgi
%A Bisazza, Arianna
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F basar-bisazza-2026-morphology
%X Minimal pair benchmarks have become a common approach for evaluating the syntactic knowledge of language models (LMs). However, the creation of such benchmarks often overlooks language-specific confounders that may affect model performance, particularly in the case of morphologically rich languages. In this paper, we investigate how surface-level factors such as morpheme count, subword count, and sentence length influence the performance of LMs on a Turkish benchmark of linguistic minimal pairs. We further analyze whether a tokenizer’s degree of alignment with morphological boundaries can serve as a proxy for model performance. Finally, we test whether the distribution of morphemes in a minimal pair benchmark can skew model performance. Our results show that while surface factors have limited predictive power, they might still serve as a systematic source of bias. Moreover, we find that morphological alignment can roughly correspond to model performance, and morpheme-level imbalances in the benchmark may have a significant influence on results.
%U https://aclanthology.org/2026.sigturk-1.9/
%P 95-102
Markdown (Informal)
[A Morphology-Aware Evaluation of Turkish Syntax in Large Language Models](https://aclanthology.org/2026.sigturk-1.9/) (Başar & Bisazza, SIGTURK 2026)
ACL