@inproceedings{samo-merlo-2026-modelling,
title = "Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of {T}urkish and {H}ebrew",
author = "Samo, Giuseppe and
Merlo, Paola",
editor = {Oflazer, Kemal and
K{\"o}ksal, Abdullatif and
Varol, Onur},
booktitle = "Proceedings of the Second Workshop Natural Language Processing for {T}urkic Languages ({SIGTURK} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.sigturk-1.8/",
pages = "82--94",
ISBN = "979-8-89176-370-8",
abstract = "In this paper, we investigate how transformer models represent complex verb paradigms in Turkish and Modern Hebrew, focusing on how tokenization strategies shape this ability. Using the Blackbird Language Matrices task on natural data, we show that for Turkish{---}with its transparent morphological markers{---}both monolingual and multilingual models succeed either when tokenization is highly atomic or breaking words into small subword units. For Hebrew, however, a multilingual model using character-level tokenization fails to capture its non-concatenative morphology, while a monolingual model with unified morpheme-aware segmentation excels. Performance improves on more synthetic datasets, in all models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samo-merlo-2026-modelling">
<titleInfo>
<title>Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of Turkish and Hebrew</title>
</titleInfo>
<name type="personal">
<namePart type="given">Giuseppe</namePart>
<namePart type="family">Samo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paola</namePart>
<namePart type="family">Merlo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kemal</namePart>
<namePart type="family">Oflazer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Abdullatif</namePart>
<namePart type="family">Köksal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Onur</namePart>
<namePart type="family">Varol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-370-8</identifier>
</relatedItem>
<abstract>In this paper, we investigate how transformer models represent complex verb paradigms in Turkish and Modern Hebrew, focusing on how tokenization strategies shape this ability. Using the Blackbird Language Matrices task on natural data, we show that for Turkish—with its transparent morphological markers—both monolingual and multilingual models succeed either when tokenization is highly atomic or breaking words into small subword units. For Hebrew, however, a multilingual model using character-level tokenization fails to capture its non-concatenative morphology, while a monolingual model with unified morpheme-aware segmentation excels. Performance improves on more synthetic datasets, in all models.</abstract>
<identifier type="citekey">samo-merlo-2026-modelling</identifier>
<location>
<url>https://aclanthology.org/2026.sigturk-1.8/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>82</start>
<end>94</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of Turkish and Hebrew
%A Samo, Giuseppe
%A Merlo, Paola
%Y Oflazer, Kemal
%Y Köksal, Abdullatif
%Y Varol, Onur
%S Proceedings of the Second Workshop Natural Language Processing for Turkic Languages (SIGTURK 2026)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-370-8
%F samo-merlo-2026-modelling
%X In this paper, we investigate how transformer models represent complex verb paradigms in Turkish and Modern Hebrew, focusing on how tokenization strategies shape this ability. Using the Blackbird Language Matrices task on natural data, we show that for Turkish—with its transparent morphological markers—both monolingual and multilingual models succeed either when tokenization is highly atomic or breaking words into small subword units. For Hebrew, however, a multilingual model using character-level tokenization fails to capture its non-concatenative morphology, while a monolingual model with unified morpheme-aware segmentation excels. Performance improves on more synthetic datasets, in all models.
%U https://aclanthology.org/2026.sigturk-1.8/
%P 82-94
Markdown (Informal)
[Modelling the Morphology of Verbal Paradigms: A Case Study in the Tokenization of Turkish and Hebrew](https://aclanthology.org/2026.sigturk-1.8/) (Samo & Merlo, SIGTURK 2026)
ACL