@inproceedings{baidildinova-etal-2026-quantifying,
title = "Quantifying mutual intelligibility gradients in {T}urkic languages using language models",
author = "Baidildinova, Moldir and
Upadhye, Shiva and
Wagner, Austin and
Mayer, Connor and
Futrell, Richard",
editor = "Voigt, Rob and
Warstadt, Alex and
Feldman, Naomi and
Linzen, Tal",
booktitle = "Proceedings of the Society for Computation in Linguistics 2026",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.scil-main.41/",
pages = "442--446",
ISBN = "979-8-89176-412-5",
abstract = "Mutual intelligibility (MI) among related languages is a gradient phenomenon shaped by lexical, grammatical, and phonetic-phonological similarity. This study proposes a neural language modeling approach to quantifying MI patterns within the Turkic language family. Using IPA-transcribed naturalistic text from six Turkic languages, we train character-level LSTM models on a source language and fine-tune them on target languages that vary in genealogical distance. Cross-lingual transfer is evaluated using character-level cross-entropy (CE) loss, Area Under the Curve (AUC), and Rate of Change (ROC), which together capture model generalization, learning dynamics, and early-stage adaptation. We further examine whether model performance is predicted by cophenetic distance, lexical similarity, weighted trigram frequency overlap, and differences in vowel harmony index. Overall, the results suggest that character-level language models can approximate MI gradients across Turkic languages: closely related pairs generally show lower CE loss and smaller AUC, while more distant pairs show greater early-stage change. Lexical similarity, local phonotactic overlap, and genealogical distance appear to be the most informative predictors of model convergence. These findings provide preliminary evidence that neural language models trained on naturalistic text can offer a scalable way to model MI patterns, including directional asymmetries, across closely related languages."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="baidildinova-etal-2026-quantifying">
<titleInfo>
<title>Quantifying mutual intelligibility gradients in Turkic languages using language models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Moldir</namePart>
<namePart type="family">Baidildinova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shiva</namePart>
<namePart type="family">Upadhye</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Austin</namePart>
<namePart type="family">Wagner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Connor</namePart>
<namePart type="family">Mayer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Richard</namePart>
<namePart type="family">Futrell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Society for Computation in Linguistics 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rob</namePart>
<namePart type="family">Voigt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alex</namePart>
<namePart type="family">Warstadt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Naomi</namePart>
<namePart type="family">Feldman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tal</namePart>
<namePart type="family">Linzen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, CA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-412-5</identifier>
</relatedItem>
<abstract>Mutual intelligibility (MI) among related languages is a gradient phenomenon shaped by lexical, grammatical, and phonetic-phonological similarity. This study proposes a neural language modeling approach to quantifying MI patterns within the Turkic language family. Using IPA-transcribed naturalistic text from six Turkic languages, we train character-level LSTM models on a source language and fine-tune them on target languages that vary in genealogical distance. Cross-lingual transfer is evaluated using character-level cross-entropy (CE) loss, Area Under the Curve (AUC), and Rate of Change (ROC), which together capture model generalization, learning dynamics, and early-stage adaptation. We further examine whether model performance is predicted by cophenetic distance, lexical similarity, weighted trigram frequency overlap, and differences in vowel harmony index. Overall, the results suggest that character-level language models can approximate MI gradients across Turkic languages: closely related pairs generally show lower CE loss and smaller AUC, while more distant pairs show greater early-stage change. Lexical similarity, local phonotactic overlap, and genealogical distance appear to be the most informative predictors of model convergence. These findings provide preliminary evidence that neural language models trained on naturalistic text can offer a scalable way to model MI patterns, including directional asymmetries, across closely related languages.</abstract>
<identifier type="citekey">baidildinova-etal-2026-quantifying</identifier>
<location>
<url>https://aclanthology.org/2026.scil-main.41/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>442</start>
<end>446</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Quantifying mutual intelligibility gradients in Turkic languages using language models
%A Baidildinova, Moldir
%A Upadhye, Shiva
%A Wagner, Austin
%A Mayer, Connor
%A Futrell, Richard
%Y Voigt, Rob
%Y Warstadt, Alex
%Y Feldman, Naomi
%Y Linzen, Tal
%S Proceedings of the Society for Computation in Linguistics 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, CA
%@ 979-8-89176-412-5
%F baidildinova-etal-2026-quantifying
%X Mutual intelligibility (MI) among related languages is a gradient phenomenon shaped by lexical, grammatical, and phonetic-phonological similarity. This study proposes a neural language modeling approach to quantifying MI patterns within the Turkic language family. Using IPA-transcribed naturalistic text from six Turkic languages, we train character-level LSTM models on a source language and fine-tune them on target languages that vary in genealogical distance. Cross-lingual transfer is evaluated using character-level cross-entropy (CE) loss, Area Under the Curve (AUC), and Rate of Change (ROC), which together capture model generalization, learning dynamics, and early-stage adaptation. We further examine whether model performance is predicted by cophenetic distance, lexical similarity, weighted trigram frequency overlap, and differences in vowel harmony index. Overall, the results suggest that character-level language models can approximate MI gradients across Turkic languages: closely related pairs generally show lower CE loss and smaller AUC, while more distant pairs show greater early-stage change. Lexical similarity, local phonotactic overlap, and genealogical distance appear to be the most informative predictors of model convergence. These findings provide preliminary evidence that neural language models trained on naturalistic text can offer a scalable way to model MI patterns, including directional asymmetries, across closely related languages.
%U https://aclanthology.org/2026.scil-main.41/
%P 442-446
Markdown (Informal)
[Quantifying mutual intelligibility gradients in Turkic languages using language models](https://aclanthology.org/2026.scil-main.41/) (Baidildinova et al., SCiL 2026)
ACL