@inproceedings{tang-vukovic-2025-nlp,
title = "{NLP} for preserving Torlak, a vulnerable low-resource {S}lavic language",
author = "Tang, Li and
Vukovi{\'c}, Teodora",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.423/",
pages = "6338--6347",
abstract = "Torlak is an endangered, low-resource Slavic language with a high degree of areal and inter-speaker variation. In previous work, interviews were performed with Torlak speakers in Serbia, near the Bulgarian border, and the transcripts annotated with lemma and morphosyntactic descriptions at token level. As such token-level annotations facilitate cross-language comparison in the context of the Balkan Sprachbund, where multiple languages influenced Torlak over time, including Serbian and Bulgarian. Here, we aim to improve the prediction of morphosyntactic annotations for this low-resource language using the fine-tuning of large language models, comparing several predictive models. We also further fine-tuned the large language models for scoring the degree of {\textquoteleft}Torlakness' of a sentence by labeling likely Torlak tokens, to facilitate the documentation of additional Torlak transcribed speech with a high degree of Torlak-style non-standard features compared to standard Serbian. Taken together, we hope that these contributions will help to document this endangered language, and improve digital access for its speakers."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tang-vukovic-2025-nlp">
<titleInfo>
<title>NLP for preserving Torlak, a vulnerable low-resource Slavic language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Li</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Teodora</namePart>
<namePart type="family">Vuković</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Torlak is an endangered, low-resource Slavic language with a high degree of areal and inter-speaker variation. In previous work, interviews were performed with Torlak speakers in Serbia, near the Bulgarian border, and the transcripts annotated with lemma and morphosyntactic descriptions at token level. As such token-level annotations facilitate cross-language comparison in the context of the Balkan Sprachbund, where multiple languages influenced Torlak over time, including Serbian and Bulgarian. Here, we aim to improve the prediction of morphosyntactic annotations for this low-resource language using the fine-tuning of large language models, comparing several predictive models. We also further fine-tuned the large language models for scoring the degree of ‘Torlakness’ of a sentence by labeling likely Torlak tokens, to facilitate the documentation of additional Torlak transcribed speech with a high degree of Torlak-style non-standard features compared to standard Serbian. Taken together, we hope that these contributions will help to document this endangered language, and improve digital access for its speakers.</abstract>
<identifier type="citekey">tang-vukovic-2025-nlp</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.423/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6338</start>
<end>6347</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NLP for preserving Torlak, a vulnerable low-resource Slavic language
%A Tang, Li
%A Vuković, Teodora
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F tang-vukovic-2025-nlp
%X Torlak is an endangered, low-resource Slavic language with a high degree of areal and inter-speaker variation. In previous work, interviews were performed with Torlak speakers in Serbia, near the Bulgarian border, and the transcripts annotated with lemma and morphosyntactic descriptions at token level. As such token-level annotations facilitate cross-language comparison in the context of the Balkan Sprachbund, where multiple languages influenced Torlak over time, including Serbian and Bulgarian. Here, we aim to improve the prediction of morphosyntactic annotations for this low-resource language using the fine-tuning of large language models, comparing several predictive models. We also further fine-tuned the large language models for scoring the degree of ‘Torlakness’ of a sentence by labeling likely Torlak tokens, to facilitate the documentation of additional Torlak transcribed speech with a high degree of Torlak-style non-standard features compared to standard Serbian. Taken together, we hope that these contributions will help to document this endangered language, and improve digital access for its speakers.
%U https://aclanthology.org/2025.coling-main.423/
%P 6338-6347
Markdown (Informal)
[NLP for preserving Torlak, a vulnerable low-resource Slavic language](https://aclanthology.org/2025.coling-main.423/) (Tang & Vuković, COLING 2025)
ACL