@inproceedings{morozov-etal-2025-bert,
title = "{BERT}-like Models for {S}lavic Morpheme Segmentation",
author = "Morozov, Dmitry and
Astapenka, Lizaveta and
Glazkova, Anna and
Garipov, Timur and
Lyashevskaya, Olga",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.acl-long.337/",
doi = "10.18653/v1/2025.acl-long.337",
pages = "6795--6815",
ISBN = "979-8-89176-251-0",
abstract = "Automatic morpheme segmentation algorithms are applicable in various tasks, such as building tokenizers and language education. For Slavic languages, the development of such algorithms is complicated by the rich derivational capabilities of these languages. Previous research has shown that, on average, these algorithms have already reached expert-level quality. However, a key unresolved issue is the significant decline in performance when segmenting words containing roots not present in the training data. This problem can be partially addressed by using pre-trained language models to better account for word semantics. In this work, we explored the possibility of fine-tuning BERT-like models for morpheme segmentation using data from Belarusian, Czech, and Russian. We found that for Czech and Russian, our models outperform all previously proposed approaches, achieving word-level accuracy of 92.5-95.1{\%}. For Belarusian, this task was addressed for the first time. The best-performing approach for Belarusian was an ensemble of convolutional neural networks with word-level accuracy of 90.45{\%}."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="morozov-etal-2025-bert">
<titleInfo>
<title>BERT-like Models for Slavic Morpheme Segmentation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dmitry</namePart>
<namePart type="family">Morozov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lizaveta</namePart>
<namePart type="family">Astapenka</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Glazkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timur</namePart>
<namePart type="family">Garipov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Lyashevskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-251-0</identifier>
</relatedItem>
<abstract>Automatic morpheme segmentation algorithms are applicable in various tasks, such as building tokenizers and language education. For Slavic languages, the development of such algorithms is complicated by the rich derivational capabilities of these languages. Previous research has shown that, on average, these algorithms have already reached expert-level quality. However, a key unresolved issue is the significant decline in performance when segmenting words containing roots not present in the training data. This problem can be partially addressed by using pre-trained language models to better account for word semantics. In this work, we explored the possibility of fine-tuning BERT-like models for morpheme segmentation using data from Belarusian, Czech, and Russian. We found that for Czech and Russian, our models outperform all previously proposed approaches, achieving word-level accuracy of 92.5-95.1%. For Belarusian, this task was addressed for the first time. The best-performing approach for Belarusian was an ensemble of convolutional neural networks with word-level accuracy of 90.45%.</abstract>
<identifier type="citekey">morozov-etal-2025-bert</identifier>
<identifier type="doi">10.18653/v1/2025.acl-long.337</identifier>
<location>
<url>https://aclanthology.org/2025.acl-long.337/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>6795</start>
<end>6815</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERT-like Models for Slavic Morpheme Segmentation
%A Morozov, Dmitry
%A Astapenka, Lizaveta
%A Glazkova, Anna
%A Garipov, Timur
%A Lyashevskaya, Olga
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-251-0
%F morozov-etal-2025-bert
%X Automatic morpheme segmentation algorithms are applicable in various tasks, such as building tokenizers and language education. For Slavic languages, the development of such algorithms is complicated by the rich derivational capabilities of these languages. Previous research has shown that, on average, these algorithms have already reached expert-level quality. However, a key unresolved issue is the significant decline in performance when segmenting words containing roots not present in the training data. This problem can be partially addressed by using pre-trained language models to better account for word semantics. In this work, we explored the possibility of fine-tuning BERT-like models for morpheme segmentation using data from Belarusian, Czech, and Russian. We found that for Czech and Russian, our models outperform all previously proposed approaches, achieving word-level accuracy of 92.5-95.1%. For Belarusian, this task was addressed for the first time. The best-performing approach for Belarusian was an ensemble of convolutional neural networks with word-level accuracy of 90.45%.
%R 10.18653/v1/2025.acl-long.337
%U https://aclanthology.org/2025.acl-long.337/
%U https://doi.org/10.18653/v1/2025.acl-long.337
%P 6795-6815
Markdown (Informal)
[BERT-like Models for Slavic Morpheme Segmentation](https://aclanthology.org/2025.acl-long.337/) (Morozov et al., ACL 2025)
ACL
- Dmitry Morozov, Lizaveta Astapenka, Anna Glazkova, Timur Garipov, and Olga Lyashevskaya. 2025. BERT-like Models for Slavic Morpheme Segmentation. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 6795–6815, Vienna, Austria. Association for Computational Linguistics.