@inproceedings{afanasev-etal-2025-rubic2,
title = "Rubic2: Ensemble Model for {R}ussian Lemmatization",
author = "Afanasev, Ilia and
Glazkova, Anna and
Lyashevskaya, Olga and
Morozov, Dmitry and
Smal, Ivan and
Vlasova, Natalia",
editor = "Piskorski, Jakub and
P{\v{r}}ib{\'a}{\v{n}}, Pavel and
Nakov, Preslav and
Yangarber, Roman and
Marcinczuk, Michal",
booktitle = "Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.bsnlp-1.18/",
doi = "10.18653/v1/2025.bsnlp-1.18",
pages = "157--170",
ISBN = "978-1-959429-57-9",
abstract = "Pre-trained language models have significantly advanced natural language processing (NLP), particularly in analyzing languages with complex morphological structures. This study addresses lemmatization for the Russian language, the errors in which can critically affect the performance of information retrieval, question answering, and other tasks. We present the results of experiments on generative lemmatization using pre-trained language models. Our findings demonstrate that combining generative models with the existing solutions allows achieving performance that surpasses current results for the lemmatization of Russian. This paper also introduces Rubic2, a new ensemble approach that combines the generative BART-base model, fine-tuned on a manually annotated data set of 2.1 million tokens, with the neural model called Rubic which is currently used for morphological annotation and lemmatization in the Russian National Corpus. Extensive experiments show that Rubic2 outperforms current solutions for the lemmatization of Russian, offering superior results across various text domains and contributing to advancements in NLP applications."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="afanasev-etal-2025-rubic2">
<titleInfo>
<title>Rubic2: Ensemble Model for Russian Lemmatization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilia</namePart>
<namePart type="family">Afanasev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Glazkova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olga</namePart>
<namePart type="family">Lyashevskaya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dmitry</namePart>
<namePart type="family">Morozov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ivan</namePart>
<namePart type="family">Smal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalia</namePart>
<namePart type="family">Vlasova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jakub</namePart>
<namePart type="family">Piskorski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pavel</namePart>
<namePart type="family">Přibáň</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roman</namePart>
<namePart type="family">Yangarber</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michal</namePart>
<namePart type="family">Marcinczuk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-1-959429-57-9</identifier>
</relatedItem>
<abstract>Pre-trained language models have significantly advanced natural language processing (NLP), particularly in analyzing languages with complex morphological structures. This study addresses lemmatization for the Russian language, the errors in which can critically affect the performance of information retrieval, question answering, and other tasks. We present the results of experiments on generative lemmatization using pre-trained language models. Our findings demonstrate that combining generative models with the existing solutions allows achieving performance that surpasses current results for the lemmatization of Russian. This paper also introduces Rubic2, a new ensemble approach that combines the generative BART-base model, fine-tuned on a manually annotated data set of 2.1 million tokens, with the neural model called Rubic which is currently used for morphological annotation and lemmatization in the Russian National Corpus. Extensive experiments show that Rubic2 outperforms current solutions for the lemmatization of Russian, offering superior results across various text domains and contributing to advancements in NLP applications.</abstract>
<identifier type="citekey">afanasev-etal-2025-rubic2</identifier>
<identifier type="doi">10.18653/v1/2025.bsnlp-1.18</identifier>
<location>
<url>https://aclanthology.org/2025.bsnlp-1.18/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>157</start>
<end>170</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Rubic2: Ensemble Model for Russian Lemmatization
%A Afanasev, Ilia
%A Glazkova, Anna
%A Lyashevskaya, Olga
%A Morozov, Dmitry
%A Smal, Ivan
%A Vlasova, Natalia
%Y Piskorski, Jakub
%Y Přibáň, Pavel
%Y Nakov, Preslav
%Y Yangarber, Roman
%Y Marcinczuk, Michal
%S Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025)
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 978-1-959429-57-9
%F afanasev-etal-2025-rubic2
%X Pre-trained language models have significantly advanced natural language processing (NLP), particularly in analyzing languages with complex morphological structures. This study addresses lemmatization for the Russian language, the errors in which can critically affect the performance of information retrieval, question answering, and other tasks. We present the results of experiments on generative lemmatization using pre-trained language models. Our findings demonstrate that combining generative models with the existing solutions allows achieving performance that surpasses current results for the lemmatization of Russian. This paper also introduces Rubic2, a new ensemble approach that combines the generative BART-base model, fine-tuned on a manually annotated data set of 2.1 million tokens, with the neural model called Rubic which is currently used for morphological annotation and lemmatization in the Russian National Corpus. Extensive experiments show that Rubic2 outperforms current solutions for the lemmatization of Russian, offering superior results across various text domains and contributing to advancements in NLP applications.
%R 10.18653/v1/2025.bsnlp-1.18
%U https://aclanthology.org/2025.bsnlp-1.18/
%U https://doi.org/10.18653/v1/2025.bsnlp-1.18
%P 157-170
Markdown (Informal)
[Rubic2: Ensemble Model for Russian Lemmatization](https://aclanthology.org/2025.bsnlp-1.18/) (Afanasev et al., BSNLP 2025)
ACL
- Ilia Afanasev, Anna Glazkova, Olga Lyashevskaya, Dmitry Morozov, Ivan Smal, and Natalia Vlasova. 2025. Rubic2: Ensemble Model for Russian Lemmatization. In Proceedings of the 10th Workshop on Slavic Natural Language Processing (Slavic NLP 2025), pages 157–170, Vienna, Austria. Association for Computational Linguistics.