@inproceedings{krol-etal-2025-lemmatization,
title = "Lemmatization of {P}olish Multi-word Expressions",
author = "Kr{\'o}l, Magdalena and
Smywi{\'n}ski-Pohl, Aleksander and
Kaleta, Zbigniew and
Lewkowicz, Pawe{\l}",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1126/",
pages = "22159--22168",
ISBN = "979-8-89176-332-6",
abstract = "This paper explores the lemmatization of multi-word expressions (MWEs) and proper names in Polish {--} tasks complicated by linguistic irregularities and historical factors. Instead of using rule-based methods, we apply a machine learning approach with fine-tuned plT5 and mT5 models. We trained and validated the models on enhanced gold-standard data from the 2019 PolEval task and evaluated the impact of additional fine-tuning on a silver-standard dataset derived from Wikipedia. Two setups were tested: one without context, and one using left-side context of the target MWE. Our best model achieved 86.23{\%} AccCS (Accuracy Case-Sensitive), 89.43{\%} AccCI (Accuracy Case-Insensitive), and a combined score of 88.79{\%}, setting a new state-of-the-art for Polish MWE and named entity lemmatization, as confirmed by the PolEval maintainers. We also evaluated optimization and quantization techniques to reduce model size and inference time with minimal quality loss."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="krol-etal-2025-lemmatization">
<titleInfo>
<title>Lemmatization of Polish Multi-word Expressions</title>
</titleInfo>
<name type="personal">
<namePart type="given">Magdalena</namePart>
<namePart type="family">Król</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aleksander</namePart>
<namePart type="family">Smywiński-Pohl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zbigniew</namePart>
<namePart type="family">Kaleta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Paweł</namePart>
<namePart type="family">Lewkowicz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>This paper explores the lemmatization of multi-word expressions (MWEs) and proper names in Polish – tasks complicated by linguistic irregularities and historical factors. Instead of using rule-based methods, we apply a machine learning approach with fine-tuned plT5 and mT5 models. We trained and validated the models on enhanced gold-standard data from the 2019 PolEval task and evaluated the impact of additional fine-tuning on a silver-standard dataset derived from Wikipedia. Two setups were tested: one without context, and one using left-side context of the target MWE. Our best model achieved 86.23% AccCS (Accuracy Case-Sensitive), 89.43% AccCI (Accuracy Case-Insensitive), and a combined score of 88.79%, setting a new state-of-the-art for Polish MWE and named entity lemmatization, as confirmed by the PolEval maintainers. We also evaluated optimization and quantization techniques to reduce model size and inference time with minimal quality loss.</abstract>
<identifier type="citekey">krol-etal-2025-lemmatization</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1126/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>22159</start>
<end>22168</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Lemmatization of Polish Multi-word Expressions
%A Król, Magdalena
%A Smywiński-Pohl, Aleksander
%A Kaleta, Zbigniew
%A Lewkowicz, Paweł
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F krol-etal-2025-lemmatization
%X This paper explores the lemmatization of multi-word expressions (MWEs) and proper names in Polish – tasks complicated by linguistic irregularities and historical factors. Instead of using rule-based methods, we apply a machine learning approach with fine-tuned plT5 and mT5 models. We trained and validated the models on enhanced gold-standard data from the 2019 PolEval task and evaluated the impact of additional fine-tuning on a silver-standard dataset derived from Wikipedia. Two setups were tested: one without context, and one using left-side context of the target MWE. Our best model achieved 86.23% AccCS (Accuracy Case-Sensitive), 89.43% AccCI (Accuracy Case-Insensitive), and a combined score of 88.79%, setting a new state-of-the-art for Polish MWE and named entity lemmatization, as confirmed by the PolEval maintainers. We also evaluated optimization and quantization techniques to reduce model size and inference time with minimal quality loss.
%U https://aclanthology.org/2025.emnlp-main.1126/
%P 22159-22168
Markdown (Informal)
[Lemmatization of Polish Multi-word Expressions](https://aclanthology.org/2025.emnlp-main.1126/) (Król et al., EMNLP 2025)
ACL
- Magdalena Król, Aleksander Smywiński-Pohl, Zbigniew Kaleta, and Paweł Lewkowicz. 2025. Lemmatization of Polish Multi-word Expressions. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 22159–22168, Suzhou, China. Association for Computational Linguistics.