@inproceedings{zalmout-habash-2020-utilizing,
title = "Utilizing Subword Entities in Character-Level Sequence-to-Sequence Lemmatization Models",
author = "Zalmout, Nasser and
Habash, Nizar",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.412",
doi = "10.18653/v1/2020.coling-main.412",
pages = "4676--4682",
abstract = "In this paper we present a character-level sequence-to-sequence lemmatization model, utilizing several subword features in multiple configurations. In addition to generic n-gram embeddings (using FastText), we experiment with concatenative (stems) and templatic (roots and patterns) morphological subwords. We present several architectures that embed these features directly at the encoder side, or learn them jointly at the decoder side with a multitask learning architecture. The results indicate that using the generic n-gram embeddings (through FastText) outperform the other linguistically-driven subwords. We use Modern Standard Arabic and Egyptian Arabic as test cases, with up to 22{\%} and 13{\%} relative error reduction, respectively, from a strong baseline. An error analysis shows that our best system is even able to handle word/lemma pairs that are both unseen in the training data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zalmout-habash-2020-utilizing">
<titleInfo>
<title>Utilizing Subword Entities in Character-Level Sequence-to-Sequence Lemmatization Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nasser</namePart>
<namePart type="family">Zalmout</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nizar</namePart>
<namePart type="family">Habash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 28th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Donia</namePart>
<namePart type="family">Scott</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nuria</namePart>
<namePart type="family">Bel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we present a character-level sequence-to-sequence lemmatization model, utilizing several subword features in multiple configurations. In addition to generic n-gram embeddings (using FastText), we experiment with concatenative (stems) and templatic (roots and patterns) morphological subwords. We present several architectures that embed these features directly at the encoder side, or learn them jointly at the decoder side with a multitask learning architecture. The results indicate that using the generic n-gram embeddings (through FastText) outperform the other linguistically-driven subwords. We use Modern Standard Arabic and Egyptian Arabic as test cases, with up to 22% and 13% relative error reduction, respectively, from a strong baseline. An error analysis shows that our best system is even able to handle word/lemma pairs that are both unseen in the training data.</abstract>
<identifier type="citekey">zalmout-habash-2020-utilizing</identifier>
<identifier type="doi">10.18653/v1/2020.coling-main.412</identifier>
<location>
<url>https://aclanthology.org/2020.coling-main.412</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>4676</start>
<end>4682</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Utilizing Subword Entities in Character-Level Sequence-to-Sequence Lemmatization Models
%A Zalmout, Nasser
%A Habash, Nizar
%Y Scott, Donia
%Y Bel, Nuria
%Y Zong, Chengqing
%S Proceedings of the 28th International Conference on Computational Linguistics
%D 2020
%8 December
%I International Committee on Computational Linguistics
%C Barcelona, Spain (Online)
%F zalmout-habash-2020-utilizing
%X In this paper we present a character-level sequence-to-sequence lemmatization model, utilizing several subword features in multiple configurations. In addition to generic n-gram embeddings (using FastText), we experiment with concatenative (stems) and templatic (roots and patterns) morphological subwords. We present several architectures that embed these features directly at the encoder side, or learn them jointly at the decoder side with a multitask learning architecture. The results indicate that using the generic n-gram embeddings (through FastText) outperform the other linguistically-driven subwords. We use Modern Standard Arabic and Egyptian Arabic as test cases, with up to 22% and 13% relative error reduction, respectively, from a strong baseline. An error analysis shows that our best system is even able to handle word/lemma pairs that are both unseen in the training data.
%R 10.18653/v1/2020.coling-main.412
%U https://aclanthology.org/2020.coling-main.412
%U https://doi.org/10.18653/v1/2020.coling-main.412
%P 4676-4682
Markdown (Informal)
[Utilizing Subword Entities in Character-Level Sequence-to-Sequence Lemmatization Models](https://aclanthology.org/2020.coling-main.412) (Zalmout & Habash, COLING 2020)
ACL