@inproceedings{aker-etal-2017-extensible,
title = "An Extensible Multilingual Open Source Lemmatizer",
author = "Aker, Ahmet and
Petrak, Johann and
Sabbah, Firas",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the International Conference Recent Advances in Natural Language Processing, {RANLP} 2017",
month = sep,
year = "2017",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd.",
url = "https://doi.org/10.26615/978-954-452-049-6_006",
doi = "10.26615/978-954-452-049-6_006",
pages = "40--45",
abstract = "We present GATE DictLemmatizer, a multilingual open source lemmatizer for the GATE NLP framework that currently supports English, German, Italian, French, Dutch, and Spanish, and is easily extensible to other languages. The software is freely available under the LGPL license. The lemmatization is based on the Helsinki Finite-State Transducer Technology (HFST) and lemma dictionaries automatically created from Wiktionary. We evaluate the performance of the lemmatizers against TreeTagger, which is only freely available for research purposes. Our evaluation shows that DictLemmatizer achieves similar or even better results than TreeTagger for languages where there is support from HFST. The performance drops when there is no support from HFST and the entire lemmatization process is based on lemma dictionaries. However, the results are still satisfactory given the fact that DictLemmatizer isopen-source and can be easily extended to other languages. The software for extending the lemmatizer by creating word lists from Wiktionary dictionaries is also freely available as open-source software.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aker-etal-2017-extensible">
<titleInfo>
<title>An Extensible Multilingual Open Source Lemmatizer</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmet</namePart>
<namePart type="family">Aker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Johann</namePart>
<namePart type="family">Petrak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Firas</namePart>
<namePart type="family">Sabbah</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruslan</namePart>
<namePart type="family">Mitkov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Galia</namePart>
<namePart type="family">Angelova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>INCOMA Ltd.</publisher>
<place>
<placeTerm type="text">Varna, Bulgaria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present GATE DictLemmatizer, a multilingual open source lemmatizer for the GATE NLP framework that currently supports English, German, Italian, French, Dutch, and Spanish, and is easily extensible to other languages. The software is freely available under the LGPL license. The lemmatization is based on the Helsinki Finite-State Transducer Technology (HFST) and lemma dictionaries automatically created from Wiktionary. We evaluate the performance of the lemmatizers against TreeTagger, which is only freely available for research purposes. Our evaluation shows that DictLemmatizer achieves similar or even better results than TreeTagger for languages where there is support from HFST. The performance drops when there is no support from HFST and the entire lemmatization process is based on lemma dictionaries. However, the results are still satisfactory given the fact that DictLemmatizer isopen-source and can be easily extended to other languages. The software for extending the lemmatizer by creating word lists from Wiktionary dictionaries is also freely available as open-source software.</abstract>
<identifier type="citekey">aker-etal-2017-extensible</identifier>
<identifier type="doi">10.26615/978-954-452-049-6_006</identifier>
<part>
<date>2017-09</date>
<extent unit="page">
<start>40</start>
<end>45</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T An Extensible Multilingual Open Source Lemmatizer
%A Aker, Ahmet
%A Petrak, Johann
%A Sabbah, Firas
%Y Mitkov, Ruslan
%Y Angelova, Galia
%S Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017
%D 2017
%8 September
%I INCOMA Ltd.
%C Varna, Bulgaria
%F aker-etal-2017-extensible
%X We present GATE DictLemmatizer, a multilingual open source lemmatizer for the GATE NLP framework that currently supports English, German, Italian, French, Dutch, and Spanish, and is easily extensible to other languages. The software is freely available under the LGPL license. The lemmatization is based on the Helsinki Finite-State Transducer Technology (HFST) and lemma dictionaries automatically created from Wiktionary. We evaluate the performance of the lemmatizers against TreeTagger, which is only freely available for research purposes. Our evaluation shows that DictLemmatizer achieves similar or even better results than TreeTagger for languages where there is support from HFST. The performance drops when there is no support from HFST and the entire lemmatization process is based on lemma dictionaries. However, the results are still satisfactory given the fact that DictLemmatizer isopen-source and can be easily extended to other languages. The software for extending the lemmatizer by creating word lists from Wiktionary dictionaries is also freely available as open-source software.
%R 10.26615/978-954-452-049-6_006
%U https://doi.org/10.26615/978-954-452-049-6_006
%P 40-45
Markdown (Informal)
[An Extensible Multilingual Open Source Lemmatizer](https://doi.org/10.26615/978-954-452-049-6_006) (Aker et al., RANLP 2017)
ACL
- Ahmet Aker, Johann Petrak, and Firas Sabbah. 2017. An Extensible Multilingual Open Source Lemmatizer. In Proceedings of the International Conference Recent Advances in Natural Language Processing, RANLP 2017, pages 40–45, Varna, Bulgaria. INCOMA Ltd..