@inproceedings{ahmadi-2020-tokenization,
title = "A Tokenization System for the {K}urdish Language",
author = "Ahmadi, Sina",
editor = {Zampieri, Marcos and
Nakov, Preslav and
Ljube{\v{s}}i{\'c}, Nikola and
Tiedemann, J{\"o}rg and
Scherrer, Yves},
booktitle = "Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics (ICCL)",
url = "https://aclanthology.org/2020.vardial-1.11",
pages = "114--127",
abstract = "Tokenization is one of the essential and fundamental tasks in natural language processing. Despite the recent advances in applying unsupervised statistical methods for this task, every language with its writing system and orthography represents specific challenges that should be addressed individually. In this paper, as a preliminary study of its kind, we propose an approach for the tokenization of the Sorani and Kurmanji dialects of Kurdish using a lexicon and a morphological analyzer. We demonstrate how the morphological complexity of the language along with the lack of a unified orthography can be efficiently addressed in tokenization. We also develop an annotated dataset for which our approach outperforms the performance of unsupervised methods.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmadi-2020-tokenization">
<titleInfo>
<title>A Tokenization System for the Kurdish Language</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Ahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marcos</namePart>
<namePart type="family">Zampieri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preslav</namePart>
<namePart type="family">Nakov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikola</namePart>
<namePart type="family">Ljubešić</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jörg</namePart>
<namePart type="family">Tiedemann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yves</namePart>
<namePart type="family">Scherrer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics (ICCL)</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Tokenization is one of the essential and fundamental tasks in natural language processing. Despite the recent advances in applying unsupervised statistical methods for this task, every language with its writing system and orthography represents specific challenges that should be addressed individually. In this paper, as a preliminary study of its kind, we propose an approach for the tokenization of the Sorani and Kurmanji dialects of Kurdish using a lexicon and a morphological analyzer. We demonstrate how the morphological complexity of the language along with the lack of a unified orthography can be efficiently addressed in tokenization. We also develop an annotated dataset for which our approach outperforms the performance of unsupervised methods.</abstract>
<identifier type="citekey">ahmadi-2020-tokenization</identifier>
<location>
<url>https://aclanthology.org/2020.vardial-1.11</url>
</location>
<part>
<date>2020-12</date>
<extent unit="page">
<start>114</start>
<end>127</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Tokenization System for the Kurdish Language
%A Ahmadi, Sina
%Y Zampieri, Marcos
%Y Nakov, Preslav
%Y Ljubešić, Nikola
%Y Tiedemann, Jörg
%Y Scherrer, Yves
%S Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects
%D 2020
%8 December
%I International Committee on Computational Linguistics (ICCL)
%C Barcelona, Spain (Online)
%F ahmadi-2020-tokenization
%X Tokenization is one of the essential and fundamental tasks in natural language processing. Despite the recent advances in applying unsupervised statistical methods for this task, every language with its writing system and orthography represents specific challenges that should be addressed individually. In this paper, as a preliminary study of its kind, we propose an approach for the tokenization of the Sorani and Kurmanji dialects of Kurdish using a lexicon and a morphological analyzer. We demonstrate how the morphological complexity of the language along with the lack of a unified orthography can be efficiently addressed in tokenization. We also develop an annotated dataset for which our approach outperforms the performance of unsupervised methods.
%U https://aclanthology.org/2020.vardial-1.11
%P 114-127
Markdown (Informal)
[A Tokenization System for the Kurdish Language](https://aclanthology.org/2020.vardial-1.11) (Ahmadi, VarDial 2020)
ACL
- Sina Ahmadi. 2020. A Tokenization System for the Kurdish Language. In Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects, pages 114–127, Barcelona, Spain (Online). International Committee on Computational Linguistics (ICCL).