@inproceedings{ahmadi-etal-2023-approaches,
title = "Approaches to Corpus Creation for Low-Resource Language Technology: the Case of {S}outhern {K}urdish and {L}aki",
author = "Ahmadi, Sina and
Azin, Zahra and
Belelli, Sara and
Anastasopoulos, Antonios",
editor = "Serikov, Oleg and
Voloshina, Ekaterina and
Postnikova, Anna and
Klyachko, Elena and
Vylomova, Ekaterina and
Shavrina, Tatiana and
Le Ferrand, Eric and
Malykh, Valentin and
Tyers, Francis and
Arkhangelskiy, Timofey and
Mikhailov, Vladislav",
booktitle = "Proceedings of the Second Workshop on NLP Applications to Field Linguistics",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.fieldmatters-1.7",
doi = "10.18653/v1/2023.fieldmatters-1.7",
pages = "52--63",
abstract = "One of the major challenges that under-represented and endangered language communities face in language technology is the lack or paucity of language data. This is also the case of the Southern varieties of the Kurdish and Laki languages for which very limited resources are available with insubstantial progress in tools. To tackle this, we provide a few approaches that rely on the content of local news websites, a local radio station that broadcasts content in Southern Kurdish and fieldwork for Laki. In this paper, we describe some of the challenges of such under-represented languages, particularly in writing and standardization, and also, in retrieving sources of data and retro-digitizing handwritten content to create a corpus for Southern Kurdish and Laki. In addition, we study the task of language identification in light of the other variants of Kurdish and Zaza-Gorani languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahmadi-etal-2023-approaches">
<titleInfo>
<title>Approaches to Corpus Creation for Low-Resource Language Technology: the Case of Southern Kurdish and Laki</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sina</namePart>
<namePart type="family">Ahmadi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zahra</namePart>
<namePart type="family">Azin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sara</namePart>
<namePart type="family">Belelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Antonios</namePart>
<namePart type="family">Anastasopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on NLP Applications to Field Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Postnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Klyachko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Le Ferrand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Tyers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timofey</namePart>
<namePart type="family">Arkhangelskiy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>One of the major challenges that under-represented and endangered language communities face in language technology is the lack or paucity of language data. This is also the case of the Southern varieties of the Kurdish and Laki languages for which very limited resources are available with insubstantial progress in tools. To tackle this, we provide a few approaches that rely on the content of local news websites, a local radio station that broadcasts content in Southern Kurdish and fieldwork for Laki. In this paper, we describe some of the challenges of such under-represented languages, particularly in writing and standardization, and also, in retrieving sources of data and retro-digitizing handwritten content to create a corpus for Southern Kurdish and Laki. In addition, we study the task of language identification in light of the other variants of Kurdish and Zaza-Gorani languages.</abstract>
<identifier type="citekey">ahmadi-etal-2023-approaches</identifier>
<identifier type="doi">10.18653/v1/2023.fieldmatters-1.7</identifier>
<location>
<url>https://aclanthology.org/2023.fieldmatters-1.7</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>52</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Approaches to Corpus Creation for Low-Resource Language Technology: the Case of Southern Kurdish and Laki
%A Ahmadi, Sina
%A Azin, Zahra
%A Belelli, Sara
%A Anastasopoulos, Antonios
%Y Serikov, Oleg
%Y Voloshina, Ekaterina
%Y Postnikova, Anna
%Y Klyachko, Elena
%Y Vylomova, Ekaterina
%Y Shavrina, Tatiana
%Y Le Ferrand, Eric
%Y Malykh, Valentin
%Y Tyers, Francis
%Y Arkhangelskiy, Timofey
%Y Mikhailov, Vladislav
%S Proceedings of the Second Workshop on NLP Applications to Field Linguistics
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F ahmadi-etal-2023-approaches
%X One of the major challenges that under-represented and endangered language communities face in language technology is the lack or paucity of language data. This is also the case of the Southern varieties of the Kurdish and Laki languages for which very limited resources are available with insubstantial progress in tools. To tackle this, we provide a few approaches that rely on the content of local news websites, a local radio station that broadcasts content in Southern Kurdish and fieldwork for Laki. In this paper, we describe some of the challenges of such under-represented languages, particularly in writing and standardization, and also, in retrieving sources of data and retro-digitizing handwritten content to create a corpus for Southern Kurdish and Laki. In addition, we study the task of language identification in light of the other variants of Kurdish and Zaza-Gorani languages.
%R 10.18653/v1/2023.fieldmatters-1.7
%U https://aclanthology.org/2023.fieldmatters-1.7
%U https://doi.org/10.18653/v1/2023.fieldmatters-1.7
%P 52-63
Markdown (Informal)
[Approaches to Corpus Creation for Low-Resource Language Technology: the Case of Southern Kurdish and Laki](https://aclanthology.org/2023.fieldmatters-1.7) (Ahmadi et al., FieldMatters 2023)
ACL