@inproceedings{gaim-etal-2022-geezswitch,
title = "{G}eez{S}witch: Language Identification in Typologically Related Low-resourced {E}ast {A}frican Languages",
author = "Gaim, Fitsum and
Yang, Wonsuk and
Park, Jong C.",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.707",
pages = "6578--6584",
abstract = "Language identification is one of the fundamental tasks in natural language processing that is a prerequisite to data processing and numerous applications. Low-resourced languages with similar typologies are generally confused with each other in real-world applications such as machine translation, affecting the user{'}s experience. In this work, we present a language identification dataset for five typologically and phylogenetically related low-resourced East African languages that use the Ge{'}ez script as a writing system; namely Amharic, Blin, Ge{'}ez, Tigre, and Tigrinya. The dataset is built automatically from selected data sources, but we also performed a manual evaluation to assess its quality. Our approach to constructing the dataset is cost-effective and applicable to other low-resource languages. We integrated the dataset into an existing language-identification tool and also fine-tuned several Transformer based language models, achieving very strong results in all cases. While the task of language identification is easy for the informed person, such datasets can make a difference in real-world deployments and also serve as part of a benchmark for language understanding in the target languages. The data and models are made available at \url{https://github.com/fgaim/geezswitch}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gaim-etal-2022-geezswitch">
<titleInfo>
<title>GeezSwitch: Language Identification in Typologically Related Low-resourced East African Languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Fitsum</namePart>
<namePart type="family">Gaim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wonsuk</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jong</namePart>
<namePart type="given">C</namePart>
<namePart type="family">Park</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Thirteenth Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Language identification is one of the fundamental tasks in natural language processing that is a prerequisite to data processing and numerous applications. Low-resourced languages with similar typologies are generally confused with each other in real-world applications such as machine translation, affecting the user’s experience. In this work, we present a language identification dataset for five typologically and phylogenetically related low-resourced East African languages that use the Ge’ez script as a writing system; namely Amharic, Blin, Ge’ez, Tigre, and Tigrinya. The dataset is built automatically from selected data sources, but we also performed a manual evaluation to assess its quality. Our approach to constructing the dataset is cost-effective and applicable to other low-resource languages. We integrated the dataset into an existing language-identification tool and also fine-tuned several Transformer based language models, achieving very strong results in all cases. While the task of language identification is easy for the informed person, such datasets can make a difference in real-world deployments and also serve as part of a benchmark for language understanding in the target languages. The data and models are made available at https://github.com/fgaim/geezswitch.</abstract>
<identifier type="citekey">gaim-etal-2022-geezswitch</identifier>
<location>
<url>https://aclanthology.org/2022.lrec-1.707</url>
</location>
<part>
<date>2022-06</date>
<extent unit="page">
<start>6578</start>
<end>6584</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GeezSwitch: Language Identification in Typologically Related Low-resourced East African Languages
%A Gaim, Fitsum
%A Yang, Wonsuk
%A Park, Jong C.
%S Proceedings of the Thirteenth Language Resources and Evaluation Conference
%D 2022
%8 June
%I European Language Resources Association
%C Marseille, France
%F gaim-etal-2022-geezswitch
%X Language identification is one of the fundamental tasks in natural language processing that is a prerequisite to data processing and numerous applications. Low-resourced languages with similar typologies are generally confused with each other in real-world applications such as machine translation, affecting the user’s experience. In this work, we present a language identification dataset for five typologically and phylogenetically related low-resourced East African languages that use the Ge’ez script as a writing system; namely Amharic, Blin, Ge’ez, Tigre, and Tigrinya. The dataset is built automatically from selected data sources, but we also performed a manual evaluation to assess its quality. Our approach to constructing the dataset is cost-effective and applicable to other low-resource languages. We integrated the dataset into an existing language-identification tool and also fine-tuned several Transformer based language models, achieving very strong results in all cases. While the task of language identification is easy for the informed person, such datasets can make a difference in real-world deployments and also serve as part of a benchmark for language understanding in the target languages. The data and models are made available at https://github.com/fgaim/geezswitch.
%U https://aclanthology.org/2022.lrec-1.707
%P 6578-6584
Markdown (Informal)
[GeezSwitch: Language Identification in Typologically Related Low-resourced East African Languages](https://aclanthology.org/2022.lrec-1.707) (Gaim et al., LREC 2022)
ACL