@inproceedings{zaitsev-minchenko-2022-automatic,
title = "Automatic Detection of Borrowings in Low-Resource Languages of the {C}aucasus: {A}ndic branch",
author = "Zaitsev, Konstantin and
Minchenko, Anzhelika",
editor = "Serikov, Oleg and
Voloshina, Ekaterina and
Postnikova, Anna and
Klyachko, Elena and
Neminova, Ekaterina and
Vylomova, Ekaterina and
Shavrina, Tatiana and
Ferrand, Eric Le and
Malykh, Valentin and
Tyers, Francis and
Arkhangelskiy, Timofey and
Mikhailov, Vladislav and
Fenogenova, Alena",
booktitle = "Proceedings of the first workshop on NLP applications to field linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.fieldmatters-1.4",
pages = "34--41",
abstract = "Linguistic borrowings occur in all languages. Andic languages of the Caucasus have borrowings from different donor-languages like Russian, Arabic, Persian. To automatically detect these borrowings, we propose a logistic regression model. The model was trained on the dataset which contains words in IPA from dictionaries of Andic languages. To improve model{'}s quality, we compared TfIdf and Count vectorizers and chose the second one. Besides, we added new features to the model. They were extracted using analysis of vectorizer features and using a language model. The model was evaluated by classification quality metrics (precision, recall and F1-score). The best average F1-score of all languages for words in IPA was about 0.78. Experiments showed that our model reaches good results not only with words in IPA but also with words in Cyrillic.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zaitsev-minchenko-2022-automatic">
<titleInfo>
<title>Automatic Detection of Borrowings in Low-Resource Languages of the Caucasus: Andic branch</title>
</titleInfo>
<name type="personal">
<namePart type="given">Konstantin</namePart>
<namePart type="family">Zaitsev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anzhelika</namePart>
<namePart type="family">Minchenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the first workshop on NLP applications to field linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Voloshina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="family">Postnikova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Elena</namePart>
<namePart type="family">Klyachko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Neminova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Vylomova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="given">Le</namePart>
<namePart type="family">Ferrand</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Tyers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Timofey</namePart>
<namePart type="family">Arkhangelskiy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alena</namePart>
<namePart type="family">Fenogenova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Conference on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Linguistic borrowings occur in all languages. Andic languages of the Caucasus have borrowings from different donor-languages like Russian, Arabic, Persian. To automatically detect these borrowings, we propose a logistic regression model. The model was trained on the dataset which contains words in IPA from dictionaries of Andic languages. To improve model’s quality, we compared TfIdf and Count vectorizers and chose the second one. Besides, we added new features to the model. They were extracted using analysis of vectorizer features and using a language model. The model was evaluated by classification quality metrics (precision, recall and F1-score). The best average F1-score of all languages for words in IPA was about 0.78. Experiments showed that our model reaches good results not only with words in IPA but also with words in Cyrillic.</abstract>
<identifier type="citekey">zaitsev-minchenko-2022-automatic</identifier>
<location>
<url>https://aclanthology.org/2022.fieldmatters-1.4</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>34</start>
<end>41</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Detection of Borrowings in Low-Resource Languages of the Caucasus: Andic branch
%A Zaitsev, Konstantin
%A Minchenko, Anzhelika
%Y Serikov, Oleg
%Y Voloshina, Ekaterina
%Y Postnikova, Anna
%Y Klyachko, Elena
%Y Neminova, Ekaterina
%Y Vylomova, Ekaterina
%Y Shavrina, Tatiana
%Y Ferrand, Eric Le
%Y Malykh, Valentin
%Y Tyers, Francis
%Y Arkhangelskiy, Timofey
%Y Mikhailov, Vladislav
%Y Fenogenova, Alena
%S Proceedings of the first workshop on NLP applications to field linguistics
%D 2022
%8 October
%I International Conference on Computational Linguistics
%C Gyeongju, Republic of Korea
%F zaitsev-minchenko-2022-automatic
%X Linguistic borrowings occur in all languages. Andic languages of the Caucasus have borrowings from different donor-languages like Russian, Arabic, Persian. To automatically detect these borrowings, we propose a logistic regression model. The model was trained on the dataset which contains words in IPA from dictionaries of Andic languages. To improve model’s quality, we compared TfIdf and Count vectorizers and chose the second one. Besides, we added new features to the model. They were extracted using analysis of vectorizer features and using a language model. The model was evaluated by classification quality metrics (precision, recall and F1-score). The best average F1-score of all languages for words in IPA was about 0.78. Experiments showed that our model reaches good results not only with words in IPA but also with words in Cyrillic.
%U https://aclanthology.org/2022.fieldmatters-1.4
%P 34-41
Markdown (Informal)
[Automatic Detection of Borrowings in Low-Resource Languages of the Caucasus: Andic branch](https://aclanthology.org/2022.fieldmatters-1.4) (Zaitsev & Minchenko, FieldMatters 2022)
ACL