@inproceedings{afanasev-2023-multi-lect,
title = "Multi-lect automatic detection of {S}wadesh list items from raw corpus data in {E}ast {S}lavic languages",
author = "Afanasev, Ilia",
editor = "Tahmasebi, Nina and
Montariol, Syrielle and
Dubossarsky, Haim and
Kutuzov, Andrey and
Hengchen, Simon and
Alfter, David and
Periti, Francesco and
Cassotti, Pierluigi",
booktitle = "Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.lchange-1.8/",
doi = "10.18653/v1/2023.lchange-1.8",
pages = "76--86",
abstract = "The article introduces a novel task of multi-lect automatic detection of Swadesh list items from raw corpora. The task aids the early stageof historical linguistics study by helping the researcher compile word lists for further analysis.In this paper, I test multi-lect automatic detection on the East Slavic lects' data. The training data consists of Ukrainian, Belarusian, and Russian material. I introduce a new dataset for the Ukrainian language. I implement data augmentation techniques to give automatic tools a better understanding of the searched value. The test data consists of the Old East Slavic texts.I train HMM, CRF, and mBERT models, then test and evaluate them by harmonic F1 score. The baseline is a Random Forest classifier. I introduce two different subtasks: the search for new Swadesh list items, and the search for the known Swadesh list items in new lects of the well-established group. The first subtask, given the simultaneously diverse and vague nature of the Swadesh list, currently presents an almost unbeatable challenge for machine learning methods. The second subtask, on the other hand, is easier, and the mBERT model achieves a 0.57 F1 score. This is an impressive result, given how hard it is to formalise the token belonging to a very specific and thematically diverse set of concepts."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="afanasev-2023-multi-lect">
<titleInfo>
<title>Multi-lect automatic detection of Swadesh list items from raw corpus data in East Slavic languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ilia</namePart>
<namePart type="family">Afanasev</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nina</namePart>
<namePart type="family">Tahmasebi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Syrielle</namePart>
<namePart type="family">Montariol</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haim</namePart>
<namePart type="family">Dubossarsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andrey</namePart>
<namePart type="family">Kutuzov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Hengchen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Alfter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francesco</namePart>
<namePart type="family">Periti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierluigi</namePart>
<namePart type="family">Cassotti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The article introduces a novel task of multi-lect automatic detection of Swadesh list items from raw corpora. The task aids the early stageof historical linguistics study by helping the researcher compile word lists for further analysis.In this paper, I test multi-lect automatic detection on the East Slavic lects’ data. The training data consists of Ukrainian, Belarusian, and Russian material. I introduce a new dataset for the Ukrainian language. I implement data augmentation techniques to give automatic tools a better understanding of the searched value. The test data consists of the Old East Slavic texts.I train HMM, CRF, and mBERT models, then test and evaluate them by harmonic F1 score. The baseline is a Random Forest classifier. I introduce two different subtasks: the search for new Swadesh list items, and the search for the known Swadesh list items in new lects of the well-established group. The first subtask, given the simultaneously diverse and vague nature of the Swadesh list, currently presents an almost unbeatable challenge for machine learning methods. The second subtask, on the other hand, is easier, and the mBERT model achieves a 0.57 F1 score. This is an impressive result, given how hard it is to formalise the token belonging to a very specific and thematically diverse set of concepts.</abstract>
<identifier type="citekey">afanasev-2023-multi-lect</identifier>
<identifier type="doi">10.18653/v1/2023.lchange-1.8</identifier>
<location>
<url>https://aclanthology.org/2023.lchange-1.8/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>76</start>
<end>86</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Multi-lect automatic detection of Swadesh list items from raw corpus data in East Slavic languages
%A Afanasev, Ilia
%Y Tahmasebi, Nina
%Y Montariol, Syrielle
%Y Dubossarsky, Haim
%Y Kutuzov, Andrey
%Y Hengchen, Simon
%Y Alfter, David
%Y Periti, Francesco
%Y Cassotti, Pierluigi
%S Proceedings of the 4th Workshop on Computational Approaches to Historical Language Change
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F afanasev-2023-multi-lect
%X The article introduces a novel task of multi-lect automatic detection of Swadesh list items from raw corpora. The task aids the early stageof historical linguistics study by helping the researcher compile word lists for further analysis.In this paper, I test multi-lect automatic detection on the East Slavic lects’ data. The training data consists of Ukrainian, Belarusian, and Russian material. I introduce a new dataset for the Ukrainian language. I implement data augmentation techniques to give automatic tools a better understanding of the searched value. The test data consists of the Old East Slavic texts.I train HMM, CRF, and mBERT models, then test and evaluate them by harmonic F1 score. The baseline is a Random Forest classifier. I introduce two different subtasks: the search for new Swadesh list items, and the search for the known Swadesh list items in new lects of the well-established group. The first subtask, given the simultaneously diverse and vague nature of the Swadesh list, currently presents an almost unbeatable challenge for machine learning methods. The second subtask, on the other hand, is easier, and the mBERT model achieves a 0.57 F1 score. This is an impressive result, given how hard it is to formalise the token belonging to a very specific and thematically diverse set of concepts.
%R 10.18653/v1/2023.lchange-1.8
%U https://aclanthology.org/2023.lchange-1.8/
%U https://doi.org/10.18653/v1/2023.lchange-1.8
%P 76-86
Markdown (Informal)
[Multi-lect automatic detection of Swadesh list items from raw corpus data in East Slavic languages](https://aclanthology.org/2023.lchange-1.8/) (Afanasev, LChange 2023)
ACL