@inproceedings{kuriyozov-etal-2024-bertbek,
title = "{BERT}bek: A Pretrained Language Model for {U}zbek",
author = "Kuriyozov, Elmurod and
Vilares, David and
G{\'o}mez-Rodr{\'\i}guez, Carlos",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.5",
pages = "33--44",
abstract = "Recent advances in neural networks based language representation made it possible for pretrained language models to outperform previous models in many downstream natural language processing (NLP) tasks. These pretrained language models have also shown that if large enough, they exhibit good few-shot abilities, which is especially beneficial for low-resource scenarios. In this respect, although there are some large-scale multilingual pretrained language models available, language-specific pretrained models have demonstrated to be more accurate for monolingual evaluation setups. In this work, we present BERTbek - pretrained language models based on the BERT (Bidirectional Encoder Representations from Transformers) architecture for the low-resource Uzbek language. We also provide a comprehensive evaluation of the models on a number of NLP tasks: sentiment analysis, multi-label topic classification, and named entity recognition, comparing the models with various machine learning methods as well as multilingual BERT (mBERT). Experimental results indicate that our models outperform mBERT and other task-specific baseline models in all three tasks. Additionally, we also show the impact of training data size and quality on the downstream performance of BERT models, by training three different models with different text sources and corpus sizes.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kuriyozov-etal-2024-bertbek">
<titleInfo>
<title>BERTbek: A Pretrained Language Model for Uzbek</title>
</titleInfo>
<name type="personal">
<namePart type="given">Elmurod</namePart>
<namePart type="family">Kuriyozov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Vilares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="family">Gómez-Rodríguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maite</namePart>
<namePart type="family">Melero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sakriani</namePart>
<namePart type="family">Sakti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Soria</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>ELRA and ICCL</publisher>
<place>
<placeTerm type="text">Torino, Italia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recent advances in neural networks based language representation made it possible for pretrained language models to outperform previous models in many downstream natural language processing (NLP) tasks. These pretrained language models have also shown that if large enough, they exhibit good few-shot abilities, which is especially beneficial for low-resource scenarios. In this respect, although there are some large-scale multilingual pretrained language models available, language-specific pretrained models have demonstrated to be more accurate for monolingual evaluation setups. In this work, we present BERTbek - pretrained language models based on the BERT (Bidirectional Encoder Representations from Transformers) architecture for the low-resource Uzbek language. We also provide a comprehensive evaluation of the models on a number of NLP tasks: sentiment analysis, multi-label topic classification, and named entity recognition, comparing the models with various machine learning methods as well as multilingual BERT (mBERT). Experimental results indicate that our models outperform mBERT and other task-specific baseline models in all three tasks. Additionally, we also show the impact of training data size and quality on the downstream performance of BERT models, by training three different models with different text sources and corpus sizes.</abstract>
<identifier type="citekey">kuriyozov-etal-2024-bertbek</identifier>
<location>
<url>https://aclanthology.org/2024.sigul-1.5</url>
</location>
<part>
<date>2024-05</date>
<extent unit="page">
<start>33</start>
<end>44</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BERTbek: A Pretrained Language Model for Uzbek
%A Kuriyozov, Elmurod
%A Vilares, David
%A Gómez-Rodríguez, Carlos
%Y Melero, Maite
%Y Sakti, Sakriani
%Y Soria, Claudia
%S Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024
%D 2024
%8 May
%I ELRA and ICCL
%C Torino, Italia
%F kuriyozov-etal-2024-bertbek
%X Recent advances in neural networks based language representation made it possible for pretrained language models to outperform previous models in many downstream natural language processing (NLP) tasks. These pretrained language models have also shown that if large enough, they exhibit good few-shot abilities, which is especially beneficial for low-resource scenarios. In this respect, although there are some large-scale multilingual pretrained language models available, language-specific pretrained models have demonstrated to be more accurate for monolingual evaluation setups. In this work, we present BERTbek - pretrained language models based on the BERT (Bidirectional Encoder Representations from Transformers) architecture for the low-resource Uzbek language. We also provide a comprehensive evaluation of the models on a number of NLP tasks: sentiment analysis, multi-label topic classification, and named entity recognition, comparing the models with various machine learning methods as well as multilingual BERT (mBERT). Experimental results indicate that our models outperform mBERT and other task-specific baseline models in all three tasks. Additionally, we also show the impact of training data size and quality on the downstream performance of BERT models, by training three different models with different text sources and corpus sizes.
%U https://aclanthology.org/2024.sigul-1.5
%P 33-44
Markdown (Informal)
[BERTbek: A Pretrained Language Model for Uzbek](https://aclanthology.org/2024.sigul-1.5) (Kuriyozov et al., SIGUL-WS 2024)
ACL
- Elmurod Kuriyozov, David Vilares, and Carlos Gómez-Rodríguez. 2024. BERTbek: A Pretrained Language Model for Uzbek. In Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024, pages 33–44, Torino, Italia. ELRA and ICCL.