@inproceedings{ogunremi-etal-2023-mini,
title = "Mini But Mighty: Efficient Multilingual Pretraining with Linguistically-Informed Data Selection",
author = "Ogunremi, Tolulope and
Jurafsky, Dan and
Manning, Christopher",
editor = "Vlachos, Andreas and
Augenstein, Isabelle",
booktitle = "Findings of the Association for Computational Linguistics: EACL 2023",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-eacl.93",
doi = "10.18653/v1/2023.findings-eacl.93",
pages = "1251--1266",
abstract = "With the prominence of large pretrained language models, low-resource languages are rarely modelled monolingually and become victims of the {``}curse of multilinguality{''} in massively multilingual models. Recently, AfriBERTa showed that training transformer models from scratch on 1GB of data from many unrelated African languages outperforms massively multilingual models on downstream NLP tasks. Here we extend this direction, focusing on the use of related languages. We propose that training on smaller amounts of data but from related languages could match the performance of models trained on large, unrelated data. We test our hypothesis on the Niger-Congo family and its Bantu and Volta-Niger sub-families, pretraining models with data solely from Niger-Congo languages and finetuning on 4 downstream tasks: NER, part-of-speech tagging, sentiment analysis and text classification. We find that models trained on genetically related languages achieve equal performance on downstream tasks in low-resource languages despite using less training data. We recommend selecting training data based on language-relatedness when pretraining language models for low-resource languages.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ogunremi-etal-2023-mini">
<titleInfo>
<title>Mini But Mighty: Efficient Multilingual Pretraining with Linguistically-Informed Data Selection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tolulope</namePart>
<namePart type="family">Ogunremi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christopher</namePart>
<namePart type="family">Manning</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andreas</namePart>
<namePart type="family">Vlachos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Isabelle</namePart>
<namePart type="family">Augenstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dubrovnik, Croatia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>With the prominence of large pretrained language models, low-resource languages are rarely modelled monolingually and become victims of the “curse of multilinguality” in massively multilingual models. Recently, AfriBERTa showed that training transformer models from scratch on 1GB of data from many unrelated African languages outperforms massively multilingual models on downstream NLP tasks. Here we extend this direction, focusing on the use of related languages. We propose that training on smaller amounts of data but from related languages could match the performance of models trained on large, unrelated data. We test our hypothesis on the Niger-Congo family and its Bantu and Volta-Niger sub-families, pretraining models with data solely from Niger-Congo languages and finetuning on 4 downstream tasks: NER, part-of-speech tagging, sentiment analysis and text classification. We find that models trained on genetically related languages achieve equal performance on downstream tasks in low-resource languages despite using less training data. We recommend selecting training data based on language-relatedness when pretraining language models for low-resource languages.</abstract>
<identifier type="citekey">ogunremi-etal-2023-mini</identifier>
<identifier type="doi">10.18653/v1/2023.findings-eacl.93</identifier>
<location>
<url>https://aclanthology.org/2023.findings-eacl.93</url>
</location>
<part>
<date>2023-05</date>
<extent unit="page">
<start>1251</start>
<end>1266</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Mini But Mighty: Efficient Multilingual Pretraining with Linguistically-Informed Data Selection
%A Ogunremi, Tolulope
%A Jurafsky, Dan
%A Manning, Christopher
%Y Vlachos, Andreas
%Y Augenstein, Isabelle
%S Findings of the Association for Computational Linguistics: EACL 2023
%D 2023
%8 May
%I Association for Computational Linguistics
%C Dubrovnik, Croatia
%F ogunremi-etal-2023-mini
%X With the prominence of large pretrained language models, low-resource languages are rarely modelled monolingually and become victims of the “curse of multilinguality” in massively multilingual models. Recently, AfriBERTa showed that training transformer models from scratch on 1GB of data from many unrelated African languages outperforms massively multilingual models on downstream NLP tasks. Here we extend this direction, focusing on the use of related languages. We propose that training on smaller amounts of data but from related languages could match the performance of models trained on large, unrelated data. We test our hypothesis on the Niger-Congo family and its Bantu and Volta-Niger sub-families, pretraining models with data solely from Niger-Congo languages and finetuning on 4 downstream tasks: NER, part-of-speech tagging, sentiment analysis and text classification. We find that models trained on genetically related languages achieve equal performance on downstream tasks in low-resource languages despite using less training data. We recommend selecting training data based on language-relatedness when pretraining language models for low-resource languages.
%R 10.18653/v1/2023.findings-eacl.93
%U https://aclanthology.org/2023.findings-eacl.93
%U https://doi.org/10.18653/v1/2023.findings-eacl.93
%P 1251-1266
Markdown (Informal)
[Mini But Mighty: Efficient Multilingual Pretraining with Linguistically-Informed Data Selection](https://aclanthology.org/2023.findings-eacl.93) (Ogunremi et al., Findings 2023)
ACL