@inproceedings{ahuja-etal-2022-beyond,
title = "Beyond Static models and test sets: Benchmarking the potential of pre-trained models across tasks and languages",
author = "Ahuja, Kabir and
Dandapat, Sandipan and
Sitaram, Sunayana and
Choudhury, Monojit",
editor = "Shavrina, Tatiana and
Mikhailov, Vladislav and
Malykh, Valentin and
Artemova, Ekaterina and
Serikov, Oleg and
Protasov, Vitaly",
booktitle = "Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.nlppower-1.7/",
doi = "10.18653/v1/2022.nlppower-1.7",
pages = "64--74",
abstract = "Although recent Massively Multilingual Language Models (MMLMs) like mBERT and XLMR support around 100 languages, most existing multilingual NLP benchmarks provide evaluation data in only a handful of these languages with little linguistic diversity. We argue that this makes the existing practices in multilingual evaluation unreliable and does not provide a full picture of the performance of MMLMs across the linguistic landscape. We propose that the recent work done in Performance Prediction for NLP tasks can serve as a potential solution in fixing benchmarking in Multilingual NLP by utilizing features related to data and language typology to estimate the performance of an MMLM on different languages. We compare performance prediction with translating test data with a case study on four different multilingual datasets, and observe that these methods can provide reliable estimates of the performance that are often on-par with the translation based approaches, without the need for any additional translation as well as evaluation costs."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ahuja-etal-2022-beyond">
<titleInfo>
<title>Beyond Static models and test sets: Benchmarking the potential of pre-trained models across tasks and languages</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kabir</namePart>
<namePart type="family">Ahuja</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandipan</namePart>
<namePart type="family">Dandapat</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sunayana</namePart>
<namePart type="family">Sitaram</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Monojit</namePart>
<namePart type="family">Choudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP</title>
</titleInfo>
<name type="personal">
<namePart type="given">Tatiana</namePart>
<namePart type="family">Shavrina</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vladislav</namePart>
<namePart type="family">Mikhailov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Valentin</namePart>
<namePart type="family">Malykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Artemova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Oleg</namePart>
<namePart type="family">Serikov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vitaly</namePart>
<namePart type="family">Protasov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Dublin, Ireland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Although recent Massively Multilingual Language Models (MMLMs) like mBERT and XLMR support around 100 languages, most existing multilingual NLP benchmarks provide evaluation data in only a handful of these languages with little linguistic diversity. We argue that this makes the existing practices in multilingual evaluation unreliable and does not provide a full picture of the performance of MMLMs across the linguistic landscape. We propose that the recent work done in Performance Prediction for NLP tasks can serve as a potential solution in fixing benchmarking in Multilingual NLP by utilizing features related to data and language typology to estimate the performance of an MMLM on different languages. We compare performance prediction with translating test data with a case study on four different multilingual datasets, and observe that these methods can provide reliable estimates of the performance that are often on-par with the translation based approaches, without the need for any additional translation as well as evaluation costs.</abstract>
<identifier type="citekey">ahuja-etal-2022-beyond</identifier>
<identifier type="doi">10.18653/v1/2022.nlppower-1.7</identifier>
<location>
<url>https://aclanthology.org/2022.nlppower-1.7/</url>
</location>
<part>
<date>2022-05</date>
<extent unit="page">
<start>64</start>
<end>74</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Beyond Static models and test sets: Benchmarking the potential of pre-trained models across tasks and languages
%A Ahuja, Kabir
%A Dandapat, Sandipan
%A Sitaram, Sunayana
%A Choudhury, Monojit
%Y Shavrina, Tatiana
%Y Mikhailov, Vladislav
%Y Malykh, Valentin
%Y Artemova, Ekaterina
%Y Serikov, Oleg
%Y Protasov, Vitaly
%S Proceedings of NLP Power! The First Workshop on Efficient Benchmarking in NLP
%D 2022
%8 May
%I Association for Computational Linguistics
%C Dublin, Ireland
%F ahuja-etal-2022-beyond
%X Although recent Massively Multilingual Language Models (MMLMs) like mBERT and XLMR support around 100 languages, most existing multilingual NLP benchmarks provide evaluation data in only a handful of these languages with little linguistic diversity. We argue that this makes the existing practices in multilingual evaluation unreliable and does not provide a full picture of the performance of MMLMs across the linguistic landscape. We propose that the recent work done in Performance Prediction for NLP tasks can serve as a potential solution in fixing benchmarking in Multilingual NLP by utilizing features related to data and language typology to estimate the performance of an MMLM on different languages. We compare performance prediction with translating test data with a case study on four different multilingual datasets, and observe that these methods can provide reliable estimates of the performance that are often on-par with the translation based approaches, without the need for any additional translation as well as evaluation costs.
%R 10.18653/v1/2022.nlppower-1.7
%U https://aclanthology.org/2022.nlppower-1.7/
%U https://doi.org/10.18653/v1/2022.nlppower-1.7
%P 64-74
Markdown (Informal)
[Beyond Static models and test sets: Benchmarking the potential of pre-trained models across tasks and languages](https://aclanthology.org/2022.nlppower-1.7/) (Ahuja et al., nlppower 2022)
ACL