@inproceedings{costa-jussa-etal-2025-2m,
title = "2{M}-{BELEBELE}: Highly Multilingual Speech and {A}merican {S}ign {L}anguage Comprehension Dataset Download {PDF}",
author = "Costa-juss{\`a}, Marta R. and
Yu, Bokai and
Andrews, Pierre and
Alastruey, Belen and
Camgoz, Necati Cihan and
Chuang, Joe and
Maillard, Jean and
Ropers, Christophe and
Turkatenko, Arina and
Wood, Carleigh",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.569/",
doi = "10.18653/v1/2025.findings-acl.569",
pages = "10893--10904",
ISBN = "979-8-89176-256-5",
abstract = "We introduce the first highly multilingual speech and American Sign Language (ASL) comprehension dataset by extending BELEBELE. Our dataset covers 91 spoken languages at the intersection of BELEBELE and FLEURS, and one sign language (ASL). As a by-product we also extend the Automatic Speech Recognition Benchmark, FLEURS, by 20{\%}. We evaluate 2M-BELEBELE dataset for both 5-shot and zero-shot settings and across languages, the speech comprehension accuracy is {\ensuremath{\approx}} 10{\%} average lower compared to reading comprehension."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="costa-jussa-etal-2025-2m">
<titleInfo>
<title>2M-BELEBELE: Highly Multilingual Speech and American Sign Language Comprehension Dataset Download PDF</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marta</namePart>
<namePart type="given">R</namePart>
<namePart type="family">Costa-jussà</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bokai</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre</namePart>
<namePart type="family">Andrews</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Belen</namePart>
<namePart type="family">Alastruey</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Necati</namePart>
<namePart type="given">Cihan</namePart>
<namePart type="family">Camgoz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joe</namePart>
<namePart type="family">Chuang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jean</namePart>
<namePart type="family">Maillard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Christophe</namePart>
<namePart type="family">Ropers</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arina</namePart>
<namePart type="family">Turkatenko</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carleigh</namePart>
<namePart type="family">Wood</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>We introduce the first highly multilingual speech and American Sign Language (ASL) comprehension dataset by extending BELEBELE. Our dataset covers 91 spoken languages at the intersection of BELEBELE and FLEURS, and one sign language (ASL). As a by-product we also extend the Automatic Speech Recognition Benchmark, FLEURS, by 20%. We evaluate 2M-BELEBELE dataset for both 5-shot and zero-shot settings and across languages, the speech comprehension accuracy is \ensuremath\approx 10% average lower compared to reading comprehension.</abstract>
<identifier type="citekey">costa-jussa-etal-2025-2m</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.569</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.569/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>10893</start>
<end>10904</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T 2M-BELEBELE: Highly Multilingual Speech and American Sign Language Comprehension Dataset Download PDF
%A Costa-jussà, Marta R.
%A Yu, Bokai
%A Andrews, Pierre
%A Alastruey, Belen
%A Camgoz, Necati Cihan
%A Chuang, Joe
%A Maillard, Jean
%A Ropers, Christophe
%A Turkatenko, Arina
%A Wood, Carleigh
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F costa-jussa-etal-2025-2m
%X We introduce the first highly multilingual speech and American Sign Language (ASL) comprehension dataset by extending BELEBELE. Our dataset covers 91 spoken languages at the intersection of BELEBELE and FLEURS, and one sign language (ASL). As a by-product we also extend the Automatic Speech Recognition Benchmark, FLEURS, by 20%. We evaluate 2M-BELEBELE dataset for both 5-shot and zero-shot settings and across languages, the speech comprehension accuracy is \ensuremath\approx 10% average lower compared to reading comprehension.
%R 10.18653/v1/2025.findings-acl.569
%U https://aclanthology.org/2025.findings-acl.569/
%U https://doi.org/10.18653/v1/2025.findings-acl.569
%P 10893-10904
Markdown (Informal)
[2M-BELEBELE: Highly Multilingual Speech and American Sign Language Comprehension Dataset Download PDF](https://aclanthology.org/2025.findings-acl.569/) (Costa-jussà et al., Findings 2025)
ACL
- Marta R. Costa-jussà, Bokai Yu, Pierre Andrews, Belen Alastruey, Necati Cihan Camgoz, Joe Chuang, Jean Maillard, Christophe Ropers, Arina Turkatenko, and Carleigh Wood. 2025. 2M-BELEBELE: Highly Multilingual Speech and American Sign Language Comprehension Dataset Download PDF. In Findings of the Association for Computational Linguistics: ACL 2025, pages 10893–10904, Vienna, Austria. Association for Computational Linguistics.