@inproceedings{evaldo-leal-etal-2025-mupe,
title = "{M}u{P}e Life Stories Dataset: Spontaneous Speech in {B}razilian {P}ortuguese with a Case Study Evaluation on {ASR} Bias against Speakers Groups and Topic Modeling",
author = "Evaldo Leal, Sidney and
Candido Junior, Arnaldo and
Marcacini, Ricardo and
Casanova, Edresson and
Gon{\c{c}}alves, Odilon and
Silva Soares, Anderson and
Freitas Lima, Rodrigo and
Stefanel Gris, Lucas Rafael and
Alu{\'i}sio, Sandra",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.407/",
pages = "6076--6087",
abstract = "Recently, several public datasets for automatic speech recognition (ASR) in Brazilian Portuguese (BP) have been released, improving ASR systems performance. However, these datasets lack diversity in terms of age groups, regional accents, and education levels. In this paper, we present a new publicly available dataset consisting of 289 life story interviews (365 hours), featuring a broad range of speakers varying in age, education, and regional accents. First, we demonstrated the presence of bias in current BP ASR models concerning education levels and age groups. Second, we showed that our dataset helps mitigate these biases. Additionally, an ASR model trained on our dataset performed better during evaluation on a diverse test set. Finally, the ASR model trained with our dataset was extrinsically evaluated through a topic modeling task that utilized the automatically transcribed output."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="evaldo-leal-etal-2025-mupe">
<titleInfo>
<title>MuPe Life Stories Dataset: Spontaneous Speech in Brazilian Portuguese with a Case Study Evaluation on ASR Bias against Speakers Groups and Topic Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sidney</namePart>
<namePart type="family">Evaldo Leal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arnaldo</namePart>
<namePart type="family">Candido Junior</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ricardo</namePart>
<namePart type="family">Marcacini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Edresson</namePart>
<namePart type="family">Casanova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Odilon</namePart>
<namePart type="family">Gonçalves</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anderson</namePart>
<namePart type="family">Silva Soares</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rodrigo</namePart>
<namePart type="family">Freitas Lima</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucas</namePart>
<namePart type="given">Rafael</namePart>
<namePart type="family">Stefanel Gris</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sandra</namePart>
<namePart type="family">Aluísio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recently, several public datasets for automatic speech recognition (ASR) in Brazilian Portuguese (BP) have been released, improving ASR systems performance. However, these datasets lack diversity in terms of age groups, regional accents, and education levels. In this paper, we present a new publicly available dataset consisting of 289 life story interviews (365 hours), featuring a broad range of speakers varying in age, education, and regional accents. First, we demonstrated the presence of bias in current BP ASR models concerning education levels and age groups. Second, we showed that our dataset helps mitigate these biases. Additionally, an ASR model trained on our dataset performed better during evaluation on a diverse test set. Finally, the ASR model trained with our dataset was extrinsically evaluated through a topic modeling task that utilized the automatically transcribed output.</abstract>
<identifier type="citekey">evaldo-leal-etal-2025-mupe</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.407/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6076</start>
<end>6087</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T MuPe Life Stories Dataset: Spontaneous Speech in Brazilian Portuguese with a Case Study Evaluation on ASR Bias against Speakers Groups and Topic Modeling
%A Evaldo Leal, Sidney
%A Candido Junior, Arnaldo
%A Marcacini, Ricardo
%A Casanova, Edresson
%A Gonçalves, Odilon
%A Silva Soares, Anderson
%A Freitas Lima, Rodrigo
%A Stefanel Gris, Lucas Rafael
%A Aluísio, Sandra
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F evaldo-leal-etal-2025-mupe
%X Recently, several public datasets for automatic speech recognition (ASR) in Brazilian Portuguese (BP) have been released, improving ASR systems performance. However, these datasets lack diversity in terms of age groups, regional accents, and education levels. In this paper, we present a new publicly available dataset consisting of 289 life story interviews (365 hours), featuring a broad range of speakers varying in age, education, and regional accents. First, we demonstrated the presence of bias in current BP ASR models concerning education levels and age groups. Second, we showed that our dataset helps mitigate these biases. Additionally, an ASR model trained on our dataset performed better during evaluation on a diverse test set. Finally, the ASR model trained with our dataset was extrinsically evaluated through a topic modeling task that utilized the automatically transcribed output.
%U https://aclanthology.org/2025.coling-main.407/
%P 6076-6087
Markdown (Informal)
[MuPe Life Stories Dataset: Spontaneous Speech in Brazilian Portuguese with a Case Study Evaluation on ASR Bias against Speakers Groups and Topic Modeling](https://aclanthology.org/2025.coling-main.407/) (Evaldo Leal et al., COLING 2025)
ACL
- Sidney Evaldo Leal, Arnaldo Candido Junior, Ricardo Marcacini, Edresson Casanova, Odilon Gonçalves, Anderson Silva Soares, Rodrigo Freitas Lima, Lucas Rafael Stefanel Gris, and Sandra Aluísio. 2025. MuPe Life Stories Dataset: Spontaneous Speech in Brazilian Portuguese with a Case Study Evaluation on ASR Bias against Speakers Groups and Topic Modeling. In Proceedings of the 31st International Conference on Computational Linguistics, pages 6076–6087, Abu Dhabi, UAE. Association for Computational Linguistics.