@inproceedings{hernandez-mena-etal-2025-automatic,
title = "Automatic Validation of the Non-Validated {S}panish Speech Data of Common Voice 17.0",
author = "Hern{\'a}ndez Mena, Carlos Daniel and
Scalvini, Barbara and
L{\'a}g, D{\'a}vid {\'i}",
editor = "Holdt, {\v{S}}pela Arhar and
Ilinykh, Nikolai and
Scalvini, Barbara and
Bruton, Micaella and
Debess, Iben Nyholm and
Tudor, Crina Madalina",
booktitle = "Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)",
month = mar,
year = "2025",
address = "Tallinn, Estonia",
publisher = "University of Tartu Library, Estonia",
url = "https://aclanthology.org/2025.resourceful-1.12/",
pages = "58--63",
ISBN = "978-9908-53-121-2",
abstract = "Mozilla Common Voice is a crowdsourced project that aims to create a public, multilingual dataset of voice recordings for training speech recognition models. In Common Voice, anyone can contribute by donating or validating recordings in various languages. However, despite the availability of many recordings in certain languages, a significant percentage remains unvalidated by users. This is the case for Spanish, where in version 17.0 of Common Voice, 75{\%} of the 2,220 hours of recordings are unvalidated. In this work, we used the Whisper recognizer to automatically validate approximately 784 hours of recordings which are more than the 562 hours validated by users. To verify the accuracy of the validation, we developed a speech recognition model based on a version of NVIDIA-NeMo`s Parakeet, which does not have an official Spanish version. Our final model achieved a WER of less than 4{\%} on the test and validation splits of Common Voice 17.0. Both the model and the speech corpus are publicly available on Hugging Face."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hernandez-mena-etal-2025-automatic">
<titleInfo>
<title>Automatic Validation of the Non-Validated Spanish Speech Data of Common Voice 17.0</title>
</titleInfo>
<name type="personal">
<namePart type="given">Carlos</namePart>
<namePart type="given">Daniel</namePart>
<namePart type="family">Hernández Mena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Scalvini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dávid</namePart>
<namePart type="given">í</namePart>
<namePart type="family">Lág</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Špela</namePart>
<namePart type="given">Arhar</namePart>
<namePart type="family">Holdt</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nikolai</namePart>
<namePart type="family">Ilinykh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="family">Scalvini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Micaella</namePart>
<namePart type="family">Bruton</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Iben</namePart>
<namePart type="given">Nyholm</namePart>
<namePart type="family">Debess</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Crina</namePart>
<namePart type="given">Madalina</namePart>
<namePart type="family">Tudor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>University of Tartu Library, Estonia</publisher>
<place>
<placeTerm type="text">Tallinn, Estonia</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">978-9908-53-121-2</identifier>
</relatedItem>
<abstract>Mozilla Common Voice is a crowdsourced project that aims to create a public, multilingual dataset of voice recordings for training speech recognition models. In Common Voice, anyone can contribute by donating or validating recordings in various languages. However, despite the availability of many recordings in certain languages, a significant percentage remains unvalidated by users. This is the case for Spanish, where in version 17.0 of Common Voice, 75% of the 2,220 hours of recordings are unvalidated. In this work, we used the Whisper recognizer to automatically validate approximately 784 hours of recordings which are more than the 562 hours validated by users. To verify the accuracy of the validation, we developed a speech recognition model based on a version of NVIDIA-NeMo‘s Parakeet, which does not have an official Spanish version. Our final model achieved a WER of less than 4% on the test and validation splits of Common Voice 17.0. Both the model and the speech corpus are publicly available on Hugging Face.</abstract>
<identifier type="citekey">hernandez-mena-etal-2025-automatic</identifier>
<location>
<url>https://aclanthology.org/2025.resourceful-1.12/</url>
</location>
<part>
<date>2025-03</date>
<extent unit="page">
<start>58</start>
<end>63</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Automatic Validation of the Non-Validated Spanish Speech Data of Common Voice 17.0
%A Hernández Mena, Carlos Daniel
%A Scalvini, Barbara
%A Lág, Dávid í
%Y Holdt, Špela Arhar
%Y Ilinykh, Nikolai
%Y Scalvini, Barbara
%Y Bruton, Micaella
%Y Debess, Iben Nyholm
%Y Tudor, Crina Madalina
%S Proceedings of the Third Workshop on Resources and Representations for Under-Resourced Languages and Domains (RESOURCEFUL-2025)
%D 2025
%8 March
%I University of Tartu Library, Estonia
%C Tallinn, Estonia
%@ 978-9908-53-121-2
%F hernandez-mena-etal-2025-automatic
%X Mozilla Common Voice is a crowdsourced project that aims to create a public, multilingual dataset of voice recordings for training speech recognition models. In Common Voice, anyone can contribute by donating or validating recordings in various languages. However, despite the availability of many recordings in certain languages, a significant percentage remains unvalidated by users. This is the case for Spanish, where in version 17.0 of Common Voice, 75% of the 2,220 hours of recordings are unvalidated. In this work, we used the Whisper recognizer to automatically validate approximately 784 hours of recordings which are more than the 562 hours validated by users. To verify the accuracy of the validation, we developed a speech recognition model based on a version of NVIDIA-NeMo‘s Parakeet, which does not have an official Spanish version. Our final model achieved a WER of less than 4% on the test and validation splits of Common Voice 17.0. Both the model and the speech corpus are publicly available on Hugging Face.
%U https://aclanthology.org/2025.resourceful-1.12/
%P 58-63
Markdown (Informal)
[Automatic Validation of the Non-Validated Spanish Speech Data of Common Voice 17.0](https://aclanthology.org/2025.resourceful-1.12/) (Hernández Mena et al., RESOURCEFUL 2025)
ACL