@inproceedings{liu-etal-2025-voxpopulitts,
title = "{V}oxpopuli{TTS}: a large-scale multilingual {TTS} corpus for zero-shot speech generation",
author = "Liu, Wenrui and
Bai, Jionghao and
Cheng, Xize and
Zuo, Jialong and
Jiang, Ziyue and
Ji, Shengpeng and
Fang, Minghui and
Yang, Xiaoda and
Yang, Qian and
Zhao, Zhou",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.685/",
pages = "10293--10297",
abstract = "In recent years, speech generation fields have achieved significant advancements, primarily due to improvements in large TTS (text-to-speech) systems and scalable TTS datasets. However, there is still a lack of large-scale multilingual TTS datasets, which limits the development of cross-language and multilingual TTS systems. Hence, we refine Voxpopuli dataset and propose VoxpopuliTTS dataset. This dataset comprises 30,000 hours of high-quality speech data, across 3 languages with multiple speakers and styles, suitable for various speech tasks such as TTS and ASR. To enhance the quality of speech data from Voxpopuli, we improve the existing processing pipeline by: 1) filtering out low-quality speech-text pairs based on ASR confidence scores, and 2) concatenating short transcripts by checking semantic information completeness to generate the long transcript. Experimental results demonstrate the effectiveness of the VoxpopuliTTS dataset and the proposed processing pipeline."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="liu-etal-2025-voxpopulitts">
<titleInfo>
<title>VoxpopuliTTS: a large-scale multilingual TTS corpus for zero-shot speech generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wenrui</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jionghao</namePart>
<namePart type="family">Bai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xize</namePart>
<namePart type="family">Cheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jialong</namePart>
<namePart type="family">Zuo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ziyue</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shengpeng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Minghui</namePart>
<namePart type="family">Fang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoda</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qian</namePart>
<namePart type="family">Yang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhou</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, speech generation fields have achieved significant advancements, primarily due to improvements in large TTS (text-to-speech) systems and scalable TTS datasets. However, there is still a lack of large-scale multilingual TTS datasets, which limits the development of cross-language and multilingual TTS systems. Hence, we refine Voxpopuli dataset and propose VoxpopuliTTS dataset. This dataset comprises 30,000 hours of high-quality speech data, across 3 languages with multiple speakers and styles, suitable for various speech tasks such as TTS and ASR. To enhance the quality of speech data from Voxpopuli, we improve the existing processing pipeline by: 1) filtering out low-quality speech-text pairs based on ASR confidence scores, and 2) concatenating short transcripts by checking semantic information completeness to generate the long transcript. Experimental results demonstrate the effectiveness of the VoxpopuliTTS dataset and the proposed processing pipeline.</abstract>
<identifier type="citekey">liu-etal-2025-voxpopulitts</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.685/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>10293</start>
<end>10297</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T VoxpopuliTTS: a large-scale multilingual TTS corpus for zero-shot speech generation
%A Liu, Wenrui
%A Bai, Jionghao
%A Cheng, Xize
%A Zuo, Jialong
%A Jiang, Ziyue
%A Ji, Shengpeng
%A Fang, Minghui
%A Yang, Xiaoda
%A Yang, Qian
%A Zhao, Zhou
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F liu-etal-2025-voxpopulitts
%X In recent years, speech generation fields have achieved significant advancements, primarily due to improvements in large TTS (text-to-speech) systems and scalable TTS datasets. However, there is still a lack of large-scale multilingual TTS datasets, which limits the development of cross-language and multilingual TTS systems. Hence, we refine Voxpopuli dataset and propose VoxpopuliTTS dataset. This dataset comprises 30,000 hours of high-quality speech data, across 3 languages with multiple speakers and styles, suitable for various speech tasks such as TTS and ASR. To enhance the quality of speech data from Voxpopuli, we improve the existing processing pipeline by: 1) filtering out low-quality speech-text pairs based on ASR confidence scores, and 2) concatenating short transcripts by checking semantic information completeness to generate the long transcript. Experimental results demonstrate the effectiveness of the VoxpopuliTTS dataset and the proposed processing pipeline.
%U https://aclanthology.org/2025.coling-main.685/
%P 10293-10297
Markdown (Informal)
[VoxpopuliTTS: a large-scale multilingual TTS corpus for zero-shot speech generation](https://aclanthology.org/2025.coling-main.685/) (Liu et al., COLING 2025)
ACL
- Wenrui Liu, Jionghao Bai, Xize Cheng, Jialong Zuo, Ziyue Jiang, Shengpeng Ji, Minghui Fang, Xiaoda Yang, Qian Yang, and Zhou Zhao. 2025. VoxpopuliTTS: a large-scale multilingual TTS corpus for zero-shot speech generation. In Proceedings of the 31st International Conference on Computational Linguistics, pages 10293–10297, Abu Dhabi, UAE. Association for Computational Linguistics.