@inproceedings{lux-etal-2022-low,
title = "Low-Resource Multilingual and Zero-Shot Multispeaker {TTS}",
author = "Lux, Florian and
Koch, Julia and
Vu, Ngoc Thang",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-main.56/",
doi = "10.18653/v1/2022.aacl-main.56",
pages = "741--751",
abstract = "While neural methods for text-to-speech (TTS) have shown great advances in modeling multiple speakers, even in zero-shot settings, the amount of data needed for those approaches is generally not feasible for the vast majority of the world`s over 6,000 spoken languages. In this work, we bring together the tasks of zero-shot voice cloning and multilingual low-resource TTS. Using the language agnostic meta learning (LAML) procedure and modifications to a TTS encoder, we show that it is possible for a system to learn speaking a new language using just 5 minutes of training data while retaining the ability to infer the voice of even unseen speakers in the newly learned language. We show the success of our proposed approach in terms of intelligibility, naturalness and similarity to target speaker using objective metrics as well as human studies and provide our code and trained models open source."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lux-etal-2022-low">
<titleInfo>
<title>Low-Resource Multilingual and Zero-Shot Multispeaker TTS</title>
</titleInfo>
<name type="personal">
<namePart type="given">Florian</namePart>
<namePart type="family">Lux</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Julia</namePart>
<namePart type="family">Koch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ngoc</namePart>
<namePart type="given">Thang</namePart>
<namePart type="family">Vu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yulan</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sujian</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chua-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online only</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While neural methods for text-to-speech (TTS) have shown great advances in modeling multiple speakers, even in zero-shot settings, the amount of data needed for those approaches is generally not feasible for the vast majority of the world‘s over 6,000 spoken languages. In this work, we bring together the tasks of zero-shot voice cloning and multilingual low-resource TTS. Using the language agnostic meta learning (LAML) procedure and modifications to a TTS encoder, we show that it is possible for a system to learn speaking a new language using just 5 minutes of training data while retaining the ability to infer the voice of even unseen speakers in the newly learned language. We show the success of our proposed approach in terms of intelligibility, naturalness and similarity to target speaker using objective metrics as well as human studies and provide our code and trained models open source.</abstract>
<identifier type="citekey">lux-etal-2022-low</identifier>
<identifier type="doi">10.18653/v1/2022.aacl-main.56</identifier>
<location>
<url>https://aclanthology.org/2022.aacl-main.56/</url>
</location>
<part>
<date>2022-11</date>
<extent unit="page">
<start>741</start>
<end>751</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Low-Resource Multilingual and Zero-Shot Multispeaker TTS
%A Lux, Florian
%A Koch, Julia
%A Vu, Ngoc Thang
%Y He, Yulan
%Y Ji, Heng
%Y Li, Sujian
%Y Liu, Yang
%Y Chang, Chua-Hui
%S Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)
%D 2022
%8 November
%I Association for Computational Linguistics
%C Online only
%F lux-etal-2022-low
%X While neural methods for text-to-speech (TTS) have shown great advances in modeling multiple speakers, even in zero-shot settings, the amount of data needed for those approaches is generally not feasible for the vast majority of the world‘s over 6,000 spoken languages. In this work, we bring together the tasks of zero-shot voice cloning and multilingual low-resource TTS. Using the language agnostic meta learning (LAML) procedure and modifications to a TTS encoder, we show that it is possible for a system to learn speaking a new language using just 5 minutes of training data while retaining the ability to infer the voice of even unseen speakers in the newly learned language. We show the success of our proposed approach in terms of intelligibility, naturalness and similarity to target speaker using objective metrics as well as human studies and provide our code and trained models open source.
%R 10.18653/v1/2022.aacl-main.56
%U https://aclanthology.org/2022.aacl-main.56/
%U https://doi.org/10.18653/v1/2022.aacl-main.56
%P 741-751
Markdown (Informal)
[Low-Resource Multilingual and Zero-Shot Multispeaker TTS](https://aclanthology.org/2022.aacl-main.56/) (Lux et al., AACL-IJCNLP 2022)
ACL
- Florian Lux, Julia Koch, and Ngoc Thang Vu. 2022. Low-Resource Multilingual and Zero-Shot Multispeaker TTS. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pages 741–751, Online only. Association for Computational Linguistics.