@article{kharitonov-etal-2023-speak,
title = "Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision",
author = {Kharitonov, Eugene and
Vincent, Damien and
Borsos, Zal{\'a}n and
Marinier, Rapha{\"e}l and
Girgin, Sertan and
Pietquin, Olivier and
Sharifi, Matt and
Tagliasacchi, Marco and
Zeghidour, Neil},
journal = "Transactions of the Association for Computational Linguistics",
volume = "11",
year = "2023",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2023.tacl-1.95",
doi = "10.1162/tacl_a_00618",
pages = "1703--1718",
abstract = "We introduce SPEAR-TTS, a multi-speaker text-to-speech (TTS) system that can be trained with minimal supervision. By combining two types of discrete speech representations, we cast TTS as a composition of two sequence-to-sequence tasks: from text to high-level semantic tokens (akin to {``}reading{''}) and from semantic tokens to low-level acoustic tokens ({``}speaking{''}). Decoupling these two tasks enables training of the {``}speaking{''} module using abundant audio-only data, and unlocks the highly efficient combination of pretraining and backtranslation to reduce the need for parallel data when training the {``}reading{''} component. To control the speaker identity, we adopt example prompting, which allows SPEAR-TTS to generalize to unseen speakers using only a short sample of 3 seconds, without any explicit speaker representation or speaker labels. Our experiments demonstrate that SPEAR-TTS achieves a character error rate that is competitive with state-of-the-art methods using only 15 minutes of parallel data, while matching ground-truth speech in naturalness and acoustic quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kharitonov-etal-2023-speak">
<titleInfo>
<title>Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eugene</namePart>
<namePart type="family">Kharitonov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damien</namePart>
<namePart type="family">Vincent</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zalán</namePart>
<namePart type="family">Borsos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raphaël</namePart>
<namePart type="family">Marinier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sertan</namePart>
<namePart type="family">Girgin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Olivier</namePart>
<namePart type="family">Pietquin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matt</namePart>
<namePart type="family">Sharifi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marco</namePart>
<namePart type="family">Tagliasacchi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Neil</namePart>
<namePart type="family">Zeghidour</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We introduce SPEAR-TTS, a multi-speaker text-to-speech (TTS) system that can be trained with minimal supervision. By combining two types of discrete speech representations, we cast TTS as a composition of two sequence-to-sequence tasks: from text to high-level semantic tokens (akin to “reading”) and from semantic tokens to low-level acoustic tokens (“speaking”). Decoupling these two tasks enables training of the “speaking” module using abundant audio-only data, and unlocks the highly efficient combination of pretraining and backtranslation to reduce the need for parallel data when training the “reading” component. To control the speaker identity, we adopt example prompting, which allows SPEAR-TTS to generalize to unseen speakers using only a short sample of 3 seconds, without any explicit speaker representation or speaker labels. Our experiments demonstrate that SPEAR-TTS achieves a character error rate that is competitive with state-of-the-art methods using only 15 minutes of parallel data, while matching ground-truth speech in naturalness and acoustic quality.</abstract>
<identifier type="citekey">kharitonov-etal-2023-speak</identifier>
<identifier type="doi">10.1162/tacl_a_00618</identifier>
<location>
<url>https://aclanthology.org/2023.tacl-1.95</url>
</location>
<part>
<date>2023</date>
<detail type="volume"><number>11</number></detail>
<extent unit="page">
<start>1703</start>
<end>1718</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision
%A Kharitonov, Eugene
%A Vincent, Damien
%A Borsos, Zalán
%A Marinier, Raphaël
%A Girgin, Sertan
%A Pietquin, Olivier
%A Sharifi, Matt
%A Tagliasacchi, Marco
%A Zeghidour, Neil
%J Transactions of the Association for Computational Linguistics
%D 2023
%V 11
%I MIT Press
%C Cambridge, MA
%F kharitonov-etal-2023-speak
%X We introduce SPEAR-TTS, a multi-speaker text-to-speech (TTS) system that can be trained with minimal supervision. By combining two types of discrete speech representations, we cast TTS as a composition of two sequence-to-sequence tasks: from text to high-level semantic tokens (akin to “reading”) and from semantic tokens to low-level acoustic tokens (“speaking”). Decoupling these two tasks enables training of the “speaking” module using abundant audio-only data, and unlocks the highly efficient combination of pretraining and backtranslation to reduce the need for parallel data when training the “reading” component. To control the speaker identity, we adopt example prompting, which allows SPEAR-TTS to generalize to unseen speakers using only a short sample of 3 seconds, without any explicit speaker representation or speaker labels. Our experiments demonstrate that SPEAR-TTS achieves a character error rate that is competitive with state-of-the-art methods using only 15 minutes of parallel data, while matching ground-truth speech in naturalness and acoustic quality.
%R 10.1162/tacl_a_00618
%U https://aclanthology.org/2023.tacl-1.95
%U https://doi.org/10.1162/tacl_a_00618
%P 1703-1718
Markdown (Informal)
[Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision](https://aclanthology.org/2023.tacl-1.95) (Kharitonov et al., TACL 2023)
ACL
- Eugene Kharitonov, Damien Vincent, Zalán Borsos, Raphaël Marinier, Sertan Girgin, Olivier Pietquin, Matt Sharifi, Marco Tagliasacchi, and Neil Zeghidour. 2023. Speak, Read and Prompt: High-Fidelity Text-to-Speech with Minimal Supervision. Transactions of the Association for Computational Linguistics, 11:1703–1718.