@inproceedings{latif-etal-2021-controlling,
title = "Controlling Prosody in End-to-End {TTS}: A Case Study on Contrastive Focus Generation",
author = "Latif, Siddique and
Kim, Inyoung and
Calapodescu, Ioan and
Besacier, Laurent",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.42/",
doi = "10.18653/v1/2021.conll-1.42",
pages = "544--551",
abstract = "While End-2-End Text-to-Speech (TTS) has made significant progresses over the past few years, these systems still lack intuitive user controls over prosody. For instance, generating speech with fine-grained prosody control (prosodic prominence, contextually appropriate emotions) is still an open challenge. In this paper, we investigate whether we can control prosody directly from the input text, in order to code information related to contrastive focus which emphasizes a specific word that is contrary to the presuppositions of the interlocutor. We build and share a specific dataset for this purpose and show that it allows to train a TTS system were this fine-grained prosodic feature can be correctly conveyed using control tokens. Our evaluation compares synthetic and natural utterances and shows that prosodic patterns of contrastive focus (variations of Fo, Intensity and Duration) can be learnt accurately. Such a milestone is important to allow, for example, smart speakers to be programmatically controlled in terms of output prosody."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="latif-etal-2021-controlling">
<titleInfo>
<title>Controlling Prosody in End-to-End TTS: A Case Study on Contrastive Focus Generation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siddique</namePart>
<namePart type="family">Latif</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Inyoung</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ioan</namePart>
<namePart type="family">Calapodescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>While End-2-End Text-to-Speech (TTS) has made significant progresses over the past few years, these systems still lack intuitive user controls over prosody. For instance, generating speech with fine-grained prosody control (prosodic prominence, contextually appropriate emotions) is still an open challenge. In this paper, we investigate whether we can control prosody directly from the input text, in order to code information related to contrastive focus which emphasizes a specific word that is contrary to the presuppositions of the interlocutor. We build and share a specific dataset for this purpose and show that it allows to train a TTS system were this fine-grained prosodic feature can be correctly conveyed using control tokens. Our evaluation compares synthetic and natural utterances and shows that prosodic patterns of contrastive focus (variations of Fo, Intensity and Duration) can be learnt accurately. Such a milestone is important to allow, for example, smart speakers to be programmatically controlled in terms of output prosody.</abstract>
<identifier type="citekey">latif-etal-2021-controlling</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.42</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.42/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>544</start>
<end>551</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Controlling Prosody in End-to-End TTS: A Case Study on Contrastive Focus Generation
%A Latif, Siddique
%A Kim, Inyoung
%A Calapodescu, Ioan
%A Besacier, Laurent
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F latif-etal-2021-controlling
%X While End-2-End Text-to-Speech (TTS) has made significant progresses over the past few years, these systems still lack intuitive user controls over prosody. For instance, generating speech with fine-grained prosody control (prosodic prominence, contextually appropriate emotions) is still an open challenge. In this paper, we investigate whether we can control prosody directly from the input text, in order to code information related to contrastive focus which emphasizes a specific word that is contrary to the presuppositions of the interlocutor. We build and share a specific dataset for this purpose and show that it allows to train a TTS system were this fine-grained prosodic feature can be correctly conveyed using control tokens. Our evaluation compares synthetic and natural utterances and shows that prosodic patterns of contrastive focus (variations of Fo, Intensity and Duration) can be learnt accurately. Such a milestone is important to allow, for example, smart speakers to be programmatically controlled in terms of output prosody.
%R 10.18653/v1/2021.conll-1.42
%U https://aclanthology.org/2021.conll-1.42/
%U https://doi.org/10.18653/v1/2021.conll-1.42
%P 544-551
Markdown (Informal)
[Controlling Prosody in End-to-End TTS: A Case Study on Contrastive Focus Generation](https://aclanthology.org/2021.conll-1.42/) (Latif et al., CoNLL 2021)
ACL