@inproceedings{lorenzo-trueba-etal-2016-continuous,
title = "Continuous Expressive Speaking Styles Synthesis based on {CVSM} and {MR}-{HMM}",
author = "Lorenzo-Trueba, Jaime and
Barra-Chicote, Roberto and
Gallardo-Antolin, Ascension and
Yamagishi, Junichi and
Montero, Juan M.",
editor = "Matsumoto, Yuji and
Prasad, Rashmi",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://aclanthology.org/C16-1036",
pages = "369--376",
abstract = "This paper introduces a continuous system capable of automatically producing the most adequate speaking style to synthesize a desired target text. This is done thanks to a joint modeling of the acoustic and lexical parameters of the speaker models by adapting the CVSM projection of the training texts using MR-HMM techniques. As such, we consider that as long as sufficient variety in the training data is available, we should be able to model a continuous lexical space into a continuous acoustic space. The proposed continuous automatic text to speech system was evaluated by means of a perceptual evaluation in order to compare them with traditional approaches to the task. The system proved to be capable of conveying the correct expressiveness (average adequacy of 3.6) with an expressive strength comparable to oracle traditional expressive speech synthesis (average of 3.6) although with a drop in speech quality mainly due to the semi-continuous nature of the data (average quality of 2.9). This means that the proposed system is capable of improving traditional neutral systems without requiring any additional user interaction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="lorenzo-trueba-etal-2016-continuous">
<titleInfo>
<title>Continuous Expressive Speaking Styles Synthesis based on CVSM and MR-HMM</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jaime</namePart>
<namePart type="family">Lorenzo-Trueba</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Barra-Chicote</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ascension</namePart>
<namePart type="family">Gallardo-Antolin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junichi</namePart>
<namePart type="family">Yamagishi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">M</namePart>
<namePart type="family">Montero</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2016-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuji</namePart>
<namePart type="family">Matsumoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rashmi</namePart>
<namePart type="family">Prasad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The COLING 2016 Organizing Committee</publisher>
<place>
<placeTerm type="text">Osaka, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces a continuous system capable of automatically producing the most adequate speaking style to synthesize a desired target text. This is done thanks to a joint modeling of the acoustic and lexical parameters of the speaker models by adapting the CVSM projection of the training texts using MR-HMM techniques. As such, we consider that as long as sufficient variety in the training data is available, we should be able to model a continuous lexical space into a continuous acoustic space. The proposed continuous automatic text to speech system was evaluated by means of a perceptual evaluation in order to compare them with traditional approaches to the task. The system proved to be capable of conveying the correct expressiveness (average adequacy of 3.6) with an expressive strength comparable to oracle traditional expressive speech synthesis (average of 3.6) although with a drop in speech quality mainly due to the semi-continuous nature of the data (average quality of 2.9). This means that the proposed system is capable of improving traditional neutral systems without requiring any additional user interaction.</abstract>
<identifier type="citekey">lorenzo-trueba-etal-2016-continuous</identifier>
<location>
<url>https://aclanthology.org/C16-1036</url>
</location>
<part>
<date>2016-12</date>
<extent unit="page">
<start>369</start>
<end>376</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Continuous Expressive Speaking Styles Synthesis based on CVSM and MR-HMM
%A Lorenzo-Trueba, Jaime
%A Barra-Chicote, Roberto
%A Gallardo-Antolin, Ascension
%A Yamagishi, Junichi
%A Montero, Juan M.
%Y Matsumoto, Yuji
%Y Prasad, Rashmi
%S Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers
%D 2016
%8 December
%I The COLING 2016 Organizing Committee
%C Osaka, Japan
%F lorenzo-trueba-etal-2016-continuous
%X This paper introduces a continuous system capable of automatically producing the most adequate speaking style to synthesize a desired target text. This is done thanks to a joint modeling of the acoustic and lexical parameters of the speaker models by adapting the CVSM projection of the training texts using MR-HMM techniques. As such, we consider that as long as sufficient variety in the training data is available, we should be able to model a continuous lexical space into a continuous acoustic space. The proposed continuous automatic text to speech system was evaluated by means of a perceptual evaluation in order to compare them with traditional approaches to the task. The system proved to be capable of conveying the correct expressiveness (average adequacy of 3.6) with an expressive strength comparable to oracle traditional expressive speech synthesis (average of 3.6) although with a drop in speech quality mainly due to the semi-continuous nature of the data (average quality of 2.9). This means that the proposed system is capable of improving traditional neutral systems without requiring any additional user interaction.
%U https://aclanthology.org/C16-1036
%P 369-376
Markdown (Informal)
[Continuous Expressive Speaking Styles Synthesis based on CVSM and MR-HMM](https://aclanthology.org/C16-1036) (Lorenzo-Trueba et al., COLING 2016)
ACL
- Jaime Lorenzo-Trueba, Roberto Barra-Chicote, Ascension Gallardo-Antolin, Junichi Yamagishi, and Juan M. Montero. 2016. Continuous Expressive Speaking Styles Synthesis based on CVSM and MR-HMM. In Proceedings of COLING 2016, the 26th International Conference on Computational Linguistics: Technical Papers, pages 369–376, Osaka, Japan. The COLING 2016 Organizing Committee.