@inproceedings{wang-huang-2021-incorporating,
title = "Incorporating speaker embedding and post-filter network for improving speaker similarity of personalized speech synthesis system",
author = "Wang, Sheng-Yao and
Huang, Yi-Chin",
editor = "Lee, Lung-Hao and
Chang, Chia-Hui and
Chen, Kuan-Yu",
booktitle = "Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)",
month = oct,
year = "2021",
address = "Taoyuan, Taiwan",
publisher = "The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)",
url = "https://aclanthology.org/2021.rocling-1.42",
pages = "326--332",
abstract = "In recent years, speech synthesis system can generate speech with high speech quality. However, multi-speaker text-to-speech (TTS) system still require large amount of speech data for each target speaker. In this study, we would like to construct a multi-speaker TTS system by incorporating two sub modules into artificial neural network-based speech synthesis system to alleviate this problem. First module is to add speaker embedding into encoding module for generating speech while a large amount of the speech data from target speaker is not necessary. For speaker embedding method, in our study, two main speaker embedding methods, namely speaker verification embedding and voice conversion embedding, are compared to deciding which one is suitable for our personalized TTS system. Second, we substituted the conventional post-net module, which is adopted to enhance the output spectrum sequence, to further improving the speech quality of the generated speech utterance. Here, a post-filter network is used. Finally, experiment results showed that the speaker embedding is useful by adding it into encoding module and the resultant speech utterance indeed perceived as the target speaker. Also, the post-filter network not only improving the speech quality and also enhancing the speaker similarity of the generated speech utterances. The constructed TTS system can generate a speech utterance of the target speaker in fewer than 2 seconds. In the future, we would like to further investigate the controllability of the speaking rate or perceived emotion state of the generated speech.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-huang-2021-incorporating">
<titleInfo>
<title>Incorporating speaker embedding and post-filter network for improving speaker similarity of personalized speech synthesis system</title>
</titleInfo>
<name type="personal">
<namePart type="given">Sheng-Yao</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi-Chin</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lung-Hao</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chia-Hui</namePart>
<namePart type="family">Chang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kuan-Yu</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)</publisher>
<place>
<placeTerm type="text">Taoyuan, Taiwan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In recent years, speech synthesis system can generate speech with high speech quality. However, multi-speaker text-to-speech (TTS) system still require large amount of speech data for each target speaker. In this study, we would like to construct a multi-speaker TTS system by incorporating two sub modules into artificial neural network-based speech synthesis system to alleviate this problem. First module is to add speaker embedding into encoding module for generating speech while a large amount of the speech data from target speaker is not necessary. For speaker embedding method, in our study, two main speaker embedding methods, namely speaker verification embedding and voice conversion embedding, are compared to deciding which one is suitable for our personalized TTS system. Second, we substituted the conventional post-net module, which is adopted to enhance the output spectrum sequence, to further improving the speech quality of the generated speech utterance. Here, a post-filter network is used. Finally, experiment results showed that the speaker embedding is useful by adding it into encoding module and the resultant speech utterance indeed perceived as the target speaker. Also, the post-filter network not only improving the speech quality and also enhancing the speaker similarity of the generated speech utterances. The constructed TTS system can generate a speech utterance of the target speaker in fewer than 2 seconds. In the future, we would like to further investigate the controllability of the speaking rate or perceived emotion state of the generated speech.</abstract>
<identifier type="citekey">wang-huang-2021-incorporating</identifier>
<location>
<url>https://aclanthology.org/2021.rocling-1.42</url>
</location>
<part>
<date>2021-10</date>
<extent unit="page">
<start>326</start>
<end>332</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Incorporating speaker embedding and post-filter network for improving speaker similarity of personalized speech synthesis system
%A Wang, Sheng-Yao
%A Huang, Yi-Chin
%Y Lee, Lung-Hao
%Y Chang, Chia-Hui
%Y Chen, Kuan-Yu
%S Proceedings of the 33rd Conference on Computational Linguistics and Speech Processing (ROCLING 2021)
%D 2021
%8 October
%I The Association for Computational Linguistics and Chinese Language Processing (ACLCLP)
%C Taoyuan, Taiwan
%F wang-huang-2021-incorporating
%X In recent years, speech synthesis system can generate speech with high speech quality. However, multi-speaker text-to-speech (TTS) system still require large amount of speech data for each target speaker. In this study, we would like to construct a multi-speaker TTS system by incorporating two sub modules into artificial neural network-based speech synthesis system to alleviate this problem. First module is to add speaker embedding into encoding module for generating speech while a large amount of the speech data from target speaker is not necessary. For speaker embedding method, in our study, two main speaker embedding methods, namely speaker verification embedding and voice conversion embedding, are compared to deciding which one is suitable for our personalized TTS system. Second, we substituted the conventional post-net module, which is adopted to enhance the output spectrum sequence, to further improving the speech quality of the generated speech utterance. Here, a post-filter network is used. Finally, experiment results showed that the speaker embedding is useful by adding it into encoding module and the resultant speech utterance indeed perceived as the target speaker. Also, the post-filter network not only improving the speech quality and also enhancing the speaker similarity of the generated speech utterances. The constructed TTS system can generate a speech utterance of the target speaker in fewer than 2 seconds. In the future, we would like to further investigate the controllability of the speaking rate or perceived emotion state of the generated speech.
%U https://aclanthology.org/2021.rocling-1.42
%P 326-332
Markdown (Informal)
[Incorporating speaker embedding and post-filter network for improving speaker similarity of personalized speech synthesis system](https://aclanthology.org/2021.rocling-1.42) (Wang & Huang, ROCLING 2021)
ACL