@inproceedings{tankala-etal-2024-storico,
title = "{STOR}i{C}o: Storytelling {TTS} for {H}indi with Character Voice Modulation",
author = "Tankala, Pavan and
Jyothi, Preethi and
Rao, Preeti and
Bhattacharyya, Pushpak",
editor = "Graham, Yvette and
Purver, Matthew",
booktitle = "Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = mar,
year = "2024",
address = "St. Julian{'}s, Malta",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.eacl-short.37",
pages = "426--431",
abstract = "We present a new Hindi text-to-speech (TTS) dataset and demonstrate its utility for the expressive synthesis of children{'}s audio stories. The dataset comprises narration by a single female speaker who modifies her voice to produce different story characters. Annotation for dialogue identification, character labelling, and character attribution are provided, all of which are expected to facilitate the learning of character voice and speaking styles. Experiments are conducted using different versions of the annotated dataset that enable training a multi-speaker TTS model on the single-speaker data. Subjective tests show that the multi-speaker model improves expressiveness and character voice consistency compared to the baseline single-speaker TTS. With the multi-speaker model, objective evaluations show comparable word error rates, better speaker voice consistency, and higher correlations with ground-truth emotion attributes. We release a new 16.8 hours storytelling speech dataset in Hindi and propose effective solutions for expressive TTS with narrator voice modulation and character voice consistency.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tankala-etal-2024-storico">
<titleInfo>
<title>STORiCo: Storytelling TTS for Hindi with Character Voice Modulation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Pavan</namePart>
<namePart type="family">Tankala</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preethi</namePart>
<namePart type="family">Jyothi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Preeti</namePart>
<namePart type="family">Rao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pushpak</namePart>
<namePart type="family">Bhattacharyya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yvette</namePart>
<namePart type="family">Graham</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Matthew</namePart>
<namePart type="family">Purver</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">St. Julian’s, Malta</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We present a new Hindi text-to-speech (TTS) dataset and demonstrate its utility for the expressive synthesis of children’s audio stories. The dataset comprises narration by a single female speaker who modifies her voice to produce different story characters. Annotation for dialogue identification, character labelling, and character attribution are provided, all of which are expected to facilitate the learning of character voice and speaking styles. Experiments are conducted using different versions of the annotated dataset that enable training a multi-speaker TTS model on the single-speaker data. Subjective tests show that the multi-speaker model improves expressiveness and character voice consistency compared to the baseline single-speaker TTS. With the multi-speaker model, objective evaluations show comparable word error rates, better speaker voice consistency, and higher correlations with ground-truth emotion attributes. We release a new 16.8 hours storytelling speech dataset in Hindi and propose effective solutions for expressive TTS with narrator voice modulation and character voice consistency.</abstract>
<identifier type="citekey">tankala-etal-2024-storico</identifier>
<location>
<url>https://aclanthology.org/2024.eacl-short.37</url>
</location>
<part>
<date>2024-03</date>
<extent unit="page">
<start>426</start>
<end>431</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T STORiCo: Storytelling TTS for Hindi with Character Voice Modulation
%A Tankala, Pavan
%A Jyothi, Preethi
%A Rao, Preeti
%A Bhattacharyya, Pushpak
%Y Graham, Yvette
%Y Purver, Matthew
%S Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2024
%8 March
%I Association for Computational Linguistics
%C St. Julian’s, Malta
%F tankala-etal-2024-storico
%X We present a new Hindi text-to-speech (TTS) dataset and demonstrate its utility for the expressive synthesis of children’s audio stories. The dataset comprises narration by a single female speaker who modifies her voice to produce different story characters. Annotation for dialogue identification, character labelling, and character attribution are provided, all of which are expected to facilitate the learning of character voice and speaking styles. Experiments are conducted using different versions of the annotated dataset that enable training a multi-speaker TTS model on the single-speaker data. Subjective tests show that the multi-speaker model improves expressiveness and character voice consistency compared to the baseline single-speaker TTS. With the multi-speaker model, objective evaluations show comparable word error rates, better speaker voice consistency, and higher correlations with ground-truth emotion attributes. We release a new 16.8 hours storytelling speech dataset in Hindi and propose effective solutions for expressive TTS with narrator voice modulation and character voice consistency.
%U https://aclanthology.org/2024.eacl-short.37
%P 426-431
Markdown (Informal)
[STORiCo: Storytelling TTS for Hindi with Character Voice Modulation](https://aclanthology.org/2024.eacl-short.37) (Tankala et al., EACL 2024)
ACL
- Pavan Tankala, Preethi Jyothi, Preeti Rao, and Pushpak Bhattacharyya. 2024. STORiCo: Storytelling TTS for Hindi with Character Voice Modulation. In Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers), pages 426–431, St. Julian’s, Malta. Association for Computational Linguistics.