@inproceedings{chhetri-etal-2025-impacts,
title = "Impacts of Vocoder Selection on Tacotron-based {N}epali Text-To-Speech Synthesis",
author = "Chhetri, Ganesh Dhakal and
Dahal, Kiran Chandra and
Poudyal, Prakash",
editor = "Sarveswaran, Kengatharaiyer and
Vaidya, Ashwini and
Krishna Bal, Bal and
Shams, Sana and
Thapa, Surendrabikram",
booktitle = "Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2025.chipsal-1.18/",
pages = "185--192",
abstract = "Text-to-speech (TTS) technology enhances human-computer interaction and increases content accessibility. Tacotron and other deep learning models have enhanced the naturalness of text-to-speech systems. The vocoder, which transforms mel-spectrograms into audio waveforms, significantly influences voice quality. This study evaluates Tacotron2 vocoders for Nepali text-to speech synthesis. While English language vocoders have been thoroughly examined, Nepali language vocoders remain underexplored. The study utilizes the WaveNet and MelGAN vocoders to generate speech from mel-spectrograms produced by Tacotron2 for Nepali text. In order to assess the quality of voice synthesis, this paper study the mel-cepstral distortion (MCD) and Mean Opinion Score (MOS) for speech produced by both vocoders. The comparative investigation of the Tacotron2 + MelGAN and Tacotron2 + WaveNet models, utilizing the Nepali OpenSLR and News male voice datasets, consistently reveals the advantage of Tacotron2 + MelGAN in terms of naturalness and accuracy. The Tacotron2 + MelGAN model achieved an average MOS score of 4.245 on the Nepali OpenSLR dataset and 2.885 on the male voice dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chhetri-etal-2025-impacts">
<titleInfo>
<title>Impacts of Vocoder Selection on Tacotron-based Nepali Text-To-Speech Synthesis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ganesh</namePart>
<namePart type="given">Dhakal</namePart>
<namePart type="family">Chhetri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kiran</namePart>
<namePart type="given">Chandra</namePart>
<namePart type="family">Dahal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Prakash</namePart>
<namePart type="family">Poudyal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kengatharaiyer</namePart>
<namePart type="family">Sarveswaran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashwini</namePart>
<namePart type="family">Vaidya</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bal</namePart>
<namePart type="family">Krishna Bal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sana</namePart>
<namePart type="family">Shams</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Surendrabikram</namePart>
<namePart type="family">Thapa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text-to-speech (TTS) technology enhances human-computer interaction and increases content accessibility. Tacotron and other deep learning models have enhanced the naturalness of text-to-speech systems. The vocoder, which transforms mel-spectrograms into audio waveforms, significantly influences voice quality. This study evaluates Tacotron2 vocoders for Nepali text-to speech synthesis. While English language vocoders have been thoroughly examined, Nepali language vocoders remain underexplored. The study utilizes the WaveNet and MelGAN vocoders to generate speech from mel-spectrograms produced by Tacotron2 for Nepali text. In order to assess the quality of voice synthesis, this paper study the mel-cepstral distortion (MCD) and Mean Opinion Score (MOS) for speech produced by both vocoders. The comparative investigation of the Tacotron2 + MelGAN and Tacotron2 + WaveNet models, utilizing the Nepali OpenSLR and News male voice datasets, consistently reveals the advantage of Tacotron2 + MelGAN in terms of naturalness and accuracy. The Tacotron2 + MelGAN model achieved an average MOS score of 4.245 on the Nepali OpenSLR dataset and 2.885 on the male voice dataset.</abstract>
<identifier type="citekey">chhetri-etal-2025-impacts</identifier>
<location>
<url>https://aclanthology.org/2025.chipsal-1.18/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>185</start>
<end>192</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Impacts of Vocoder Selection on Tacotron-based Nepali Text-To-Speech Synthesis
%A Chhetri, Ganesh Dhakal
%A Dahal, Kiran Chandra
%A Poudyal, Prakash
%Y Sarveswaran, Kengatharaiyer
%Y Vaidya, Ashwini
%Y Krishna Bal, Bal
%Y Shams, Sana
%Y Thapa, Surendrabikram
%S Proceedings of the First Workshop on Challenges in Processing South Asian Languages (CHiPSAL 2025)
%D 2025
%8 January
%I International Committee on Computational Linguistics
%C Abu Dhabi, UAE
%F chhetri-etal-2025-impacts
%X Text-to-speech (TTS) technology enhances human-computer interaction and increases content accessibility. Tacotron and other deep learning models have enhanced the naturalness of text-to-speech systems. The vocoder, which transforms mel-spectrograms into audio waveforms, significantly influences voice quality. This study evaluates Tacotron2 vocoders for Nepali text-to speech synthesis. While English language vocoders have been thoroughly examined, Nepali language vocoders remain underexplored. The study utilizes the WaveNet and MelGAN vocoders to generate speech from mel-spectrograms produced by Tacotron2 for Nepali text. In order to assess the quality of voice synthesis, this paper study the mel-cepstral distortion (MCD) and Mean Opinion Score (MOS) for speech produced by both vocoders. The comparative investigation of the Tacotron2 + MelGAN and Tacotron2 + WaveNet models, utilizing the Nepali OpenSLR and News male voice datasets, consistently reveals the advantage of Tacotron2 + MelGAN in terms of naturalness and accuracy. The Tacotron2 + MelGAN model achieved an average MOS score of 4.245 on the Nepali OpenSLR dataset and 2.885 on the male voice dataset.
%U https://aclanthology.org/2025.chipsal-1.18/
%P 185-192
Markdown (Informal)
[Impacts of Vocoder Selection on Tacotron-based Nepali Text-To-Speech Synthesis](https://aclanthology.org/2025.chipsal-1.18/) (Chhetri et al., CHiPSAL 2025)
ACL