@inproceedings{abdelali-etal-2022-natiq,
title = "{N}ati{Q}: An End-to-end Text-to-Speech System for {A}rabic",
author = "Abdelali, Ahmed and
Durrani, Nadir and
Demiroglu, Cenk and
Dalvi, Fahim and
Mubarak, Hamdy and
Darwish, Kareem",
booktitle = "Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.wanlp-1.38",
doi = "10.18653/v1/2022.wanlp-1.38",
pages = "394--398",
abstract = "NatiQ is end-to-end text-to-speech system for Arabic. Our speech synthesizer uses an encoder-decoder architecture with attention. We used both tacotron-based models (tacotron- 1 and tacotron-2) and the faster transformer model for generating mel-spectrograms from characters. We concatenated Tacotron1 with the WaveRNN vocoder, Tacotron2 with the WaveGlow vocoder and ESPnet transformer with the parallel wavegan vocoder to synthesize waveforms from the spectrograms. We used in-house speech data for two voices: 1) neu- tral male {``}Hamza{''}- narrating general content and news, and 2) expressive female {``}Amina{''}- narrating children story books to train our models. Our best systems achieve an aver- age Mean Opinion Score (MOS) of 4.21 and 4.40 for Amina and Hamza respectively. The objective evaluation of the systems using word and character error rate (WER and CER) as well as the response time measured by real- time factor favored the end-to-end architecture ESPnet. NatiQ demo is available online at \url{https://tts.qcri.org}.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abdelali-etal-2022-natiq">
<titleInfo>
<title>NatiQ: An End-to-end Text-to-Speech System for Arabic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="family">Abdelali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nadir</namePart>
<namePart type="family">Durrani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cenk</namePart>
<namePart type="family">Demiroglu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fahim</namePart>
<namePart type="family">Dalvi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hamdy</namePart>
<namePart type="family">Mubarak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kareem</namePart>
<namePart type="family">Darwish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, United Arab Emirates (Hybrid)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>NatiQ is end-to-end text-to-speech system for Arabic. Our speech synthesizer uses an encoder-decoder architecture with attention. We used both tacotron-based models (tacotron- 1 and tacotron-2) and the faster transformer model for generating mel-spectrograms from characters. We concatenated Tacotron1 with the WaveRNN vocoder, Tacotron2 with the WaveGlow vocoder and ESPnet transformer with the parallel wavegan vocoder to synthesize waveforms from the spectrograms. We used in-house speech data for two voices: 1) neu- tral male “Hamza”- narrating general content and news, and 2) expressive female “Amina”- narrating children story books to train our models. Our best systems achieve an aver- age Mean Opinion Score (MOS) of 4.21 and 4.40 for Amina and Hamza respectively. The objective evaluation of the systems using word and character error rate (WER and CER) as well as the response time measured by real- time factor favored the end-to-end architecture ESPnet. NatiQ demo is available online at https://tts.qcri.org.</abstract>
<identifier type="citekey">abdelali-etal-2022-natiq</identifier>
<identifier type="doi">10.18653/v1/2022.wanlp-1.38</identifier>
<location>
<url>https://aclanthology.org/2022.wanlp-1.38</url>
</location>
<part>
<date>2022-12</date>
<extent unit="page">
<start>394</start>
<end>398</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T NatiQ: An End-to-end Text-to-Speech System for Arabic
%A Abdelali, Ahmed
%A Durrani, Nadir
%A Demiroglu, Cenk
%A Dalvi, Fahim
%A Mubarak, Hamdy
%A Darwish, Kareem
%S Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP)
%D 2022
%8 December
%I Association for Computational Linguistics
%C Abu Dhabi, United Arab Emirates (Hybrid)
%F abdelali-etal-2022-natiq
%X NatiQ is end-to-end text-to-speech system for Arabic. Our speech synthesizer uses an encoder-decoder architecture with attention. We used both tacotron-based models (tacotron- 1 and tacotron-2) and the faster transformer model for generating mel-spectrograms from characters. We concatenated Tacotron1 with the WaveRNN vocoder, Tacotron2 with the WaveGlow vocoder and ESPnet transformer with the parallel wavegan vocoder to synthesize waveforms from the spectrograms. We used in-house speech data for two voices: 1) neu- tral male “Hamza”- narrating general content and news, and 2) expressive female “Amina”- narrating children story books to train our models. Our best systems achieve an aver- age Mean Opinion Score (MOS) of 4.21 and 4.40 for Amina and Hamza respectively. The objective evaluation of the systems using word and character error rate (WER and CER) as well as the response time measured by real- time factor favored the end-to-end architecture ESPnet. NatiQ demo is available online at https://tts.qcri.org.
%R 10.18653/v1/2022.wanlp-1.38
%U https://aclanthology.org/2022.wanlp-1.38
%U https://doi.org/10.18653/v1/2022.wanlp-1.38
%P 394-398
Markdown (Informal)
[NatiQ: An End-to-end Text-to-Speech System for Arabic](https://aclanthology.org/2022.wanlp-1.38) (Abdelali et al., WANLP 2022)
ACL
- Ahmed Abdelali, Nadir Durrani, Cenk Demiroglu, Fahim Dalvi, Hamdy Mubarak, and Kareem Darwish. 2022. NatiQ: An End-to-end Text-to-Speech System for Arabic. In Proceedings of the The Seventh Arabic Natural Language Processing Workshop (WANLP), pages 394–398, Abu Dhabi, United Arab Emirates (Hybrid). Association for Computational Linguistics.