@inproceedings{toyin-etal-2024-sttatts,
title = "{STTATTS}: Unified Speech-To-Text And Text-To-Speech Model",
author = "Toyin, Hawau and
Li, Hao and
Aldarmaki, Hanan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.findings-emnlp.401",
pages = "6853--6863",
abstract = "Speech recognition and speech synthesis models are typically trained separately, each with its own set of learning objectives, training data, and model parameters, resulting in two distinct large networks. We propose a parameter-efficient approach to learning ASR and TTS jointly via a multi-task learning objective and shared parameters. Our evaluation demonstrates thatthe performance of our multi-task model is comparable to that of individually trained models while significantly savingcomputational and memory costs ({\textasciitilde}50{\%} reduction in the total number of parameters required for the two tasks combined). We experiment with English as a resource-rich language, and Arabic as a relatively low-resource language due to shortage of TTS data. Our models are trained with publicly available data, and both the training code and model checkpoints are openly available for further research.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="toyin-etal-2024-sttatts">
<titleInfo>
<title>STTATTS: Unified Speech-To-Text And Text-To-Speech Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hawau</namePart>
<namePart type="family">Toyin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hanan</namePart>
<namePart type="family">Aldarmaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2024</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Speech recognition and speech synthesis models are typically trained separately, each with its own set of learning objectives, training data, and model parameters, resulting in two distinct large networks. We propose a parameter-efficient approach to learning ASR and TTS jointly via a multi-task learning objective and shared parameters. Our evaluation demonstrates thatthe performance of our multi-task model is comparable to that of individually trained models while significantly savingcomputational and memory costs (~50% reduction in the total number of parameters required for the two tasks combined). We experiment with English as a resource-rich language, and Arabic as a relatively low-resource language due to shortage of TTS data. Our models are trained with publicly available data, and both the training code and model checkpoints are openly available for further research.</abstract>
<identifier type="citekey">toyin-etal-2024-sttatts</identifier>
<location>
<url>https://aclanthology.org/2024.findings-emnlp.401</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>6853</start>
<end>6863</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T STTATTS: Unified Speech-To-Text And Text-To-Speech Model
%A Toyin, Hawau
%A Li, Hao
%A Aldarmaki, Hanan
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Findings of the Association for Computational Linguistics: EMNLP 2024
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F toyin-etal-2024-sttatts
%X Speech recognition and speech synthesis models are typically trained separately, each with its own set of learning objectives, training data, and model parameters, resulting in two distinct large networks. We propose a parameter-efficient approach to learning ASR and TTS jointly via a multi-task learning objective and shared parameters. Our evaluation demonstrates thatthe performance of our multi-task model is comparable to that of individually trained models while significantly savingcomputational and memory costs (~50% reduction in the total number of parameters required for the two tasks combined). We experiment with English as a resource-rich language, and Arabic as a relatively low-resource language due to shortage of TTS data. Our models are trained with publicly available data, and both the training code and model checkpoints are openly available for further research.
%U https://aclanthology.org/2024.findings-emnlp.401
%P 6853-6863
Markdown (Informal)
[STTATTS: Unified Speech-To-Text And Text-To-Speech Model](https://aclanthology.org/2024.findings-emnlp.401) (Toyin et al., Findings 2024)
ACL