@inproceedings{wang-etal-2024-spikevoice,
title = "{S}pike{V}oice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network",
author = "Wang, Kexin and
Zhang, Jiahong and
Ren, Yong and
Yao, Man and
Shang, Di and
Xu, Bo and
Li, Guoqi",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.luhme-long.429/",
doi = "10.18653/v1/2024.acl-long.429",
pages = "7927--7940",
abstract = "Brain-inspired Spiking Neural Network (SNN) has demonstrated its effectiveness and efficiency in vision, natural language, and speech understanding tasks, indicating their capacity to {\textquotedblleft}see{\textquotedblright}, {\textquotedblleft}listen{\textquotedblright}, and {\textquotedblleft}read{\textquotedblright}. In this paper, we design SpikeVoice, which performs high-quality Text-To-Speech (TTS) via SNN, to explore the potential of SNN to {\textquotedblleft}speak{\textquotedblright}. A major obstacle to using SNN for such generative tasks lies in the demand for models to grasp long-term dependencies. The serial nature of spiking neurons, however, leads to the invisibility of information at future spiking time steps, limiting SNN models to capture sequence dependencies solely within the same time step. We term this phenomenon {\textquotedblleft}partial-time dependency{\textquotedblright}. To address this issue, we introduce Spiking Temporal-Sequential Attention (STSA) in the SpikeVoice. To the best of our knowledge, SpikeVoice is the first TTS work in the SNN field. We perform experiments using four well-established datasets that cover both Chinese and English languages, encompassing scenarios with both single-speaker and multi-speaker configurations. The results demonstrate that SpikeVoice can achieve results comparable to Artificial Neural Networks (ANN) with only 10.5{\%} energy consumption of ANN. Both our demo and code are available as supplementary material."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2024-spikevoice">
<titleInfo>
<title>SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kexin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiahong</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yong</namePart>
<namePart type="family">Ren</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Man</namePart>
<namePart type="family">Yao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Di</namePart>
<namePart type="family">Shang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bo</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guoqi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lun-Wei</namePart>
<namePart type="family">Ku</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Andre</namePart>
<namePart type="family">Martins</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vivek</namePart>
<namePart type="family">Srikumar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Bangkok, Thailand</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Brain-inspired Spiking Neural Network (SNN) has demonstrated its effectiveness and efficiency in vision, natural language, and speech understanding tasks, indicating their capacity to “see”, “listen”, and “read”. In this paper, we design SpikeVoice, which performs high-quality Text-To-Speech (TTS) via SNN, to explore the potential of SNN to “speak”. A major obstacle to using SNN for such generative tasks lies in the demand for models to grasp long-term dependencies. The serial nature of spiking neurons, however, leads to the invisibility of information at future spiking time steps, limiting SNN models to capture sequence dependencies solely within the same time step. We term this phenomenon “partial-time dependency”. To address this issue, we introduce Spiking Temporal-Sequential Attention (STSA) in the SpikeVoice. To the best of our knowledge, SpikeVoice is the first TTS work in the SNN field. We perform experiments using four well-established datasets that cover both Chinese and English languages, encompassing scenarios with both single-speaker and multi-speaker configurations. The results demonstrate that SpikeVoice can achieve results comparable to Artificial Neural Networks (ANN) with only 10.5% energy consumption of ANN. Both our demo and code are available as supplementary material.</abstract>
<identifier type="citekey">wang-etal-2024-spikevoice</identifier>
<identifier type="doi">10.18653/v1/2024.acl-long.429</identifier>
<location>
<url>https://aclanthology.org/2024.luhme-long.429/</url>
</location>
<part>
<date>2024-08</date>
<extent unit="page">
<start>7927</start>
<end>7940</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network
%A Wang, Kexin
%A Zhang, Jiahong
%A Ren, Yong
%A Yao, Man
%A Shang, Di
%A Xu, Bo
%A Li, Guoqi
%Y Ku, Lun-Wei
%Y Martins, Andre
%Y Srikumar, Vivek
%S Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2024
%8 August
%I Association for Computational Linguistics
%C Bangkok, Thailand
%F wang-etal-2024-spikevoice
%X Brain-inspired Spiking Neural Network (SNN) has demonstrated its effectiveness and efficiency in vision, natural language, and speech understanding tasks, indicating their capacity to “see”, “listen”, and “read”. In this paper, we design SpikeVoice, which performs high-quality Text-To-Speech (TTS) via SNN, to explore the potential of SNN to “speak”. A major obstacle to using SNN for such generative tasks lies in the demand for models to grasp long-term dependencies. The serial nature of spiking neurons, however, leads to the invisibility of information at future spiking time steps, limiting SNN models to capture sequence dependencies solely within the same time step. We term this phenomenon “partial-time dependency”. To address this issue, we introduce Spiking Temporal-Sequential Attention (STSA) in the SpikeVoice. To the best of our knowledge, SpikeVoice is the first TTS work in the SNN field. We perform experiments using four well-established datasets that cover both Chinese and English languages, encompassing scenarios with both single-speaker and multi-speaker configurations. The results demonstrate that SpikeVoice can achieve results comparable to Artificial Neural Networks (ANN) with only 10.5% energy consumption of ANN. Both our demo and code are available as supplementary material.
%R 10.18653/v1/2024.acl-long.429
%U https://aclanthology.org/2024.luhme-long.429/
%U https://doi.org/10.18653/v1/2024.acl-long.429
%P 7927-7940
Markdown (Informal)
[SpikeVoice: High-Quality Text-to-Speech Via Efficient Spiking Neural Network](https://aclanthology.org/2024.luhme-long.429/) (Wang et al., ACL 2024)
ACL