@inproceedings{wang-etal-2025-prosodyflow,
title = "{P}rosody{F}low: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models",
author = "Wang, Haoyu and
Shan, Sizhe and
Guo, Yinlin and
Wang, Yuehai",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.518/",
pages = "7748--7753",
abstract = "Text-to-speech (TTS) has seen significant advancements in high-quality, expressive speech synthesis. However, achieving diverse and natural prosody in synthesized speech remains challenging. In this paper, we propose ProsodyFlow, an end-to-end TTS model that integrates large self-supervised speech models and conditional flow matching to model prosodic features effectively. Our approach involves using a speech LLM to extract acoustic features, mapping these features into a prosody latent space, and then employing conditional flow matching to generate prosodic vectors conditioned on the input text. Experiments on the LJSpeech dataset show that ProsodyFlow improves synthesis quality and efficiency compared to existing models, achieving more prosodic and expressive speech synthesizing."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2025-prosodyflow">
<titleInfo>
<title>ProsodyFlow: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Haoyu</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sizhe</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yinlin</namePart>
<namePart type="family">Guo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuehai</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Text-to-speech (TTS) has seen significant advancements in high-quality, expressive speech synthesis. However, achieving diverse and natural prosody in synthesized speech remains challenging. In this paper, we propose ProsodyFlow, an end-to-end TTS model that integrates large self-supervised speech models and conditional flow matching to model prosodic features effectively. Our approach involves using a speech LLM to extract acoustic features, mapping these features into a prosody latent space, and then employing conditional flow matching to generate prosodic vectors conditioned on the input text. Experiments on the LJSpeech dataset show that ProsodyFlow improves synthesis quality and efficiency compared to existing models, achieving more prosodic and expressive speech synthesizing.</abstract>
<identifier type="citekey">wang-etal-2025-prosodyflow</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.518/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>7748</start>
<end>7753</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ProsodyFlow: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models
%A Wang, Haoyu
%A Shan, Sizhe
%A Guo, Yinlin
%A Wang, Yuehai
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F wang-etal-2025-prosodyflow
%X Text-to-speech (TTS) has seen significant advancements in high-quality, expressive speech synthesis. However, achieving diverse and natural prosody in synthesized speech remains challenging. In this paper, we propose ProsodyFlow, an end-to-end TTS model that integrates large self-supervised speech models and conditional flow matching to model prosodic features effectively. Our approach involves using a speech LLM to extract acoustic features, mapping these features into a prosody latent space, and then employing conditional flow matching to generate prosodic vectors conditioned on the input text. Experiments on the LJSpeech dataset show that ProsodyFlow improves synthesis quality and efficiency compared to existing models, achieving more prosodic and expressive speech synthesizing.
%U https://aclanthology.org/2025.coling-main.518/
%P 7748-7753
Markdown (Informal)
[ProsodyFlow: High-fidelity Text-to-Speech through Conditional Flow Matching and Prosody Modeling with Large Speech Language Models](https://aclanthology.org/2025.coling-main.518/) (Wang et al., COLING 2025)
ACL