@inproceedings{das-jena-2026-resodiff,
title = "{R}eso{D}iff-44k: High-Fidelity Cross-Lingual Speech and Singing Synthesis via Discrete Diffusion",
author = "Das, Gyanendra and
Jena, Sai Satyam",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.146/",
pages = "2183--2190",
ISBN = "979-8-89176-394-4",
abstract = "While large-scale generative speech models have achieved remarkable semantic coherence, industrial deployment remains constrained by a fidelity ceiling typically capped at lower sampling rates. A fundamental limitation is the reliance on intermediate mel-spectrograms, a low-dimensional bottleneck that discards phase and high-frequency information, causing artifacts in expressive scenarios like singing. In this work, we introduce ResoDiff-44k, a production-grade generative foundation model designed for cinema-quality, 44.1kHz audio synthesis. Departing from standard masked audio modeling and mel-spectrogram inversion, ResoDiff-44k leverages Discrete Diffusion over a pure Descript Audio Codec latent space. We pre-train ResoDiff-44k on a massive 150K -hour multilingual dataset to establish a robust acoustic prior, followed by targeted fine-tuning on a curated regional mixed-language and singing corpus. Our experiments demonstrate that replacing the standard prediction head with a discrete diffusion trajectory significantly reduces misalignment in long sequences. We report a double-blind subjective evaluation showing that ResoDiff-44k achieves a 4.6 Mean Opinion Score in 44.1kHz singing synthesis and a 71{\%} reduction in character error rate on regional mixed-language prompts compared to strong baselines. The proposed pipeline offers a viable path for deploying high-fidelity, culturally adaptive conversational agents."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="das-jena-2026-resodiff">
<titleInfo>
<title>ResoDiff-44k: High-Fidelity Cross-Lingual Speech and Singing Synthesis via Discrete Diffusion</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gyanendra</namePart>
<namePart type="family">Das</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sai</namePart>
<namePart type="given">Satyam</namePart>
<namePart type="family">Jena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>While large-scale generative speech models have achieved remarkable semantic coherence, industrial deployment remains constrained by a fidelity ceiling typically capped at lower sampling rates. A fundamental limitation is the reliance on intermediate mel-spectrograms, a low-dimensional bottleneck that discards phase and high-frequency information, causing artifacts in expressive scenarios like singing. In this work, we introduce ResoDiff-44k, a production-grade generative foundation model designed for cinema-quality, 44.1kHz audio synthesis. Departing from standard masked audio modeling and mel-spectrogram inversion, ResoDiff-44k leverages Discrete Diffusion over a pure Descript Audio Codec latent space. We pre-train ResoDiff-44k on a massive 150K -hour multilingual dataset to establish a robust acoustic prior, followed by targeted fine-tuning on a curated regional mixed-language and singing corpus. Our experiments demonstrate that replacing the standard prediction head with a discrete diffusion trajectory significantly reduces misalignment in long sequences. We report a double-blind subjective evaluation showing that ResoDiff-44k achieves a 4.6 Mean Opinion Score in 44.1kHz singing synthesis and a 71% reduction in character error rate on regional mixed-language prompts compared to strong baselines. The proposed pipeline offers a viable path for deploying high-fidelity, culturally adaptive conversational agents.</abstract>
<identifier type="citekey">das-jena-2026-resodiff</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.146/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>2183</start>
<end>2190</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ResoDiff-44k: High-Fidelity Cross-Lingual Speech and Singing Synthesis via Discrete Diffusion
%A Das, Gyanendra
%A Jena, Sai Satyam
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F das-jena-2026-resodiff
%X While large-scale generative speech models have achieved remarkable semantic coherence, industrial deployment remains constrained by a fidelity ceiling typically capped at lower sampling rates. A fundamental limitation is the reliance on intermediate mel-spectrograms, a low-dimensional bottleneck that discards phase and high-frequency information, causing artifacts in expressive scenarios like singing. In this work, we introduce ResoDiff-44k, a production-grade generative foundation model designed for cinema-quality, 44.1kHz audio synthesis. Departing from standard masked audio modeling and mel-spectrogram inversion, ResoDiff-44k leverages Discrete Diffusion over a pure Descript Audio Codec latent space. We pre-train ResoDiff-44k on a massive 150K -hour multilingual dataset to establish a robust acoustic prior, followed by targeted fine-tuning on a curated regional mixed-language and singing corpus. Our experiments demonstrate that replacing the standard prediction head with a discrete diffusion trajectory significantly reduces misalignment in long sequences. We report a double-blind subjective evaluation showing that ResoDiff-44k achieves a 4.6 Mean Opinion Score in 44.1kHz singing synthesis and a 71% reduction in character error rate on regional mixed-language prompts compared to strong baselines. The proposed pipeline offers a viable path for deploying high-fidelity, culturally adaptive conversational agents.
%U https://aclanthology.org/2026.acl-industry.146/
%P 2183-2190
Markdown (Informal)
[ResoDiff-44k: High-Fidelity Cross-Lingual Speech and Singing Synthesis via Discrete Diffusion](https://aclanthology.org/2026.acl-industry.146/) (Das & Jena, ACL 2026)
ACL