@inproceedings{shi-etal-2026-affectcodec,
title = "{A}ffect{C}odec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling",
author = "Shi, Jiacheng and
Du, Hongfei and
Song, Xinyuan and
Hong, Y. Alicia and
Zhang, Yanfu and
Gao, Ashley",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.442/",
pages = "9102--9124",
ISBN = "979-8-89176-395-1",
abstract = "Neural speech codecs provide discrete representations for speech language models, but emotional cues are often degraded during quantization. Existing codecs mainly optimize acoustic reconstruction, leaving emotion expressiveness insufficiently modeled at the representation level. We propose an emotion-guided neural speech codec that explicitly preserves emotional information while maintaining semantic fidelity and prosodic naturalness. Our framework combines emotion{--}semantic guided latent modulation, relation-preserving emotional{--}semantic distillation, and emotion-weighted semantic alignment to retain emotionally salient cues under compression. Extensive evaluations across speech reconstruction, emotion recognition, and downstream text to speech generation demonstrate improved emotion consistency and perceptual quality without sacrificing content accuracy."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2026-affectcodec">
<titleInfo>
<title>AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jiacheng</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hongfei</namePart>
<namePart type="family">Du</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xinyuan</namePart>
<namePart type="family">Song</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Y</namePart>
<namePart type="given">Alicia</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yanfu</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashley</namePart>
<namePart type="family">Gao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Neural speech codecs provide discrete representations for speech language models, but emotional cues are often degraded during quantization. Existing codecs mainly optimize acoustic reconstruction, leaving emotion expressiveness insufficiently modeled at the representation level. We propose an emotion-guided neural speech codec that explicitly preserves emotional information while maintaining semantic fidelity and prosodic naturalness. Our framework combines emotion–semantic guided latent modulation, relation-preserving emotional–semantic distillation, and emotion-weighted semantic alignment to retain emotionally salient cues under compression. Extensive evaluations across speech reconstruction, emotion recognition, and downstream text to speech generation demonstrate improved emotion consistency and perceptual quality without sacrificing content accuracy.</abstract>
<identifier type="citekey">shi-etal-2026-affectcodec</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.442/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>9102</start>
<end>9124</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling
%A Shi, Jiacheng
%A Du, Hongfei
%A Song, Xinyuan
%A Hong, Y. Alicia
%A Zhang, Yanfu
%A Gao, Ashley
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F shi-etal-2026-affectcodec
%X Neural speech codecs provide discrete representations for speech language models, but emotional cues are often degraded during quantization. Existing codecs mainly optimize acoustic reconstruction, leaving emotion expressiveness insufficiently modeled at the representation level. We propose an emotion-guided neural speech codec that explicitly preserves emotional information while maintaining semantic fidelity and prosodic naturalness. Our framework combines emotion–semantic guided latent modulation, relation-preserving emotional–semantic distillation, and emotion-weighted semantic alignment to retain emotionally salient cues under compression. Extensive evaluations across speech reconstruction, emotion recognition, and downstream text to speech generation demonstrate improved emotion consistency and perceptual quality without sacrificing content accuracy.
%U https://aclanthology.org/2026.findings-acl.442/
%P 9102-9124
Markdown (Informal)
[AffectCodec: Emotion-Preserving Neural Speech Codec for Expressive Speech Modeling](https://aclanthology.org/2026.findings-acl.442/) (Shi et al., Findings 2026)
ACL