@inproceedings{sharma-wang-2026-unified,
title = "A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection",
author = "Sharma, Aastha and
Wang, Guangjing",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1245/",
pages = "24853--24863",
ISBN = "979-8-89176-395-1",
abstract = "High-fidelity audio generation techniques, such as voice conversion and singing voice synthesis, have significantly increased the risk of audio deepfakes. Although existing methods perform well on conversational speech deepfake detection, they fail severely under the speech-to-singing domain shift. To address this limitation, we propose GenuVoice, a unified deepfake detector based on a multi-branch mixture-of-experts architecture that integrates three complementary feature views: Wav2Vec 2.0 representations, log-mel spectrograms, and mel-frequency cepstral coefficients (MFCC). Each expert is trained to remain independently discriminative, while a learned gating network dynamically weights expert contributions. A speech-retentive multi-domain fine-tuning strategy enables adaptation to singing without degrading speech performance. GenuVoice achieves 1.82{\%} Equal Error Rate (EER) on CtrSVDD, compared to 37{--}62{\%} for existing speech-trained detectors, while preserving strong speech performance (0.38{\%} EER on ASVspoof 2019) and generalizing to unseen generators (8.89{\%} EER on held-out ASVspoof 2021). Extensive ablations confirm the importance of multi-expert fusion and speech retention, establishing GenuVoice as an effective unified detector for speech and singing deepfakes. The implementation code is available at \url{https://github.com/aastha-sharma/genuvoice}"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sharma-wang-2026-unified">
<titleInfo>
<title>A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection</title>
</titleInfo>
<name type="personal">
<namePart type="given">Aastha</namePart>
<namePart type="family">Sharma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guangjing</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>High-fidelity audio generation techniques, such as voice conversion and singing voice synthesis, have significantly increased the risk of audio deepfakes. Although existing methods perform well on conversational speech deepfake detection, they fail severely under the speech-to-singing domain shift. To address this limitation, we propose GenuVoice, a unified deepfake detector based on a multi-branch mixture-of-experts architecture that integrates three complementary feature views: Wav2Vec 2.0 representations, log-mel spectrograms, and mel-frequency cepstral coefficients (MFCC). Each expert is trained to remain independently discriminative, while a learned gating network dynamically weights expert contributions. A speech-retentive multi-domain fine-tuning strategy enables adaptation to singing without degrading speech performance. GenuVoice achieves 1.82% Equal Error Rate (EER) on CtrSVDD, compared to 37–62% for existing speech-trained detectors, while preserving strong speech performance (0.38% EER on ASVspoof 2019) and generalizing to unseen generators (8.89% EER on held-out ASVspoof 2021). Extensive ablations confirm the importance of multi-expert fusion and speech retention, establishing GenuVoice as an effective unified detector for speech and singing deepfakes. The implementation code is available at https://github.com/aastha-sharma/genuvoice</abstract>
<identifier type="citekey">sharma-wang-2026-unified</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1245/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>24853</start>
<end>24863</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection
%A Sharma, Aastha
%A Wang, Guangjing
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F sharma-wang-2026-unified
%X High-fidelity audio generation techniques, such as voice conversion and singing voice synthesis, have significantly increased the risk of audio deepfakes. Although existing methods perform well on conversational speech deepfake detection, they fail severely under the speech-to-singing domain shift. To address this limitation, we propose GenuVoice, a unified deepfake detector based on a multi-branch mixture-of-experts architecture that integrates three complementary feature views: Wav2Vec 2.0 representations, log-mel spectrograms, and mel-frequency cepstral coefficients (MFCC). Each expert is trained to remain independently discriminative, while a learned gating network dynamically weights expert contributions. A speech-retentive multi-domain fine-tuning strategy enables adaptation to singing without degrading speech performance. GenuVoice achieves 1.82% Equal Error Rate (EER) on CtrSVDD, compared to 37–62% for existing speech-trained detectors, while preserving strong speech performance (0.38% EER on ASVspoof 2019) and generalizing to unseen generators (8.89% EER on held-out ASVspoof 2021). Extensive ablations confirm the importance of multi-expert fusion and speech retention, establishing GenuVoice as an effective unified detector for speech and singing deepfakes. The implementation code is available at https://github.com/aastha-sharma/genuvoice
%U https://aclanthology.org/2026.findings-acl.1245/
%P 24853-24863
Markdown (Informal)
[A Unified Feature Mixture Framework for Joint Speech and Singing Deepfake Detection](https://aclanthology.org/2026.findings-acl.1245/) (Sharma & Wang, Findings 2026)
ACL