@inproceedings{tran-etal-2026-depth,
title = "When depth is redundant: Efficient transformer-based speech anti-spoofing",
author = "Tran, Hoan My and
Lolive, Damien and
Sini, Aghilas and
Delhay, Arnaud and
Marteau, Pierre-Francois and
Guennec, David",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.318/",
pages = "6380--6397",
ISBN = "979-8-89176-395-1",
abstract = "Detecting speech deepfakes is critical for protecting society against fraud, identity theft, and the misuse of modern speech synthesis technologies. Despite recent progress, existing countermeasures often exhibit limited generalization to unseen spoofing attacks, particularly in out-of-domain evaluation settings, even when achieving strong in-domain performance. Transformer architectures have become ubiquitous in anti-spoofing, serving both as feature extractors (e.g., wav2vec 2.0) and as classifiers. However, deep transformer stacks exhibit substantial representational redundancy across adjacent layers, with similarity increasing toward deeper layers. As a result, task-specific specialization is largely concentrated in the final layers, while shallow layers remain underutilized during fine-tuning. In this work, we analyze the layer-wise behavior of transformer-based classifiers for speech deepfake detection and propose a training strategy that explicitly aligns shallow and intermediate representations with those of the final transformer layer. By encouraging all layers to mimic the task-specialized representation learned at depth, the model more effectively exploits early-layer features while preserving discriminative capacity in deeper layers. This design improves robustness to unseen spoofing attacks and enhances out-of-domain generalization. Extensive experiments across multiple benchmark datasets demonstrate consistent performance gains over strong baselines."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tran-etal-2026-depth">
<titleInfo>
<title>When depth is redundant: Efficient transformer-based speech anti-spoofing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hoan</namePart>
<namePart type="given">My</namePart>
<namePart type="family">Tran</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Damien</namePart>
<namePart type="family">Lolive</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aghilas</namePart>
<namePart type="family">Sini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Arnaud</namePart>
<namePart type="family">Delhay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre-Francois</namePart>
<namePart type="family">Marteau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Guennec</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Detecting speech deepfakes is critical for protecting society against fraud, identity theft, and the misuse of modern speech synthesis technologies. Despite recent progress, existing countermeasures often exhibit limited generalization to unseen spoofing attacks, particularly in out-of-domain evaluation settings, even when achieving strong in-domain performance. Transformer architectures have become ubiquitous in anti-spoofing, serving both as feature extractors (e.g., wav2vec 2.0) and as classifiers. However, deep transformer stacks exhibit substantial representational redundancy across adjacent layers, with similarity increasing toward deeper layers. As a result, task-specific specialization is largely concentrated in the final layers, while shallow layers remain underutilized during fine-tuning. In this work, we analyze the layer-wise behavior of transformer-based classifiers for speech deepfake detection and propose a training strategy that explicitly aligns shallow and intermediate representations with those of the final transformer layer. By encouraging all layers to mimic the task-specialized representation learned at depth, the model more effectively exploits early-layer features while preserving discriminative capacity in deeper layers. This design improves robustness to unseen spoofing attacks and enhances out-of-domain generalization. Extensive experiments across multiple benchmark datasets demonstrate consistent performance gains over strong baselines.</abstract>
<identifier type="citekey">tran-etal-2026-depth</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.318/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>6380</start>
<end>6397</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T When depth is redundant: Efficient transformer-based speech anti-spoofing
%A Tran, Hoan My
%A Lolive, Damien
%A Sini, Aghilas
%A Delhay, Arnaud
%A Marteau, Pierre-Francois
%A Guennec, David
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F tran-etal-2026-depth
%X Detecting speech deepfakes is critical for protecting society against fraud, identity theft, and the misuse of modern speech synthesis technologies. Despite recent progress, existing countermeasures often exhibit limited generalization to unseen spoofing attacks, particularly in out-of-domain evaluation settings, even when achieving strong in-domain performance. Transformer architectures have become ubiquitous in anti-spoofing, serving both as feature extractors (e.g., wav2vec 2.0) and as classifiers. However, deep transformer stacks exhibit substantial representational redundancy across adjacent layers, with similarity increasing toward deeper layers. As a result, task-specific specialization is largely concentrated in the final layers, while shallow layers remain underutilized during fine-tuning. In this work, we analyze the layer-wise behavior of transformer-based classifiers for speech deepfake detection and propose a training strategy that explicitly aligns shallow and intermediate representations with those of the final transformer layer. By encouraging all layers to mimic the task-specialized representation learned at depth, the model more effectively exploits early-layer features while preserving discriminative capacity in deeper layers. This design improves robustness to unseen spoofing attacks and enhances out-of-domain generalization. Extensive experiments across multiple benchmark datasets demonstrate consistent performance gains over strong baselines.
%U https://aclanthology.org/2026.findings-acl.318/
%P 6380-6397
Markdown (Informal)
[When depth is redundant: Efficient transformer-based speech anti-spoofing](https://aclanthology.org/2026.findings-acl.318/) (Tran et al., Findings 2026)
ACL