@inproceedings{dash-etal-2026-tokens,
title = "Not All Tokens Are Equal: Per-Dimension Top-K Pooling for Adversarially Robust {BERT} Classification",
author = "Dash, Manoranjan and
Aralikatti, Shivam Anand and
Sheth, Shanay and
Shinde, Pranav",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.gem-main.29/",
pages = "285--295",
ISBN = "979-8-89176-423-1",
abstract = "Contextual text classification with BERT typically relies on the [CLS] token representation for downstream prediction. While effective under standard conditions, [CLS]-based pooling is brittle under adversarial perturbation, as its single-vector representation is indiscriminately influenced by injected adversarial tokens. We propose Per-Dimension Top-K Average Pooling, a pooling strategy that, for each hidden dimension, selectively aggregates only the top-K token activations rather than the full sequence {---} effectively controlling which tokens contribute to the final representation. This token-level selectivity acts as a natural filter against adversarial injection: tokens that do not rank among the top-K for a given dimension are suppressed from aggregation. We evaluate our approach against CLS, Global Average Pooling (GAP), Global Max Pooling (GMP), and Hybrid variants across three text classification domains: spam detection (Enron and LingSpam), automated essay scoring (ASAP), and hate speech classification. On the Enron spam dataset under adversarial attack, our best Hybrid (K=3) variant reduces the Attack Success Rate from 70.65{\%} to 37.07{\%} while maintaining clean accuracy above 99{\%}, compared to CLS which degrades to 63.64{\%} adversarial accuracy. Representation-level analyses further corroborate these findings: Top-K pooling variants exhibit substantially lower cosine similarity shift under attack, and adversarially injected tokens enter the top-K selection in far fewer dimensions compared to CLS. These results suggest that per-dimension token selectivity offers a principled and lightweight mechanism for adversarial robustness in BERT-based classifiers without any modification to the underlying model architecture."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dash-etal-2026-tokens">
<titleInfo>
<title>Not All Tokens Are Equal: Per-Dimension Top-K Pooling for Adversarially Robust BERT Classification</title>
</titleInfo>
<name type="personal">
<namePart type="given">Manoranjan</namePart>
<namePart type="family">Dash</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivam</namePart>
<namePart type="given">Anand</namePart>
<namePart type="family">Aralikatti</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shanay</namePart>
<namePart type="family">Sheth</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pranav</namePart>
<namePart type="family">Shinde</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Simon</namePart>
<namePart type="family">Mille</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Gehrmann</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrícia</namePart>
<namePart type="family">Schmidtová</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondřej</namePart>
<namePart type="family">Dušek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marzieh</namePart>
<namePart type="family">Fadaee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kyle</namePart>
<namePart type="family">Lo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gabriel</namePart>
<namePart type="family">Stanovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-423-1</identifier>
</relatedItem>
<abstract>Contextual text classification with BERT typically relies on the [CLS] token representation for downstream prediction. While effective under standard conditions, [CLS]-based pooling is brittle under adversarial perturbation, as its single-vector representation is indiscriminately influenced by injected adversarial tokens. We propose Per-Dimension Top-K Average Pooling, a pooling strategy that, for each hidden dimension, selectively aggregates only the top-K token activations rather than the full sequence — effectively controlling which tokens contribute to the final representation. This token-level selectivity acts as a natural filter against adversarial injection: tokens that do not rank among the top-K for a given dimension are suppressed from aggregation. We evaluate our approach against CLS, Global Average Pooling (GAP), Global Max Pooling (GMP), and Hybrid variants across three text classification domains: spam detection (Enron and LingSpam), automated essay scoring (ASAP), and hate speech classification. On the Enron spam dataset under adversarial attack, our best Hybrid (K=3) variant reduces the Attack Success Rate from 70.65% to 37.07% while maintaining clean accuracy above 99%, compared to CLS which degrades to 63.64% adversarial accuracy. Representation-level analyses further corroborate these findings: Top-K pooling variants exhibit substantially lower cosine similarity shift under attack, and adversarially injected tokens enter the top-K selection in far fewer dimensions compared to CLS. These results suggest that per-dimension token selectivity offers a principled and lightweight mechanism for adversarial robustness in BERT-based classifiers without any modification to the underlying model architecture.</abstract>
<identifier type="citekey">dash-etal-2026-tokens</identifier>
<location>
<url>https://aclanthology.org/2026.gem-main.29/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>285</start>
<end>295</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Not All Tokens Are Equal: Per-Dimension Top-K Pooling for Adversarially Robust BERT Classification
%A Dash, Manoranjan
%A Aralikatti, Shivam Anand
%A Sheth, Shanay
%A Shinde, Pranav
%Y Mille, Simon
%Y Gehrmann, Sebastian
%Y Schmidtová, Patrícia
%Y Dušek, Ondřej
%Y Fadaee, Marzieh
%Y Lo, Kyle
%Y Santus, Enrico
%Y Stanovsky, Gabriel
%S Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics (GEM)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-423-1
%F dash-etal-2026-tokens
%X Contextual text classification with BERT typically relies on the [CLS] token representation for downstream prediction. While effective under standard conditions, [CLS]-based pooling is brittle under adversarial perturbation, as its single-vector representation is indiscriminately influenced by injected adversarial tokens. We propose Per-Dimension Top-K Average Pooling, a pooling strategy that, for each hidden dimension, selectively aggregates only the top-K token activations rather than the full sequence — effectively controlling which tokens contribute to the final representation. This token-level selectivity acts as a natural filter against adversarial injection: tokens that do not rank among the top-K for a given dimension are suppressed from aggregation. We evaluate our approach against CLS, Global Average Pooling (GAP), Global Max Pooling (GMP), and Hybrid variants across three text classification domains: spam detection (Enron and LingSpam), automated essay scoring (ASAP), and hate speech classification. On the Enron spam dataset under adversarial attack, our best Hybrid (K=3) variant reduces the Attack Success Rate from 70.65% to 37.07% while maintaining clean accuracy above 99%, compared to CLS which degrades to 63.64% adversarial accuracy. Representation-level analyses further corroborate these findings: Top-K pooling variants exhibit substantially lower cosine similarity shift under attack, and adversarially injected tokens enter the top-K selection in far fewer dimensions compared to CLS. These results suggest that per-dimension token selectivity offers a principled and lightweight mechanism for adversarial robustness in BERT-based classifiers without any modification to the underlying model architecture.
%U https://aclanthology.org/2026.gem-main.29/
%P 285-295
Markdown (Informal)
[Not All Tokens Are Equal: Per-Dimension Top-K Pooling for Adversarially Robust BERT Classification](https://aclanthology.org/2026.gem-main.29/) (Dash et al., GEM 2026)
ACL