@inproceedings{tsai-etal-2026-shape,
title = "The Shape of Vulnerability: How Adversarial Perturbations Reshape the Topology of Language Model Latent Spaces",
author = "Tsai, Angelina and
Subramanian, Shreya and
Liu, Catherine and
Lopez, Kimberly and
Zinn-Brooks, Leif and
Schulz, Alexia E. and
Uchendu, Adaku",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-srw.24/",
pages = "290--306",
ISBN = "979-8-89176-393-7",
abstract = "Adversarial perturbations in the context of large language models (LLMs) are subtle changes added to input data (i.e., images or text) that are designed to alter predictions or outputs of machine learning models. We introduce several novel visualizations using topological data analysis (TDA) (leveraging persistent homology) to characterize how adversarial perturbations act on text inputs, specifically, how sandbagging and code-injection attacksalter the geometric structure of attention heads in transformer models. By computing persistent homology metrics from attention maps across different model architectures (such as BERT, RoBERTa, ELECTRA, DistilGPT, etc.), we find that adversarial inputs alter higher-dimensional topological features ($H_{1}$ loops and $H_{2}$ voids) in ways that distinguish them from clean, non-adversarial inputs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="tsai-etal-2026-shape">
<titleInfo>
<title>The Shape of Vulnerability: How Adversarial Perturbations Reshape the Topology of Language Model Latent Spaces</title>
</titleInfo>
<name type="personal">
<namePart type="given">Angelina</namePart>
<namePart type="family">Tsai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shreya</namePart>
<namePart type="family">Subramanian</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Catherine</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kimberly</namePart>
<namePart type="family">Lopez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leif</namePart>
<namePart type="family">Zinn-Brooks</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alexia</namePart>
<namePart type="given">E</namePart>
<namePart type="family">Schulz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Adaku</namePart>
<namePart type="family">Uchendu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Santosh</namePart>
<namePart type="family">T.Y.S.S.</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="given">Diego</namePart>
<namePart type="family">Rodriguez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ona</namePart>
<namePart type="family">de Gibert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-393-7</identifier>
</relatedItem>
<abstract>Adversarial perturbations in the context of large language models (LLMs) are subtle changes added to input data (i.e., images or text) that are designed to alter predictions or outputs of machine learning models. We introduce several novel visualizations using topological data analysis (TDA) (leveraging persistent homology) to characterize how adversarial perturbations act on text inputs, specifically, how sandbagging and code-injection attacksalter the geometric structure of attention heads in transformer models. By computing persistent homology metrics from attention maps across different model architectures (such as BERT, RoBERTa, ELECTRA, DistilGPT, etc.), we find that adversarial inputs alter higher-dimensional topological features (H₁ loops and H₂ voids) in ways that distinguish them from clean, non-adversarial inputs.</abstract>
<identifier type="citekey">tsai-etal-2026-shape</identifier>
<location>
<url>https://aclanthology.org/2026.acl-srw.24/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>290</start>
<end>306</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Shape of Vulnerability: How Adversarial Perturbations Reshape the Topology of Language Model Latent Spaces
%A Tsai, Angelina
%A Subramanian, Shreya
%A Liu, Catherine
%A Lopez, Kimberly
%A Zinn-Brooks, Leif
%A Schulz, Alexia E.
%A Uchendu, Adaku
%Y T.Y.S.S., Santosh
%Y Rodriguez, Juan Diego
%Y de Gibert, Ona
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-393-7
%F tsai-etal-2026-shape
%X Adversarial perturbations in the context of large language models (LLMs) are subtle changes added to input data (i.e., images or text) that are designed to alter predictions or outputs of machine learning models. We introduce several novel visualizations using topological data analysis (TDA) (leveraging persistent homology) to characterize how adversarial perturbations act on text inputs, specifically, how sandbagging and code-injection attacksalter the geometric structure of attention heads in transformer models. By computing persistent homology metrics from attention maps across different model architectures (such as BERT, RoBERTa, ELECTRA, DistilGPT, etc.), we find that adversarial inputs alter higher-dimensional topological features (H₁ loops and H₂ voids) in ways that distinguish them from clean, non-adversarial inputs.
%U https://aclanthology.org/2026.acl-srw.24/
%P 290-306
Markdown (Informal)
[The Shape of Vulnerability: How Adversarial Perturbations Reshape the Topology of Language Model Latent Spaces](https://aclanthology.org/2026.acl-srw.24/) (Tsai et al., ACL 2026)
ACL
- Angelina Tsai, Shreya Subramanian, Catherine Liu, Kimberly Lopez, Leif Zinn-Brooks, Alexia E. Schulz, and Adaku Uchendu. 2026. The Shape of Vulnerability: How Adversarial Perturbations Reshape the Topology of Language Model Latent Spaces. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 290–306, San Diego, California, United States. Association for Computational Linguistics.