@inproceedings{moffett-dhingra-2025-close,
title = "Close or Cloze? Assessing the Robustness of Large Language Models to Adversarial Perturbations via Word Recovery",
author = "Moffett, Luke and
Dhingra, Bhuwan",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.467/",
pages = "6999--7019",
abstract = "The current generation of large language models (LLMs) show a surprising degree of robustness to adversarial perturbations, but it is unclear when these models implicitly recover the original text and when they rely on surrounding context. To isolate this recovery faculty of language models, we study a new diagnostic task {---}Adversarial Word Recovery {---} an extension of spellchecking where the inputs may be adversarial. We collect a new dataset using 9 popular perturbation attack strategies from the literature and organize them using a taxonomy of phonetic, typo, and visual attacks. We use this dataset to study the word recovery performance of the current generation of LLMs, finding that proprietary models (GPT-4, GPT-3.5 and Palm-2) match or surpass human performance. Conversely, open-source models (Llama-2, Mistral, Falcon) demonstrate a material gap between human performance, especially on visual attacks. For these open models, we show that performance of word recovery without context correlates to word recovery with context, and ultimately affects downstream task performance on a hateful, offensive, and toxic classification task. Finally, to show improving word recovery can improve robustness, we mitigate these attacks with a small Byt5 model tuned to recover visually attacked words."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="moffett-dhingra-2025-close">
<titleInfo>
<title>Close or Cloze? Assessing the Robustness of Large Language Models to Adversarial Perturbations via Word Recovery</title>
</titleInfo>
<name type="personal">
<namePart type="given">Luke</namePart>
<namePart type="family">Moffett</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bhuwan</namePart>
<namePart type="family">Dhingra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-01</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 31st International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Owen</namePart>
<namePart type="family">Rambow</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marianna</namePart>
<namePart type="family">Apidianaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hend</namePart>
<namePart type="family">Al-Khalifa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Barbara</namePart>
<namePart type="given">Di</namePart>
<namePart type="family">Eugenio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Steven</namePart>
<namePart type="family">Schockaert</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Abu Dhabi, UAE</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The current generation of large language models (LLMs) show a surprising degree of robustness to adversarial perturbations, but it is unclear when these models implicitly recover the original text and when they rely on surrounding context. To isolate this recovery faculty of language models, we study a new diagnostic task —Adversarial Word Recovery — an extension of spellchecking where the inputs may be adversarial. We collect a new dataset using 9 popular perturbation attack strategies from the literature and organize them using a taxonomy of phonetic, typo, and visual attacks. We use this dataset to study the word recovery performance of the current generation of LLMs, finding that proprietary models (GPT-4, GPT-3.5 and Palm-2) match or surpass human performance. Conversely, open-source models (Llama-2, Mistral, Falcon) demonstrate a material gap between human performance, especially on visual attacks. For these open models, we show that performance of word recovery without context correlates to word recovery with context, and ultimately affects downstream task performance on a hateful, offensive, and toxic classification task. Finally, to show improving word recovery can improve robustness, we mitigate these attacks with a small Byt5 model tuned to recover visually attacked words.</abstract>
<identifier type="citekey">moffett-dhingra-2025-close</identifier>
<location>
<url>https://aclanthology.org/2025.coling-main.467/</url>
</location>
<part>
<date>2025-01</date>
<extent unit="page">
<start>6999</start>
<end>7019</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Close or Cloze? Assessing the Robustness of Large Language Models to Adversarial Perturbations via Word Recovery
%A Moffett, Luke
%A Dhingra, Bhuwan
%Y Rambow, Owen
%Y Wanner, Leo
%Y Apidianaki, Marianna
%Y Al-Khalifa, Hend
%Y Eugenio, Barbara Di
%Y Schockaert, Steven
%S Proceedings of the 31st International Conference on Computational Linguistics
%D 2025
%8 January
%I Association for Computational Linguistics
%C Abu Dhabi, UAE
%F moffett-dhingra-2025-close
%X The current generation of large language models (LLMs) show a surprising degree of robustness to adversarial perturbations, but it is unclear when these models implicitly recover the original text and when they rely on surrounding context. To isolate this recovery faculty of language models, we study a new diagnostic task —Adversarial Word Recovery — an extension of spellchecking where the inputs may be adversarial. We collect a new dataset using 9 popular perturbation attack strategies from the literature and organize them using a taxonomy of phonetic, typo, and visual attacks. We use this dataset to study the word recovery performance of the current generation of LLMs, finding that proprietary models (GPT-4, GPT-3.5 and Palm-2) match or surpass human performance. Conversely, open-source models (Llama-2, Mistral, Falcon) demonstrate a material gap between human performance, especially on visual attacks. For these open models, we show that performance of word recovery without context correlates to word recovery with context, and ultimately affects downstream task performance on a hateful, offensive, and toxic classification task. Finally, to show improving word recovery can improve robustness, we mitigate these attacks with a small Byt5 model tuned to recover visually attacked words.
%U https://aclanthology.org/2025.coling-main.467/
%P 6999-7019
Markdown (Informal)
[Close or Cloze? Assessing the Robustness of Large Language Models to Adversarial Perturbations via Word Recovery](https://aclanthology.org/2025.coling-main.467/) (Moffett & Dhingra, COLING 2025)
ACL