@inproceedings{alavi-etal-2026-agents,
title = "More Agents Improve Math Problem Solving but Adversarial Robustness Gap Persists",
author = "Alavi, Khashayar and
Yeltay, Zhastay and
Flek, Lucie and
Karimi, Akbar",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.2158/",
doi = "10.18653/v1/2026.findings-acl.2158",
pages = "43457--43475",
ISBN = "979-8-89176-395-1",
abstract = "When LLM agents work together, they seem to be more powerful than a single LLM in mathematical question answering. However, are they also more robust to adversarial inputs? We investigate this question using adversarially perturbed math questions. These perturbations include punctuation noise with three intensities (10{\%}, 30{\%}, 50{\%}), plus real-world and human-like typos (WikiTypo, R2ATA). Using a unified sampling-and-voting framework (Agent Forest), we evaluate six open-source models (Qwen3-4B/14B, Llama3.1-8B, Mistral-7B, Gemma3-4B/12B) across four benchmarks (GSM8K, MATH, MMLU{--}Math, MultiArith), with various numbers of agents n = {1,2,5,10,15,20,25}. Our findings show that 1) Noise type matters: punctuation noise harm scales with its severity, and the human typos remain the dominant bottleneck, yielding the largest gaps to Clean accuracy and the highest attack success rate (ASR) even with a large number of agents; 2) Collaboration reliably improves accuracy as the number of agents, n, increases, with the largest gains from n=1 to n=5 and diminishing returns beyond n$\approx$10. However, the adversarial robustness gap persists regardless of the agent count."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="alavi-etal-2026-agents">
<titleInfo>
<title>More Agents Improve Math Problem Solving but Adversarial Robustness Gap Persists</title>
</titleInfo>
<name type="personal">
<namePart type="given">Khashayar</namePart>
<namePart type="family">Alavi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhastay</namePart>
<namePart type="family">Yeltay</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucie</namePart>
<namePart type="family">Flek</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akbar</namePart>
<namePart type="family">Karimi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>When LLM agents work together, they seem to be more powerful than a single LLM in mathematical question answering. However, are they also more robust to adversarial inputs? We investigate this question using adversarially perturbed math questions. These perturbations include punctuation noise with three intensities (10%, 30%, 50%), plus real-world and human-like typos (WikiTypo, R2ATA). Using a unified sampling-and-voting framework (Agent Forest), we evaluate six open-source models (Qwen3-4B/14B, Llama3.1-8B, Mistral-7B, Gemma3-4B/12B) across four benchmarks (GSM8K, MATH, MMLU–Math, MultiArith), with various numbers of agents n = 1,2,5,10,15,20,25. Our findings show that 1) Noise type matters: punctuation noise harm scales with its severity, and the human typos remain the dominant bottleneck, yielding the largest gaps to Clean accuracy and the highest attack success rate (ASR) even with a large number of agents; 2) Collaboration reliably improves accuracy as the number of agents, n, increases, with the largest gains from n=1 to n=5 and diminishing returns beyond n\approx10. However, the adversarial robustness gap persists regardless of the agent count.</abstract>
<identifier type="citekey">alavi-etal-2026-agents</identifier>
<identifier type="doi">10.18653/v1/2026.findings-acl.2158</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.2158/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>43457</start>
<end>43475</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T More Agents Improve Math Problem Solving but Adversarial Robustness Gap Persists
%A Alavi, Khashayar
%A Yeltay, Zhastay
%A Flek, Lucie
%A Karimi, Akbar
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F alavi-etal-2026-agents
%X When LLM agents work together, they seem to be more powerful than a single LLM in mathematical question answering. However, are they also more robust to adversarial inputs? We investigate this question using adversarially perturbed math questions. These perturbations include punctuation noise with three intensities (10%, 30%, 50%), plus real-world and human-like typos (WikiTypo, R2ATA). Using a unified sampling-and-voting framework (Agent Forest), we evaluate six open-source models (Qwen3-4B/14B, Llama3.1-8B, Mistral-7B, Gemma3-4B/12B) across four benchmarks (GSM8K, MATH, MMLU–Math, MultiArith), with various numbers of agents n = 1,2,5,10,15,20,25. Our findings show that 1) Noise type matters: punctuation noise harm scales with its severity, and the human typos remain the dominant bottleneck, yielding the largest gaps to Clean accuracy and the highest attack success rate (ASR) even with a large number of agents; 2) Collaboration reliably improves accuracy as the number of agents, n, increases, with the largest gains from n=1 to n=5 and diminishing returns beyond n\approx10. However, the adversarial robustness gap persists regardless of the agent count.
%R 10.18653/v1/2026.findings-acl.2158
%U https://aclanthology.org/2026.findings-acl.2158/
%U https://doi.org/10.18653/v1/2026.findings-acl.2158
%P 43457-43475
Markdown (Informal)
[More Agents Improve Math Problem Solving but Adversarial Robustness Gap Persists](https://aclanthology.org/2026.findings-acl.2158/) (Alavi et al., Findings 2026)
ACL