@inproceedings{hou-etal-2026-toward,
title = "Toward Automated Robustness Evaluation of Mathematical Reasoning",
author = "Hou, Yutao and
Xiao, Zeguan and
Yu, Fei and
Jiang, Yihan and
Shuguang, Ma and
Dai, Zhaoqian and
Huang, Hailiang and
Chen, Yun and
Chen, Guanhua",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.691/",
pages = "14117--14139",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) have demonstrated remarkable capabilities in various reasoning-intensive tasks. However, these models exhibit unexpected brittleness, often failing on simple variations of the same underlying task. Existing robustness evaluations predominantly rely on hand-crafted templates or a limited set of perturbation rules. Consequently, such approaches lack the adaptability to probe latent vulnerabilities unique to specific models and remain susceptible to data contamination. To address this, we propose the Math Stress Tester (MaSTer), an automated framework inspired by software stress testing. MaSTer generates adversarial variants via a multi-round rewrite-verify loop, ensuring semantic consistency while successfully inducing model failure. Our framework generates benchmark variants dynamically for each LLM, thus minimizing the risk of data contamination. Experiments on GSM8K and MATH-500 demonstrate the effectiveness of MaSTer on mathematical tasks. Additionally, we validate the framework{'}s extensibility to non-mathematical tasks, highlighting its broad applicability. Furthermore, we demonstrate that the synthesized variants generated by MaSTer can be utilized as a fine-tuning dataset to significantly enhance the model{'}s robustness."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hou-etal-2026-toward">
<titleInfo>
<title>Toward Automated Robustness Evaluation of Mathematical Reasoning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yutao</namePart>
<namePart type="family">Hou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zeguan</namePart>
<namePart type="family">Xiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihan</namePart>
<namePart type="family">Jiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ma</namePart>
<namePart type="family">Shuguang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhaoqian</namePart>
<namePart type="family">Dai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hailiang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Guanhua</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large Language Models (LLMs) have demonstrated remarkable capabilities in various reasoning-intensive tasks. However, these models exhibit unexpected brittleness, often failing on simple variations of the same underlying task. Existing robustness evaluations predominantly rely on hand-crafted templates or a limited set of perturbation rules. Consequently, such approaches lack the adaptability to probe latent vulnerabilities unique to specific models and remain susceptible to data contamination. To address this, we propose the Math Stress Tester (MaSTer), an automated framework inspired by software stress testing. MaSTer generates adversarial variants via a multi-round rewrite-verify loop, ensuring semantic consistency while successfully inducing model failure. Our framework generates benchmark variants dynamically for each LLM, thus minimizing the risk of data contamination. Experiments on GSM8K and MATH-500 demonstrate the effectiveness of MaSTer on mathematical tasks. Additionally, we validate the framework’s extensibility to non-mathematical tasks, highlighting its broad applicability. Furthermore, we demonstrate that the synthesized variants generated by MaSTer can be utilized as a fine-tuning dataset to significantly enhance the model’s robustness.</abstract>
<identifier type="citekey">hou-etal-2026-toward</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.691/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>14117</start>
<end>14139</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Toward Automated Robustness Evaluation of Mathematical Reasoning
%A Hou, Yutao
%A Xiao, Zeguan
%A Yu, Fei
%A Jiang, Yihan
%A Shuguang, Ma
%A Dai, Zhaoqian
%A Huang, Hailiang
%A Chen, Yun
%A Chen, Guanhua
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F hou-etal-2026-toward
%X Large Language Models (LLMs) have demonstrated remarkable capabilities in various reasoning-intensive tasks. However, these models exhibit unexpected brittleness, often failing on simple variations of the same underlying task. Existing robustness evaluations predominantly rely on hand-crafted templates or a limited set of perturbation rules. Consequently, such approaches lack the adaptability to probe latent vulnerabilities unique to specific models and remain susceptible to data contamination. To address this, we propose the Math Stress Tester (MaSTer), an automated framework inspired by software stress testing. MaSTer generates adversarial variants via a multi-round rewrite-verify loop, ensuring semantic consistency while successfully inducing model failure. Our framework generates benchmark variants dynamically for each LLM, thus minimizing the risk of data contamination. Experiments on GSM8K and MATH-500 demonstrate the effectiveness of MaSTer on mathematical tasks. Additionally, we validate the framework’s extensibility to non-mathematical tasks, highlighting its broad applicability. Furthermore, we demonstrate that the synthesized variants generated by MaSTer can be utilized as a fine-tuning dataset to significantly enhance the model’s robustness.
%U https://aclanthology.org/2026.findings-acl.691/
%P 14117-14139
Markdown (Informal)
[Toward Automated Robustness Evaluation of Mathematical Reasoning](https://aclanthology.org/2026.findings-acl.691/) (Hou et al., Findings 2026)
ACL
- Yutao Hou, Zeguan Xiao, Fei Yu, Yihan Jiang, Ma Shuguang, Zhaoqian Dai, Hailiang Huang, Yun Chen, and Guanhua Chen. 2026. Toward Automated Robustness Evaluation of Mathematical Reasoning. In Findings of the Association for Computational Linguistics: ACL 2026, pages 14117–14139, San Diego, California, United States. Association for Computational Linguistics.