@inproceedings{zhang-etal-2026-gsm,
title = "{GSM}-Noise: Exploring and Enhancing Large Language Models' Reasoning under Noisy Inputs",
author = "Zhang, Zhengxin and
Huang, Chengyu and
Liu, Xufu and
Zhao, Dan and
Su, Jinyan and
Cardie, Claire",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1748/",
pages = "35020--35045",
ISBN = "979-8-89176-395-1",
abstract = "Large language models (LLMs) have demonstrated impressive reasoning capabilities, yet they often struggle when dealing with complex, ill-formed, or noisy inputs that frequently occur in interactions with real users. LLMs typically lack crucial refining capabilities needed to filter out irrelevant details, restructure key points before reasoning over the text and responding, resulting in suboptimal performance and incorrect answers. From an information theory perspective, this behavior is akin to decoding a high-entropy problem without first reducing its entropy. In this work, we first introduce GSM-Noise, a benchmark featuring grade-school math problems systematically perturbed to reflect real-world input variability. We show that the reasoning ability of open-source models (e.g., LLaMA and Qwen series) can be compromised by noise, while closed-source models are more robust. To improve LLM robustness under noisy conditions, we propose that LLMs first refine inputs {---} thereby reducing their entropy {---} before engaging in in-depth analysis. We investigate three approaches to instill this refinement capability: prompt engineering (PE), supervised finetuning (SFT), and reinforcement learning (RL). Experimental results show that input refinement leads to consistent performance gains: 2{--}12{\%} with PE, 4{--}13{\%} with SFT, and 3{--}25{\%} with RL. These results highlight the importance of incorporating an explicit refinement phase to enhance the robustness and reliability of LLM reasoning in real-world scenarios."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2026-gsm">
<titleInfo>
<title>GSM-Noise: Exploring and Enhancing Large Language Models’ Reasoning under Noisy Inputs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Zhengxin</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengyu</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xufu</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Zhao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyan</namePart>
<namePart type="family">Su</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claire</namePart>
<namePart type="family">Cardie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Large language models (LLMs) have demonstrated impressive reasoning capabilities, yet they often struggle when dealing with complex, ill-formed, or noisy inputs that frequently occur in interactions with real users. LLMs typically lack crucial refining capabilities needed to filter out irrelevant details, restructure key points before reasoning over the text and responding, resulting in suboptimal performance and incorrect answers. From an information theory perspective, this behavior is akin to decoding a high-entropy problem without first reducing its entropy. In this work, we first introduce GSM-Noise, a benchmark featuring grade-school math problems systematically perturbed to reflect real-world input variability. We show that the reasoning ability of open-source models (e.g., LLaMA and Qwen series) can be compromised by noise, while closed-source models are more robust. To improve LLM robustness under noisy conditions, we propose that LLMs first refine inputs — thereby reducing their entropy — before engaging in in-depth analysis. We investigate three approaches to instill this refinement capability: prompt engineering (PE), supervised finetuning (SFT), and reinforcement learning (RL). Experimental results show that input refinement leads to consistent performance gains: 2–12% with PE, 4–13% with SFT, and 3–25% with RL. These results highlight the importance of incorporating an explicit refinement phase to enhance the robustness and reliability of LLM reasoning in real-world scenarios.</abstract>
<identifier type="citekey">zhang-etal-2026-gsm</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1748/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>35020</start>
<end>35045</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T GSM-Noise: Exploring and Enhancing Large Language Models’ Reasoning under Noisy Inputs
%A Zhang, Zhengxin
%A Huang, Chengyu
%A Liu, Xufu
%A Zhao, Dan
%A Su, Jinyan
%A Cardie, Claire
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F zhang-etal-2026-gsm
%X Large language models (LLMs) have demonstrated impressive reasoning capabilities, yet they often struggle when dealing with complex, ill-formed, or noisy inputs that frequently occur in interactions with real users. LLMs typically lack crucial refining capabilities needed to filter out irrelevant details, restructure key points before reasoning over the text and responding, resulting in suboptimal performance and incorrect answers. From an information theory perspective, this behavior is akin to decoding a high-entropy problem without first reducing its entropy. In this work, we first introduce GSM-Noise, a benchmark featuring grade-school math problems systematically perturbed to reflect real-world input variability. We show that the reasoning ability of open-source models (e.g., LLaMA and Qwen series) can be compromised by noise, while closed-source models are more robust. To improve LLM robustness under noisy conditions, we propose that LLMs first refine inputs — thereby reducing their entropy — before engaging in in-depth analysis. We investigate three approaches to instill this refinement capability: prompt engineering (PE), supervised finetuning (SFT), and reinforcement learning (RL). Experimental results show that input refinement leads to consistent performance gains: 2–12% with PE, 4–13% with SFT, and 3–25% with RL. These results highlight the importance of incorporating an explicit refinement phase to enhance the robustness and reliability of LLM reasoning in real-world scenarios.
%U https://aclanthology.org/2026.findings-acl.1748/
%P 35020-35045
Markdown (Informal)
[GSM-Noise: Exploring and Enhancing Large Language Models’ Reasoning under Noisy Inputs](https://aclanthology.org/2026.findings-acl.1748/) (Zhang et al., Findings 2026)
ACL