@inproceedings{yuan-etal-2026-curing,
title = "Curing Miracle Steps in {LLM} Mathematical Reasoning with Rubric Rewards",
author = "Yuan, Youliang and
Mang, Qiuyang and
Chen, Jingbang and
Wan, Hong and
Liu, Xiaoyuan and
Xu, Junjielong and
Huang, Jen-tse and
Wang, Wenxuan and
Jiao, Wenxiang and
He, Pinjia",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-long.844/",
pages = "18556--18577",
ISBN = "979-8-89176-390-6",
abstract = "In this paper, we observe that current models are susceptible to reward hacking, leading to a substantial overestimation of a model{'}s reasoning ability. This is evidenced by a high incidence of ``false positives''{---}solutions that reach the correct answer through an unsound process.Through a systematic analysis with human verification, we establish a taxonomy of these failure modes, identifying patterns like Miracle Steps{---}abrupt jumps to a correct output without a valid preceding derivation. Probing experiments suggest that these Miracle Steps are linked to answer-recall shortcuts, including memorization from pretraining, where the model accesses the correct answer independently of its reasoning chain.To mitigate this systemic issue, we introduce the Rubric Reward Model (RRM), a process-oriented reward function that evaluates the entire reasoning trajectory against problem-specific rubrics.The RRM explicitly penalizes logical flaws and encourages rigorous deduction.When integrated into an RL pipeline, RRM-based training consistently outperforms outcome-only supervision across four math benchmarks.Notably, it boosts Verified Pass@1024 on AIME2024 from 26.7{\%} to 62.6{\%} and reduces the incidence of Miracle Steps by 71{\%}.Our work demonstrates that rewarding the solution process is crucial for building accurate and reliable models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yuan-etal-2026-curing">
<titleInfo>
<title>Curing Miracle Steps in LLM Mathematical Reasoning with Rubric Rewards</title>
</titleInfo>
<name type="personal">
<namePart type="given">Youliang</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qiuyang</namePart>
<namePart type="family">Mang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jingbang</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hong</namePart>
<namePart type="family">Wan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiaoyuan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Junjielong</namePart>
<namePart type="family">Xu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jen-tse</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenxiang</namePart>
<namePart type="family">Jiao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pinjia</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-390-6</identifier>
</relatedItem>
<abstract>In this paper, we observe that current models are susceptible to reward hacking, leading to a substantial overestimation of a model’s reasoning ability. This is evidenced by a high incidence of “false positives”—solutions that reach the correct answer through an unsound process.Through a systematic analysis with human verification, we establish a taxonomy of these failure modes, identifying patterns like Miracle Steps—abrupt jumps to a correct output without a valid preceding derivation. Probing experiments suggest that these Miracle Steps are linked to answer-recall shortcuts, including memorization from pretraining, where the model accesses the correct answer independently of its reasoning chain.To mitigate this systemic issue, we introduce the Rubric Reward Model (RRM), a process-oriented reward function that evaluates the entire reasoning trajectory against problem-specific rubrics.The RRM explicitly penalizes logical flaws and encourages rigorous deduction.When integrated into an RL pipeline, RRM-based training consistently outperforms outcome-only supervision across four math benchmarks.Notably, it boosts Verified Pass@1024 on AIME2024 from 26.7% to 62.6% and reduces the incidence of Miracle Steps by 71%.Our work demonstrates that rewarding the solution process is crucial for building accurate and reliable models.</abstract>
<identifier type="citekey">yuan-etal-2026-curing</identifier>
<location>
<url>https://aclanthology.org/2026.acl-long.844/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>18556</start>
<end>18577</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Curing Miracle Steps in LLM Mathematical Reasoning with Rubric Rewards
%A Yuan, Youliang
%A Mang, Qiuyang
%A Chen, Jingbang
%A Wan, Hong
%A Liu, Xiaoyuan
%A Xu, Junjielong
%A Huang, Jen-tse
%A Wang, Wenxuan
%A Jiao, Wenxiang
%A He, Pinjia
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-390-6
%F yuan-etal-2026-curing
%X In this paper, we observe that current models are susceptible to reward hacking, leading to a substantial overestimation of a model’s reasoning ability. This is evidenced by a high incidence of “false positives”—solutions that reach the correct answer through an unsound process.Through a systematic analysis with human verification, we establish a taxonomy of these failure modes, identifying patterns like Miracle Steps—abrupt jumps to a correct output without a valid preceding derivation. Probing experiments suggest that these Miracle Steps are linked to answer-recall shortcuts, including memorization from pretraining, where the model accesses the correct answer independently of its reasoning chain.To mitigate this systemic issue, we introduce the Rubric Reward Model (RRM), a process-oriented reward function that evaluates the entire reasoning trajectory against problem-specific rubrics.The RRM explicitly penalizes logical flaws and encourages rigorous deduction.When integrated into an RL pipeline, RRM-based training consistently outperforms outcome-only supervision across four math benchmarks.Notably, it boosts Verified Pass@1024 on AIME2024 from 26.7% to 62.6% and reduces the incidence of Miracle Steps by 71%.Our work demonstrates that rewarding the solution process is crucial for building accurate and reliable models.
%U https://aclanthology.org/2026.acl-long.844/
%P 18556-18577
Markdown (Informal)
[Curing Miracle Steps in LLM Mathematical Reasoning with Rubric Rewards](https://aclanthology.org/2026.acl-long.844/) (Yuan et al., ACL 2026)
ACL
- Youliang Yuan, Qiuyang Mang, Jingbang Chen, Hong Wan, Xiaoyuan Liu, Junjielong Xu, Jen-tse Huang, Wenxuan Wang, Wenxiang Jiao, and Pinjia He. 2026. Curing Miracle Steps in LLM Mathematical Reasoning with Rubric Rewards. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 18556–18577, San Diego, California, United States. Association for Computational Linguistics.