@inproceedings{wang-etal-2026-reflect,
title = "Reflect, Rewrite, Repeat: How Simple Arithmetic Enables Advanced Reasoning in Small Language Models",
author = "Wang, Mengdie Flora and
Xie, Haochen and
Kim, Mun Young and
Chaudhury, Baishali and
Ashok, Meghana and
Gunturu, Suren and
Hong, Sungmin and
Woo, Jae Oh",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-eacl.69/",
pages = "1341--1363",
ISBN = "979-8-89176-386-9",
abstract = "Contemporary advancements in language model reasoning typically require computationally intensive reinforcement learning (RL) and massive datasets, creating barriers for resource-constrained teams. In this work, we demonstrate that high-quality, iterative training on minimal data can rival modern RL approaches. We introduce a resource-efficient framework that combines Direct Preference Optimization (DPO) and Supervised Fine-Tuning (SFT) with selective guidance from larger models, iteratively refining solutions through a ``reflect, rewrite, repeat'' cycle (R$^3$). Using Qwen 2.5 7B and Qwen 2.5 Math 7B as base models, our method shows meaningful performance improvements across arithmetic, symbolic and cognitive reasoning benchmarks{---}including GSM8K (83.1{\%} {\textrightarrow} 88.6{\%}), AIME{'}25@10 (20.0{\%} {\textrightarrow} 30.0{\%}) and LastLetterConcat (40.7{\%} {\textrightarrow} 53.3{\%}) problems. The model-agnostic nature of our R$^3$ framework is further demonstrated through substantial improvements when applied to Mistral and LLaMA-based models. Remarkably, these gains are achieved using mere 700 basic arithmetic training samples, in stark contrast to the hundreds of thousands of examples typically required by RL-based systems. Our results suggest that reasoning improvements need not strictly depend on large-scale data. By emphasizing strategically curated training grounded in foundational principles, we achieve competitive generalization with minimal resource overhead. Our R$^3$ pipeline also generates high-quality SFT data with high-fidelity reasoning traces as byproduct, further enabling scalable and annotation-free fine-tuning. Code is available.[{\ensuremath{<}}https://github.com/aws-samples/sample-for-reflect-rewrite-repeat{\ensuremath{>}}]"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-reflect">
<titleInfo>
<title>Reflect, Rewrite, Repeat: How Simple Arithmetic Enables Advanced Reasoning in Small Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Mengdie</namePart>
<namePart type="given">Flora</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haochen</namePart>
<namePart type="family">Xie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mun</namePart>
<namePart type="given">Young</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Baishali</namePart>
<namePart type="family">Chaudhury</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Meghana</namePart>
<namePart type="family">Ashok</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Suren</namePart>
<namePart type="family">Gunturu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sungmin</namePart>
<namePart type="family">Hong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jae</namePart>
<namePart type="given">Oh</namePart>
<namePart type="family">Woo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-386-9</identifier>
</relatedItem>
<abstract>Contemporary advancements in language model reasoning typically require computationally intensive reinforcement learning (RL) and massive datasets, creating barriers for resource-constrained teams. In this work, we demonstrate that high-quality, iterative training on minimal data can rival modern RL approaches. We introduce a resource-efficient framework that combines Direct Preference Optimization (DPO) and Supervised Fine-Tuning (SFT) with selective guidance from larger models, iteratively refining solutions through a “reflect, rewrite, repeat” cycle (R³). Using Qwen 2.5 7B and Qwen 2.5 Math 7B as base models, our method shows meaningful performance improvements across arithmetic, symbolic and cognitive reasoning benchmarks—including GSM8K (83.1% → 88.6%), AIME’25@10 (20.0% → 30.0%) and LastLetterConcat (40.7% → 53.3%) problems. The model-agnostic nature of our R³ framework is further demonstrated through substantial improvements when applied to Mistral and LLaMA-based models. Remarkably, these gains are achieved using mere 700 basic arithmetic training samples, in stark contrast to the hundreds of thousands of examples typically required by RL-based systems. Our results suggest that reasoning improvements need not strictly depend on large-scale data. By emphasizing strategically curated training grounded in foundational principles, we achieve competitive generalization with minimal resource overhead. Our R³ pipeline also generates high-quality SFT data with high-fidelity reasoning traces as byproduct, further enabling scalable and annotation-free fine-tuning. Code is available.[\ensuremath<https://github.com/aws-samples/sample-for-reflect-rewrite-repeat\ensuremath>]</abstract>
<identifier type="citekey">wang-etal-2026-reflect</identifier>
<location>
<url>https://aclanthology.org/2026.findings-eacl.69/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>1341</start>
<end>1363</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Reflect, Rewrite, Repeat: How Simple Arithmetic Enables Advanced Reasoning in Small Language Models
%A Wang, Mengdie Flora
%A Xie, Haochen
%A Kim, Mun Young
%A Chaudhury, Baishali
%A Ashok, Meghana
%A Gunturu, Suren
%A Hong, Sungmin
%A Woo, Jae Oh
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Findings of the Association for Computational Linguistics: EACL 2026
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-386-9
%F wang-etal-2026-reflect
%X Contemporary advancements in language model reasoning typically require computationally intensive reinforcement learning (RL) and massive datasets, creating barriers for resource-constrained teams. In this work, we demonstrate that high-quality, iterative training on minimal data can rival modern RL approaches. We introduce a resource-efficient framework that combines Direct Preference Optimization (DPO) and Supervised Fine-Tuning (SFT) with selective guidance from larger models, iteratively refining solutions through a “reflect, rewrite, repeat” cycle (R³). Using Qwen 2.5 7B and Qwen 2.5 Math 7B as base models, our method shows meaningful performance improvements across arithmetic, symbolic and cognitive reasoning benchmarks—including GSM8K (83.1% → 88.6%), AIME’25@10 (20.0% → 30.0%) and LastLetterConcat (40.7% → 53.3%) problems. The model-agnostic nature of our R³ framework is further demonstrated through substantial improvements when applied to Mistral and LLaMA-based models. Remarkably, these gains are achieved using mere 700 basic arithmetic training samples, in stark contrast to the hundreds of thousands of examples typically required by RL-based systems. Our results suggest that reasoning improvements need not strictly depend on large-scale data. By emphasizing strategically curated training grounded in foundational principles, we achieve competitive generalization with minimal resource overhead. Our R³ pipeline also generates high-quality SFT data with high-fidelity reasoning traces as byproduct, further enabling scalable and annotation-free fine-tuning. Code is available.[\ensuremath<https://github.com/aws-samples/sample-for-reflect-rewrite-repeat\ensuremath>]
%U https://aclanthology.org/2026.findings-eacl.69/
%P 1341-1363
Markdown (Informal)
[Reflect, Rewrite, Repeat: How Simple Arithmetic Enables Advanced Reasoning in Small Language Models](https://aclanthology.org/2026.findings-eacl.69/) (Wang et al., Findings 2026)
ACL
- Mengdie Flora Wang, Haochen Xie, Mun Young Kim, Baishali Chaudhury, Meghana Ashok, Suren Gunturu, Sungmin Hong, and Jae Oh Woo. 2026. Reflect, Rewrite, Repeat: How Simple Arithmetic Enables Advanced Reasoning in Small Language Models. In Findings of the Association for Computational Linguistics: EACL 2026, pages 1341–1363, Rabat, Morocco. Association for Computational Linguistics.