@inproceedings{srivastava-etal-2025-thinkslm,
title = "{T}hink{SLM}: Towards Reasoning in Small Language Models",
author = "Srivastava, Gaurav and
Cao, Shuxiang and
Wang, Xuan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1659/",
doi = "10.18653/v1/2025.emnlp-main.1659",
pages = "32612--32662",
ISBN = "979-8-89176-332-6",
abstract = "Reasoning has long been viewed as an emergent property of large language models (LLMs). However, recent studies challenge this assumption, showing that small language models (SLMs) can also achieve competitive reasoning performance. This paper introduces $\textbf{ThinkSLM}$, the first extensive benchmark to systematically evaluate and study the reasoning abilities of SLMs trained from scratch or derived from LLMs through quantization, pruning, and distillation. We first establish a reliable evaluation criterion comparing available methods and LLM judges against our human evaluations. Then we present a study evaluating $\textbf{72}$ diverse SLMs from $\textbf{six}$ major model families across $\textbf{17 reasoning benchmarks}$. We repeat all our experiments $\textbf{three}$ times to ensure a robust assessment. Our findings show that: $\textbf{\textit{1)}}$ reasoning ability in SLMs is strongly influenced by training methods and data quality rather than solely model scale; $\textbf{\textit{2)}}$ quantization preserves reasoning capability, while pruning significantly disrupts it;$\textbf{\textit{ 3)}}$ larger models consistently exhibit higher robustness against adversarial perturbations and intermediate reasoning, but certain smaller models closely match or exceed the larger models' performance. Our findings challenge the assumption that scaling is the only way to achieve strong reasoning. Instead, we foresee a future where SLMs with strong reasoning capabilities can be developed through structured training or post-training compression. Our $\textbf{ThinkSLM}$ Leaderboard is publicly available at: https://ctrl-gaurav.github.io/thinkslm.github.io/."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="srivastava-etal-2025-thinkslm">
<titleInfo>
<title>ThinkSLM: Towards Reasoning in Small Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Gaurav</namePart>
<namePart type="family">Srivastava</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shuxiang</namePart>
<namePart type="family">Cao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuan</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Reasoning has long been viewed as an emergent property of large language models (LLMs). However, recent studies challenge this assumption, showing that small language models (SLMs) can also achieve competitive reasoning performance. This paper introduces ThinkSLM, the first extensive benchmark to systematically evaluate and study the reasoning abilities of SLMs trained from scratch or derived from LLMs through quantization, pruning, and distillation. We first establish a reliable evaluation criterion comparing available methods and LLM judges against our human evaluations. Then we present a study evaluating 72 diverse SLMs from six major model families across 17 reasoning benchmarks. We repeat all our experiments three times to ensure a robust assessment. Our findings show that: 1) reasoning ability in SLMs is strongly influenced by training methods and data quality rather than solely model scale; 2) quantization preserves reasoning capability, while pruning significantly disrupts it; 3) larger models consistently exhibit higher robustness against adversarial perturbations and intermediate reasoning, but certain smaller models closely match or exceed the larger models’ performance. Our findings challenge the assumption that scaling is the only way to achieve strong reasoning. Instead, we foresee a future where SLMs with strong reasoning capabilities can be developed through structured training or post-training compression. Our ThinkSLM Leaderboard is publicly available at: https://ctrl-gaurav.github.io/thinkslm.github.io/.</abstract>
<identifier type="citekey">srivastava-etal-2025-thinkslm</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1659</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1659/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>32612</start>
<end>32662</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T ThinkSLM: Towards Reasoning in Small Language Models
%A Srivastava, Gaurav
%A Cao, Shuxiang
%A Wang, Xuan
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F srivastava-etal-2025-thinkslm
%X Reasoning has long been viewed as an emergent property of large language models (LLMs). However, recent studies challenge this assumption, showing that small language models (SLMs) can also achieve competitive reasoning performance. This paper introduces ThinkSLM, the first extensive benchmark to systematically evaluate and study the reasoning abilities of SLMs trained from scratch or derived from LLMs through quantization, pruning, and distillation. We first establish a reliable evaluation criterion comparing available methods and LLM judges against our human evaluations. Then we present a study evaluating 72 diverse SLMs from six major model families across 17 reasoning benchmarks. We repeat all our experiments three times to ensure a robust assessment. Our findings show that: 1) reasoning ability in SLMs is strongly influenced by training methods and data quality rather than solely model scale; 2) quantization preserves reasoning capability, while pruning significantly disrupts it; 3) larger models consistently exhibit higher robustness against adversarial perturbations and intermediate reasoning, but certain smaller models closely match or exceed the larger models’ performance. Our findings challenge the assumption that scaling is the only way to achieve strong reasoning. Instead, we foresee a future where SLMs with strong reasoning capabilities can be developed through structured training or post-training compression. Our ThinkSLM Leaderboard is publicly available at: https://ctrl-gaurav.github.io/thinkslm.github.io/.
%R 10.18653/v1/2025.emnlp-main.1659
%U https://aclanthology.org/2025.emnlp-main.1659/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1659
%P 32612-32662
Markdown (Informal)
[ThinkSLM: Towards Reasoning in Small Language Models](https://aclanthology.org/2025.emnlp-main.1659/) (Srivastava et al., EMNLP 2025)
ACL
- Gaurav Srivastava, Shuxiang Cao, and Xuan Wang. 2025. ThinkSLM: Towards Reasoning in Small Language Models. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 32612–32662, Suzhou, China. Association for Computational Linguistics.