@inproceedings{chen-etal-2025-war,
title = "War of Thoughts: Competition Stimulates Stronger Reasoning in Large Language Models",
author = "Chen, Yibin and
Liu, Jinyi and
Zheng, Yan and
Yuan, Yifu and
Hao, Jianye",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.1118/",
doi = "10.18653/v1/2025.findings-acl.1118",
pages = "21716--21737",
ISBN = "979-8-89176-256-5",
abstract = "Recent advances in Large Language Models (LLMs) have reshaped the landscape of reasoning tasks, particularly through test-time scaling (TTS) to enhance LLM reasoning. Prior research has used structures such as trees or graphs to guide LLMs in searching for optimal solutions. These methods are time-consuming and require a strong reward model (RM) to support effective solution space exploration. Tournament-style approaches eliminate the reliance on RMs through comparative evaluation but suffer from transitivity dilemmas, leading to unstable ordering. To address these issues, we propose War of Thoughts (**WoT**), a novel post-hoc method that enhances reasoning without finetuning. WoT comprises two distinct stages: (1) *Exploration*, in which diverse and meaningful candidate solutions are generated through contrastive demonstrations and multi-granularity reasoning specifications; and (2) *Competition*, where these candidate solutions are subjected to multiple rounds of matchups within a competitive arena. Throughout this iterative process, the solutions are optimized and improved, with the optimal solution being determined based on Elo ratings. Extensive experiments across various LLMs demonstrate the superiority of WoT, surpassing baselines by **10{--}30{\%}**. WoT can effectively stimulate stronger reasoning abilities, achieving impressive TTS performance in both generation budget and model size. It shows higher scalability efficiency compared to the baseline within the same budget. Notably, WoT exhibits excellent scalability with model size, even outperforming a 72B model despite using a 7B model."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="chen-etal-2025-war">
<titleInfo>
<title>War of Thoughts: Competition Stimulates Stronger Reasoning in Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yibin</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jinyi</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yan</namePart>
<namePart type="family">Zheng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yifu</namePart>
<namePart type="family">Yuan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jianye</namePart>
<namePart type="family">Hao</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Recent advances in Large Language Models (LLMs) have reshaped the landscape of reasoning tasks, particularly through test-time scaling (TTS) to enhance LLM reasoning. Prior research has used structures such as trees or graphs to guide LLMs in searching for optimal solutions. These methods are time-consuming and require a strong reward model (RM) to support effective solution space exploration. Tournament-style approaches eliminate the reliance on RMs through comparative evaluation but suffer from transitivity dilemmas, leading to unstable ordering. To address these issues, we propose War of Thoughts (**WoT**), a novel post-hoc method that enhances reasoning without finetuning. WoT comprises two distinct stages: (1) *Exploration*, in which diverse and meaningful candidate solutions are generated through contrastive demonstrations and multi-granularity reasoning specifications; and (2) *Competition*, where these candidate solutions are subjected to multiple rounds of matchups within a competitive arena. Throughout this iterative process, the solutions are optimized and improved, with the optimal solution being determined based on Elo ratings. Extensive experiments across various LLMs demonstrate the superiority of WoT, surpassing baselines by **10–30%**. WoT can effectively stimulate stronger reasoning abilities, achieving impressive TTS performance in both generation budget and model size. It shows higher scalability efficiency compared to the baseline within the same budget. Notably, WoT exhibits excellent scalability with model size, even outperforming a 72B model despite using a 7B model.</abstract>
<identifier type="citekey">chen-etal-2025-war</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.1118</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.1118/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>21716</start>
<end>21737</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T War of Thoughts: Competition Stimulates Stronger Reasoning in Large Language Models
%A Chen, Yibin
%A Liu, Jinyi
%A Zheng, Yan
%A Yuan, Yifu
%A Hao, Jianye
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F chen-etal-2025-war
%X Recent advances in Large Language Models (LLMs) have reshaped the landscape of reasoning tasks, particularly through test-time scaling (TTS) to enhance LLM reasoning. Prior research has used structures such as trees or graphs to guide LLMs in searching for optimal solutions. These methods are time-consuming and require a strong reward model (RM) to support effective solution space exploration. Tournament-style approaches eliminate the reliance on RMs through comparative evaluation but suffer from transitivity dilemmas, leading to unstable ordering. To address these issues, we propose War of Thoughts (**WoT**), a novel post-hoc method that enhances reasoning without finetuning. WoT comprises two distinct stages: (1) *Exploration*, in which diverse and meaningful candidate solutions are generated through contrastive demonstrations and multi-granularity reasoning specifications; and (2) *Competition*, where these candidate solutions are subjected to multiple rounds of matchups within a competitive arena. Throughout this iterative process, the solutions are optimized and improved, with the optimal solution being determined based on Elo ratings. Extensive experiments across various LLMs demonstrate the superiority of WoT, surpassing baselines by **10–30%**. WoT can effectively stimulate stronger reasoning abilities, achieving impressive TTS performance in both generation budget and model size. It shows higher scalability efficiency compared to the baseline within the same budget. Notably, WoT exhibits excellent scalability with model size, even outperforming a 72B model despite using a 7B model.
%R 10.18653/v1/2025.findings-acl.1118
%U https://aclanthology.org/2025.findings-acl.1118/
%U https://doi.org/10.18653/v1/2025.findings-acl.1118
%P 21716-21737
Markdown (Informal)
[War of Thoughts: Competition Stimulates Stronger Reasoning in Large Language Models](https://aclanthology.org/2025.findings-acl.1118/) (Chen et al., Findings 2025)
ACL