@inproceedings{shi-etal-2026-exploration,
title = "Exploration-Exploitation Reshaping towards Efficient Reasoning for Large Language Models",
author = "Shi, Yufeng and
Luo, Weilin and
Zhang, Yuxiang and
Zhang, Zongmeng and
Liu, Haoyang and
Wang, Yubing and
Wang, Bin and
Zhou, Wengang and
Li, Houqiang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1520/",
pages = "30392--30407",
ISBN = "979-8-89176-395-1",
abstract = "While excelling at solving complex problems, Large Reasoning Models (LRMs) are still constrained by the overthinking issue. Most current studies rely on reward shaping in Reinforcement Learning (RL) to shorten the Chain-of-Thought (CoT) of LRMs, remaining sample-inefficient and non-robust due to the absence of guided exploration and prioritized exploitation. To address these issues, we propose a novel policy optimization framework with **S**elf-**I**mitation and self-**G**uidance **M**ech**A**nisms (SIGMA), which reshapes the exploration and exploitation through two core components: (i) **self-imitation exploitation**, which enables the prioritized exploitation of high-value prompts and rollouts by introducing a self-imitated loss and a dynamic sampling strategy based on compression rate; (ii) **self-guidance exploration**, which provides a preference-aware exploration guidance through diverse and pluggable self-rewriting strategies. Experiments across various datasets indicate that our method achieves superior reasoning efficiency without compromising, and even facilitating, the overall accuracy. Furthermore, ablation studies show that the proposed mechanisms can provide flexible control interfaces for the tradeoff between the reasoning accuracy and efficiency of LRMs."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="shi-etal-2026-exploration">
<titleInfo>
<title>Exploration-Exploitation Reshaping towards Efficient Reasoning for Large Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yufeng</namePart>
<namePart type="family">Shi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weilin</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuxiang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zongmeng</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haoyang</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yubing</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wengang</namePart>
<namePart type="family">Zhou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Houqiang</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>While excelling at solving complex problems, Large Reasoning Models (LRMs) are still constrained by the overthinking issue. Most current studies rely on reward shaping in Reinforcement Learning (RL) to shorten the Chain-of-Thought (CoT) of LRMs, remaining sample-inefficient and non-robust due to the absence of guided exploration and prioritized exploitation. To address these issues, we propose a novel policy optimization framework with **S**elf-**I**mitation and self-**G**uidance **M**ech**A**nisms (SIGMA), which reshapes the exploration and exploitation through two core components: (i) **self-imitation exploitation**, which enables the prioritized exploitation of high-value prompts and rollouts by introducing a self-imitated loss and a dynamic sampling strategy based on compression rate; (ii) **self-guidance exploration**, which provides a preference-aware exploration guidance through diverse and pluggable self-rewriting strategies. Experiments across various datasets indicate that our method achieves superior reasoning efficiency without compromising, and even facilitating, the overall accuracy. Furthermore, ablation studies show that the proposed mechanisms can provide flexible control interfaces for the tradeoff between the reasoning accuracy and efficiency of LRMs.</abstract>
<identifier type="citekey">shi-etal-2026-exploration</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1520/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>30392</start>
<end>30407</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Exploration-Exploitation Reshaping towards Efficient Reasoning for Large Language Models
%A Shi, Yufeng
%A Luo, Weilin
%A Zhang, Yuxiang
%A Zhang, Zongmeng
%A Liu, Haoyang
%A Wang, Yubing
%A Wang, Bin
%A Zhou, Wengang
%A Li, Houqiang
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F shi-etal-2026-exploration
%X While excelling at solving complex problems, Large Reasoning Models (LRMs) are still constrained by the overthinking issue. Most current studies rely on reward shaping in Reinforcement Learning (RL) to shorten the Chain-of-Thought (CoT) of LRMs, remaining sample-inefficient and non-robust due to the absence of guided exploration and prioritized exploitation. To address these issues, we propose a novel policy optimization framework with **S**elf-**I**mitation and self-**G**uidance **M**ech**A**nisms (SIGMA), which reshapes the exploration and exploitation through two core components: (i) **self-imitation exploitation**, which enables the prioritized exploitation of high-value prompts and rollouts by introducing a self-imitated loss and a dynamic sampling strategy based on compression rate; (ii) **self-guidance exploration**, which provides a preference-aware exploration guidance through diverse and pluggable self-rewriting strategies. Experiments across various datasets indicate that our method achieves superior reasoning efficiency without compromising, and even facilitating, the overall accuracy. Furthermore, ablation studies show that the proposed mechanisms can provide flexible control interfaces for the tradeoff between the reasoning accuracy and efficiency of LRMs.
%U https://aclanthology.org/2026.findings-acl.1520/
%P 30392-30407
Markdown (Informal)
[Exploration-Exploitation Reshaping towards Efficient Reasoning for Large Language Models](https://aclanthology.org/2026.findings-acl.1520/) (Shi et al., Findings 2026)
ACL
- Yufeng Shi, Weilin Luo, Yuxiang Zhang, Zongmeng Zhang, Haoyang Liu, Yubing Wang, Bin Wang, Wengang Zhou, and Houqiang Li. 2026. Exploration-Exploitation Reshaping towards Efficient Reasoning for Large Language Models. In Findings of the Association for Computational Linguistics: ACL 2026, pages 30392–30407, San Diego, California, United States. Association for Computational Linguistics.