@inproceedings{yu-etal-2025-accelerate,
title = "Accelerate Parallelizable Reasoning via Parallel Decoding within One Sequence",
author = "Yu, Yijiong and
Wang, Wei and
Chen, Ran and
Pei, Ji",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.457/",
doi = "10.18653/v1/2025.emnlp-main.457",
pages = "9007--9014",
ISBN = "979-8-89176-332-6",
abstract = "Recent advances in reasoning models have demonstrated significant improvements in accuracy by employing detailed and comprehensive reasoning processes. However, generating these lengthy reasoning sequences is computationally expensive and time-consuming. To address this inefficiency, we leverage the inherent parallelizability of certain tasks to accelerate the reasoning process. Specifically, when multiple parallel reasoning steps exist, we decode multiple tokens per forward pass via a tree-like attention mask within a single sequence, avoiding additional memory usage. Experimental results show that our method achieves up to nearly 100{\%} speedup in decoding while basically maintaining the answer quality. Our code is available in https://github.com/yuyijiong/parallel-decoding-in-one-sequence"
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yu-etal-2025-accelerate">
<titleInfo>
<title>Accelerate Parallelizable Reasoning via Parallel Decoding within One Sequence</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yijiong</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wei</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ran</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ji</namePart>
<namePart type="family">Pei</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Recent advances in reasoning models have demonstrated significant improvements in accuracy by employing detailed and comprehensive reasoning processes. However, generating these lengthy reasoning sequences is computationally expensive and time-consuming. To address this inefficiency, we leverage the inherent parallelizability of certain tasks to accelerate the reasoning process. Specifically, when multiple parallel reasoning steps exist, we decode multiple tokens per forward pass via a tree-like attention mask within a single sequence, avoiding additional memory usage. Experimental results show that our method achieves up to nearly 100% speedup in decoding while basically maintaining the answer quality. Our code is available in https://github.com/yuyijiong/parallel-decoding-in-one-sequence</abstract>
<identifier type="citekey">yu-etal-2025-accelerate</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.457</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.457/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>9007</start>
<end>9014</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Accelerate Parallelizable Reasoning via Parallel Decoding within One Sequence
%A Yu, Yijiong
%A Wang, Wei
%A Chen, Ran
%A Pei, Ji
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F yu-etal-2025-accelerate
%X Recent advances in reasoning models have demonstrated significant improvements in accuracy by employing detailed and comprehensive reasoning processes. However, generating these lengthy reasoning sequences is computationally expensive and time-consuming. To address this inefficiency, we leverage the inherent parallelizability of certain tasks to accelerate the reasoning process. Specifically, when multiple parallel reasoning steps exist, we decode multiple tokens per forward pass via a tree-like attention mask within a single sequence, avoiding additional memory usage. Experimental results show that our method achieves up to nearly 100% speedup in decoding while basically maintaining the answer quality. Our code is available in https://github.com/yuyijiong/parallel-decoding-in-one-sequence
%R 10.18653/v1/2025.emnlp-main.457
%U https://aclanthology.org/2025.emnlp-main.457/
%U https://doi.org/10.18653/v1/2025.emnlp-main.457
%P 9007-9014
Markdown (Informal)
[Accelerate Parallelizable Reasoning via Parallel Decoding within One Sequence](https://aclanthology.org/2025.emnlp-main.457/) (Yu et al., EMNLP 2025)
ACL