@inproceedings{wang-etal-2026-time,
title = "Time-for-Accuracy: Formalizing Chain-of-Thought as an Expansion of Logical Depth",
author = "Wang, Yue and
Zhang, Zhi and
Xi, Wang and
Sun, Chengjie and
Shan, Lili and
Liu, Bingquan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.findings-acl.1938/",
pages = "38925--38941",
ISBN = "979-8-89176-395-1",
abstract = "Chain-of-thought (CoT) often improves multi-step reasoning, but it remains unclear what kind of additional sequential computation longer traces actually enable. We connect CoT to Bennett{'}s logical depth, separating an answer{'}s description length from the sequential effort required to derive it, and view a CoT budget of $T$ steps as a qualitative cap on realizable sequential computation. To operationalize realized depth beyond raw length, we introduce Effective Logical Depth (ELD), a deletion-based measure of step necessity under a specified inference interface. Across depth-controlled prefix-sum tasks and GSM8K rationale perturbations, we observe two consistent signatures of a Time-for-Accuracy tradeoff: (i) plateau-to-transition accuracy curves as the budget increases from being below to matching the task{'}s required depth, and (ii) sparse, position-dependent deletion sensitivity concentrated in early steps for deeper instances. On GSM8K, an Extract interface, where the model reads off the answer from the remaining rationale, remains near-perfect even after prefix deletions, whereas a Repair interface, where the model must re-solve from truncated rationale context, degrades markedly. Moreover, Socratic human rationales are consistently more robust than Main rationales under Repair. These results suggest that longer CoT helps primarily when it enables additional effective sequential computation, and that deletion-based diagnostics can distinguish computational steps from redundant ones."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wang-etal-2026-time">
<titleInfo>
<title>Time-for-Accuracy: Formalizing Chain-of-Thought as an Expansion of Logical Depth</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yue</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Xi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chengjie</namePart>
<namePart type="family">Sun</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lili</namePart>
<namePart type="family">Shan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bingquan</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2026</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maria</namePart>
<namePart type="family">Liakata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Viviane</namePart>
<namePart type="given">P</namePart>
<namePart type="family">Moreira</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiajun</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Jurgens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, United States</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-395-1</identifier>
</relatedItem>
<abstract>Chain-of-thought (CoT) often improves multi-step reasoning, but it remains unclear what kind of additional sequential computation longer traces actually enable. We connect CoT to Bennett’s logical depth, separating an answer’s description length from the sequential effort required to derive it, and view a CoT budget of T steps as a qualitative cap on realizable sequential computation. To operationalize realized depth beyond raw length, we introduce Effective Logical Depth (ELD), a deletion-based measure of step necessity under a specified inference interface. Across depth-controlled prefix-sum tasks and GSM8K rationale perturbations, we observe two consistent signatures of a Time-for-Accuracy tradeoff: (i) plateau-to-transition accuracy curves as the budget increases from being below to matching the task’s required depth, and (ii) sparse, position-dependent deletion sensitivity concentrated in early steps for deeper instances. On GSM8K, an Extract interface, where the model reads off the answer from the remaining rationale, remains near-perfect even after prefix deletions, whereas a Repair interface, where the model must re-solve from truncated rationale context, degrades markedly. Moreover, Socratic human rationales are consistently more robust than Main rationales under Repair. These results suggest that longer CoT helps primarily when it enables additional effective sequential computation, and that deletion-based diagnostics can distinguish computational steps from redundant ones.</abstract>
<identifier type="citekey">wang-etal-2026-time</identifier>
<location>
<url>https://aclanthology.org/2026.findings-acl.1938/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>38925</start>
<end>38941</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Time-for-Accuracy: Formalizing Chain-of-Thought as an Expansion of Logical Depth
%A Wang, Yue
%A Zhang, Zhi
%A Xi, Wang
%A Sun, Chengjie
%A Shan, Lili
%A Liu, Bingquan
%Y Liakata, Maria
%Y Moreira, Viviane P.
%Y Zhang, Jiajun
%Y Jurgens, David
%S Findings of the Association for Computational Linguistics: ACL 2026
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, United States
%@ 979-8-89176-395-1
%F wang-etal-2026-time
%X Chain-of-thought (CoT) often improves multi-step reasoning, but it remains unclear what kind of additional sequential computation longer traces actually enable. We connect CoT to Bennett’s logical depth, separating an answer’s description length from the sequential effort required to derive it, and view a CoT budget of T steps as a qualitative cap on realizable sequential computation. To operationalize realized depth beyond raw length, we introduce Effective Logical Depth (ELD), a deletion-based measure of step necessity under a specified inference interface. Across depth-controlled prefix-sum tasks and GSM8K rationale perturbations, we observe two consistent signatures of a Time-for-Accuracy tradeoff: (i) plateau-to-transition accuracy curves as the budget increases from being below to matching the task’s required depth, and (ii) sparse, position-dependent deletion sensitivity concentrated in early steps for deeper instances. On GSM8K, an Extract interface, where the model reads off the answer from the remaining rationale, remains near-perfect even after prefix deletions, whereas a Repair interface, where the model must re-solve from truncated rationale context, degrades markedly. Moreover, Socratic human rationales are consistently more robust than Main rationales under Repair. These results suggest that longer CoT helps primarily when it enables additional effective sequential computation, and that deletion-based diagnostics can distinguish computational steps from redundant ones.
%U https://aclanthology.org/2026.findings-acl.1938/
%P 38925-38941
Markdown (Informal)
[Time-for-Accuracy: Formalizing Chain-of-Thought as an Expansion of Logical Depth](https://aclanthology.org/2026.findings-acl.1938/) (Wang et al., Findings 2026)
ACL