@inproceedings{thakur-etal-2026-think,
title = "Think Like You Execute: Verifiable Chain of Thought from Program Traces",
author = "Thakur, Shailja and
Saxena, Vaibhav and
Kulkarni, Rohan and
Singh, Shivdeep and
Selvam, Parameswaran and
Kanayama, Hiroshi and
Patel, Hima",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.acl-industry.53/",
pages = "775--795",
ISBN = "979-8-89176-394-4",
abstract = "Teaching language models to reason about code execution is still an open problem. Current synthetic Chain-of-Thought (CoT) training data often consists of plausible-sounding explanations generated by teacher models, not verifiable accounts of actual program behavior. This causes models to learn logically flawed reasoning patterns despite syntactic correctness.We address this by grounding CoT generation directly in program execution traces. Our pipeline instruments code to capture dynamic behavior, narrates execution traces into natural language, and actively verifies each rationale against the trace. We systematically create 54,000 execution-verified, bi-directional rationales that teach models to reason both forward (input$\rightarrow$output) and backward (output$\rightarrow$input). Models fine-tuned on our verified data achieve substantial improvements, with a performance boost of +24.2 on LiveCodeBench-Exec, +22.3 on CruxEval-Output, and +21.1 on CruxEval-Input, demonstrating that verification quality directly determines both reasoning and code generation capabilities."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="thakur-etal-2026-think">
<titleInfo>
<title>Think Like You Execute: Verifiable Chain of Thought from Program Traces</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shailja</namePart>
<namePart type="family">Thakur</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vaibhav</namePart>
<namePart type="family">Saxena</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rohan</namePart>
<namePart type="family">Kulkarni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shivdeep</namePart>
<namePart type="family">Singh</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Parameswaran</namePart>
<namePart type="family">Selvam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hiroshi</namePart>
<namePart type="family">Kanayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hima</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Georg</namePart>
<namePart type="family">Rehm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mei</namePart>
<namePart type="family">Tu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">San Diego, California, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-394-4</identifier>
</relatedItem>
<abstract>Teaching language models to reason about code execution is still an open problem. Current synthetic Chain-of-Thought (CoT) training data often consists of plausible-sounding explanations generated by teacher models, not verifiable accounts of actual program behavior. This causes models to learn logically flawed reasoning patterns despite syntactic correctness.We address this by grounding CoT generation directly in program execution traces. Our pipeline instruments code to capture dynamic behavior, narrates execution traces into natural language, and actively verifies each rationale against the trace. We systematically create 54,000 execution-verified, bi-directional rationales that teach models to reason both forward (input\rightarrowoutput) and backward (output\rightarrowinput). Models fine-tuned on our verified data achieve substantial improvements, with a performance boost of +24.2 on LiveCodeBench-Exec, +22.3 on CruxEval-Output, and +21.1 on CruxEval-Input, demonstrating that verification quality directly determines both reasoning and code generation capabilities.</abstract>
<identifier type="citekey">thakur-etal-2026-think</identifier>
<location>
<url>https://aclanthology.org/2026.acl-industry.53/</url>
</location>
<part>
<date>2026-07</date>
<extent unit="page">
<start>775</start>
<end>795</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Think Like You Execute: Verifiable Chain of Thought from Program Traces
%A Thakur, Shailja
%A Saxena, Vaibhav
%A Kulkarni, Rohan
%A Singh, Shivdeep
%A Selvam, Parameswaran
%A Kanayama, Hiroshi
%A Patel, Hima
%Y Li, Yunyao
%Y Rehm, Georg
%Y Tu, Mei
%S Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026)
%D 2026
%8 July
%I Association for Computational Linguistics
%C San Diego, California, USA
%@ 979-8-89176-394-4
%F thakur-etal-2026-think
%X Teaching language models to reason about code execution is still an open problem. Current synthetic Chain-of-Thought (CoT) training data often consists of plausible-sounding explanations generated by teacher models, not verifiable accounts of actual program behavior. This causes models to learn logically flawed reasoning patterns despite syntactic correctness.We address this by grounding CoT generation directly in program execution traces. Our pipeline instruments code to capture dynamic behavior, narrates execution traces into natural language, and actively verifies each rationale against the trace. We systematically create 54,000 execution-verified, bi-directional rationales that teach models to reason both forward (input\rightarrowoutput) and backward (output\rightarrowinput). Models fine-tuned on our verified data achieve substantial improvements, with a performance boost of +24.2 on LiveCodeBench-Exec, +22.3 on CruxEval-Output, and +21.1 on CruxEval-Input, demonstrating that verification quality directly determines both reasoning and code generation capabilities.
%U https://aclanthology.org/2026.acl-industry.53/
%P 775-795
Markdown (Informal)
[Think Like You Execute: Verifiable Chain of Thought from Program Traces](https://aclanthology.org/2026.acl-industry.53/) (Thakur et al., ACL 2026)
ACL
- Shailja Thakur, Vaibhav Saxena, Rohan Kulkarni, Shivdeep Singh, Parameswaran Selvam, Hiroshi Kanayama, and Hima Patel. 2026. Think Like You Execute: Verifiable Chain of Thought from Program Traces. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (ACL 2026), pages 775–795, San Diego, California, USA. Association for Computational Linguistics.