@inproceedings{gautam-2026-training,
title = "Training in Step-by-Step Formal Reasoning Improves Pronominal Reasoning in Language Models",
author = "Gautam, Vagrant",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 2: Short Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2026.eacl-short.7/",
pages = "121--135",
ISBN = "979-8-89176-381-4",
abstract = "Large reasoning models are trained to solve problems by decomposing them into steps.While they show impressive progress on reasoning tasks, ``reasoning'' here is typically limited to formal reasoning, i.e., math, code, and logic.An open question is whether these abilities transfer to {\_}pronominal reasoning{\_}, where step-by-step thinking in non-reasoning models worsens performance, but code pre-training may help.I answer this question by evaluating six pairs of original and DeepSeek-distilled models (1.5B-70B parameters) on six challenging datasets for English pronoun resolution (identifying whom a pronoun refers to) and pronoun fidelity (learning and applying a pronoun mapping correctly).Performance improves statistically significantly on all datasets (31{\%} relative increase), indicating that distilling step-by-step formal reasoning does in fact help with pronominal reasoning, in part by improving instruction-following.With a qualitative evaluation of 720 generations, I show that improvements occur across granular error types, and come from plausible-looking reasoning chains employing a variety of reasoning strategies.However, the gains put models just above random performance on these datasets, leaving plenty of room for improvement."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gautam-2026-training">
<titleInfo>
<title>Training in Step-by-Step Formal Reasoning Improves Pronominal Reasoning in Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vagrant</namePart>
<namePart type="family">Gautam</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2026-03</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Vera</namePart>
<namePart type="family">Demberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lluís</namePart>
<namePart type="family">Marquez</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Rabat, Morocco</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-381-4</identifier>
</relatedItem>
<abstract>Large reasoning models are trained to solve problems by decomposing them into steps.While they show impressive progress on reasoning tasks, “reasoning” here is typically limited to formal reasoning, i.e., math, code, and logic.An open question is whether these abilities transfer to _pronominal reasoning_, where step-by-step thinking in non-reasoning models worsens performance, but code pre-training may help.I answer this question by evaluating six pairs of original and DeepSeek-distilled models (1.5B-70B parameters) on six challenging datasets for English pronoun resolution (identifying whom a pronoun refers to) and pronoun fidelity (learning and applying a pronoun mapping correctly).Performance improves statistically significantly on all datasets (31% relative increase), indicating that distilling step-by-step formal reasoning does in fact help with pronominal reasoning, in part by improving instruction-following.With a qualitative evaluation of 720 generations, I show that improvements occur across granular error types, and come from plausible-looking reasoning chains employing a variety of reasoning strategies.However, the gains put models just above random performance on these datasets, leaving plenty of room for improvement.</abstract>
<identifier type="citekey">gautam-2026-training</identifier>
<location>
<url>https://aclanthology.org/2026.eacl-short.7/</url>
</location>
<part>
<date>2026-03</date>
<extent unit="page">
<start>121</start>
<end>135</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Training in Step-by-Step Formal Reasoning Improves Pronominal Reasoning in Language Models
%A Gautam, Vagrant
%Y Demberg, Vera
%Y Inui, Kentaro
%Y Marquez, Lluís
%S Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 2: Short Papers)
%D 2026
%8 March
%I Association for Computational Linguistics
%C Rabat, Morocco
%@ 979-8-89176-381-4
%F gautam-2026-training
%X Large reasoning models are trained to solve problems by decomposing them into steps.While they show impressive progress on reasoning tasks, “reasoning” here is typically limited to formal reasoning, i.e., math, code, and logic.An open question is whether these abilities transfer to _pronominal reasoning_, where step-by-step thinking in non-reasoning models worsens performance, but code pre-training may help.I answer this question by evaluating six pairs of original and DeepSeek-distilled models (1.5B-70B parameters) on six challenging datasets for English pronoun resolution (identifying whom a pronoun refers to) and pronoun fidelity (learning and applying a pronoun mapping correctly).Performance improves statistically significantly on all datasets (31% relative increase), indicating that distilling step-by-step formal reasoning does in fact help with pronominal reasoning, in part by improving instruction-following.With a qualitative evaluation of 720 generations, I show that improvements occur across granular error types, and come from plausible-looking reasoning chains employing a variety of reasoning strategies.However, the gains put models just above random performance on these datasets, leaving plenty of room for improvement.
%U https://aclanthology.org/2026.eacl-short.7/
%P 121-135
Markdown (Informal)
[Training in Step-by-Step Formal Reasoning Improves Pronominal Reasoning in Language Models](https://aclanthology.org/2026.eacl-short.7/) (Gautam, EACL 2026)
ACL