@inproceedings{peixu-etal-2025-autoregressive,
title = "How do autoregressive transformers solve full addition?",
author = "Peixu, Wang and
Yu, Chen and
Ming, Yu and
Xiang, Cheng",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.643/",
pages = "12743--12767",
ISBN = "979-8-89176-332-6",
abstract = "Large pre-trained language models have demonstrated impressive capabilities, but there is still much to learn about how they operate. In this study, we conduct an investigation of the autoregressive transformer{'}s ability to perform basic addition operations. Specifically, by using causal analysis we found that a few different attention heads in the middle layers control the addition carry, with each head processing carries of different lengths. Due to the lack of global focus on the sequence within these attention heads, the model struggles to handle long-sequence addition tasks. By performing inference intervention on mistral-7B, partial task performance can be restored, with the accuracy on 20-digit long-sequence additions from 2{\%} to 38{\%}. Through fine-tuning, a new mechanism branches out for handling complex cases, yet it still faces challenges with length generalization. Our research reveals how the models perform basic arithmetic task, and further provides insights into the debate on whether these models are merely statistical."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="peixu-etal-2025-autoregressive">
<titleInfo>
<title>How do autoregressive transformers solve full addition?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wang</namePart>
<namePart type="family">Peixu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chen</namePart>
<namePart type="family">Yu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yu</namePart>
<namePart type="family">Ming</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Cheng</namePart>
<namePart type="family">Xiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Large pre-trained language models have demonstrated impressive capabilities, but there is still much to learn about how they operate. In this study, we conduct an investigation of the autoregressive transformer’s ability to perform basic addition operations. Specifically, by using causal analysis we found that a few different attention heads in the middle layers control the addition carry, with each head processing carries of different lengths. Due to the lack of global focus on the sequence within these attention heads, the model struggles to handle long-sequence addition tasks. By performing inference intervention on mistral-7B, partial task performance can be restored, with the accuracy on 20-digit long-sequence additions from 2% to 38%. Through fine-tuning, a new mechanism branches out for handling complex cases, yet it still faces challenges with length generalization. Our research reveals how the models perform basic arithmetic task, and further provides insights into the debate on whether these models are merely statistical.</abstract>
<identifier type="citekey">peixu-etal-2025-autoregressive</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.643/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>12743</start>
<end>12767</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T How do autoregressive transformers solve full addition?
%A Peixu, Wang
%A Yu, Chen
%A Ming, Yu
%A Xiang, Cheng
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F peixu-etal-2025-autoregressive
%X Large pre-trained language models have demonstrated impressive capabilities, but there is still much to learn about how they operate. In this study, we conduct an investigation of the autoregressive transformer’s ability to perform basic addition operations. Specifically, by using causal analysis we found that a few different attention heads in the middle layers control the addition carry, with each head processing carries of different lengths. Due to the lack of global focus on the sequence within these attention heads, the model struggles to handle long-sequence addition tasks. By performing inference intervention on mistral-7B, partial task performance can be restored, with the accuracy on 20-digit long-sequence additions from 2% to 38%. Through fine-tuning, a new mechanism branches out for handling complex cases, yet it still faces challenges with length generalization. Our research reveals how the models perform basic arithmetic task, and further provides insights into the debate on whether these models are merely statistical.
%U https://aclanthology.org/2025.emnlp-main.643/
%P 12743-12767
Markdown (Informal)
[How do autoregressive transformers solve full addition?](https://aclanthology.org/2025.emnlp-main.643/) (Peixu et al., EMNLP 2025)
ACL
- Wang Peixu, Chen Yu, Yu Ming, and Cheng Xiang. 2025. How do autoregressive transformers solve full addition?. In Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing, pages 12743–12767, Suzhou, China. Association for Computational Linguistics.