@article{strobl-etal-2025-transformers,
title = "Transformers as Transducers",
author = "Strobl, Lena and
Angluin, Dana and
Chiang, David and
Rawski, Jonathan and
Sabharwal, Ashish",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2025.tacl-1.9/",
doi = "10.1162/tacl_a_00736",
pages = "200--219",
abstract = "We study the sequence-to-sequence mapping capacity of transformers by relating them to finite transducers, and find that they can express surprisingly large classes of (total functional) transductions. We do so using variants of RASP, a programming language designed to help people {\textquotedblleft}think like transformers,{\textquotedblright} as an intermediate representation. We extend the existing Boolean variant B-RASP to sequence-to-sequence transductions and show that it computes exactly the first-order rational transductions (such as string rotation). Then, we introduce two new extensions. B-RASP[pos] enables calculations on positions (such as copying the first half of a string) and contains all first-order regular transductions. S-RASP adds prefix sum, which enables additional arithmetic operations (such as squaring a string) and contains all first-order polyregular transductions. Finally, we show that masked average-hard attention transformers can simulate S-RASP."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="strobl-etal-2025-transformers">
<titleInfo>
<title>Transformers as Transducers</title>
</titleInfo>
<name type="personal">
<namePart type="given">Lena</namePart>
<namePart type="family">Strobl</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Dana</namePart>
<namePart type="family">Angluin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Chiang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jonathan</namePart>
<namePart type="family">Rawski</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ashish</namePart>
<namePart type="family">Sabharwal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>We study the sequence-to-sequence mapping capacity of transformers by relating them to finite transducers, and find that they can express surprisingly large classes of (total functional) transductions. We do so using variants of RASP, a programming language designed to help people “think like transformers,” as an intermediate representation. We extend the existing Boolean variant B-RASP to sequence-to-sequence transductions and show that it computes exactly the first-order rational transductions (such as string rotation). Then, we introduce two new extensions. B-RASP[pos] enables calculations on positions (such as copying the first half of a string) and contains all first-order regular transductions. S-RASP adds prefix sum, which enables additional arithmetic operations (such as squaring a string) and contains all first-order polyregular transductions. Finally, we show that masked average-hard attention transformers can simulate S-RASP.</abstract>
<identifier type="citekey">strobl-etal-2025-transformers</identifier>
<identifier type="doi">10.1162/tacl_a_00736</identifier>
<location>
<url>https://aclanthology.org/2025.tacl-1.9/</url>
</location>
<part>
<date>2025</date>
<detail type="volume"><number>13</number></detail>
<extent unit="page">
<start>200</start>
<end>219</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Transformers as Transducers
%A Strobl, Lena
%A Angluin, Dana
%A Chiang, David
%A Rawski, Jonathan
%A Sabharwal, Ashish
%J Transactions of the Association for Computational Linguistics
%D 2025
%V 13
%I MIT Press
%C Cambridge, MA
%F strobl-etal-2025-transformers
%X We study the sequence-to-sequence mapping capacity of transformers by relating them to finite transducers, and find that they can express surprisingly large classes of (total functional) transductions. We do so using variants of RASP, a programming language designed to help people “think like transformers,” as an intermediate representation. We extend the existing Boolean variant B-RASP to sequence-to-sequence transductions and show that it computes exactly the first-order rational transductions (such as string rotation). Then, we introduce two new extensions. B-RASP[pos] enables calculations on positions (such as copying the first half of a string) and contains all first-order regular transductions. S-RASP adds prefix sum, which enables additional arithmetic operations (such as squaring a string) and contains all first-order polyregular transductions. Finally, we show that masked average-hard attention transformers can simulate S-RASP.
%R 10.1162/tacl_a_00736
%U https://aclanthology.org/2025.tacl-1.9/
%U https://doi.org/10.1162/tacl_a_00736
%P 200-219
Markdown (Informal)
[Transformers as Transducers](https://aclanthology.org/2025.tacl-1.9/) (Strobl et al., TACL 2025)
ACL
- Lena Strobl, Dana Angluin, David Chiang, Jonathan Rawski, and Ashish Sabharwal. 2025. Transformers as Transducers. Transactions of the Association for Computational Linguistics, 13:200–219.