@inproceedings{irie-2025-positional,
title = "Why Are Positional Encodings Nonessential for Deep Autoregressive Transformers? A Petroglyph Revisited",
author = "Irie, Kazuki",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.findings-acl.30/",
doi = "10.18653/v1/2025.findings-acl.30",
pages = "551--559",
ISBN = "979-8-89176-256-5",
abstract = "Do autoregressive Transformer language models require explicit positional encodings (PEs)? The answer is `no' provided they have more than one layer{---}they can distinguish sequences with permuted tokens without the need for explicit PEs. This follows from the fact that a cascade of (permutation invariant) set processors can collectively exhibit sequence-sensitive behavior in the autoregressive setting. This property has been known since early efforts (contemporary with GPT-2) adopting the Transformer for language modeling. However, this result does not appear to have been well disseminated, leading to recent rediscoveries. This may be partially due to a sudden growth of the language modeling community after the advent of GPT-2/3, but perhaps also due to the lack of a clear explanation in prior work, despite being commonly understood by practitioners in the past. Here we review the long-forgotten explanation why explicit PEs are nonessential for multi-layer autoregressive Transformers (in contrast, one-layer models require PEs to discern order information of their inputs), as well as the origin of this result, and hope to re-establish it as a common knowledge."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="irie-2025-positional">
<titleInfo>
<title>Why Are Positional Encodings Nonessential for Deep Autoregressive Transformers? A Petroglyph Revisited</title>
</titleInfo>
<name type="personal">
<namePart type="given">Kazuki</namePart>
<namePart type="family">Irie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: ACL 2025</title>
</titleInfo>
<name type="personal">
<namePart type="given">Wanxiang</namePart>
<namePart type="family">Che</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Nabende</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ekaterina</namePart>
<namePart type="family">Shutova</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohammad</namePart>
<namePart type="given">Taher</namePart>
<namePart type="family">Pilehvar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Vienna, Austria</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-256-5</identifier>
</relatedItem>
<abstract>Do autoregressive Transformer language models require explicit positional encodings (PEs)? The answer is ‘no’ provided they have more than one layer—they can distinguish sequences with permuted tokens without the need for explicit PEs. This follows from the fact that a cascade of (permutation invariant) set processors can collectively exhibit sequence-sensitive behavior in the autoregressive setting. This property has been known since early efforts (contemporary with GPT-2) adopting the Transformer for language modeling. However, this result does not appear to have been well disseminated, leading to recent rediscoveries. This may be partially due to a sudden growth of the language modeling community after the advent of GPT-2/3, but perhaps also due to the lack of a clear explanation in prior work, despite being commonly understood by practitioners in the past. Here we review the long-forgotten explanation why explicit PEs are nonessential for multi-layer autoregressive Transformers (in contrast, one-layer models require PEs to discern order information of their inputs), as well as the origin of this result, and hope to re-establish it as a common knowledge.</abstract>
<identifier type="citekey">irie-2025-positional</identifier>
<identifier type="doi">10.18653/v1/2025.findings-acl.30</identifier>
<location>
<url>https://aclanthology.org/2025.findings-acl.30/</url>
</location>
<part>
<date>2025-07</date>
<extent unit="page">
<start>551</start>
<end>559</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Why Are Positional Encodings Nonessential for Deep Autoregressive Transformers? A Petroglyph Revisited
%A Irie, Kazuki
%Y Che, Wanxiang
%Y Nabende, Joyce
%Y Shutova, Ekaterina
%Y Pilehvar, Mohammad Taher
%S Findings of the Association for Computational Linguistics: ACL 2025
%D 2025
%8 July
%I Association for Computational Linguistics
%C Vienna, Austria
%@ 979-8-89176-256-5
%F irie-2025-positional
%X Do autoregressive Transformer language models require explicit positional encodings (PEs)? The answer is ‘no’ provided they have more than one layer—they can distinguish sequences with permuted tokens without the need for explicit PEs. This follows from the fact that a cascade of (permutation invariant) set processors can collectively exhibit sequence-sensitive behavior in the autoregressive setting. This property has been known since early efforts (contemporary with GPT-2) adopting the Transformer for language modeling. However, this result does not appear to have been well disseminated, leading to recent rediscoveries. This may be partially due to a sudden growth of the language modeling community after the advent of GPT-2/3, but perhaps also due to the lack of a clear explanation in prior work, despite being commonly understood by practitioners in the past. Here we review the long-forgotten explanation why explicit PEs are nonessential for multi-layer autoregressive Transformers (in contrast, one-layer models require PEs to discern order information of their inputs), as well as the origin of this result, and hope to re-establish it as a common knowledge.
%R 10.18653/v1/2025.findings-acl.30
%U https://aclanthology.org/2025.findings-acl.30/
%U https://doi.org/10.18653/v1/2025.findings-acl.30
%P 551-559
Markdown (Informal)
[Why Are Positional Encodings Nonessential for Deep Autoregressive Transformers? A Petroglyph Revisited](https://aclanthology.org/2025.findings-acl.30/) (Irie, Findings 2025)
ACL